Implement proper ticket categorization based on issue type
Added intelligent categorization to match tickets with correct category and type instead of defaulting everything to Hardware/Problem. Changes: - Added TICKET_CATEGORIES and TICKET_TYPES mappings for API consistency - Created _categorize_issue() method to determine proper classification: Hardware Issues: - SMART/drive/disk errors → Hardware + Incident (critical/failed) - SMART warnings → Hardware + Problem (needs investigation) Software Issues: - LXC/container/storage usage/CPU → Software category - Critical levels → Software + Incident (service degradation) - Warning levels → Software + Problem (preventive investigation) Network Issues: - Network failures/unreachable → Network + Incident - Network warnings → Network + Problem - Updated ticket creation to use _categorize_issue() and _determine_ticket_priority() - Tickets now have correct tags: [incident] vs [problem] instead of always [maintenance] - Category field in API payload now matches issue type (Hardware/Software/Network) - Type field in API payload now reflects actual situation (Incident/Problem/Task) Examples: - "LXC storage usage >80%" → Software + Problem - "Critical SMART errors" → Hardware + Incident - "High CPU usage" → Software + Problem - "Network unreachable" → Network + Incident Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
152
hwmonDaemon.py
152
hwmonDaemon.py
@@ -116,7 +116,10 @@ class SystemHealthMonitor:
|
|||||||
'PRODUCTION': '[production]'
|
'PRODUCTION': '[production]'
|
||||||
},
|
},
|
||||||
'TICKET_TYPE': {
|
'TICKET_TYPE': {
|
||||||
'MAINTENANCE': '[maintenance]'
|
'INCIDENT': '[incident]',
|
||||||
|
'MAINTENANCE': '[maintenance]',
|
||||||
|
'PROBLEM': '[problem]',
|
||||||
|
'TASK': '[task]'
|
||||||
},
|
},
|
||||||
'HARDWARE_TYPE': {
|
'HARDWARE_TYPE': {
|
||||||
'HARDWARE': '[hardware]'
|
'HARDWARE': '[hardware]'
|
||||||
@@ -130,9 +133,23 @@ class SystemHealthMonitor:
|
|||||||
'SCOPE': {
|
'SCOPE': {
|
||||||
'SINGLE_NODE': '[single-node]',
|
'SINGLE_NODE': '[single-node]',
|
||||||
'CLUSTER_WIDE': '[cluster-wide]'
|
'CLUSTER_WIDE': '[cluster-wide]'
|
||||||
},
|
}
|
||||||
'DEFAULT_CATEGORY': 'Hardware',
|
}
|
||||||
'DEFAULT_ISSUE_TYPE': 'Problem'
|
|
||||||
|
# Category and Type mappings for ticket API
|
||||||
|
TICKET_CATEGORIES = {
|
||||||
|
'HARDWARE': 'Hardware',
|
||||||
|
'SOFTWARE': 'Software',
|
||||||
|
'NETWORK': 'Network',
|
||||||
|
'SECURITY': 'Security',
|
||||||
|
'OTHER': 'Other'
|
||||||
|
}
|
||||||
|
|
||||||
|
TICKET_TYPES = {
|
||||||
|
'INCIDENT': 'Incident', # Unplanned interruption or service degradation
|
||||||
|
'REQUEST': 'Request', # Service or information request
|
||||||
|
'PROBLEM': 'Problem', # Root cause investigation needed
|
||||||
|
'TASK': 'Task' # Planned work item
|
||||||
}
|
}
|
||||||
|
|
||||||
PROBLEMATIC_FIRMWARE = {
|
PROBLEMATIC_FIRMWARE = {
|
||||||
@@ -1243,6 +1260,94 @@ class SystemHealthMonitor:
|
|||||||
# P4 - Low priority monitoring alerts
|
# P4 - Low priority monitoring alerts
|
||||||
return self.PRIORITIES['LOW'] # P4
|
return self.PRIORITIES['LOW'] # P4
|
||||||
|
|
||||||
|
def _categorize_issue(self, issue: str) -> tuple:
|
||||||
|
"""
|
||||||
|
Determine the correct category, type, and tags for an issue.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (category, ticket_type, issue_tag, ticket_type_tag)
|
||||||
|
- category: 'Hardware', 'Software', 'Network', etc.
|
||||||
|
- ticket_type: 'Incident', 'Problem', 'Task', 'Request'
|
||||||
|
- issue_tag: '[hardware]', '[software]', '[network]'
|
||||||
|
- ticket_type_tag: '[incident]', '[problem]', etc.
|
||||||
|
"""
|
||||||
|
issue_lower = issue.lower()
|
||||||
|
|
||||||
|
# Hardware Issues - Physical hardware problems
|
||||||
|
if any(keyword in issue_lower for keyword in [
|
||||||
|
'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature',
|
||||||
|
'firmware', 'power_on_hours', 'reallocated', 'pending',
|
||||||
|
'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending'
|
||||||
|
]):
|
||||||
|
# SMART errors are incidents (unplanned degradation)
|
||||||
|
if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']):
|
||||||
|
return (
|
||||||
|
self.TICKET_CATEGORIES['HARDWARE'],
|
||||||
|
self.TICKET_TYPES['INCIDENT'],
|
||||||
|
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
|
||||||
|
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
|
||||||
|
)
|
||||||
|
# SMART warnings are problems (need investigation)
|
||||||
|
else:
|
||||||
|
return (
|
||||||
|
self.TICKET_CATEGORIES['HARDWARE'],
|
||||||
|
self.TICKET_TYPES['PROBLEM'],
|
||||||
|
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
|
||||||
|
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Software Issues - Application/OS/Container issues
|
||||||
|
if any(keyword in issue_lower for keyword in [
|
||||||
|
'lxc', 'container', 'storage usage', 'cpu usage', 'process',
|
||||||
|
'application', 'service', 'daemon'
|
||||||
|
]):
|
||||||
|
# Critical storage/CPU is an incident (service degradation)
|
||||||
|
if 'critical' in issue_lower:
|
||||||
|
return (
|
||||||
|
self.TICKET_CATEGORIES['SOFTWARE'],
|
||||||
|
self.TICKET_TYPES['INCIDENT'],
|
||||||
|
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
|
||||||
|
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
|
||||||
|
)
|
||||||
|
# Warning level is a problem (needs investigation before it becomes critical)
|
||||||
|
else:
|
||||||
|
return (
|
||||||
|
self.TICKET_CATEGORIES['SOFTWARE'],
|
||||||
|
self.TICKET_TYPES['PROBLEM'],
|
||||||
|
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
|
||||||
|
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Network Issues - Network connectivity/infrastructure
|
||||||
|
if any(keyword in issue_lower for keyword in [
|
||||||
|
'network', 'connectivity', 'unreachable', 'latency', 'packet loss',
|
||||||
|
'interface', 'link down'
|
||||||
|
]):
|
||||||
|
# Network failures are incidents
|
||||||
|
if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']):
|
||||||
|
return (
|
||||||
|
self.TICKET_CATEGORIES['NETWORK'],
|
||||||
|
self.TICKET_TYPES['INCIDENT'],
|
||||||
|
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
|
||||||
|
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
|
||||||
|
)
|
||||||
|
# Network warnings are problems
|
||||||
|
else:
|
||||||
|
return (
|
||||||
|
self.TICKET_CATEGORIES['NETWORK'],
|
||||||
|
self.TICKET_TYPES['PROBLEM'],
|
||||||
|
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
|
||||||
|
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||||
|
)
|
||||||
|
|
||||||
|
# Default: Hardware Problem (for undefined cases)
|
||||||
|
return (
|
||||||
|
self.TICKET_CATEGORIES['HARDWARE'],
|
||||||
|
self.TICKET_TYPES['PROBLEM'],
|
||||||
|
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
|
||||||
|
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||||
|
)
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# TICKET CREATION METHODS
|
# TICKET CREATION METHODS
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -1256,24 +1361,16 @@ class SystemHealthMonitor:
|
|||||||
hostname = socket.gethostname()
|
hostname = socket.gethostname()
|
||||||
action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
|
action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
|
||||||
environment = self.TICKET_TEMPLATES['ENVIRONMENT']
|
environment = self.TICKET_TEMPLATES['ENVIRONMENT']
|
||||||
ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
|
|
||||||
hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
|
|
||||||
software_type = self.TICKET_TEMPLATES['SOFTWARE_TYPE']
|
|
||||||
|
|
||||||
for issue in issues:
|
|
||||||
if issue.lower().startswith('critical') or 'critical' in issue.upper():
|
|
||||||
priority = self.PRIORITIES['CRITICAL']
|
|
||||||
elif issue.lower().startswith('warning') or 'warning' in issue.lower():
|
|
||||||
# all warnings become LOW priority (4)
|
|
||||||
priority = self.PRIORITIES['LOW']
|
|
||||||
else:
|
|
||||||
# everything else stays at MEDIUM (3)
|
|
||||||
priority = self.PRIORITIES['MEDIUM']
|
|
||||||
|
|
||||||
category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
|
|
||||||
issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
|
|
||||||
scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
|
scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
|
||||||
|
|
||||||
|
for issue in issues:
|
||||||
|
# Use the comprehensive priority determination function
|
||||||
|
priority = self._determine_ticket_priority(issue, health_report)
|
||||||
|
|
||||||
|
# Get proper categorization for this issue
|
||||||
|
category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue)
|
||||||
|
|
||||||
|
# Extract drive capacity if this is a drive-related issue
|
||||||
drive_size = ""
|
drive_size = ""
|
||||||
if "Drive" in issue and "/dev/" in issue:
|
if "Drive" in issue and "/dev/" in issue:
|
||||||
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
|
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
|
||||||
@@ -1285,21 +1382,16 @@ class SystemHealthMonitor:
|
|||||||
else:
|
else:
|
||||||
logger.warning(f"Could not extract device from issue: {issue}")
|
logger.warning(f"Could not extract device from issue: {issue}")
|
||||||
|
|
||||||
# Determine if this is a hardware or software issue
|
# Build ticket title with proper categorization
|
||||||
issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
|
|
||||||
|
|
||||||
# Use the correct template based on issue category
|
|
||||||
category_template = hardware_type['HARDWARE'] if issue_category == 'HARDWARE' else software_type['SOFTWARE']
|
|
||||||
|
|
||||||
ticket_title = (
|
ticket_title = (
|
||||||
f"[{hostname}]"
|
f"[{hostname}]"
|
||||||
f"{action_type['AUTO']}"
|
f"{action_type['AUTO']}"
|
||||||
f"{category_template}"
|
f"{issue_tag}"
|
||||||
f"{drive_size}" # Insert drive capacity here
|
f"{drive_size}"
|
||||||
f"{issue}"
|
f"{issue}"
|
||||||
f"{scope}"
|
f"{scope}"
|
||||||
f"{environment['PRODUCTION']}"
|
f"{environment['PRODUCTION']}"
|
||||||
f"{ticket_type['MAINTENANCE']}"
|
f"{ticket_type_tag}"
|
||||||
)
|
)
|
||||||
description = self._generate_detailed_description(issue, health_report)
|
description = self._generate_detailed_description(issue, health_report)
|
||||||
|
|
||||||
@@ -1309,7 +1401,7 @@ class SystemHealthMonitor:
|
|||||||
"priority": priority,
|
"priority": priority,
|
||||||
"status": "Open",
|
"status": "Open",
|
||||||
"category": category,
|
"category": category,
|
||||||
"type": issue_type
|
"type": ticket_type
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.dry_run:
|
if self.dry_run:
|
||||||
|
|||||||
Reference in New Issue
Block a user