Implement proper ticket categorization based on issue type
Added intelligent categorization to match tickets with correct category and type instead of defaulting everything to Hardware/Problem. Changes: - Added TICKET_CATEGORIES and TICKET_TYPES mappings for API consistency - Created _categorize_issue() method to determine proper classification: Hardware Issues: - SMART/drive/disk errors → Hardware + Incident (critical/failed) - SMART warnings → Hardware + Problem (needs investigation) Software Issues: - LXC/container/storage usage/CPU → Software category - Critical levels → Software + Incident (service degradation) - Warning levels → Software + Problem (preventive investigation) Network Issues: - Network failures/unreachable → Network + Incident - Network warnings → Network + Problem - Updated ticket creation to use _categorize_issue() and _determine_ticket_priority() - Tickets now have correct tags: [incident] vs [problem] instead of always [maintenance] - Category field in API payload now matches issue type (Hardware/Software/Network) - Type field in API payload now reflects actual situation (Incident/Problem/Task) Examples: - "LXC storage usage >80%" → Software + Problem - "Critical SMART errors" → Hardware + Incident - "High CPU usage" → Software + Problem - "Network unreachable" → Network + Incident Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
148
hwmonDaemon.py
148
hwmonDaemon.py
@@ -116,7 +116,10 @@ class SystemHealthMonitor:
|
||||
'PRODUCTION': '[production]'
|
||||
},
|
||||
'TICKET_TYPE': {
|
||||
'MAINTENANCE': '[maintenance]'
|
||||
'INCIDENT': '[incident]',
|
||||
'MAINTENANCE': '[maintenance]',
|
||||
'PROBLEM': '[problem]',
|
||||
'TASK': '[task]'
|
||||
},
|
||||
'HARDWARE_TYPE': {
|
||||
'HARDWARE': '[hardware]'
|
||||
@@ -130,9 +133,23 @@ class SystemHealthMonitor:
|
||||
'SCOPE': {
|
||||
'SINGLE_NODE': '[single-node]',
|
||||
'CLUSTER_WIDE': '[cluster-wide]'
|
||||
},
|
||||
'DEFAULT_CATEGORY': 'Hardware',
|
||||
'DEFAULT_ISSUE_TYPE': 'Problem'
|
||||
}
|
||||
}
|
||||
|
||||
# Category and Type mappings for ticket API
|
||||
TICKET_CATEGORIES = {
|
||||
'HARDWARE': 'Hardware',
|
||||
'SOFTWARE': 'Software',
|
||||
'NETWORK': 'Network',
|
||||
'SECURITY': 'Security',
|
||||
'OTHER': 'Other'
|
||||
}
|
||||
|
||||
TICKET_TYPES = {
|
||||
'INCIDENT': 'Incident', # Unplanned interruption or service degradation
|
||||
'REQUEST': 'Request', # Service or information request
|
||||
'PROBLEM': 'Problem', # Root cause investigation needed
|
||||
'TASK': 'Task' # Planned work item
|
||||
}
|
||||
|
||||
PROBLEMATIC_FIRMWARE = {
|
||||
@@ -1243,6 +1260,94 @@ class SystemHealthMonitor:
|
||||
# P4 - Low priority monitoring alerts
|
||||
return self.PRIORITIES['LOW'] # P4
|
||||
|
||||
def _categorize_issue(self, issue: str) -> tuple:
|
||||
"""
|
||||
Determine the correct category, type, and tags for an issue.
|
||||
|
||||
Returns:
|
||||
tuple: (category, ticket_type, issue_tag, ticket_type_tag)
|
||||
- category: 'Hardware', 'Software', 'Network', etc.
|
||||
- ticket_type: 'Incident', 'Problem', 'Task', 'Request'
|
||||
- issue_tag: '[hardware]', '[software]', '[network]'
|
||||
- ticket_type_tag: '[incident]', '[problem]', etc.
|
||||
"""
|
||||
issue_lower = issue.lower()
|
||||
|
||||
# Hardware Issues - Physical hardware problems
|
||||
if any(keyword in issue_lower for keyword in [
|
||||
'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature',
|
||||
'firmware', 'power_on_hours', 'reallocated', 'pending',
|
||||
'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending'
|
||||
]):
|
||||
# SMART errors are incidents (unplanned degradation)
|
||||
if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']):
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
self.TICKET_TYPES['INCIDENT'],
|
||||
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
|
||||
)
|
||||
# SMART warnings are problems (need investigation)
|
||||
else:
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
self.TICKET_TYPES['PROBLEM'],
|
||||
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# Software Issues - Application/OS/Container issues
|
||||
if any(keyword in issue_lower for keyword in [
|
||||
'lxc', 'container', 'storage usage', 'cpu usage', 'process',
|
||||
'application', 'service', 'daemon'
|
||||
]):
|
||||
# Critical storage/CPU is an incident (service degradation)
|
||||
if 'critical' in issue_lower:
|
||||
return (
|
||||
self.TICKET_CATEGORIES['SOFTWARE'],
|
||||
self.TICKET_TYPES['INCIDENT'],
|
||||
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
|
||||
)
|
||||
# Warning level is a problem (needs investigation before it becomes critical)
|
||||
else:
|
||||
return (
|
||||
self.TICKET_CATEGORIES['SOFTWARE'],
|
||||
self.TICKET_TYPES['PROBLEM'],
|
||||
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# Network Issues - Network connectivity/infrastructure
|
||||
if any(keyword in issue_lower for keyword in [
|
||||
'network', 'connectivity', 'unreachable', 'latency', 'packet loss',
|
||||
'interface', 'link down'
|
||||
]):
|
||||
# Network failures are incidents
|
||||
if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']):
|
||||
return (
|
||||
self.TICKET_CATEGORIES['NETWORK'],
|
||||
self.TICKET_TYPES['INCIDENT'],
|
||||
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
|
||||
)
|
||||
# Network warnings are problems
|
||||
else:
|
||||
return (
|
||||
self.TICKET_CATEGORIES['NETWORK'],
|
||||
self.TICKET_TYPES['PROBLEM'],
|
||||
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# Default: Hardware Problem (for undefined cases)
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
self.TICKET_TYPES['PROBLEM'],
|
||||
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# =============================================================================
|
||||
# TICKET CREATION METHODS
|
||||
# =============================================================================
|
||||
@@ -1256,24 +1361,16 @@ class SystemHealthMonitor:
|
||||
hostname = socket.gethostname()
|
||||
action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
|
||||
environment = self.TICKET_TEMPLATES['ENVIRONMENT']
|
||||
ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
|
||||
hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
|
||||
software_type = self.TICKET_TEMPLATES['SOFTWARE_TYPE']
|
||||
scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
|
||||
|
||||
for issue in issues:
|
||||
if issue.lower().startswith('critical') or 'critical' in issue.upper():
|
||||
priority = self.PRIORITIES['CRITICAL']
|
||||
elif issue.lower().startswith('warning') or 'warning' in issue.lower():
|
||||
# all warnings become LOW priority (4)
|
||||
priority = self.PRIORITIES['LOW']
|
||||
else:
|
||||
# everything else stays at MEDIUM (3)
|
||||
priority = self.PRIORITIES['MEDIUM']
|
||||
# Use the comprehensive priority determination function
|
||||
priority = self._determine_ticket_priority(issue, health_report)
|
||||
|
||||
category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
|
||||
issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
|
||||
scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
|
||||
# Get proper categorization for this issue
|
||||
category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue)
|
||||
|
||||
# Extract drive capacity if this is a drive-related issue
|
||||
drive_size = ""
|
||||
if "Drive" in issue and "/dev/" in issue:
|
||||
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
|
||||
@@ -1285,21 +1382,16 @@ class SystemHealthMonitor:
|
||||
else:
|
||||
logger.warning(f"Could not extract device from issue: {issue}")
|
||||
|
||||
# Determine if this is a hardware or software issue
|
||||
issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
|
||||
|
||||
# Use the correct template based on issue category
|
||||
category_template = hardware_type['HARDWARE'] if issue_category == 'HARDWARE' else software_type['SOFTWARE']
|
||||
|
||||
# Build ticket title with proper categorization
|
||||
ticket_title = (
|
||||
f"[{hostname}]"
|
||||
f"{action_type['AUTO']}"
|
||||
f"{category_template}"
|
||||
f"{drive_size}" # Insert drive capacity here
|
||||
f"{issue_tag}"
|
||||
f"{drive_size}"
|
||||
f"{issue}"
|
||||
f"{scope}"
|
||||
f"{environment['PRODUCTION']}"
|
||||
f"{ticket_type['MAINTENANCE']}"
|
||||
f"{ticket_type_tag}"
|
||||
)
|
||||
description = self._generate_detailed_description(issue, health_report)
|
||||
|
||||
@@ -1309,7 +1401,7 @@ class SystemHealthMonitor:
|
||||
"priority": priority,
|
||||
"status": "Open",
|
||||
"category": category,
|
||||
"type": issue_type
|
||||
"type": ticket_type
|
||||
}
|
||||
|
||||
if self.dry_run:
|
||||
|
||||
Reference in New Issue
Block a user