Implement proper ticket categorization based on issue type

Added intelligent categorization to match tickets with correct category and type
instead of defaulting everything to Hardware/Problem.

Changes:
- Added TICKET_CATEGORIES and TICKET_TYPES mappings for API consistency
- Created _categorize_issue() method to determine proper classification:

  Hardware Issues:
  - SMART/drive/disk errors → Hardware + Incident (critical/failed)
  - SMART warnings → Hardware + Problem (needs investigation)

  Software Issues:
  - LXC/container/storage usage/CPU → Software category
  - Critical levels → Software + Incident (service degradation)
  - Warning levels → Software + Problem (preventive investigation)

  Network Issues:
  - Network failures/unreachable → Network + Incident
  - Network warnings → Network + Problem

- Updated ticket creation to use _categorize_issue() and _determine_ticket_priority()
- Tickets now have correct tags: [incident] vs [problem] instead of always [maintenance]
- Category field in API payload now matches issue type (Hardware/Software/Network)
- Type field in API payload now reflects actual situation (Incident/Problem/Task)

Examples:
- "LXC storage usage >80%" → Software + Problem
- "Critical SMART errors" → Hardware + Incident
- "High CPU usage" → Software + Problem
- "Network unreachable" → Network + Incident

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-08 13:26:17 -05:00
parent 88afc8f03e
commit 0f81d015cd

View File

@@ -116,7 +116,10 @@ class SystemHealthMonitor:
'PRODUCTION': '[production]'
},
'TICKET_TYPE': {
'MAINTENANCE': '[maintenance]'
'INCIDENT': '[incident]',
'MAINTENANCE': '[maintenance]',
'PROBLEM': '[problem]',
'TASK': '[task]'
},
'HARDWARE_TYPE': {
'HARDWARE': '[hardware]'
@@ -130,9 +133,23 @@ class SystemHealthMonitor:
'SCOPE': {
'SINGLE_NODE': '[single-node]',
'CLUSTER_WIDE': '[cluster-wide]'
},
'DEFAULT_CATEGORY': 'Hardware',
'DEFAULT_ISSUE_TYPE': 'Problem'
}
}
# Category and Type mappings for ticket API
TICKET_CATEGORIES = {
'HARDWARE': 'Hardware',
'SOFTWARE': 'Software',
'NETWORK': 'Network',
'SECURITY': 'Security',
'OTHER': 'Other'
}
TICKET_TYPES = {
'INCIDENT': 'Incident', # Unplanned interruption or service degradation
'REQUEST': 'Request', # Service or information request
'PROBLEM': 'Problem', # Root cause investigation needed
'TASK': 'Task' # Planned work item
}
PROBLEMATIC_FIRMWARE = {
@@ -1243,6 +1260,94 @@ class SystemHealthMonitor:
# P4 - Low priority monitoring alerts
return self.PRIORITIES['LOW'] # P4
def _categorize_issue(self, issue: str) -> tuple:
"""
Determine the correct category, type, and tags for an issue.
Returns:
tuple: (category, ticket_type, issue_tag, ticket_type_tag)
- category: 'Hardware', 'Software', 'Network', etc.
- ticket_type: 'Incident', 'Problem', 'Task', 'Request'
- issue_tag: '[hardware]', '[software]', '[network]'
- ticket_type_tag: '[incident]', '[problem]', etc.
"""
issue_lower = issue.lower()
# Hardware Issues - Physical hardware problems
if any(keyword in issue_lower for keyword in [
'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature',
'firmware', 'power_on_hours', 'reallocated', 'pending',
'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending'
]):
# SMART errors are incidents (unplanned degradation)
if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']):
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['INCIDENT'],
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
)
# SMART warnings are problems (need investigation)
else:
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Software Issues - Application/OS/Container issues
if any(keyword in issue_lower for keyword in [
'lxc', 'container', 'storage usage', 'cpu usage', 'process',
'application', 'service', 'daemon'
]):
# Critical storage/CPU is an incident (service degradation)
if 'critical' in issue_lower:
return (
self.TICKET_CATEGORIES['SOFTWARE'],
self.TICKET_TYPES['INCIDENT'],
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
)
# Warning level is a problem (needs investigation before it becomes critical)
else:
return (
self.TICKET_CATEGORIES['SOFTWARE'],
self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Network Issues - Network connectivity/infrastructure
if any(keyword in issue_lower for keyword in [
'network', 'connectivity', 'unreachable', 'latency', 'packet loss',
'interface', 'link down'
]):
# Network failures are incidents
if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']):
return (
self.TICKET_CATEGORIES['NETWORK'],
self.TICKET_TYPES['INCIDENT'],
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
)
# Network warnings are problems
else:
return (
self.TICKET_CATEGORIES['NETWORK'],
self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Default: Hardware Problem (for undefined cases)
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# =============================================================================
# TICKET CREATION METHODS
# =============================================================================
@@ -1256,24 +1361,16 @@ class SystemHealthMonitor:
hostname = socket.gethostname()
action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
environment = self.TICKET_TEMPLATES['ENVIRONMENT']
ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
software_type = self.TICKET_TEMPLATES['SOFTWARE_TYPE']
scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
for issue in issues:
if issue.lower().startswith('critical') or 'critical' in issue.upper():
priority = self.PRIORITIES['CRITICAL']
elif issue.lower().startswith('warning') or 'warning' in issue.lower():
# all warnings become LOW priority (4)
priority = self.PRIORITIES['LOW']
else:
# everything else stays at MEDIUM (3)
priority = self.PRIORITIES['MEDIUM']
# Use the comprehensive priority determination function
priority = self._determine_ticket_priority(issue, health_report)
category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
# Get proper categorization for this issue
category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue)
# Extract drive capacity if this is a drive-related issue
drive_size = ""
if "Drive" in issue and "/dev/" in issue:
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
@@ -1285,21 +1382,16 @@ class SystemHealthMonitor:
else:
logger.warning(f"Could not extract device from issue: {issue}")
# Determine if this is a hardware or software issue
issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
# Use the correct template based on issue category
category_template = hardware_type['HARDWARE'] if issue_category == 'HARDWARE' else software_type['SOFTWARE']
# Build ticket title with proper categorization
ticket_title = (
f"[{hostname}]"
f"{action_type['AUTO']}"
f"{category_template}"
f"{drive_size}" # Insert drive capacity here
f"{issue_tag}"
f"{drive_size}"
f"{issue}"
f"{scope}"
f"{environment['PRODUCTION']}"
f"{ticket_type['MAINTENANCE']}"
f"{ticket_type_tag}"
)
description = self._generate_detailed_description(issue, health_report)
@@ -1309,7 +1401,7 @@ class SystemHealthMonitor:
"priority": priority,
"status": "Open",
"category": category,
"type": issue_type
"type": ticket_type
}
if self.dry_run: