Implement proper ticket categorization based on issue type

Added intelligent categorization to match tickets with correct category and type
instead of defaulting everything to Hardware/Problem.

Changes:
- Added TICKET_CATEGORIES and TICKET_TYPES mappings for API consistency
- Created _categorize_issue() method to determine proper classification:

  Hardware Issues:
  - SMART/drive/disk errors → Hardware + Incident (critical/failed)
  - SMART warnings → Hardware + Problem (needs investigation)

  Software Issues:
  - LXC/container/storage usage/CPU → Software category
  - Critical levels → Software + Incident (service degradation)
  - Warning levels → Software + Problem (preventive investigation)

  Network Issues:
  - Network failures/unreachable → Network + Incident
  - Network warnings → Network + Problem

- Updated ticket creation to use _categorize_issue() and _determine_ticket_priority()
- Tickets now have correct tags: [incident] vs [problem] instead of always [maintenance]
- Category field in API payload now matches issue type (Hardware/Software/Network)
- Type field in API payload now reflects actual situation (Incident/Problem/Task)

Examples:
- "LXC storage usage >80%" → Software + Problem
- "Critical SMART errors" → Hardware + Incident
- "High CPU usage" → Software + Problem
- "Network unreachable" → Network + Incident

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-08 13:26:17 -05:00
parent 88afc8f03e
commit 0f81d015cd

View File

@@ -116,7 +116,10 @@ class SystemHealthMonitor:
'PRODUCTION': '[production]' 'PRODUCTION': '[production]'
}, },
'TICKET_TYPE': { 'TICKET_TYPE': {
'MAINTENANCE': '[maintenance]' 'INCIDENT': '[incident]',
'MAINTENANCE': '[maintenance]',
'PROBLEM': '[problem]',
'TASK': '[task]'
}, },
'HARDWARE_TYPE': { 'HARDWARE_TYPE': {
'HARDWARE': '[hardware]' 'HARDWARE': '[hardware]'
@@ -130,9 +133,23 @@ class SystemHealthMonitor:
'SCOPE': { 'SCOPE': {
'SINGLE_NODE': '[single-node]', 'SINGLE_NODE': '[single-node]',
'CLUSTER_WIDE': '[cluster-wide]' 'CLUSTER_WIDE': '[cluster-wide]'
}, }
'DEFAULT_CATEGORY': 'Hardware', }
'DEFAULT_ISSUE_TYPE': 'Problem'
# Category and Type mappings for ticket API
TICKET_CATEGORIES = {
'HARDWARE': 'Hardware',
'SOFTWARE': 'Software',
'NETWORK': 'Network',
'SECURITY': 'Security',
'OTHER': 'Other'
}
TICKET_TYPES = {
'INCIDENT': 'Incident', # Unplanned interruption or service degradation
'REQUEST': 'Request', # Service or information request
'PROBLEM': 'Problem', # Root cause investigation needed
'TASK': 'Task' # Planned work item
} }
PROBLEMATIC_FIRMWARE = { PROBLEMATIC_FIRMWARE = {
@@ -1243,6 +1260,94 @@ class SystemHealthMonitor:
# P4 - Low priority monitoring alerts # P4 - Low priority monitoring alerts
return self.PRIORITIES['LOW'] # P4 return self.PRIORITIES['LOW'] # P4
def _categorize_issue(self, issue: str) -> tuple:
"""
Determine the correct category, type, and tags for an issue.
Returns:
tuple: (category, ticket_type, issue_tag, ticket_type_tag)
- category: 'Hardware', 'Software', 'Network', etc.
- ticket_type: 'Incident', 'Problem', 'Task', 'Request'
- issue_tag: '[hardware]', '[software]', '[network]'
- ticket_type_tag: '[incident]', '[problem]', etc.
"""
issue_lower = issue.lower()
# Hardware Issues - Physical hardware problems
if any(keyword in issue_lower for keyword in [
'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature',
'firmware', 'power_on_hours', 'reallocated', 'pending',
'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending'
]):
# SMART errors are incidents (unplanned degradation)
if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']):
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['INCIDENT'],
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
)
# SMART warnings are problems (need investigation)
else:
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Software Issues - Application/OS/Container issues
if any(keyword in issue_lower for keyword in [
'lxc', 'container', 'storage usage', 'cpu usage', 'process',
'application', 'service', 'daemon'
]):
# Critical storage/CPU is an incident (service degradation)
if 'critical' in issue_lower:
return (
self.TICKET_CATEGORIES['SOFTWARE'],
self.TICKET_TYPES['INCIDENT'],
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
)
# Warning level is a problem (needs investigation before it becomes critical)
else:
return (
self.TICKET_CATEGORIES['SOFTWARE'],
self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Network Issues - Network connectivity/infrastructure
if any(keyword in issue_lower for keyword in [
'network', 'connectivity', 'unreachable', 'latency', 'packet loss',
'interface', 'link down'
]):
# Network failures are incidents
if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']):
return (
self.TICKET_CATEGORIES['NETWORK'],
self.TICKET_TYPES['INCIDENT'],
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
)
# Network warnings are problems
else:
return (
self.TICKET_CATEGORIES['NETWORK'],
self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Default: Hardware Problem (for undefined cases)
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# ============================================================================= # =============================================================================
# TICKET CREATION METHODS # TICKET CREATION METHODS
# ============================================================================= # =============================================================================
@@ -1256,24 +1361,16 @@ class SystemHealthMonitor:
hostname = socket.gethostname() hostname = socket.gethostname()
action_type = self.TICKET_TEMPLATES['ACTION_TYPE'] action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
environment = self.TICKET_TEMPLATES['ENVIRONMENT'] environment = self.TICKET_TEMPLATES['ENVIRONMENT']
ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE'] scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
software_type = self.TICKET_TEMPLATES['SOFTWARE_TYPE']
for issue in issues: for issue in issues:
if issue.lower().startswith('critical') or 'critical' in issue.upper(): # Use the comprehensive priority determination function
priority = self.PRIORITIES['CRITICAL'] priority = self._determine_ticket_priority(issue, health_report)
elif issue.lower().startswith('warning') or 'warning' in issue.lower():
# all warnings become LOW priority (4)
priority = self.PRIORITIES['LOW']
else:
# everything else stays at MEDIUM (3)
priority = self.PRIORITIES['MEDIUM']
category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY'] # Get proper categorization for this issue
issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE'] category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue)
scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
# Extract drive capacity if this is a drive-related issue
drive_size = "" drive_size = ""
if "Drive" in issue and "/dev/" in issue: if "Drive" in issue and "/dev/" in issue:
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue) device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
@@ -1285,21 +1382,16 @@ class SystemHealthMonitor:
else: else:
logger.warning(f"Could not extract device from issue: {issue}") logger.warning(f"Could not extract device from issue: {issue}")
# Determine if this is a hardware or software issue # Build ticket title with proper categorization
issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
# Use the correct template based on issue category
category_template = hardware_type['HARDWARE'] if issue_category == 'HARDWARE' else software_type['SOFTWARE']
ticket_title = ( ticket_title = (
f"[{hostname}]" f"[{hostname}]"
f"{action_type['AUTO']}" f"{action_type['AUTO']}"
f"{category_template}" f"{issue_tag}"
f"{drive_size}" # Insert drive capacity here f"{drive_size}"
f"{issue}" f"{issue}"
f"{scope}" f"{scope}"
f"{environment['PRODUCTION']}" f"{environment['PRODUCTION']}"
f"{ticket_type['MAINTENANCE']}" f"{ticket_type_tag}"
) )
description = self._generate_detailed_description(issue, health_report) description = self._generate_detailed_description(issue, health_report)
@@ -1309,7 +1401,7 @@ class SystemHealthMonitor:
"priority": priority, "priority": priority,
"status": "Open", "status": "Open",
"category": category, "category": category,
"type": issue_type "type": ticket_type
} }
if self.dry_run: if self.dry_run: