Implement proper ticket categorization based on issue type

Added intelligent categorization to match tickets with correct category and type instead of defaulting everything to Hardware/Problem. Changes: - Added TICKET_CATEGORIES and TICKET_TYPES mappings for API consistency - Created _categorize_issue() method to determine proper classification: Hardware Issues: - SMART/drive/disk errors → Hardware + Incident (critical/failed) - SMART warnings → Hardware + Problem (needs investigation) Software Issues: - LXC/container/storage usage/CPU → Software category - Critical levels → Software + Incident (service degradation) - Warning levels → Software + Problem (preventive investigation) Network Issues: - Network failures/unreachable → Network + Incident - Network warnings → Network + Problem - Updated ticket creation to use _categorize_issue() and _determine_ticket_priority() - Tickets now have correct tags: [incident] vs [problem] instead of always [maintenance] - Category field in API payload now matches issue type (Hardware/Software/Network) - Type field in API payload now reflects actual situation (Incident/Problem/Task) Examples: - "LXC storage usage >80%" → Software + Problem - "Critical SMART errors" → Hardware + Incident - "High CPU usage" → Software + Problem - "Network unreachable" → Network + Incident Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-08 13:26:17 -05:00
parent 88afc8f03e
commit 0f81d015cd
1 changed files with 120 additions and 28 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -116,7 +116,10 @@ class SystemHealthMonitor:
            'PRODUCTION': '[production]'
        },
        'TICKET_TYPE': {
-            'MAINTENANCE': '[maintenance]'
+            'INCIDENT': '[incident]',
+            'MAINTENANCE': '[maintenance]',
+            'PROBLEM': '[problem]',
+            'TASK': '[task]'
        },
        'HARDWARE_TYPE': {
            'HARDWARE': '[hardware]'
@@ -130,9 +133,23 @@ class SystemHealthMonitor:
        'SCOPE': {
            'SINGLE_NODE': '[single-node]',
            'CLUSTER_WIDE': '[cluster-wide]'
-        },
-        'DEFAULT_CATEGORY': 'Hardware',
-        'DEFAULT_ISSUE_TYPE': 'Problem'
+        }
+    }
+
+    # Category and Type mappings for ticket API
+    TICKET_CATEGORIES = {
+        'HARDWARE': 'Hardware',
+        'SOFTWARE': 'Software',
+        'NETWORK': 'Network',
+        'SECURITY': 'Security',
+        'OTHER': 'Other'
+    }
+
+    TICKET_TYPES = {
+        'INCIDENT': 'Incident',      # Unplanned interruption or service degradation
+        'REQUEST': 'Request',         # Service or information request
+        'PROBLEM': 'Problem',         # Root cause investigation needed
+        'TASK': 'Task'                # Planned work item
    }
    
    PROBLEMATIC_FIRMWARE = {
@@ -1243,6 +1260,94 @@ class SystemHealthMonitor:
        # P4 - Low priority monitoring alerts
        return self.PRIORITIES['LOW']  # P4

+    def _categorize_issue(self, issue: str) -> tuple:
+        """
+        Determine the correct category, type, and tags for an issue.
+
+        Returns:
+            tuple: (category, ticket_type, issue_tag, ticket_type_tag)
+                - category: 'Hardware', 'Software', 'Network', etc.
+                - ticket_type: 'Incident', 'Problem', 'Task', 'Request'
+                - issue_tag: '[hardware]', '[software]', '[network]'
+                - ticket_type_tag: '[incident]', '[problem]', etc.
+        """
+        issue_lower = issue.lower()
+
+        # Hardware Issues - Physical hardware problems
+        if any(keyword in issue_lower for keyword in [
+            'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature',
+            'firmware', 'power_on_hours', 'reallocated', 'pending',
+            'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending'
+        ]):
+            # SMART errors are incidents (unplanned degradation)
+            if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']):
+                return (
+                    self.TICKET_CATEGORIES['HARDWARE'],
+                    self.TICKET_TYPES['INCIDENT'],
+                    self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
+                )
+            # SMART warnings are problems (need investigation)
+            else:
+                return (
+                    self.TICKET_CATEGORIES['HARDWARE'],
+                    self.TICKET_TYPES['PROBLEM'],
+                    self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
+                )
+
+        # Software Issues - Application/OS/Container issues
+        if any(keyword in issue_lower for keyword in [
+            'lxc', 'container', 'storage usage', 'cpu usage', 'process',
+            'application', 'service', 'daemon'
+        ]):
+            # Critical storage/CPU is an incident (service degradation)
+            if 'critical' in issue_lower:
+                return (
+                    self.TICKET_CATEGORIES['SOFTWARE'],
+                    self.TICKET_TYPES['INCIDENT'],
+                    self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
+                )
+            # Warning level is a problem (needs investigation before it becomes critical)
+            else:
+                return (
+                    self.TICKET_CATEGORIES['SOFTWARE'],
+                    self.TICKET_TYPES['PROBLEM'],
+                    self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
+                )
+
+        # Network Issues - Network connectivity/infrastructure
+        if any(keyword in issue_lower for keyword in [
+            'network', 'connectivity', 'unreachable', 'latency', 'packet loss',
+            'interface', 'link down'
+        ]):
+            # Network failures are incidents
+            if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']):
+                return (
+                    self.TICKET_CATEGORIES['NETWORK'],
+                    self.TICKET_TYPES['INCIDENT'],
+                    self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
+                )
+            # Network warnings are problems
+            else:
+                return (
+                    self.TICKET_CATEGORIES['NETWORK'],
+                    self.TICKET_TYPES['PROBLEM'],
+                    self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
+                )
+
+        # Default: Hardware Problem (for undefined cases)
+        return (
+            self.TICKET_CATEGORIES['HARDWARE'],
+            self.TICKET_TYPES['PROBLEM'],
+            self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
+            self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
+        )
+
    # =============================================================================
    # TICKET CREATION METHODS
    # =============================================================================
@@ -1256,24 +1361,16 @@ class SystemHealthMonitor:
        hostname = socket.gethostname()
        action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
        environment = self.TICKET_TEMPLATES['ENVIRONMENT']
-        ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
-        hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
-        software_type = self.TICKET_TEMPLATES['SOFTWARE_TYPE']
+        scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']

        for issue in issues:
-            if issue.lower().startswith('critical') or 'critical' in issue.upper():
-                priority = self.PRIORITIES['CRITICAL']
-            elif issue.lower().startswith('warning') or 'warning' in issue.lower():
-                # all warnings become LOW priority (4)
-                priority = self.PRIORITIES['LOW']
-            else:
-                # everything else stays at MEDIUM (3)
-                priority = self.PRIORITIES['MEDIUM']
+            # Use the comprehensive priority determination function
+            priority = self._determine_ticket_priority(issue, health_report)

-            category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
-            issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
-            scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
+            # Get proper categorization for this issue
+            category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue)

+            # Extract drive capacity if this is a drive-related issue
            drive_size = ""
            if "Drive" in issue and "/dev/" in issue:
                device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
@@ -1285,21 +1382,16 @@ class SystemHealthMonitor:
                else:
                    logger.warning(f"Could not extract device from issue: {issue}")

-            # Determine if this is a hardware or software issue
-            issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
-            
-            # Use the correct template based on issue category
-            category_template = hardware_type['HARDWARE'] if issue_category == 'HARDWARE' else software_type['SOFTWARE']
-
+            # Build ticket title with proper categorization
            ticket_title = (
                f"[{hostname}]"
                f"{action_type['AUTO']}"
-                f"{category_template}"
-                f"{drive_size}"  # Insert drive capacity here
+                f"{issue_tag}"
+                f"{drive_size}"
                f"{issue}"
                f"{scope}"
                f"{environment['PRODUCTION']}"
-                f"{ticket_type['MAINTENANCE']}"
+                f"{ticket_type_tag}"
            )
            description = self._generate_detailed_description(issue, health_report)

@@ -1309,7 +1401,7 @@ class SystemHealthMonitor:
                "priority": priority,
                "status": "Open",
                "category": category,
-                "type": issue_type
+                "type": ticket_type
            }

            if self.dry_run: