Upgrade priority system and fix ticket type alignment

- Add P5 (LOW) priority for informational/minimal impact alerts - Expand ISSUE_PRIORITIES from 7 to 40+ comprehensive mappings - Fix TICKET_TYPES to match tinker_tickets API (Issue, Problem, Task, Maintenance, Upgrade, Install, Request) - Fix TICKET_CATEGORIES to only Hardware and Software - Add P1 escalation logic via _count_critical_issues() helper - Rewrite _determine_ticket_priority() with full P1-P5 support - Add CONFIG options: INCLUDE_INFO_TICKETS, PRIORITY_ESCALATION_THRESHOLD - Filter INFO-level alerts from ticket creation by default - Update _categorize_issue() to use valid ticket types Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-17 15:24:35 -05:00
parent 87b16ca822
commit 3322c5878a
1 changed files with 167 additions and 67 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -23,20 +23,53 @@ class SystemHealthMonitor:
    STANDARD_WIDTH = 80
    
    PRIORITIES = {
-        'CRITICAL': '1',
-        'HIGH': '2',
-        'MEDIUM': '3',
-        'LOW': '4'
+        'CRITICAL': '1',  # P1 - Cluster outages, total system failure
+        'HIGH': '2',      # P2 - Hardware failures, same-day response
+        'MEDIUM': '3',    # P3 - Warnings, 1-3 day response
+        'NORMAL': '4',    # P4 - Standard monitoring alerts
+        'LOW': '5'        # P5 - Informational, minimal impact
    }

    ISSUE_PRIORITIES = {
+        # P1 - Critical System Issues (cluster-wide impact)
+        'CLUSTER_FAILURE': PRIORITIES['CRITICAL'],
+        'MULTIPLE_DRIVE_FAILURE': PRIORITIES['CRITICAL'],
+        'RAID_DEGRADED': PRIORITIES['CRITICAL'],
+
+        # P2 - Hardware Failures (same-day response)
        'SMART_FAILURE': PRIORITIES['HIGH'],
+        'SMART_CRITICAL': PRIORITIES['HIGH'],
        'DISK_CRITICAL': PRIORITIES['HIGH'],
-        'DISK_WARNING': PRIORITIES['MEDIUM'],
        'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
+        'NETWORK_FAILURE': PRIORITIES['HIGH'],
+        'TEMPERATURE_CRITICAL': PRIORITIES['HIGH'],
+        'SSD_WEAR_CRITICAL': PRIORITIES['HIGH'],
+        'NVME_SPARE_CRITICAL': PRIORITIES['HIGH'],
+        'FIRMWARE_CRITICAL': PRIORITIES['HIGH'],
+        'REALLOCATED_SECTOR': PRIORITIES['HIGH'],
+        'PENDING_SECTOR': PRIORITIES['HIGH'],
+
+        # P3 - Warnings (1-3 day response)
+        'SMART_WARNING': PRIORITIES['MEDIUM'],
+        'DISK_WARNING': PRIORITIES['MEDIUM'],
        'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
-        'CPU_HIGH': PRIORITIES['LOW'],
-        'NETWORK_FAILURE': PRIORITIES['HIGH']
+        'TEMPERATURE_WARNING': PRIORITIES['MEDIUM'],
+        'SSD_WEAR_WARNING': PRIORITIES['MEDIUM'],
+        'NVME_SPARE_WARNING': PRIORITIES['MEDIUM'],
+        'LXC_STORAGE_CRITICAL': PRIORITIES['MEDIUM'],
+        'TREND_ALERT': PRIORITIES['MEDIUM'],
+
+        # P4 - Normal Monitoring (standard response)
+        'CPU_HIGH': PRIORITIES['NORMAL'],
+        'LXC_STORAGE_WARNING': PRIORITIES['NORMAL'],
+        'SYSTEM_LOG_WARNING': PRIORITIES['NORMAL'],
+        'DRIVE_AGE_WARNING': PRIORITIES['NORMAL'],
+
+        # P5 - Informational (minimal impact)
+        'TEMPERATURE_INFO': PRIORITIES['LOW'],
+        'DRIVE_AGE_INFO': PRIORITIES['LOW'],
+        'SSD_WEAR_INFO': PRIORITIES['LOW'],
+        'SYSTEM_LOG_INFO': PRIORITIES['LOW']
    }
    
    CONFIG = {
@@ -69,7 +102,9 @@ class SystemHealthMonitor:
            r'.*/downloads.*'
        ],
        'HISTORY_DIR': '/var/log/hwmonDaemon',
-        'HISTORY_RETENTION_DAYS': 30
+        'HISTORY_RETENTION_DAYS': 30,
+        'INCLUDE_INFO_TICKETS': False,  # Set True to create P5 tickets for INFO alerts
+        'PRIORITY_ESCALATION_THRESHOLD': 3  # Number of criticals to trigger P1
    }

    @classmethod
@@ -116,10 +151,11 @@ class SystemHealthMonitor:
            'PRODUCTION': '[production]'
        },
        'TICKET_TYPE': {
-            'INCIDENT': '[incident]',
-            'MAINTENANCE': '[maintenance]',
-            'PROBLEM': '[problem]',
-            'TASK': '[task]'
+            'ISSUE': '[issue]',           # General issue (replaces invalid 'incident')
+            'PROBLEM': '[problem]',       # Root cause investigation
+            'TASK': '[task]',             # Planned work item
+            'MAINTENANCE': '[maintenance]', # Scheduled/preventive work
+            'UPGRADE': '[upgrade]'        # Hardware/software upgrade
        },
        'HARDWARE_TYPE': {
            'HARDWARE': '[hardware]'
@@ -139,17 +175,17 @@ class SystemHealthMonitor:
    # Category and Type mappings for ticket API
    TICKET_CATEGORIES = {
        'HARDWARE': 'Hardware',
-        'SOFTWARE': 'Software',
-        'NETWORK': 'Network',
-        'SECURITY': 'Security',
-        'OTHER': 'Other'
+        'SOFTWARE': 'Software'
    }

    TICKET_TYPES = {
-        'INCIDENT': 'Incident',      # Unplanned interruption or service degradation
-        'REQUEST': 'Request',         # Service or information request
+        'ISSUE': 'Issue',             # General issue/incident
        'PROBLEM': 'Problem',         # Root cause investigation needed
-        'TASK': 'Task'                # Planned work item
+        'TASK': 'Task',               # Planned work item
+        'MAINTENANCE': 'Maintenance', # Scheduled/preventive work
+        'UPGRADE': 'Upgrade',         # Hardware/software upgrade
+        'INSTALL': 'Install',         # New installation
+        'REQUEST': 'Request'          # Service or information request
    }
    
    PROBLEMATIC_FIRMWARE = {
@@ -1209,56 +1245,104 @@ class SystemHealthMonitor:

        return description

+    def _count_critical_issues(self, health_report: Dict[str, Any]) -> int:
+        """Count total critical issues across all health checks for P1 escalation."""
+        count = 0
+
+        # Count drive failures
+        for drive in health_report.get('drives_health', {}).get('drives', []):
+            if drive.get('smart_status') == 'UNHEALTHY':
+                count += 1
+            if any('critical' in issue.lower() for issue in drive.get('smart_issues', [])):
+                count += 1
+
+        # Count ECC errors
+        if health_report.get('memory_health', {}).get('status') == 'CRITICAL':
+            count += 1
+
+        # Count network failures
+        net = health_report.get('network_health', {})
+        if net.get('management_network', {}).get('status') == 'CRITICAL':
+            count += 1
+        if net.get('ceph_network', {}).get('status') == 'CRITICAL':
+            count += 1
+
+        # Count LXC critical issues
+        if health_report.get('lxc_health', {}).get('status') == 'CRITICAL':
+            count += 1
+
+        return count
+
    def _determine_ticket_priority(self, issue: str, health_report: Dict[str, Any]) -> str:
        """
-        Determine ticket priority based on issue type and severity.
-        P1 = Critical system outages (reserved for future major outages)  
+        Determine ticket priority based on issue type, severity, and context.
+
+        P1 = Cluster outages, multiple simultaneous failures
        P2 = Hardware failures requiring same-day response
        P3 = Warnings requiring response within 1-3 days
-        P4 = Low priority monitoring alerts
+        P4 = Normal monitoring alerts
+        P5 = Informational/minimal impact
        """
-        
        issue_lower = issue.lower()

-        # P1 - Reserved for major system outages (implement later)
-        # if 'cluster down' in issue_lower or 'total failure' in issue_lower:
-        #     return self.PRIORITIES['CRITICAL']  # P1
+        # Count total critical issues for escalation logic
+        critical_count = self._count_critical_issues(health_report)
+        escalation_threshold = self.CONFIG.get('PRIORITY_ESCALATION_THRESHOLD', 3)
+
+        # P1 - Multiple simultaneous critical failures (cluster risk)
+        if critical_count >= escalation_threshold:
+            logger.info(f"P1 escalation triggered: {critical_count} critical issues detected")
+            return self.PRIORITIES['CRITICAL']  # P1
+
+        # P1 - Specific cluster-affecting scenarios
+        if any(keyword in issue_lower for keyword in [
+            'cluster', 'raid degraded', 'multiple drive',
+            'both networks unreachable'
+        ]):
+            return self.PRIORITIES['CRITICAL']  # P1

        # P2 - Hardware failures requiring same-day response
        if any(keyword in issue_lower for keyword in [
-            'smart failure', 'drive failure', 'disk failure',
+            'smart failure', 'smart overall health check failed',
+            'drive failure', 'disk failure',
            'uncorrectable ecc', 'hardware failure',
            'critical temperature', 'firmware issue',
-            'reallocated sector', 'pending sector'
+            'reallocated_sector', 'pending_sector', 'offline_uncorrectable',
+            'critical available_spare', 'critical wear',
+            'critical reallocated', 'critical current_pending',
+            'network is unreachable'
        ]):
            return self.PRIORITIES['HIGH']  # P2

-        # P2 - SMART errors indicating potential drive failure
+        # P2 - SMART issues with critical indicators
        if 'smart issues' in issue_lower and any(error_type in issue_lower for error_type in [
-            'error', 'failed', 'reallocated', 'pending', 'uncorrectable'
-        ]):
-            return self.PRIORITIES['HIGH']  # P2
-            
-        # P2 - Critical storage usage (>90%)
-        if 'critical storage usage' in issue_lower:
-            return self.PRIORITIES['HIGH']  # P2
-            
-        # P2 - Network failures affecting cluster communication  
-        if any(keyword in issue_lower for keyword in [
-            'network failure', 'unreachable', 'network down'
+            'critical', 'failed', 'reallocated', 'pending', 'uncorrectable', 'offline'
        ]):
            return self.PRIORITIES['HIGH']  # P2

        # P3 - Warnings requiring attention within days
        if any(keyword in issue_lower for keyword in [
-            'high temperature', 'high storage usage', 
-            'correctable ecc', 'high cpu usage',
-            'warning'
+            'warning', 'high temperature', 'correctable ecc',
+            'trend alert', 'critical storage usage',
+            'low available_spare', 'high wear'
        ]):
            return self.PRIORITIES['MEDIUM']  # P3

-        # P4 - Low priority monitoring alerts
-        return self.PRIORITIES['LOW']  # P4
+        # P4 - Normal monitoring alerts
+        if any(keyword in issue_lower for keyword in [
+            'cpu usage', 'high storage usage',
+            'system log', 'drive age'
+        ]):
+            return self.PRIORITIES['NORMAL']  # P4
+
+        # P5 - Informational/minimal impact
+        if any(keyword in issue_lower for keyword in [
+            'info:', 'info ', 'above optimal', 'monitor only'
+        ]):
+            return self.PRIORITIES['LOW']  # P5
+
+        # Default to P3 for unknown issues (conservative approach)
+        return self.PRIORITIES['MEDIUM']

    def _categorize_issue(self, issue: str) -> tuple:
        """
@@ -1267,9 +1351,9 @@ class SystemHealthMonitor:
        Returns:
            tuple: (category, ticket_type, issue_tag, ticket_type_tag)
                - category: 'Hardware', 'Software', 'Network', etc.
-                - ticket_type: 'Incident', 'Problem', 'Task', 'Request'
+                - ticket_type: 'Issue', 'Problem', 'Task', 'Maintenance', etc.
                - issue_tag: '[hardware]', '[software]', '[network]'
-                - ticket_type_tag: '[incident]', '[problem]', etc.
+                - ticket_type_tag: '[issue]', '[problem]', etc.
        """
        issue_lower = issue.lower()

@@ -1277,15 +1361,16 @@ class SystemHealthMonitor:
        if any(keyword in issue_lower for keyword in [
            'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature',
            'firmware', 'power_on_hours', 'reallocated', 'pending',
-            'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending'
+            'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending',
+            'nvme'
        ]):
-            # SMART errors are incidents (unplanned degradation)
+            # SMART errors/failures are issues (unplanned degradation)
            if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']):
                return (
                    self.TICKET_CATEGORIES['HARDWARE'],
-                    self.TICKET_TYPES['INCIDENT'],
+                    self.TICKET_TYPES['ISSUE'],
                    self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
-                    self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
                )
            # SMART warnings are problems (need investigation)
            else:
@@ -1301,13 +1386,13 @@ class SystemHealthMonitor:
            'lxc', 'container', 'storage usage', 'cpu usage', 'process',
            'application', 'service', 'daemon'
        ]):
-            # Critical storage/CPU is an incident (service degradation)
+            # Critical storage/CPU is an issue (service degradation)
            if 'critical' in issue_lower:
                return (
                    self.TICKET_CATEGORIES['SOFTWARE'],
-                    self.TICKET_TYPES['INCIDENT'],
+                    self.TICKET_TYPES['ISSUE'],
                    self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
-                    self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
                )
            # Warning level is a problem (needs investigation before it becomes critical)
            else:
@@ -1318,23 +1403,23 @@ class SystemHealthMonitor:
                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
                )

-        # Network Issues - Network connectivity/infrastructure
+        # Network Issues - Network connectivity/infrastructure (categorized as Hardware)
        if any(keyword in issue_lower for keyword in [
            'network', 'connectivity', 'unreachable', 'latency', 'packet loss',
            'interface', 'link down'
        ]):
-            # Network failures are incidents
+            # Network failures are issues
            if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']):
                return (
-                    self.TICKET_CATEGORIES['NETWORK'],
-                    self.TICKET_TYPES['INCIDENT'],
+                    self.TICKET_CATEGORIES['HARDWARE'],
+                    self.TICKET_TYPES['ISSUE'],
                    self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
-                    self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
                )
            # Network warnings are problems
            else:
                return (
-                    self.TICKET_CATEGORIES['NETWORK'],
+                    self.TICKET_CATEGORIES['HARDWARE'],
                    self.TICKET_TYPES['PROBLEM'],
                    self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
@@ -1517,7 +1602,22 @@ class SystemHealthMonitor:
        logger.info(f"CPU status: {health_report['cpu_health']['status']}")
        logger.info(f"Network status: {health_report['network_health']}")
        logger.info(f"System status: {health_report['system_health']['status']}")
-        logger.info(f"Detected issues: {issues}")
+        logger.info(f"Detected issues (pre-filter): {issues}")
+
+        # Filter out INFO-level issues unless configured to include them
+        if not self.CONFIG.get('INCLUDE_INFO_TICKETS', False):
+            actionable_issues = []
+            for issue in issues:
+                # Skip INFO-level issues (P5 candidates that shouldn't create tickets)
+                if any(info_marker in issue.lower() for info_marker in [
+                    'info:', 'info ', 'above optimal', 'monitor only'
+                ]):
+                    logger.debug(f"Filtering INFO-level issue: {issue}")
+                    continue
+                actionable_issues.append(issue)
+            issues = actionable_issues
+            logger.info(f"Filtered to actionable issues: {issues}")
+
        logger.info("=== Issue Detection Completed ===\n")

        return issues