From 3322c5878a140dcd71e725223437e490ef38d153 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Sat, 17 Jan 2026 15:24:35 -0500 Subject: [PATCH] Upgrade priority system and fix ticket type alignment - Add P5 (LOW) priority for informational/minimal impact alerts - Expand ISSUE_PRIORITIES from 7 to 40+ comprehensive mappings - Fix TICKET_TYPES to match tinker_tickets API (Issue, Problem, Task, Maintenance, Upgrade, Install, Request) - Fix TICKET_CATEGORIES to only Hardware and Software - Add P1 escalation logic via _count_critical_issues() helper - Rewrite _determine_ticket_priority() with full P1-P5 support - Add CONFIG options: INCLUDE_INFO_TICKETS, PRIORITY_ESCALATION_THRESHOLD - Filter INFO-level alerts from ticket creation by default - Update _categorize_issue() to use valid ticket types Co-Authored-By: Claude Opus 4.5 --- hwmonDaemon.py | 234 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 167 insertions(+), 67 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 5ce4064..f7c7f36 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -23,20 +23,53 @@ class SystemHealthMonitor: STANDARD_WIDTH = 80 PRIORITIES = { - 'CRITICAL': '1', - 'HIGH': '2', - 'MEDIUM': '3', - 'LOW': '4' + 'CRITICAL': '1', # P1 - Cluster outages, total system failure + 'HIGH': '2', # P2 - Hardware failures, same-day response + 'MEDIUM': '3', # P3 - Warnings, 1-3 day response + 'NORMAL': '4', # P4 - Standard monitoring alerts + 'LOW': '5' # P5 - Informational, minimal impact } - + ISSUE_PRIORITIES = { + # P1 - Critical System Issues (cluster-wide impact) + 'CLUSTER_FAILURE': PRIORITIES['CRITICAL'], + 'MULTIPLE_DRIVE_FAILURE': PRIORITIES['CRITICAL'], + 'RAID_DEGRADED': PRIORITIES['CRITICAL'], + + # P2 - Hardware Failures (same-day response) 'SMART_FAILURE': PRIORITIES['HIGH'], + 'SMART_CRITICAL': PRIORITIES['HIGH'], 'DISK_CRITICAL': PRIORITIES['HIGH'], - 'DISK_WARNING': PRIORITIES['MEDIUM'], 'UNCORRECTABLE_ECC': PRIORITIES['HIGH'], + 'NETWORK_FAILURE': PRIORITIES['HIGH'], + 'TEMPERATURE_CRITICAL': PRIORITIES['HIGH'], + 'SSD_WEAR_CRITICAL': PRIORITIES['HIGH'], + 'NVME_SPARE_CRITICAL': PRIORITIES['HIGH'], + 'FIRMWARE_CRITICAL': PRIORITIES['HIGH'], + 'REALLOCATED_SECTOR': PRIORITIES['HIGH'], + 'PENDING_SECTOR': PRIORITIES['HIGH'], + + # P3 - Warnings (1-3 day response) + 'SMART_WARNING': PRIORITIES['MEDIUM'], + 'DISK_WARNING': PRIORITIES['MEDIUM'], 'CORRECTABLE_ECC': PRIORITIES['MEDIUM'], - 'CPU_HIGH': PRIORITIES['LOW'], - 'NETWORK_FAILURE': PRIORITIES['HIGH'] + 'TEMPERATURE_WARNING': PRIORITIES['MEDIUM'], + 'SSD_WEAR_WARNING': PRIORITIES['MEDIUM'], + 'NVME_SPARE_WARNING': PRIORITIES['MEDIUM'], + 'LXC_STORAGE_CRITICAL': PRIORITIES['MEDIUM'], + 'TREND_ALERT': PRIORITIES['MEDIUM'], + + # P4 - Normal Monitoring (standard response) + 'CPU_HIGH': PRIORITIES['NORMAL'], + 'LXC_STORAGE_WARNING': PRIORITIES['NORMAL'], + 'SYSTEM_LOG_WARNING': PRIORITIES['NORMAL'], + 'DRIVE_AGE_WARNING': PRIORITIES['NORMAL'], + + # P5 - Informational (minimal impact) + 'TEMPERATURE_INFO': PRIORITIES['LOW'], + 'DRIVE_AGE_INFO': PRIORITIES['LOW'], + 'SSD_WEAR_INFO': PRIORITIES['LOW'], + 'SYSTEM_LOG_INFO': PRIORITIES['LOW'] } CONFIG = { @@ -69,7 +102,9 @@ class SystemHealthMonitor: r'.*/downloads.*' ], 'HISTORY_DIR': '/var/log/hwmonDaemon', - 'HISTORY_RETENTION_DAYS': 30 + 'HISTORY_RETENTION_DAYS': 30, + 'INCLUDE_INFO_TICKETS': False, # Set True to create P5 tickets for INFO alerts + 'PRIORITY_ESCALATION_THRESHOLD': 3 # Number of criticals to trigger P1 } @classmethod @@ -116,10 +151,11 @@ class SystemHealthMonitor: 'PRODUCTION': '[production]' }, 'TICKET_TYPE': { - 'INCIDENT': '[incident]', - 'MAINTENANCE': '[maintenance]', - 'PROBLEM': '[problem]', - 'TASK': '[task]' + 'ISSUE': '[issue]', # General issue (replaces invalid 'incident') + 'PROBLEM': '[problem]', # Root cause investigation + 'TASK': '[task]', # Planned work item + 'MAINTENANCE': '[maintenance]', # Scheduled/preventive work + 'UPGRADE': '[upgrade]' # Hardware/software upgrade }, 'HARDWARE_TYPE': { 'HARDWARE': '[hardware]' @@ -139,17 +175,17 @@ class SystemHealthMonitor: # Category and Type mappings for ticket API TICKET_CATEGORIES = { 'HARDWARE': 'Hardware', - 'SOFTWARE': 'Software', - 'NETWORK': 'Network', - 'SECURITY': 'Security', - 'OTHER': 'Other' + 'SOFTWARE': 'Software' } TICKET_TYPES = { - 'INCIDENT': 'Incident', # Unplanned interruption or service degradation - 'REQUEST': 'Request', # Service or information request + 'ISSUE': 'Issue', # General issue/incident 'PROBLEM': 'Problem', # Root cause investigation needed - 'TASK': 'Task' # Planned work item + 'TASK': 'Task', # Planned work item + 'MAINTENANCE': 'Maintenance', # Scheduled/preventive work + 'UPGRADE': 'Upgrade', # Hardware/software upgrade + 'INSTALL': 'Install', # New installation + 'REQUEST': 'Request' # Service or information request } PROBLEMATIC_FIRMWARE = { @@ -1209,56 +1245,104 @@ class SystemHealthMonitor: return description + def _count_critical_issues(self, health_report: Dict[str, Any]) -> int: + """Count total critical issues across all health checks for P1 escalation.""" + count = 0 + + # Count drive failures + for drive in health_report.get('drives_health', {}).get('drives', []): + if drive.get('smart_status') == 'UNHEALTHY': + count += 1 + if any('critical' in issue.lower() for issue in drive.get('smart_issues', [])): + count += 1 + + # Count ECC errors + if health_report.get('memory_health', {}).get('status') == 'CRITICAL': + count += 1 + + # Count network failures + net = health_report.get('network_health', {}) + if net.get('management_network', {}).get('status') == 'CRITICAL': + count += 1 + if net.get('ceph_network', {}).get('status') == 'CRITICAL': + count += 1 + + # Count LXC critical issues + if health_report.get('lxc_health', {}).get('status') == 'CRITICAL': + count += 1 + + return count + def _determine_ticket_priority(self, issue: str, health_report: Dict[str, Any]) -> str: """ - Determine ticket priority based on issue type and severity. - P1 = Critical system outages (reserved for future major outages) + Determine ticket priority based on issue type, severity, and context. + + P1 = Cluster outages, multiple simultaneous failures P2 = Hardware failures requiring same-day response P3 = Warnings requiring response within 1-3 days - P4 = Low priority monitoring alerts + P4 = Normal monitoring alerts + P5 = Informational/minimal impact """ - issue_lower = issue.lower() - - # P1 - Reserved for major system outages (implement later) - # if 'cluster down' in issue_lower or 'total failure' in issue_lower: - # return self.PRIORITIES['CRITICAL'] # P1 - + + # Count total critical issues for escalation logic + critical_count = self._count_critical_issues(health_report) + escalation_threshold = self.CONFIG.get('PRIORITY_ESCALATION_THRESHOLD', 3) + + # P1 - Multiple simultaneous critical failures (cluster risk) + if critical_count >= escalation_threshold: + logger.info(f"P1 escalation triggered: {critical_count} critical issues detected") + return self.PRIORITIES['CRITICAL'] # P1 + + # P1 - Specific cluster-affecting scenarios + if any(keyword in issue_lower for keyword in [ + 'cluster', 'raid degraded', 'multiple drive', + 'both networks unreachable' + ]): + return self.PRIORITIES['CRITICAL'] # P1 + # P2 - Hardware failures requiring same-day response if any(keyword in issue_lower for keyword in [ - 'smart failure', 'drive failure', 'disk failure', + 'smart failure', 'smart overall health check failed', + 'drive failure', 'disk failure', 'uncorrectable ecc', 'hardware failure', 'critical temperature', 'firmware issue', - 'reallocated sector', 'pending sector' + 'reallocated_sector', 'pending_sector', 'offline_uncorrectable', + 'critical available_spare', 'critical wear', + 'critical reallocated', 'critical current_pending', + 'network is unreachable' ]): return self.PRIORITIES['HIGH'] # P2 - - # P2 - SMART errors indicating potential drive failure + + # P2 - SMART issues with critical indicators if 'smart issues' in issue_lower and any(error_type in issue_lower for error_type in [ - 'error', 'failed', 'reallocated', 'pending', 'uncorrectable' + 'critical', 'failed', 'reallocated', 'pending', 'uncorrectable', 'offline' ]): return self.PRIORITIES['HIGH'] # P2 - - # P2 - Critical storage usage (>90%) - if 'critical storage usage' in issue_lower: - return self.PRIORITIES['HIGH'] # P2 - - # P2 - Network failures affecting cluster communication - if any(keyword in issue_lower for keyword in [ - 'network failure', 'unreachable', 'network down' - ]): - return self.PRIORITIES['HIGH'] # P2 - + # P3 - Warnings requiring attention within days if any(keyword in issue_lower for keyword in [ - 'high temperature', 'high storage usage', - 'correctable ecc', 'high cpu usage', - 'warning' + 'warning', 'high temperature', 'correctable ecc', + 'trend alert', 'critical storage usage', + 'low available_spare', 'high wear' ]): return self.PRIORITIES['MEDIUM'] # P3 - - # P4 - Low priority monitoring alerts - return self.PRIORITIES['LOW'] # P4 + + # P4 - Normal monitoring alerts + if any(keyword in issue_lower for keyword in [ + 'cpu usage', 'high storage usage', + 'system log', 'drive age' + ]): + return self.PRIORITIES['NORMAL'] # P4 + + # P5 - Informational/minimal impact + if any(keyword in issue_lower for keyword in [ + 'info:', 'info ', 'above optimal', 'monitor only' + ]): + return self.PRIORITIES['LOW'] # P5 + + # Default to P3 for unknown issues (conservative approach) + return self.PRIORITIES['MEDIUM'] def _categorize_issue(self, issue: str) -> tuple: """ @@ -1267,9 +1351,9 @@ class SystemHealthMonitor: Returns: tuple: (category, ticket_type, issue_tag, ticket_type_tag) - category: 'Hardware', 'Software', 'Network', etc. - - ticket_type: 'Incident', 'Problem', 'Task', 'Request' + - ticket_type: 'Issue', 'Problem', 'Task', 'Maintenance', etc. - issue_tag: '[hardware]', '[software]', '[network]' - - ticket_type_tag: '[incident]', '[problem]', etc. + - ticket_type_tag: '[issue]', '[problem]', etc. """ issue_lower = issue.lower() @@ -1277,15 +1361,16 @@ class SystemHealthMonitor: if any(keyword in issue_lower for keyword in [ 'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature', 'firmware', 'power_on_hours', 'reallocated', 'pending', - 'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending' + 'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending', + 'nvme' ]): - # SMART errors are incidents (unplanned degradation) + # SMART errors/failures are issues (unplanned degradation) if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']): return ( self.TICKET_CATEGORIES['HARDWARE'], - self.TICKET_TYPES['INCIDENT'], + self.TICKET_TYPES['ISSUE'], self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'], - self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT'] + self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE'] ) # SMART warnings are problems (need investigation) else: @@ -1301,13 +1386,13 @@ class SystemHealthMonitor: 'lxc', 'container', 'storage usage', 'cpu usage', 'process', 'application', 'service', 'daemon' ]): - # Critical storage/CPU is an incident (service degradation) + # Critical storage/CPU is an issue (service degradation) if 'critical' in issue_lower: return ( self.TICKET_CATEGORIES['SOFTWARE'], - self.TICKET_TYPES['INCIDENT'], + self.TICKET_TYPES['ISSUE'], self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'], - self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT'] + self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE'] ) # Warning level is a problem (needs investigation before it becomes critical) else: @@ -1318,23 +1403,23 @@ class SystemHealthMonitor: self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] ) - # Network Issues - Network connectivity/infrastructure + # Network Issues - Network connectivity/infrastructure (categorized as Hardware) if any(keyword in issue_lower for keyword in [ 'network', 'connectivity', 'unreachable', 'latency', 'packet loss', 'interface', 'link down' ]): - # Network failures are incidents + # Network failures are issues if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']): return ( - self.TICKET_CATEGORIES['NETWORK'], - self.TICKET_TYPES['INCIDENT'], + self.TICKET_CATEGORIES['HARDWARE'], + self.TICKET_TYPES['ISSUE'], self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'], - self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT'] + self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE'] ) # Network warnings are problems else: return ( - self.TICKET_CATEGORIES['NETWORK'], + self.TICKET_CATEGORIES['HARDWARE'], self.TICKET_TYPES['PROBLEM'], self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'], self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] @@ -1517,7 +1602,22 @@ class SystemHealthMonitor: logger.info(f"CPU status: {health_report['cpu_health']['status']}") logger.info(f"Network status: {health_report['network_health']}") logger.info(f"System status: {health_report['system_health']['status']}") - logger.info(f"Detected issues: {issues}") + logger.info(f"Detected issues (pre-filter): {issues}") + + # Filter out INFO-level issues unless configured to include them + if not self.CONFIG.get('INCLUDE_INFO_TICKETS', False): + actionable_issues = [] + for issue in issues: + # Skip INFO-level issues (P5 candidates that shouldn't create tickets) + if any(info_marker in issue.lower() for info_marker in [ + 'info:', 'info ', 'above optimal', 'monitor only' + ]): + logger.debug(f"Filtering INFO-level issue: {issue}") + continue + actionable_issues.append(issue) + issues = actionable_issues + logger.info(f"Filtered to actionable issues: {issues}") + logger.info("=== Issue Detection Completed ===\n") return issues