diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 4adab5c..5ce4064 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -116,7 +116,10 @@ class SystemHealthMonitor: 'PRODUCTION': '[production]' }, 'TICKET_TYPE': { - 'MAINTENANCE': '[maintenance]' + 'INCIDENT': '[incident]', + 'MAINTENANCE': '[maintenance]', + 'PROBLEM': '[problem]', + 'TASK': '[task]' }, 'HARDWARE_TYPE': { 'HARDWARE': '[hardware]' @@ -130,9 +133,23 @@ class SystemHealthMonitor: 'SCOPE': { 'SINGLE_NODE': '[single-node]', 'CLUSTER_WIDE': '[cluster-wide]' - }, - 'DEFAULT_CATEGORY': 'Hardware', - 'DEFAULT_ISSUE_TYPE': 'Problem' + } + } + + # Category and Type mappings for ticket API + TICKET_CATEGORIES = { + 'HARDWARE': 'Hardware', + 'SOFTWARE': 'Software', + 'NETWORK': 'Network', + 'SECURITY': 'Security', + 'OTHER': 'Other' + } + + TICKET_TYPES = { + 'INCIDENT': 'Incident', # Unplanned interruption or service degradation + 'REQUEST': 'Request', # Service or information request + 'PROBLEM': 'Problem', # Root cause investigation needed + 'TASK': 'Task' # Planned work item } PROBLEMATIC_FIRMWARE = { @@ -1243,6 +1260,94 @@ class SystemHealthMonitor: # P4 - Low priority monitoring alerts return self.PRIORITIES['LOW'] # P4 + def _categorize_issue(self, issue: str) -> tuple: + """ + Determine the correct category, type, and tags for an issue. + + Returns: + tuple: (category, ticket_type, issue_tag, ticket_type_tag) + - category: 'Hardware', 'Software', 'Network', etc. + - ticket_type: 'Incident', 'Problem', 'Task', 'Request' + - issue_tag: '[hardware]', '[software]', '[network]' + - ticket_type_tag: '[incident]', '[problem]', etc. + """ + issue_lower = issue.lower() + + # Hardware Issues - Physical hardware problems + if any(keyword in issue_lower for keyword in [ + 'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature', + 'firmware', 'power_on_hours', 'reallocated', 'pending', + 'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending' + ]): + # SMART errors are incidents (unplanned degradation) + if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']): + return ( + self.TICKET_CATEGORIES['HARDWARE'], + self.TICKET_TYPES['INCIDENT'], + self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'], + self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT'] + ) + # SMART warnings are problems (need investigation) + else: + return ( + self.TICKET_CATEGORIES['HARDWARE'], + self.TICKET_TYPES['PROBLEM'], + self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'], + self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] + ) + + # Software Issues - Application/OS/Container issues + if any(keyword in issue_lower for keyword in [ + 'lxc', 'container', 'storage usage', 'cpu usage', 'process', + 'application', 'service', 'daemon' + ]): + # Critical storage/CPU is an incident (service degradation) + if 'critical' in issue_lower: + return ( + self.TICKET_CATEGORIES['SOFTWARE'], + self.TICKET_TYPES['INCIDENT'], + self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'], + self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT'] + ) + # Warning level is a problem (needs investigation before it becomes critical) + else: + return ( + self.TICKET_CATEGORIES['SOFTWARE'], + self.TICKET_TYPES['PROBLEM'], + self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'], + self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] + ) + + # Network Issues - Network connectivity/infrastructure + if any(keyword in issue_lower for keyword in [ + 'network', 'connectivity', 'unreachable', 'latency', 'packet loss', + 'interface', 'link down' + ]): + # Network failures are incidents + if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']): + return ( + self.TICKET_CATEGORIES['NETWORK'], + self.TICKET_TYPES['INCIDENT'], + self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'], + self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT'] + ) + # Network warnings are problems + else: + return ( + self.TICKET_CATEGORIES['NETWORK'], + self.TICKET_TYPES['PROBLEM'], + self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'], + self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] + ) + + # Default: Hardware Problem (for undefined cases) + return ( + self.TICKET_CATEGORIES['HARDWARE'], + self.TICKET_TYPES['PROBLEM'], + self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'], + self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] + ) + # ============================================================================= # TICKET CREATION METHODS # ============================================================================= @@ -1256,24 +1361,16 @@ class SystemHealthMonitor: hostname = socket.gethostname() action_type = self.TICKET_TEMPLATES['ACTION_TYPE'] environment = self.TICKET_TEMPLATES['ENVIRONMENT'] - ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE'] - hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE'] - software_type = self.TICKET_TEMPLATES['SOFTWARE_TYPE'] + scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE'] for issue in issues: - if issue.lower().startswith('critical') or 'critical' in issue.upper(): - priority = self.PRIORITIES['CRITICAL'] - elif issue.lower().startswith('warning') or 'warning' in issue.lower(): - # all warnings become LOW priority (4) - priority = self.PRIORITIES['LOW'] - else: - # everything else stays at MEDIUM (3) - priority = self.PRIORITIES['MEDIUM'] + # Use the comprehensive priority determination function + priority = self._determine_ticket_priority(issue, health_report) - category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY'] - issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE'] - scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE'] + # Get proper categorization for this issue + category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue) + # Extract drive capacity if this is a drive-related issue drive_size = "" if "Drive" in issue and "/dev/" in issue: device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue) @@ -1285,21 +1382,16 @@ class SystemHealthMonitor: else: logger.warning(f"Could not extract device from issue: {issue}") - # Determine if this is a hardware or software issue - issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE' - - # Use the correct template based on issue category - category_template = hardware_type['HARDWARE'] if issue_category == 'HARDWARE' else software_type['SOFTWARE'] - + # Build ticket title with proper categorization ticket_title = ( f"[{hostname}]" f"{action_type['AUTO']}" - f"{category_template}" - f"{drive_size}" # Insert drive capacity here + f"{issue_tag}" + f"{drive_size}" f"{issue}" f"{scope}" f"{environment['PRODUCTION']}" - f"{ticket_type['MAINTENANCE']}" + f"{ticket_type_tag}" ) description = self._generate_detailed_description(issue, health_report) @@ -1309,7 +1401,7 @@ class SystemHealthMonitor: "priority": priority, "status": "Open", "category": category, - "type": issue_type + "type": ticket_type } if self.dry_run: