Upgrade priority system and fix ticket type alignment

- Add P5 (LOW) priority for informational/minimal impact alerts
- Expand ISSUE_PRIORITIES from 7 to 40+ comprehensive mappings
- Fix TICKET_TYPES to match tinker_tickets API (Issue, Problem, Task,
  Maintenance, Upgrade, Install, Request)
- Fix TICKET_CATEGORIES to only Hardware and Software
- Add P1 escalation logic via _count_critical_issues() helper
- Rewrite _determine_ticket_priority() with full P1-P5 support
- Add CONFIG options: INCLUDE_INFO_TICKETS, PRIORITY_ESCALATION_THRESHOLD
- Filter INFO-level alerts from ticket creation by default
- Update _categorize_issue() to use valid ticket types

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-17 15:24:35 -05:00
parent 87b16ca822
commit 3322c5878a

View File

@@ -23,20 +23,53 @@ class SystemHealthMonitor:
STANDARD_WIDTH = 80 STANDARD_WIDTH = 80
PRIORITIES = { PRIORITIES = {
'CRITICAL': '1', 'CRITICAL': '1', # P1 - Cluster outages, total system failure
'HIGH': '2', 'HIGH': '2', # P2 - Hardware failures, same-day response
'MEDIUM': '3', 'MEDIUM': '3', # P3 - Warnings, 1-3 day response
'LOW': '4' 'NORMAL': '4', # P4 - Standard monitoring alerts
'LOW': '5' # P5 - Informational, minimal impact
} }
ISSUE_PRIORITIES = { ISSUE_PRIORITIES = {
# P1 - Critical System Issues (cluster-wide impact)
'CLUSTER_FAILURE': PRIORITIES['CRITICAL'],
'MULTIPLE_DRIVE_FAILURE': PRIORITIES['CRITICAL'],
'RAID_DEGRADED': PRIORITIES['CRITICAL'],
# P2 - Hardware Failures (same-day response)
'SMART_FAILURE': PRIORITIES['HIGH'], 'SMART_FAILURE': PRIORITIES['HIGH'],
'SMART_CRITICAL': PRIORITIES['HIGH'],
'DISK_CRITICAL': PRIORITIES['HIGH'], 'DISK_CRITICAL': PRIORITIES['HIGH'],
'DISK_WARNING': PRIORITIES['MEDIUM'],
'UNCORRECTABLE_ECC': PRIORITIES['HIGH'], 'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
'NETWORK_FAILURE': PRIORITIES['HIGH'],
'TEMPERATURE_CRITICAL': PRIORITIES['HIGH'],
'SSD_WEAR_CRITICAL': PRIORITIES['HIGH'],
'NVME_SPARE_CRITICAL': PRIORITIES['HIGH'],
'FIRMWARE_CRITICAL': PRIORITIES['HIGH'],
'REALLOCATED_SECTOR': PRIORITIES['HIGH'],
'PENDING_SECTOR': PRIORITIES['HIGH'],
# P3 - Warnings (1-3 day response)
'SMART_WARNING': PRIORITIES['MEDIUM'],
'DISK_WARNING': PRIORITIES['MEDIUM'],
'CORRECTABLE_ECC': PRIORITIES['MEDIUM'], 'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
'CPU_HIGH': PRIORITIES['LOW'], 'TEMPERATURE_WARNING': PRIORITIES['MEDIUM'],
'NETWORK_FAILURE': PRIORITIES['HIGH'] 'SSD_WEAR_WARNING': PRIORITIES['MEDIUM'],
'NVME_SPARE_WARNING': PRIORITIES['MEDIUM'],
'LXC_STORAGE_CRITICAL': PRIORITIES['MEDIUM'],
'TREND_ALERT': PRIORITIES['MEDIUM'],
# P4 - Normal Monitoring (standard response)
'CPU_HIGH': PRIORITIES['NORMAL'],
'LXC_STORAGE_WARNING': PRIORITIES['NORMAL'],
'SYSTEM_LOG_WARNING': PRIORITIES['NORMAL'],
'DRIVE_AGE_WARNING': PRIORITIES['NORMAL'],
# P5 - Informational (minimal impact)
'TEMPERATURE_INFO': PRIORITIES['LOW'],
'DRIVE_AGE_INFO': PRIORITIES['LOW'],
'SSD_WEAR_INFO': PRIORITIES['LOW'],
'SYSTEM_LOG_INFO': PRIORITIES['LOW']
} }
CONFIG = { CONFIG = {
@@ -69,7 +102,9 @@ class SystemHealthMonitor:
r'.*/downloads.*' r'.*/downloads.*'
], ],
'HISTORY_DIR': '/var/log/hwmonDaemon', 'HISTORY_DIR': '/var/log/hwmonDaemon',
'HISTORY_RETENTION_DAYS': 30 'HISTORY_RETENTION_DAYS': 30,
'INCLUDE_INFO_TICKETS': False, # Set True to create P5 tickets for INFO alerts
'PRIORITY_ESCALATION_THRESHOLD': 3 # Number of criticals to trigger P1
} }
@classmethod @classmethod
@@ -116,10 +151,11 @@ class SystemHealthMonitor:
'PRODUCTION': '[production]' 'PRODUCTION': '[production]'
}, },
'TICKET_TYPE': { 'TICKET_TYPE': {
'INCIDENT': '[incident]', 'ISSUE': '[issue]', # General issue (replaces invalid 'incident')
'MAINTENANCE': '[maintenance]', 'PROBLEM': '[problem]', # Root cause investigation
'PROBLEM': '[problem]', 'TASK': '[task]', # Planned work item
'TASK': '[task]' 'MAINTENANCE': '[maintenance]', # Scheduled/preventive work
'UPGRADE': '[upgrade]' # Hardware/software upgrade
}, },
'HARDWARE_TYPE': { 'HARDWARE_TYPE': {
'HARDWARE': '[hardware]' 'HARDWARE': '[hardware]'
@@ -139,17 +175,17 @@ class SystemHealthMonitor:
# Category and Type mappings for ticket API # Category and Type mappings for ticket API
TICKET_CATEGORIES = { TICKET_CATEGORIES = {
'HARDWARE': 'Hardware', 'HARDWARE': 'Hardware',
'SOFTWARE': 'Software', 'SOFTWARE': 'Software'
'NETWORK': 'Network',
'SECURITY': 'Security',
'OTHER': 'Other'
} }
TICKET_TYPES = { TICKET_TYPES = {
'INCIDENT': 'Incident', # Unplanned interruption or service degradation 'ISSUE': 'Issue', # General issue/incident
'REQUEST': 'Request', # Service or information request
'PROBLEM': 'Problem', # Root cause investigation needed 'PROBLEM': 'Problem', # Root cause investigation needed
'TASK': 'Task' # Planned work item 'TASK': 'Task', # Planned work item
'MAINTENANCE': 'Maintenance', # Scheduled/preventive work
'UPGRADE': 'Upgrade', # Hardware/software upgrade
'INSTALL': 'Install', # New installation
'REQUEST': 'Request' # Service or information request
} }
PROBLEMATIC_FIRMWARE = { PROBLEMATIC_FIRMWARE = {
@@ -1209,56 +1245,104 @@ class SystemHealthMonitor:
return description return description
def _count_critical_issues(self, health_report: Dict[str, Any]) -> int:
"""Count total critical issues across all health checks for P1 escalation."""
count = 0
# Count drive failures
for drive in health_report.get('drives_health', {}).get('drives', []):
if drive.get('smart_status') == 'UNHEALTHY':
count += 1
if any('critical' in issue.lower() for issue in drive.get('smart_issues', [])):
count += 1
# Count ECC errors
if health_report.get('memory_health', {}).get('status') == 'CRITICAL':
count += 1
# Count network failures
net = health_report.get('network_health', {})
if net.get('management_network', {}).get('status') == 'CRITICAL':
count += 1
if net.get('ceph_network', {}).get('status') == 'CRITICAL':
count += 1
# Count LXC critical issues
if health_report.get('lxc_health', {}).get('status') == 'CRITICAL':
count += 1
return count
def _determine_ticket_priority(self, issue: str, health_report: Dict[str, Any]) -> str: def _determine_ticket_priority(self, issue: str, health_report: Dict[str, Any]) -> str:
""" """
Determine ticket priority based on issue type and severity. Determine ticket priority based on issue type, severity, and context.
P1 = Critical system outages (reserved for future major outages)
P1 = Cluster outages, multiple simultaneous failures
P2 = Hardware failures requiring same-day response P2 = Hardware failures requiring same-day response
P3 = Warnings requiring response within 1-3 days P3 = Warnings requiring response within 1-3 days
P4 = Low priority monitoring alerts P4 = Normal monitoring alerts
P5 = Informational/minimal impact
""" """
issue_lower = issue.lower() issue_lower = issue.lower()
# P1 - Reserved for major system outages (implement later) # Count total critical issues for escalation logic
# if 'cluster down' in issue_lower or 'total failure' in issue_lower: critical_count = self._count_critical_issues(health_report)
# return self.PRIORITIES['CRITICAL'] # P1 escalation_threshold = self.CONFIG.get('PRIORITY_ESCALATION_THRESHOLD', 3)
# P1 - Multiple simultaneous critical failures (cluster risk)
if critical_count >= escalation_threshold:
logger.info(f"P1 escalation triggered: {critical_count} critical issues detected")
return self.PRIORITIES['CRITICAL'] # P1
# P1 - Specific cluster-affecting scenarios
if any(keyword in issue_lower for keyword in [
'cluster', 'raid degraded', 'multiple drive',
'both networks unreachable'
]):
return self.PRIORITIES['CRITICAL'] # P1
# P2 - Hardware failures requiring same-day response # P2 - Hardware failures requiring same-day response
if any(keyword in issue_lower for keyword in [ if any(keyword in issue_lower for keyword in [
'smart failure', 'drive failure', 'disk failure', 'smart failure', 'smart overall health check failed',
'drive failure', 'disk failure',
'uncorrectable ecc', 'hardware failure', 'uncorrectable ecc', 'hardware failure',
'critical temperature', 'firmware issue', 'critical temperature', 'firmware issue',
'reallocated sector', 'pending sector' 'reallocated_sector', 'pending_sector', 'offline_uncorrectable',
'critical available_spare', 'critical wear',
'critical reallocated', 'critical current_pending',
'network is unreachable'
]): ]):
return self.PRIORITIES['HIGH'] # P2 return self.PRIORITIES['HIGH'] # P2
# P2 - SMART errors indicating potential drive failure # P2 - SMART issues with critical indicators
if 'smart issues' in issue_lower and any(error_type in issue_lower for error_type in [ if 'smart issues' in issue_lower and any(error_type in issue_lower for error_type in [
'error', 'failed', 'reallocated', 'pending', 'uncorrectable' 'critical', 'failed', 'reallocated', 'pending', 'uncorrectable', 'offline'
]):
return self.PRIORITIES['HIGH'] # P2
# P2 - Critical storage usage (>90%)
if 'critical storage usage' in issue_lower:
return self.PRIORITIES['HIGH'] # P2
# P2 - Network failures affecting cluster communication
if any(keyword in issue_lower for keyword in [
'network failure', 'unreachable', 'network down'
]): ]):
return self.PRIORITIES['HIGH'] # P2 return self.PRIORITIES['HIGH'] # P2
# P3 - Warnings requiring attention within days # P3 - Warnings requiring attention within days
if any(keyword in issue_lower for keyword in [ if any(keyword in issue_lower for keyword in [
'high temperature', 'high storage usage', 'warning', 'high temperature', 'correctable ecc',
'correctable ecc', 'high cpu usage', 'trend alert', 'critical storage usage',
'warning' 'low available_spare', 'high wear'
]): ]):
return self.PRIORITIES['MEDIUM'] # P3 return self.PRIORITIES['MEDIUM'] # P3
# P4 - Low priority monitoring alerts # P4 - Normal monitoring alerts
return self.PRIORITIES['LOW'] # P4 if any(keyword in issue_lower for keyword in [
'cpu usage', 'high storage usage',
'system log', 'drive age'
]):
return self.PRIORITIES['NORMAL'] # P4
# P5 - Informational/minimal impact
if any(keyword in issue_lower for keyword in [
'info:', 'info ', 'above optimal', 'monitor only'
]):
return self.PRIORITIES['LOW'] # P5
# Default to P3 for unknown issues (conservative approach)
return self.PRIORITIES['MEDIUM']
def _categorize_issue(self, issue: str) -> tuple: def _categorize_issue(self, issue: str) -> tuple:
""" """
@@ -1267,9 +1351,9 @@ class SystemHealthMonitor:
Returns: Returns:
tuple: (category, ticket_type, issue_tag, ticket_type_tag) tuple: (category, ticket_type, issue_tag, ticket_type_tag)
- category: 'Hardware', 'Software', 'Network', etc. - category: 'Hardware', 'Software', 'Network', etc.
- ticket_type: 'Incident', 'Problem', 'Task', 'Request' - ticket_type: 'Issue', 'Problem', 'Task', 'Maintenance', etc.
- issue_tag: '[hardware]', '[software]', '[network]' - issue_tag: '[hardware]', '[software]', '[network]'
- ticket_type_tag: '[incident]', '[problem]', etc. - ticket_type_tag: '[issue]', '[problem]', etc.
""" """
issue_lower = issue.lower() issue_lower = issue.lower()
@@ -1277,15 +1361,16 @@ class SystemHealthMonitor:
if any(keyword in issue_lower for keyword in [ if any(keyword in issue_lower for keyword in [
'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature', 'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature',
'firmware', 'power_on_hours', 'reallocated', 'pending', 'firmware', 'power_on_hours', 'reallocated', 'pending',
'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending' 'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending',
'nvme'
]): ]):
# SMART errors are incidents (unplanned degradation) # SMART errors/failures are issues (unplanned degradation)
if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']): if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']):
return ( return (
self.TICKET_CATEGORIES['HARDWARE'], self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['INCIDENT'], self.TICKET_TYPES['ISSUE'],
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'], self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT'] self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
) )
# SMART warnings are problems (need investigation) # SMART warnings are problems (need investigation)
else: else:
@@ -1301,13 +1386,13 @@ class SystemHealthMonitor:
'lxc', 'container', 'storage usage', 'cpu usage', 'process', 'lxc', 'container', 'storage usage', 'cpu usage', 'process',
'application', 'service', 'daemon' 'application', 'service', 'daemon'
]): ]):
# Critical storage/CPU is an incident (service degradation) # Critical storage/CPU is an issue (service degradation)
if 'critical' in issue_lower: if 'critical' in issue_lower:
return ( return (
self.TICKET_CATEGORIES['SOFTWARE'], self.TICKET_CATEGORIES['SOFTWARE'],
self.TICKET_TYPES['INCIDENT'], self.TICKET_TYPES['ISSUE'],
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'], self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT'] self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
) )
# Warning level is a problem (needs investigation before it becomes critical) # Warning level is a problem (needs investigation before it becomes critical)
else: else:
@@ -1318,23 +1403,23 @@ class SystemHealthMonitor:
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
) )
# Network Issues - Network connectivity/infrastructure # Network Issues - Network connectivity/infrastructure (categorized as Hardware)
if any(keyword in issue_lower for keyword in [ if any(keyword in issue_lower for keyword in [
'network', 'connectivity', 'unreachable', 'latency', 'packet loss', 'network', 'connectivity', 'unreachable', 'latency', 'packet loss',
'interface', 'link down' 'interface', 'link down'
]): ]):
# Network failures are incidents # Network failures are issues
if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']): if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']):
return ( return (
self.TICKET_CATEGORIES['NETWORK'], self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['INCIDENT'], self.TICKET_TYPES['ISSUE'],
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'], self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT'] self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
) )
# Network warnings are problems # Network warnings are problems
else: else:
return ( return (
self.TICKET_CATEGORIES['NETWORK'], self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['PROBLEM'], self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'], self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
@@ -1517,7 +1602,22 @@ class SystemHealthMonitor:
logger.info(f"CPU status: {health_report['cpu_health']['status']}") logger.info(f"CPU status: {health_report['cpu_health']['status']}")
logger.info(f"Network status: {health_report['network_health']}") logger.info(f"Network status: {health_report['network_health']}")
logger.info(f"System status: {health_report['system_health']['status']}") logger.info(f"System status: {health_report['system_health']['status']}")
logger.info(f"Detected issues: {issues}") logger.info(f"Detected issues (pre-filter): {issues}")
# Filter out INFO-level issues unless configured to include them
if not self.CONFIG.get('INCLUDE_INFO_TICKETS', False):
actionable_issues = []
for issue in issues:
# Skip INFO-level issues (P5 candidates that shouldn't create tickets)
if any(info_marker in issue.lower() for info_marker in [
'info:', 'info ', 'above optimal', 'monitor only'
]):
logger.debug(f"Filtering INFO-level issue: {issue}")
continue
actionable_issues.append(issue)
issues = actionable_issues
logger.info(f"Filtered to actionable issues: {issues}")
logger.info("=== Issue Detection Completed ===\n") logger.info("=== Issue Detection Completed ===\n")
return issues return issues