Upgrade priority system and fix ticket type alignment

- Add P5 (LOW) priority for informational/minimal impact alerts
- Expand ISSUE_PRIORITIES from 7 to 40+ comprehensive mappings
- Fix TICKET_TYPES to match tinker_tickets API (Issue, Problem, Task,
  Maintenance, Upgrade, Install, Request)
- Fix TICKET_CATEGORIES to only Hardware and Software
- Add P1 escalation logic via _count_critical_issues() helper
- Rewrite _determine_ticket_priority() with full P1-P5 support
- Add CONFIG options: INCLUDE_INFO_TICKETS, PRIORITY_ESCALATION_THRESHOLD
- Filter INFO-level alerts from ticket creation by default
- Update _categorize_issue() to use valid ticket types

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-17 15:24:35 -05:00
parent 87b16ca822
commit 3322c5878a

View File

@@ -23,20 +23,53 @@ class SystemHealthMonitor:
STANDARD_WIDTH = 80
PRIORITIES = {
'CRITICAL': '1',
'HIGH': '2',
'MEDIUM': '3',
'LOW': '4'
'CRITICAL': '1', # P1 - Cluster outages, total system failure
'HIGH': '2', # P2 - Hardware failures, same-day response
'MEDIUM': '3', # P3 - Warnings, 1-3 day response
'NORMAL': '4', # P4 - Standard monitoring alerts
'LOW': '5' # P5 - Informational, minimal impact
}
ISSUE_PRIORITIES = {
# P1 - Critical System Issues (cluster-wide impact)
'CLUSTER_FAILURE': PRIORITIES['CRITICAL'],
'MULTIPLE_DRIVE_FAILURE': PRIORITIES['CRITICAL'],
'RAID_DEGRADED': PRIORITIES['CRITICAL'],
# P2 - Hardware Failures (same-day response)
'SMART_FAILURE': PRIORITIES['HIGH'],
'SMART_CRITICAL': PRIORITIES['HIGH'],
'DISK_CRITICAL': PRIORITIES['HIGH'],
'DISK_WARNING': PRIORITIES['MEDIUM'],
'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
'NETWORK_FAILURE': PRIORITIES['HIGH'],
'TEMPERATURE_CRITICAL': PRIORITIES['HIGH'],
'SSD_WEAR_CRITICAL': PRIORITIES['HIGH'],
'NVME_SPARE_CRITICAL': PRIORITIES['HIGH'],
'FIRMWARE_CRITICAL': PRIORITIES['HIGH'],
'REALLOCATED_SECTOR': PRIORITIES['HIGH'],
'PENDING_SECTOR': PRIORITIES['HIGH'],
# P3 - Warnings (1-3 day response)
'SMART_WARNING': PRIORITIES['MEDIUM'],
'DISK_WARNING': PRIORITIES['MEDIUM'],
'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
'CPU_HIGH': PRIORITIES['LOW'],
'NETWORK_FAILURE': PRIORITIES['HIGH']
'TEMPERATURE_WARNING': PRIORITIES['MEDIUM'],
'SSD_WEAR_WARNING': PRIORITIES['MEDIUM'],
'NVME_SPARE_WARNING': PRIORITIES['MEDIUM'],
'LXC_STORAGE_CRITICAL': PRIORITIES['MEDIUM'],
'TREND_ALERT': PRIORITIES['MEDIUM'],
# P4 - Normal Monitoring (standard response)
'CPU_HIGH': PRIORITIES['NORMAL'],
'LXC_STORAGE_WARNING': PRIORITIES['NORMAL'],
'SYSTEM_LOG_WARNING': PRIORITIES['NORMAL'],
'DRIVE_AGE_WARNING': PRIORITIES['NORMAL'],
# P5 - Informational (minimal impact)
'TEMPERATURE_INFO': PRIORITIES['LOW'],
'DRIVE_AGE_INFO': PRIORITIES['LOW'],
'SSD_WEAR_INFO': PRIORITIES['LOW'],
'SYSTEM_LOG_INFO': PRIORITIES['LOW']
}
CONFIG = {
@@ -69,7 +102,9 @@ class SystemHealthMonitor:
r'.*/downloads.*'
],
'HISTORY_DIR': '/var/log/hwmonDaemon',
'HISTORY_RETENTION_DAYS': 30
'HISTORY_RETENTION_DAYS': 30,
'INCLUDE_INFO_TICKETS': False, # Set True to create P5 tickets for INFO alerts
'PRIORITY_ESCALATION_THRESHOLD': 3 # Number of criticals to trigger P1
}
@classmethod
@@ -116,10 +151,11 @@ class SystemHealthMonitor:
'PRODUCTION': '[production]'
},
'TICKET_TYPE': {
'INCIDENT': '[incident]',
'MAINTENANCE': '[maintenance]',
'PROBLEM': '[problem]',
'TASK': '[task]'
'ISSUE': '[issue]', # General issue (replaces invalid 'incident')
'PROBLEM': '[problem]', # Root cause investigation
'TASK': '[task]', # Planned work item
'MAINTENANCE': '[maintenance]', # Scheduled/preventive work
'UPGRADE': '[upgrade]' # Hardware/software upgrade
},
'HARDWARE_TYPE': {
'HARDWARE': '[hardware]'
@@ -139,17 +175,17 @@ class SystemHealthMonitor:
# Category and Type mappings for ticket API
TICKET_CATEGORIES = {
'HARDWARE': 'Hardware',
'SOFTWARE': 'Software',
'NETWORK': 'Network',
'SECURITY': 'Security',
'OTHER': 'Other'
'SOFTWARE': 'Software'
}
TICKET_TYPES = {
'INCIDENT': 'Incident', # Unplanned interruption or service degradation
'REQUEST': 'Request', # Service or information request
'ISSUE': 'Issue', # General issue/incident
'PROBLEM': 'Problem', # Root cause investigation needed
'TASK': 'Task' # Planned work item
'TASK': 'Task', # Planned work item
'MAINTENANCE': 'Maintenance', # Scheduled/preventive work
'UPGRADE': 'Upgrade', # Hardware/software upgrade
'INSTALL': 'Install', # New installation
'REQUEST': 'Request' # Service or information request
}
PROBLEMATIC_FIRMWARE = {
@@ -1209,56 +1245,104 @@ class SystemHealthMonitor:
return description
def _count_critical_issues(self, health_report: Dict[str, Any]) -> int:
"""Count total critical issues across all health checks for P1 escalation."""
count = 0
# Count drive failures
for drive in health_report.get('drives_health', {}).get('drives', []):
if drive.get('smart_status') == 'UNHEALTHY':
count += 1
if any('critical' in issue.lower() for issue in drive.get('smart_issues', [])):
count += 1
# Count ECC errors
if health_report.get('memory_health', {}).get('status') == 'CRITICAL':
count += 1
# Count network failures
net = health_report.get('network_health', {})
if net.get('management_network', {}).get('status') == 'CRITICAL':
count += 1
if net.get('ceph_network', {}).get('status') == 'CRITICAL':
count += 1
# Count LXC critical issues
if health_report.get('lxc_health', {}).get('status') == 'CRITICAL':
count += 1
return count
def _determine_ticket_priority(self, issue: str, health_report: Dict[str, Any]) -> str:
"""
Determine ticket priority based on issue type and severity.
P1 = Critical system outages (reserved for future major outages)
Determine ticket priority based on issue type, severity, and context.
P1 = Cluster outages, multiple simultaneous failures
P2 = Hardware failures requiring same-day response
P3 = Warnings requiring response within 1-3 days
P4 = Low priority monitoring alerts
P4 = Normal monitoring alerts
P5 = Informational/minimal impact
"""
issue_lower = issue.lower()
# P1 - Reserved for major system outages (implement later)
# if 'cluster down' in issue_lower or 'total failure' in issue_lower:
# return self.PRIORITIES['CRITICAL'] # P1
# Count total critical issues for escalation logic
critical_count = self._count_critical_issues(health_report)
escalation_threshold = self.CONFIG.get('PRIORITY_ESCALATION_THRESHOLD', 3)
# P1 - Multiple simultaneous critical failures (cluster risk)
if critical_count >= escalation_threshold:
logger.info(f"P1 escalation triggered: {critical_count} critical issues detected")
return self.PRIORITIES['CRITICAL'] # P1
# P1 - Specific cluster-affecting scenarios
if any(keyword in issue_lower for keyword in [
'cluster', 'raid degraded', 'multiple drive',
'both networks unreachable'
]):
return self.PRIORITIES['CRITICAL'] # P1
# P2 - Hardware failures requiring same-day response
if any(keyword in issue_lower for keyword in [
'smart failure', 'drive failure', 'disk failure',
'smart failure', 'smart overall health check failed',
'drive failure', 'disk failure',
'uncorrectable ecc', 'hardware failure',
'critical temperature', 'firmware issue',
'reallocated sector', 'pending sector'
'reallocated_sector', 'pending_sector', 'offline_uncorrectable',
'critical available_spare', 'critical wear',
'critical reallocated', 'critical current_pending',
'network is unreachable'
]):
return self.PRIORITIES['HIGH'] # P2
# P2 - SMART errors indicating potential drive failure
# P2 - SMART issues with critical indicators
if 'smart issues' in issue_lower and any(error_type in issue_lower for error_type in [
'error', 'failed', 'reallocated', 'pending', 'uncorrectable'
]):
return self.PRIORITIES['HIGH'] # P2
# P2 - Critical storage usage (>90%)
if 'critical storage usage' in issue_lower:
return self.PRIORITIES['HIGH'] # P2
# P2 - Network failures affecting cluster communication
if any(keyword in issue_lower for keyword in [
'network failure', 'unreachable', 'network down'
'critical', 'failed', 'reallocated', 'pending', 'uncorrectable', 'offline'
]):
return self.PRIORITIES['HIGH'] # P2
# P3 - Warnings requiring attention within days
if any(keyword in issue_lower for keyword in [
'high temperature', 'high storage usage',
'correctable ecc', 'high cpu usage',
'warning'
'warning', 'high temperature', 'correctable ecc',
'trend alert', 'critical storage usage',
'low available_spare', 'high wear'
]):
return self.PRIORITIES['MEDIUM'] # P3
# P4 - Low priority monitoring alerts
return self.PRIORITIES['LOW'] # P4
# P4 - Normal monitoring alerts
if any(keyword in issue_lower for keyword in [
'cpu usage', 'high storage usage',
'system log', 'drive age'
]):
return self.PRIORITIES['NORMAL'] # P4
# P5 - Informational/minimal impact
if any(keyword in issue_lower for keyword in [
'info:', 'info ', 'above optimal', 'monitor only'
]):
return self.PRIORITIES['LOW'] # P5
# Default to P3 for unknown issues (conservative approach)
return self.PRIORITIES['MEDIUM']
def _categorize_issue(self, issue: str) -> tuple:
"""
@@ -1267,9 +1351,9 @@ class SystemHealthMonitor:
Returns:
tuple: (category, ticket_type, issue_tag, ticket_type_tag)
- category: 'Hardware', 'Software', 'Network', etc.
- ticket_type: 'Incident', 'Problem', 'Task', 'Request'
- ticket_type: 'Issue', 'Problem', 'Task', 'Maintenance', etc.
- issue_tag: '[hardware]', '[software]', '[network]'
- ticket_type_tag: '[incident]', '[problem]', etc.
- ticket_type_tag: '[issue]', '[problem]', etc.
"""
issue_lower = issue.lower()
@@ -1277,15 +1361,16 @@ class SystemHealthMonitor:
if any(keyword in issue_lower for keyword in [
'smart', 'drive', 'disk', '/dev/', 'sector', 'temperature',
'firmware', 'power_on_hours', 'reallocated', 'pending',
'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending'
'ecc', 'memory', 'high_fly_writes', 'spin_retry', 'current_pending',
'nvme'
]):
# SMART errors are incidents (unplanned degradation)
# SMART errors/failures are issues (unplanned degradation)
if any(error in issue_lower for error in ['critical', 'failed', 'failure', 'error']):
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['INCIDENT'],
self.TICKET_TYPES['ISSUE'],
self.TICKET_TEMPLATES['HARDWARE_TYPE']['HARDWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
)
# SMART warnings are problems (need investigation)
else:
@@ -1301,13 +1386,13 @@ class SystemHealthMonitor:
'lxc', 'container', 'storage usage', 'cpu usage', 'process',
'application', 'service', 'daemon'
]):
# Critical storage/CPU is an incident (service degradation)
# Critical storage/CPU is an issue (service degradation)
if 'critical' in issue_lower:
return (
self.TICKET_CATEGORIES['SOFTWARE'],
self.TICKET_TYPES['INCIDENT'],
self.TICKET_TYPES['ISSUE'],
self.TICKET_TEMPLATES['SOFTWARE_TYPE']['SOFTWARE'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
)
# Warning level is a problem (needs investigation before it becomes critical)
else:
@@ -1318,23 +1403,23 @@ class SystemHealthMonitor:
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Network Issues - Network connectivity/infrastructure
# Network Issues - Network connectivity/infrastructure (categorized as Hardware)
if any(keyword in issue_lower for keyword in [
'network', 'connectivity', 'unreachable', 'latency', 'packet loss',
'interface', 'link down'
]):
# Network failures are incidents
# Network failures are issues
if any(error in issue_lower for error in ['failure', 'down', 'unreachable', 'critical']):
return (
self.TICKET_CATEGORIES['NETWORK'],
self.TICKET_TYPES['INCIDENT'],
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['ISSUE'],
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
self.TICKET_TEMPLATES['TICKET_TYPE']['INCIDENT']
self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
)
# Network warnings are problems
else:
return (
self.TICKET_CATEGORIES['NETWORK'],
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['PROBLEM'],
self.TICKET_TEMPLATES['NETWORK_TYPE']['NETWORK'],
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
@@ -1517,7 +1602,22 @@ class SystemHealthMonitor:
logger.info(f"CPU status: {health_report['cpu_health']['status']}")
logger.info(f"Network status: {health_report['network_health']}")
logger.info(f"System status: {health_report['system_health']['status']}")
logger.info(f"Detected issues: {issues}")
logger.info(f"Detected issues (pre-filter): {issues}")
# Filter out INFO-level issues unless configured to include them
if not self.CONFIG.get('INCLUDE_INFO_TICKETS', False):
actionable_issues = []
for issue in issues:
# Skip INFO-level issues (P5 candidates that shouldn't create tickets)
if any(info_marker in issue.lower() for info_marker in [
'info:', 'info ', 'above optimal', 'monitor only'
]):
logger.debug(f"Filtering INFO-level issue: {issue}")
continue
actionable_issues.append(issue)
issues = actionable_issues
logger.info(f"Filtered to actionable issues: {issues}")
logger.info("=== Issue Detection Completed ===\n")
return issues