More detailed ticket description and fix thresholds

This commit is contained in:
2025-03-03 18:34:58 -05:00
parent 0507203140
commit b969f8c0e4

View File

@ -133,13 +133,7 @@ class SystemHealthMonitor:
return health_report
def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
"""
Generate a detailed description for the issue based on the health report.
:param issue: The issue description.
:param health_report: The comprehensive health report from the checks.
:return: A detailed description for the issue.
"""
# Keep existing banner and initial description
banner = """
=================================================================
AUTOMATED TICKET - Generated by Hardware Monitoring Service (hwmonDaemon)
@ -151,9 +145,137 @@ class SystemHealthMonitor:
description = banner + issue + "\n\n"
# Add issue explanation section
description += "=== Issue Details ===\n"
# Add SMART attribute explanations
SMART_DESCRIPTIONS = {
'Reallocated_Sector_Ct': """
Number of sectors that have been reallocated due to errors.
- High counts indicate degrading media
- Each reallocation uses one of the drive's limited spare sectors
- Rapid increases suggest accelerating drive wear
""",
'Current_Pending_Sector': """
Sectors waiting to be reallocated due to read/write errors.
- Indicates potentially unstable sectors
- May result in data loss if unrecoverable
- Should be monitored for increases
""",
'Offline_Uncorrectable': """
Count of uncorrectable errors detected during offline data collection.
- Direct indicator of media reliability issues
- May affect data integrity
- High values suggest drive replacement needed
""",
'Reported_Uncorrect': """
Number of errors that could not be recovered using hardware ECC.
- Critical indicator of drive health
- Directly impacts data reliability
- Any non-zero value requires attention
""",
'Spin_Retry_Count': """
Number of spin start retry attempts.
- Indicates potential motor or bearing issues
- May predict imminent mechanical failure
- Increasing values suggest degrading drive health
""",
'Power_On_Hours': """
Total number of hours the device has been powered on.
- Normal aging metric
- Used to gauge overall drive lifetime
- Compare against manufacturer's MTBF rating
""",
'Media_Wearout_Indicator': """
Percentage of drive's rated life remaining (SSDs).
- 100 indicates new drive
- 0 indicates exceeded rated writes
- Critical for SSD lifecycle management
""",
'Temperature_Celsius': """
Current drive temperature.
- High temperatures accelerate wear
- Optimal range: 20-45°C
- Sustained high temps reduce lifespan
""",
'Available_Spare': """
Percentage of spare blocks remaining (SSDs).
- Critical for SSD endurance
- Low values indicate approaching end-of-life
- Rapid decreases suggest excessive writes
""",
'Program_Fail_Count': """
Number of flash program operation failures.
- Indicates NAND cell reliability
- Important for SSD health assessment
- Increasing values suggest flash degradation
""",
'Erase_Fail_Count': """
Number of flash erase operation failures.
- Related to NAND block health
- Critical for SSD reliability
- High counts suggest failing flash blocks
"""
}
if "SMART" in issue:
description += """
SMART (Self-Monitoring, Analysis, and Reporting Technology) issues indicate potential drive reliability problems.
- Reallocated sectors indicate bad blocks that have been remapped
- Pending sectors are potentially failing blocks waiting to be remapped
- Uncorrectable errors indicate data that could not be read
"""
if "Temperature" in issue:
description += """
High drive temperatures can:
- Reduce drive lifespan
- Cause performance degradation
- Lead to data corruption in extreme cases
Optimal temperature range: 20-45°C
"""
if "ECC" in issue:
description += """
ECC (Error Correction Code) Memory Issues:
- Correctable: Memory errors that were successfully fixed
- Uncorrectable: Serious memory errors that could not be corrected
Frequent ECC corrections may indicate degrading memory modules
"""
if "CPU" in issue:
description += """
High CPU usage sustained over time can indicate:
- Resource constraints
- Runaway processes
- Need for performance optimization
- Potential cooling issues
"""
if "Network" in issue:
description += """
Network connectivity issues can impact:
- Cluster communication
- Data replication
- Service availability
- Management access
"""
# Keep existing detailed metrics section
if "Disk" in issue:
for partition in health_report.get('drives_health', {}).get('drives', []):
if partition.get('mountpoint') in issue:
description += f"\n=== Disk Metrics ===\n"
description += f"Disk Device: {partition['device']}\n"
description += f"Mount Point: {partition['mountpoint']}\n"
description += f"Total Space: {partition['total_space']}\n"
@ -161,41 +283,8 @@ class SystemHealthMonitor:
description += f"Free Space: {partition['free_space']}\n"
description += f"Usage Percent: {partition['usage_percent']}%\n"
if partition.get('smart_status') == 'UNHEALTHY':
try:
# Get additional disk information using smartctl
result = subprocess.run(
['smartctl', '-a', partition['device']],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
output = result.stdout + result.stderr
description += "\nSMART Information:\n"
description += output
except Exception as e:
description += f"Error getting SMART information: {str(e)}\n"
break
elif "Memory" in issue:
memory_health = health_report.get('memory_health', {})
description += f"Total Memory: {memory_health['total_memory']}\n"
description += f"Used Memory: {memory_health['used_memory']}\n"
description += f"Memory Usage Percent: {memory_health['memory_percent']}%\n"
elif "CPU" in issue:
cpu_health = health_report.get('cpu_health', {})
description += f"CPU Usage Percent: {cpu_health['cpu_usage_percent']}%\n"
elif "Network" in issue:
network_health = health_report.get('network_health', {})
for network in ['management_network', 'ceph_network']:
if network_health[network]['issues']:
description += f"{network.replace('_', ' ').title()} Issues:\n"
description += "\n".join(network_health[network]['issues'])
description += "\n"
return description
def _create_tickets_for_issues(self, health_report: Dict[str, Any]):
issues = self._detect_issues(health_report)
if not issues:
@ -435,18 +524,18 @@ class SystemHealthMonitor:
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
'Reported_Uncorrect': {'warning': 1, 'critical': 2},
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
'Command_Timeout': {'warning': 5, 'critical': 10},
# 'Command_Timeout': {'warning': 5, 'critical': 10}, # Removed
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining
'Power_On_Hours': {'warning': 61320, 'critical': 70080}, # ~7-8 years
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10},
'Temperature_Celsius': {'warning': 65, 'critical': 75},
'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
'Wear_Leveling_Count': {'warning': 50, 'critical': 20},
'Available_Spare': {'warning': 30, 'critical': 10},
'Program_Fail_Count': {'warning': 10, 'critical': 20},
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},
'Seek_Error_Rate': {'warning': 50, 'critical': 100},
# 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100}, # Removed
# 'Seek_Error_Rate': {'warning': 50, 'critical': 100}, # Removed
'Load_Cycle_Count': {'warning': 300000, 'critical': 600000},
'SSD_Life_Left': {'warning': 30, 'critical': 10}
}