More detailed ticket description and fix thresholds
This commit is contained in:
181
hwmonDaemon.py
181
hwmonDaemon.py
@ -133,13 +133,7 @@ class SystemHealthMonitor:
|
|||||||
return health_report
|
return health_report
|
||||||
|
|
||||||
def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
|
def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
|
||||||
"""
|
# Keep existing banner and initial description
|
||||||
Generate a detailed description for the issue based on the health report.
|
|
||||||
|
|
||||||
:param issue: The issue description.
|
|
||||||
:param health_report: The comprehensive health report from the checks.
|
|
||||||
:return: A detailed description for the issue.
|
|
||||||
"""
|
|
||||||
banner = """
|
banner = """
|
||||||
=================================================================
|
=================================================================
|
||||||
AUTOMATED TICKET - Generated by Hardware Monitoring Service (hwmonDaemon)
|
AUTOMATED TICKET - Generated by Hardware Monitoring Service (hwmonDaemon)
|
||||||
@ -151,9 +145,137 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
description = banner + issue + "\n\n"
|
description = banner + issue + "\n\n"
|
||||||
|
|
||||||
|
# Add issue explanation section
|
||||||
|
description += "=== Issue Details ===\n"
|
||||||
|
|
||||||
|
# Add SMART attribute explanations
|
||||||
|
SMART_DESCRIPTIONS = {
|
||||||
|
'Reallocated_Sector_Ct': """
|
||||||
|
Number of sectors that have been reallocated due to errors.
|
||||||
|
- High counts indicate degrading media
|
||||||
|
- Each reallocation uses one of the drive's limited spare sectors
|
||||||
|
- Rapid increases suggest accelerating drive wear
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Current_Pending_Sector': """
|
||||||
|
Sectors waiting to be reallocated due to read/write errors.
|
||||||
|
- Indicates potentially unstable sectors
|
||||||
|
- May result in data loss if unrecoverable
|
||||||
|
- Should be monitored for increases
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Offline_Uncorrectable': """
|
||||||
|
Count of uncorrectable errors detected during offline data collection.
|
||||||
|
- Direct indicator of media reliability issues
|
||||||
|
- May affect data integrity
|
||||||
|
- High values suggest drive replacement needed
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Reported_Uncorrect': """
|
||||||
|
Number of errors that could not be recovered using hardware ECC.
|
||||||
|
- Critical indicator of drive health
|
||||||
|
- Directly impacts data reliability
|
||||||
|
- Any non-zero value requires attention
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Spin_Retry_Count': """
|
||||||
|
Number of spin start retry attempts.
|
||||||
|
- Indicates potential motor or bearing issues
|
||||||
|
- May predict imminent mechanical failure
|
||||||
|
- Increasing values suggest degrading drive health
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Power_On_Hours': """
|
||||||
|
Total number of hours the device has been powered on.
|
||||||
|
- Normal aging metric
|
||||||
|
- Used to gauge overall drive lifetime
|
||||||
|
- Compare against manufacturer's MTBF rating
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Media_Wearout_Indicator': """
|
||||||
|
Percentage of drive's rated life remaining (SSDs).
|
||||||
|
- 100 indicates new drive
|
||||||
|
- 0 indicates exceeded rated writes
|
||||||
|
- Critical for SSD lifecycle management
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Temperature_Celsius': """
|
||||||
|
Current drive temperature.
|
||||||
|
- High temperatures accelerate wear
|
||||||
|
- Optimal range: 20-45°C
|
||||||
|
- Sustained high temps reduce lifespan
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Available_Spare': """
|
||||||
|
Percentage of spare blocks remaining (SSDs).
|
||||||
|
- Critical for SSD endurance
|
||||||
|
- Low values indicate approaching end-of-life
|
||||||
|
- Rapid decreases suggest excessive writes
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Program_Fail_Count': """
|
||||||
|
Number of flash program operation failures.
|
||||||
|
- Indicates NAND cell reliability
|
||||||
|
- Important for SSD health assessment
|
||||||
|
- Increasing values suggest flash degradation
|
||||||
|
""",
|
||||||
|
|
||||||
|
'Erase_Fail_Count': """
|
||||||
|
Number of flash erase operation failures.
|
||||||
|
- Related to NAND block health
|
||||||
|
- Critical for SSD reliability
|
||||||
|
- High counts suggest failing flash blocks
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
if "SMART" in issue:
|
||||||
|
description += """
|
||||||
|
SMART (Self-Monitoring, Analysis, and Reporting Technology) issues indicate potential drive reliability problems.
|
||||||
|
- Reallocated sectors indicate bad blocks that have been remapped
|
||||||
|
- Pending sectors are potentially failing blocks waiting to be remapped
|
||||||
|
- Uncorrectable errors indicate data that could not be read
|
||||||
|
"""
|
||||||
|
|
||||||
|
if "Temperature" in issue:
|
||||||
|
description += """
|
||||||
|
High drive temperatures can:
|
||||||
|
- Reduce drive lifespan
|
||||||
|
- Cause performance degradation
|
||||||
|
- Lead to data corruption in extreme cases
|
||||||
|
Optimal temperature range: 20-45°C
|
||||||
|
"""
|
||||||
|
|
||||||
|
if "ECC" in issue:
|
||||||
|
description += """
|
||||||
|
ECC (Error Correction Code) Memory Issues:
|
||||||
|
- Correctable: Memory errors that were successfully fixed
|
||||||
|
- Uncorrectable: Serious memory errors that could not be corrected
|
||||||
|
Frequent ECC corrections may indicate degrading memory modules
|
||||||
|
"""
|
||||||
|
|
||||||
|
if "CPU" in issue:
|
||||||
|
description += """
|
||||||
|
High CPU usage sustained over time can indicate:
|
||||||
|
- Resource constraints
|
||||||
|
- Runaway processes
|
||||||
|
- Need for performance optimization
|
||||||
|
- Potential cooling issues
|
||||||
|
"""
|
||||||
|
|
||||||
|
if "Network" in issue:
|
||||||
|
description += """
|
||||||
|
Network connectivity issues can impact:
|
||||||
|
- Cluster communication
|
||||||
|
- Data replication
|
||||||
|
- Service availability
|
||||||
|
- Management access
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Keep existing detailed metrics section
|
||||||
if "Disk" in issue:
|
if "Disk" in issue:
|
||||||
for partition in health_report.get('drives_health', {}).get('drives', []):
|
for partition in health_report.get('drives_health', {}).get('drives', []):
|
||||||
if partition.get('mountpoint') in issue:
|
if partition.get('mountpoint') in issue:
|
||||||
|
description += f"\n=== Disk Metrics ===\n"
|
||||||
description += f"Disk Device: {partition['device']}\n"
|
description += f"Disk Device: {partition['device']}\n"
|
||||||
description += f"Mount Point: {partition['mountpoint']}\n"
|
description += f"Mount Point: {partition['mountpoint']}\n"
|
||||||
description += f"Total Space: {partition['total_space']}\n"
|
description += f"Total Space: {partition['total_space']}\n"
|
||||||
@ -161,41 +283,8 @@ class SystemHealthMonitor:
|
|||||||
description += f"Free Space: {partition['free_space']}\n"
|
description += f"Free Space: {partition['free_space']}\n"
|
||||||
description += f"Usage Percent: {partition['usage_percent']}%\n"
|
description += f"Usage Percent: {partition['usage_percent']}%\n"
|
||||||
|
|
||||||
if partition.get('smart_status') == 'UNHEALTHY':
|
|
||||||
try:
|
|
||||||
# Get additional disk information using smartctl
|
|
||||||
result = subprocess.run(
|
|
||||||
['smartctl', '-a', partition['device']],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
output = result.stdout + result.stderr
|
|
||||||
description += "\nSMART Information:\n"
|
|
||||||
description += output
|
|
||||||
except Exception as e:
|
|
||||||
description += f"Error getting SMART information: {str(e)}\n"
|
|
||||||
break
|
|
||||||
|
|
||||||
elif "Memory" in issue:
|
|
||||||
memory_health = health_report.get('memory_health', {})
|
|
||||||
description += f"Total Memory: {memory_health['total_memory']}\n"
|
|
||||||
description += f"Used Memory: {memory_health['used_memory']}\n"
|
|
||||||
description += f"Memory Usage Percent: {memory_health['memory_percent']}%\n"
|
|
||||||
|
|
||||||
elif "CPU" in issue:
|
|
||||||
cpu_health = health_report.get('cpu_health', {})
|
|
||||||
description += f"CPU Usage Percent: {cpu_health['cpu_usage_percent']}%\n"
|
|
||||||
|
|
||||||
elif "Network" in issue:
|
|
||||||
network_health = health_report.get('network_health', {})
|
|
||||||
for network in ['management_network', 'ceph_network']:
|
|
||||||
if network_health[network]['issues']:
|
|
||||||
description += f"{network.replace('_', ' ').title()} Issues:\n"
|
|
||||||
description += "\n".join(network_health[network]['issues'])
|
|
||||||
description += "\n"
|
|
||||||
|
|
||||||
return description
|
return description
|
||||||
|
|
||||||
def _create_tickets_for_issues(self, health_report: Dict[str, Any]):
|
def _create_tickets_for_issues(self, health_report: Dict[str, Any]):
|
||||||
issues = self._detect_issues(health_report)
|
issues = self._detect_issues(health_report)
|
||||||
if not issues:
|
if not issues:
|
||||||
@ -435,18 +524,18 @@ class SystemHealthMonitor:
|
|||||||
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
|
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
|
||||||
'Reported_Uncorrect': {'warning': 1, 'critical': 2},
|
'Reported_Uncorrect': {'warning': 1, 'critical': 2},
|
||||||
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
|
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
|
||||||
'Command_Timeout': {'warning': 5, 'critical': 10},
|
# 'Command_Timeout': {'warning': 5, 'critical': 10}, # Removed
|
||||||
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
||||||
'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years
|
'Power_On_Hours': {'warning': 61320, 'critical': 70080}, # ~7-8 years
|
||||||
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining
|
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10},
|
||||||
'Temperature_Celsius': {'warning': 65, 'critical': 75},
|
'Temperature_Celsius': {'warning': 65, 'critical': 75},
|
||||||
'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
|
'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
|
||||||
'Wear_Leveling_Count': {'warning': 50, 'critical': 20},
|
'Wear_Leveling_Count': {'warning': 50, 'critical': 20},
|
||||||
'Available_Spare': {'warning': 30, 'critical': 10},
|
'Available_Spare': {'warning': 30, 'critical': 10},
|
||||||
'Program_Fail_Count': {'warning': 10, 'critical': 20},
|
'Program_Fail_Count': {'warning': 10, 'critical': 20},
|
||||||
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
|
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
|
||||||
'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},
|
# 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100}, # Removed
|
||||||
'Seek_Error_Rate': {'warning': 50, 'critical': 100},
|
# 'Seek_Error_Rate': {'warning': 50, 'critical': 100}, # Removed
|
||||||
'Load_Cycle_Count': {'warning': 300000, 'critical': 600000},
|
'Load_Cycle_Count': {'warning': 300000, 'critical': 600000},
|
||||||
'SSD_Life_Left': {'warning': 30, 'critical': 10}
|
'SSD_Life_Left': {'warning': 30, 'critical': 10}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user