From b969f8c0e461c5bdfc574702ceb094b9b2cbce9e Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Mon, 3 Mar 2025 18:34:58 -0500 Subject: [PATCH] More detailed ticket description and fix thresholds --- hwmonDaemon.py | 181 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 135 insertions(+), 46 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index aeda846..7e4ec68 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -133,13 +133,7 @@ class SystemHealthMonitor: return health_report def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str: - """ - Generate a detailed description for the issue based on the health report. - - :param issue: The issue description. - :param health_report: The comprehensive health report from the checks. - :return: A detailed description for the issue. - """ + # Keep existing banner and initial description banner = """ ================================================================= AUTOMATED TICKET - Generated by Hardware Monitoring Service (hwmonDaemon) @@ -151,9 +145,137 @@ class SystemHealthMonitor: description = banner + issue + "\n\n" + # Add issue explanation section + description += "=== Issue Details ===\n" + + # Add SMART attribute explanations + SMART_DESCRIPTIONS = { + 'Reallocated_Sector_Ct': """ + Number of sectors that have been reallocated due to errors. + - High counts indicate degrading media + - Each reallocation uses one of the drive's limited spare sectors + - Rapid increases suggest accelerating drive wear + """, + + 'Current_Pending_Sector': """ + Sectors waiting to be reallocated due to read/write errors. + - Indicates potentially unstable sectors + - May result in data loss if unrecoverable + - Should be monitored for increases + """, + + 'Offline_Uncorrectable': """ + Count of uncorrectable errors detected during offline data collection. + - Direct indicator of media reliability issues + - May affect data integrity + - High values suggest drive replacement needed + """, + + 'Reported_Uncorrect': """ + Number of errors that could not be recovered using hardware ECC. + - Critical indicator of drive health + - Directly impacts data reliability + - Any non-zero value requires attention + """, + + 'Spin_Retry_Count': """ + Number of spin start retry attempts. + - Indicates potential motor or bearing issues + - May predict imminent mechanical failure + - Increasing values suggest degrading drive health + """, + + 'Power_On_Hours': """ + Total number of hours the device has been powered on. + - Normal aging metric + - Used to gauge overall drive lifetime + - Compare against manufacturer's MTBF rating + """, + + 'Media_Wearout_Indicator': """ + Percentage of drive's rated life remaining (SSDs). + - 100 indicates new drive + - 0 indicates exceeded rated writes + - Critical for SSD lifecycle management + """, + + 'Temperature_Celsius': """ + Current drive temperature. + - High temperatures accelerate wear + - Optimal range: 20-45°C + - Sustained high temps reduce lifespan + """, + + 'Available_Spare': """ + Percentage of spare blocks remaining (SSDs). + - Critical for SSD endurance + - Low values indicate approaching end-of-life + - Rapid decreases suggest excessive writes + """, + + 'Program_Fail_Count': """ + Number of flash program operation failures. + - Indicates NAND cell reliability + - Important for SSD health assessment + - Increasing values suggest flash degradation + """, + + 'Erase_Fail_Count': """ + Number of flash erase operation failures. + - Related to NAND block health + - Critical for SSD reliability + - High counts suggest failing flash blocks + """ + } + + if "SMART" in issue: + description += """ + SMART (Self-Monitoring, Analysis, and Reporting Technology) issues indicate potential drive reliability problems. + - Reallocated sectors indicate bad blocks that have been remapped + - Pending sectors are potentially failing blocks waiting to be remapped + - Uncorrectable errors indicate data that could not be read + """ + + if "Temperature" in issue: + description += """ + High drive temperatures can: + - Reduce drive lifespan + - Cause performance degradation + - Lead to data corruption in extreme cases + Optimal temperature range: 20-45°C + """ + + if "ECC" in issue: + description += """ + ECC (Error Correction Code) Memory Issues: + - Correctable: Memory errors that were successfully fixed + - Uncorrectable: Serious memory errors that could not be corrected + Frequent ECC corrections may indicate degrading memory modules + """ + + if "CPU" in issue: + description += """ + High CPU usage sustained over time can indicate: + - Resource constraints + - Runaway processes + - Need for performance optimization + - Potential cooling issues + """ + + if "Network" in issue: + description += """ + Network connectivity issues can impact: + - Cluster communication + - Data replication + - Service availability + - Management access + """ + + # Keep existing detailed metrics section if "Disk" in issue: for partition in health_report.get('drives_health', {}).get('drives', []): if partition.get('mountpoint') in issue: + description += f"\n=== Disk Metrics ===\n" description += f"Disk Device: {partition['device']}\n" description += f"Mount Point: {partition['mountpoint']}\n" description += f"Total Space: {partition['total_space']}\n" @@ -161,41 +283,8 @@ class SystemHealthMonitor: description += f"Free Space: {partition['free_space']}\n" description += f"Usage Percent: {partition['usage_percent']}%\n" - if partition.get('smart_status') == 'UNHEALTHY': - try: - # Get additional disk information using smartctl - result = subprocess.run( - ['smartctl', '-a', partition['device']], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - output = result.stdout + result.stderr - description += "\nSMART Information:\n" - description += output - except Exception as e: - description += f"Error getting SMART information: {str(e)}\n" - break - - elif "Memory" in issue: - memory_health = health_report.get('memory_health', {}) - description += f"Total Memory: {memory_health['total_memory']}\n" - description += f"Used Memory: {memory_health['used_memory']}\n" - description += f"Memory Usage Percent: {memory_health['memory_percent']}%\n" - - elif "CPU" in issue: - cpu_health = health_report.get('cpu_health', {}) - description += f"CPU Usage Percent: {cpu_health['cpu_usage_percent']}%\n" - - elif "Network" in issue: - network_health = health_report.get('network_health', {}) - for network in ['management_network', 'ceph_network']: - if network_health[network]['issues']: - description += f"{network.replace('_', ' ').title()} Issues:\n" - description += "\n".join(network_health[network]['issues']) - description += "\n" - return description + def _create_tickets_for_issues(self, health_report: Dict[str, Any]): issues = self._detect_issues(health_report) if not issues: @@ -435,18 +524,18 @@ class SystemHealthMonitor: 'Offline_Uncorrectable': {'warning': 1, 'critical': 2}, 'Reported_Uncorrect': {'warning': 1, 'critical': 2}, 'Spin_Retry_Count': {'warning': 1, 'critical': 5}, - 'Command_Timeout': {'warning': 5, 'critical': 10}, + # 'Command_Timeout': {'warning': 5, 'critical': 10}, # Removed 'Power_Cycle_Count': {'warning': 5000, 'critical': 10000}, - 'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years - 'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining + 'Power_On_Hours': {'warning': 61320, 'critical': 70080}, # ~7-8 years + 'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, 'Temperature_Celsius': {'warning': 65, 'critical': 75}, 'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000}, 'Wear_Leveling_Count': {'warning': 50, 'critical': 20}, 'Available_Spare': {'warning': 30, 'critical': 10}, 'Program_Fail_Count': {'warning': 10, 'critical': 20}, 'Erase_Fail_Count': {'warning': 10, 'critical': 20}, - 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100}, - 'Seek_Error_Rate': {'warning': 50, 'critical': 100}, + # 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100}, # Removed + # 'Seek_Error_Rate': {'warning': 50, 'critical': 100}, # Removed 'Load_Cycle_Count': {'warning': 300000, 'critical': 600000}, 'SSD_Life_Left': {'warning': 30, 'critical': 10} }