More detailed ticket description and fix thresholds

2025-03-03 18:34:58 -05:00
parent 0507203140
commit b969f8c0e4
1 changed files with 135 additions and 46 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -133,13 +133,7 @@ class SystemHealthMonitor:
        return health_report
    
    def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
-        """
-        Generate a detailed description for the issue based on the health report.
-
-        :param issue: The issue description.
-        :param health_report: The comprehensive health report from the checks.
-        :return: A detailed description for the issue.
-        """
+        # Keep existing banner and initial description
        banner = """
        =================================================================
        AUTOMATED TICKET - Generated by Hardware Monitoring Service (hwmonDaemon)
@@ -151,9 +145,137 @@ class SystemHealthMonitor:

        description = banner + issue + "\n\n"
        
+        # Add issue explanation section
+        description += "=== Issue Details ===\n"
+        
+        # Add SMART attribute explanations
+        SMART_DESCRIPTIONS = {
+            'Reallocated_Sector_Ct': """
+            Number of sectors that have been reallocated due to errors.
+            - High counts indicate degrading media
+            - Each reallocation uses one of the drive's limited spare sectors
+            - Rapid increases suggest accelerating drive wear
+            """,
+            
+            'Current_Pending_Sector': """
+            Sectors waiting to be reallocated due to read/write errors.
+            - Indicates potentially unstable sectors
+            - May result in data loss if unrecoverable
+            - Should be monitored for increases
+            """,
+            
+            'Offline_Uncorrectable': """
+            Count of uncorrectable errors detected during offline data collection.
+            - Direct indicator of media reliability issues
+            - May affect data integrity
+            - High values suggest drive replacement needed
+            """,
+            
+            'Reported_Uncorrect': """
+            Number of errors that could not be recovered using hardware ECC.
+            - Critical indicator of drive health
+            - Directly impacts data reliability
+            - Any non-zero value requires attention
+            """,
+            
+            'Spin_Retry_Count': """
+            Number of spin start retry attempts.
+            - Indicates potential motor or bearing issues
+            - May predict imminent mechanical failure
+            - Increasing values suggest degrading drive health
+            """,
+            
+            'Power_On_Hours': """
+            Total number of hours the device has been powered on.
+            - Normal aging metric
+            - Used to gauge overall drive lifetime
+            - Compare against manufacturer's MTBF rating
+            """,
+            
+            'Media_Wearout_Indicator': """
+            Percentage of drive's rated life remaining (SSDs).
+            - 100 indicates new drive
+            - 0 indicates exceeded rated writes
+            - Critical for SSD lifecycle management
+            """,
+            
+            'Temperature_Celsius': """
+            Current drive temperature.
+            - High temperatures accelerate wear
+            - Optimal range: 20-45°C
+            - Sustained high temps reduce lifespan
+            """,
+            
+            'Available_Spare': """
+            Percentage of spare blocks remaining (SSDs).
+            - Critical for SSD endurance
+            - Low values indicate approaching end-of-life
+            - Rapid decreases suggest excessive writes
+            """,
+            
+            'Program_Fail_Count': """
+            Number of flash program operation failures.
+            - Indicates NAND cell reliability
+            - Important for SSD health assessment
+            - Increasing values suggest flash degradation
+            """,
+            
+            'Erase_Fail_Count': """
+            Number of flash erase operation failures.
+            - Related to NAND block health
+            - Critical for SSD reliability
+            - High counts suggest failing flash blocks
+            """
+        }
+
+        if "SMART" in issue:
+            description += """
+            SMART (Self-Monitoring, Analysis, and Reporting Technology) issues indicate potential drive reliability problems.
+            - Reallocated sectors indicate bad blocks that have been remapped
+            - Pending sectors are potentially failing blocks waiting to be remapped
+            - Uncorrectable errors indicate data that could not be read
+            """
+        
+        if "Temperature" in issue:
+            description += """
+            High drive temperatures can:
+            - Reduce drive lifespan
+            - Cause performance degradation
+            - Lead to data corruption in extreme cases
+            Optimal temperature range: 20-45°C
+            """
+        
+        if "ECC" in issue:
+            description += """
+            ECC (Error Correction Code) Memory Issues:
+            - Correctable: Memory errors that were successfully fixed
+            - Uncorrectable: Serious memory errors that could not be corrected
+            Frequent ECC corrections may indicate degrading memory modules
+            """
+        
+        if "CPU" in issue:
+            description += """
+            High CPU usage sustained over time can indicate:
+            - Resource constraints
+            - Runaway processes
+            - Need for performance optimization
+            - Potential cooling issues
+            """
+        
+        if "Network" in issue:
+            description += """
+            Network connectivity issues can impact:
+            - Cluster communication
+            - Data replication
+            - Service availability
+            - Management access
+            """
+        
+        # Keep existing detailed metrics section
        if "Disk" in issue:
            for partition in health_report.get('drives_health', {}).get('drives', []):
                if partition.get('mountpoint') in issue:
+                    description += f"\n=== Disk Metrics ===\n"
                    description += f"Disk Device: {partition['device']}\n"
                    description += f"Mount Point: {partition['mountpoint']}\n"
                    description += f"Total Space: {partition['total_space']}\n"
@@ -161,41 +283,8 @@ class SystemHealthMonitor:
                    description += f"Free Space: {partition['free_space']}\n"
                    description += f"Usage Percent: {partition['usage_percent']}%\n"

-                    if partition.get('smart_status') == 'UNHEALTHY':
-                        try:
-                            # Get additional disk information using smartctl
-                            result = subprocess.run(
-                                ['smartctl', '-a', partition['device']],
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE,
-                                text=True
-                            )
-                            output = result.stdout + result.stderr
-                            description += "\nSMART Information:\n"
-                            description += output
-                        except Exception as e:
-                            description += f"Error getting SMART information: {str(e)}\n"
-                    break
-
-        elif "Memory" in issue:
-            memory_health = health_report.get('memory_health', {})
-            description += f"Total Memory: {memory_health['total_memory']}\n"
-            description += f"Used Memory: {memory_health['used_memory']}\n"
-            description += f"Memory Usage Percent: {memory_health['memory_percent']}%\n"
-
-        elif "CPU" in issue:
-            cpu_health = health_report.get('cpu_health', {})
-            description += f"CPU Usage Percent: {cpu_health['cpu_usage_percent']}%\n"
-
-        elif "Network" in issue:
-            network_health = health_report.get('network_health', {})
-            for network in ['management_network', 'ceph_network']:
-                if network_health[network]['issues']:
-                    description += f"{network.replace('_', ' ').title()} Issues:\n"
-                    description += "\n".join(network_health[network]['issues'])
-                    description += "\n"
-
        return description
+
    def _create_tickets_for_issues(self, health_report: Dict[str, Any]):
        issues = self._detect_issues(health_report)
        if not issues:
@@ -435,18 +524,18 @@ class SystemHealthMonitor:
            'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
            'Reported_Uncorrect': {'warning': 1, 'critical': 2},
            'Spin_Retry_Count': {'warning': 1, 'critical': 5},
-            'Command_Timeout': {'warning': 5, 'critical': 10},
+            # 'Command_Timeout': {'warning': 5, 'critical': 10},  # Removed
            'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
-            'Power_On_Hours': {'warning': 35040, 'critical': 43800},  # ~4-5 years
-            'Media_Wearout_Indicator': {'warning': 30, 'critical': 10},  # Percentage remaining
+            'Power_On_Hours': {'warning': 61320, 'critical': 70080},  # ~7-8 years
+            'Media_Wearout_Indicator': {'warning': 30, 'critical': 10},
            'Temperature_Celsius': {'warning': 65, 'critical': 75},
            'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
            'Wear_Leveling_Count': {'warning': 50, 'critical': 20},
            'Available_Spare': {'warning': 30, 'critical': 10},
            'Program_Fail_Count': {'warning': 10, 'critical': 20},
            'Erase_Fail_Count': {'warning': 10, 'critical': 20},
-            'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},
-            'Seek_Error_Rate': {'warning': 50, 'critical': 100},
+            # 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},  # Removed
+            # 'Seek_Error_Rate': {'warning': 50, 'critical': 100},  # Removed
            'Load_Cycle_Count': {'warning': 300000, 'critical': 600000},
            'SSD_Life_Left': {'warning': 30, 'critical': 10}
        }