More detailed ticket description and fix thresholds

2025-03-03 18:34:58 -05:00
parent 0507203140
commit b969f8c0e4
1 changed files with 135 additions and 46 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -133,13 +133,7 @@ class SystemHealthMonitor:
        return health_report
    def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
-        """
+        # Keep existing banner and initial description
        Generate a detailed description for the issue based on the health report.
        :param issue: The issue description.
        :param health_report: The comprehensive health report from the checks.
        :return: A detailed description for the issue.
        """
        banner = """
        =================================================================
        AUTOMATED TICKET - Generated by Hardware Monitoring Service (hwmonDaemon)
@@ -151,9 +145,137 @@ class SystemHealthMonitor:
        description = banner + issue + "\n\n"
        # Add issue explanation section
        description += "=== Issue Details ===\n"
        # Add SMART attribute explanations
        SMART_DESCRIPTIONS = {
            'Reallocated_Sector_Ct': """
            Number of sectors that have been reallocated due to errors.
            - High counts indicate degrading media
            - Each reallocation uses one of the drive's limited spare sectors
            - Rapid increases suggest accelerating drive wear
            """,
            'Current_Pending_Sector': """
            Sectors waiting to be reallocated due to read/write errors.
            - Indicates potentially unstable sectors
            - May result in data loss if unrecoverable
            - Should be monitored for increases
            """,
            'Offline_Uncorrectable': """
            Count of uncorrectable errors detected during offline data collection.
            - Direct indicator of media reliability issues
            - May affect data integrity
            - High values suggest drive replacement needed
            """,
            'Reported_Uncorrect': """
            Number of errors that could not be recovered using hardware ECC.
            - Critical indicator of drive health
            - Directly impacts data reliability
            - Any non-zero value requires attention
            """,
            'Spin_Retry_Count': """
            Number of spin start retry attempts.
            - Indicates potential motor or bearing issues
            - May predict imminent mechanical failure
            - Increasing values suggest degrading drive health
            """,
            'Power_On_Hours': """
            Total number of hours the device has been powered on.
            - Normal aging metric
            - Used to gauge overall drive lifetime
            - Compare against manufacturer's MTBF rating
            """,
            'Media_Wearout_Indicator': """
            Percentage of drive's rated life remaining (SSDs).
            - 100 indicates new drive
            - 0 indicates exceeded rated writes
            - Critical for SSD lifecycle management
            """,
            'Temperature_Celsius': """
            Current drive temperature.
            - High temperatures accelerate wear
            - Optimal range: 20-45°C
            - Sustained high temps reduce lifespan
            """,
            'Available_Spare': """
            Percentage of spare blocks remaining (SSDs).
            - Critical for SSD endurance
            - Low values indicate approaching end-of-life
            - Rapid decreases suggest excessive writes
            """,
            'Program_Fail_Count': """
            Number of flash program operation failures.
            - Indicates NAND cell reliability
            - Important for SSD health assessment
            - Increasing values suggest flash degradation
            """,
            'Erase_Fail_Count': """
            Number of flash erase operation failures.
            - Related to NAND block health
            - Critical for SSD reliability
            - High counts suggest failing flash blocks
            """
        }
        if "SMART" in issue:
            description += """
            SMART (Self-Monitoring, Analysis, and Reporting Technology) issues indicate potential drive reliability problems.
            - Reallocated sectors indicate bad blocks that have been remapped
            - Pending sectors are potentially failing blocks waiting to be remapped
            - Uncorrectable errors indicate data that could not be read
            """
        if "Temperature" in issue:
            description += """
            High drive temperatures can:
            - Reduce drive lifespan
            - Cause performance degradation
            - Lead to data corruption in extreme cases
            Optimal temperature range: 20-45°C
            """
        if "ECC" in issue:
            description += """
            ECC (Error Correction Code) Memory Issues:
            - Correctable: Memory errors that were successfully fixed
            - Uncorrectable: Serious memory errors that could not be corrected
            Frequent ECC corrections may indicate degrading memory modules
            """
        if "CPU" in issue:
            description += """
            High CPU usage sustained over time can indicate:
            - Resource constraints
            - Runaway processes
            - Need for performance optimization
            - Potential cooling issues
            """
        if "Network" in issue:
            description += """
            Network connectivity issues can impact:
            - Cluster communication
            - Data replication
            - Service availability
            - Management access
            """
        # Keep existing detailed metrics section
        if "Disk" in issue:
            for partition in health_report.get('drives_health', {}).get('drives', []):
                if partition.get('mountpoint') in issue:
                    description += f"\n=== Disk Metrics ===\n"
                    description += f"Disk Device: {partition['device']}\n"
                    description += f"Mount Point: {partition['mountpoint']}\n"
                    description += f"Total Space: {partition['total_space']}\n"
@@ -161,41 +283,8 @@ class SystemHealthMonitor:
                    description += f"Free Space: {partition['free_space']}\n"
                    description += f"Usage Percent: {partition['usage_percent']}%\n"
                    if partition.get('smart_status') == 'UNHEALTHY':
                        try:
                            # Get additional disk information using smartctl
                            result = subprocess.run(
                                ['smartctl', '-a', partition['device']],
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                text=True
                            )
                            output = result.stdout + result.stderr
                            description += "\nSMART Information:\n"
                            description += output
                        except Exception as e:
                            description += f"Error getting SMART information: {str(e)}\n"
                    break
        elif "Memory" in issue:
            memory_health = health_report.get('memory_health', {})
            description += f"Total Memory: {memory_health['total_memory']}\n"
            description += f"Used Memory: {memory_health['used_memory']}\n"
            description += f"Memory Usage Percent: {memory_health['memory_percent']}%\n"
        elif "CPU" in issue:
            cpu_health = health_report.get('cpu_health', {})
            description += f"CPU Usage Percent: {cpu_health['cpu_usage_percent']}%\n"
        elif "Network" in issue:
            network_health = health_report.get('network_health', {})
            for network in ['management_network', 'ceph_network']:
                if network_health[network]['issues']:
                    description += f"{network.replace('_', ' ').title()} Issues:\n"
                    description += "\n".join(network_health[network]['issues'])
                    description += "\n"
        return description
    def _create_tickets_for_issues(self, health_report: Dict[str, Any]):
        issues = self._detect_issues(health_report)
        if not issues:
@@ -435,18 +524,18 @@ class SystemHealthMonitor:
            'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
            'Reported_Uncorrect': {'warning': 1, 'critical': 2},
            'Spin_Retry_Count': {'warning': 1, 'critical': 5},
-            'Command_Timeout': {'warning': 5, 'critical': 10},
+            # 'Command_Timeout': {'warning': 5, 'critical': 10},  # Removed
            'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
-            'Power_On_Hours': {'warning': 35040, 'critical': 43800},  # ~4-5 years
+            'Power_On_Hours': {'warning': 61320, 'critical': 70080},  # ~7-8 years
-            'Media_Wearout_Indicator': {'warning': 30, 'critical': 10},  # Percentage remaining
+            'Media_Wearout_Indicator': {'warning': 30, 'critical': 10},
            'Temperature_Celsius': {'warning': 65, 'critical': 75},
            'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
            'Wear_Leveling_Count': {'warning': 50, 'critical': 20},
            'Available_Spare': {'warning': 30, 'critical': 10},
            'Program_Fail_Count': {'warning': 10, 'critical': 20},
            'Erase_Fail_Count': {'warning': 10, 'critical': 20},
-            'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},
+            # 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},  # Removed
-            'Seek_Error_Rate': {'warning': 50, 'critical': 100},
+            # 'Seek_Error_Rate': {'warning': 50, 'critical': 100},  # Removed
            'Load_Cycle_Count': {'warning': 300000, 'critical': 600000},
            'SSD_Life_Left': {'warning': 30, 'critical': 10}
        }