From a74c4c03091bf6ab7ac4b3f804d51346e677ae56 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Tue, 24 Jun 2025 15:14:35 -0400 Subject: [PATCH] Erase_Fail_Count matched two values --- hwmonDaemon.py | 64 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index d351c6d..b8faa3f 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -109,7 +109,7 @@ class SystemHealthMonitor: } MANUFACTURER_SMART_PROFILES = { 'Ridata': { - 'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'], # Keep the generic model + 'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'], 'firmware_patterns': ['HT3618B7', 'HT36'], # Add exact firmware match first 'wear_leveling_behavior': 'countup', 'wear_leveling_baseline': 0, @@ -1226,13 +1226,36 @@ class SystemHealthMonitor: # Parse SMART attributes with manufacturer-specific handling power_on_hours = 0 + # First pass: collect all SMART attributes with priority for _Total versions + smart_attributes_raw = {} + for line in output.split('\n'): # Extract Power_On_Hours first to determine if drive is new if 'Power_On_Hours' in line: parts = line.split() if len(parts) >= 10: power_on_hours = self._parse_smart_value(parts[9]) - smart_health['attributes']['Power_On_Hours'] = power_on_hours + smart_attributes_raw['Power_On_Hours'] = power_on_hours + + # Handle SMART attributes with preference for _Total versions + for attr in ['Erase_Fail_Count', 'Program_Fail_Count']: + # Check for _Total version first (more accurate) + if f'{attr}_Total' in line: + parts = line.split() + if len(parts) >= 10: + raw_value = self._parse_smart_value(parts[9]) + smart_attributes_raw[attr] = raw_value + logger.debug(f"Found {attr}_Total: {raw_value}") + break + # Only use non-_Total version if _Total not found + elif attr in line and f'{attr}_Total' not in smart_attributes_raw: + parts = line.split() + if len(parts) >= 10: + raw_value = self._parse_smart_value(parts[9]) + smart_attributes_raw[attr] = raw_value + logger.debug(f"Found {attr} (non-Total): {raw_value}") + + smart_health['attributes'] = smart_attributes_raw # Check if this is a new drive is_new_drive = self._is_new_drive(power_on_hours) @@ -1255,7 +1278,7 @@ class SystemHealthMonitor: 'SSD_Life_Left': {'warning': 30, 'critical': 10} } - # Parse all SMART attributes + # Parse remaining SMART attributes for line in output.split('\n'): # Handle manufacturer-specific Wear_Leveling_Count if 'Wear_Leveling_Count' in line: @@ -1295,9 +1318,9 @@ class SystemHealthMonitor: smart_health['severity'] = 'WARNING' smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}") - # Handle all other standard SMART attributes + # Handle all other standard SMART attributes (except those already processed) for attr, thresholds in BASE_SMART_THRESHOLDS.items(): - if attr in line and attr != 'Wear_Leveling_Count': # Skip wear leveling as it's handled above + if attr in line and attr not in ['Wear_Leveling_Count', 'Erase_Fail_Count', 'Program_Fail_Count']: parts = line.split() if len(parts) >= 10: raw_value = self._parse_smart_value(parts[9]) @@ -1313,15 +1336,28 @@ class SystemHealthMonitor: smart_health['severity'] = 'WARNING' smart_health['issues'].append(f"High temperature: {raw_value}°C") else: - # Fix: Only trigger alerts if the raw value actually exceeds thresholds - if raw_value > 0: # Only check non-zero values - if raw_value >= thresholds['critical']: - smart_health['severity'] = 'CRITICAL' - smart_health['issues'].append(f"Critical {attr}: {raw_value}") - elif raw_value >= thresholds['warning']: - if smart_health['severity'] != 'CRITICAL': - smart_health['severity'] = 'WARNING' - smart_health['issues'].append(f"Warning {attr}: {raw_value}") + # Only trigger alerts if the raw value actually exceeds thresholds + if raw_value >= thresholds['critical']: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical {attr}: {raw_value}") + elif raw_value >= thresholds['warning']: + if smart_health['severity'] != 'CRITICAL': + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"Warning {attr}: {raw_value}") + + # Now check the collected Erase_Fail_Count and Program_Fail_Count + for attr in ['Erase_Fail_Count', 'Program_Fail_Count']: + if attr in smart_health['attributes']: + raw_value = smart_health['attributes'][attr] + thresholds = BASE_SMART_THRESHOLDS[attr] + + if raw_value >= thresholds['critical']: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical {attr}: {raw_value}") + elif raw_value >= thresholds['warning']: + if smart_health['severity'] != 'CRITICAL': + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"Warning {attr}: {raw_value}") # Check for recent SMART errors error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"