Erase_Fail_Count matched two values

2025-06-24 15:14:35 -04:00
parent 9a700e9853
commit a74c4c0309
1 changed files with 50 additions and 14 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -109,7 +109,7 @@ class SystemHealthMonitor:
    }
    MANUFACTURER_SMART_PROFILES = {
        'Ridata': {
-            'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],  # Keep the generic model
+            'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],
            'firmware_patterns': ['HT3618B7', 'HT36'],  # Add exact firmware match first
            'wear_leveling_behavior': 'countup',
            'wear_leveling_baseline': 0,
@@ -1226,13 +1226,36 @@ class SystemHealthMonitor:
            # Parse SMART attributes with manufacturer-specific handling
            power_on_hours = 0
            
+            # First pass: collect all SMART attributes with priority for _Total versions
+            smart_attributes_raw = {}
+            
            for line in output.split('\n'):
                # Extract Power_On_Hours first to determine if drive is new
                if 'Power_On_Hours' in line:
                    parts = line.split()
                    if len(parts) >= 10:
                        power_on_hours = self._parse_smart_value(parts[9])
-                        smart_health['attributes']['Power_On_Hours'] = power_on_hours
+                        smart_attributes_raw['Power_On_Hours'] = power_on_hours
+
+                # Handle SMART attributes with preference for _Total versions
+                for attr in ['Erase_Fail_Count', 'Program_Fail_Count']:
+                    # Check for _Total version first (more accurate)
+                    if f'{attr}_Total' in line:
+                        parts = line.split()
+                        if len(parts) >= 10:
+                            raw_value = self._parse_smart_value(parts[9])
+                            smart_attributes_raw[attr] = raw_value
+                            logger.debug(f"Found {attr}_Total: {raw_value}")
+                            break
+                    # Only use non-_Total version if _Total not found
+                    elif attr in line and f'{attr}_Total' not in smart_attributes_raw:
+                        parts = line.split()
+                        if len(parts) >= 10:
+                            raw_value = self._parse_smart_value(parts[9])
+                            smart_attributes_raw[attr] = raw_value
+                            logger.debug(f"Found {attr} (non-Total): {raw_value}")
+
+            smart_health['attributes'] = smart_attributes_raw

            # Check if this is a new drive
            is_new_drive = self._is_new_drive(power_on_hours)
@@ -1255,7 +1278,7 @@ class SystemHealthMonitor:
                'SSD_Life_Left': {'warning': 30, 'critical': 10}
            }

-            # Parse all SMART attributes
+            # Parse remaining SMART attributes
            for line in output.split('\n'):
                # Handle manufacturer-specific Wear_Leveling_Count
                if 'Wear_Leveling_Count' in line:
@@ -1295,9 +1318,9 @@ class SystemHealthMonitor:
                                        smart_health['severity'] = 'WARNING'
                                    smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")

-                # Handle all other standard SMART attributes
+                # Handle all other standard SMART attributes (except those already processed)
                for attr, thresholds in BASE_SMART_THRESHOLDS.items():
-                    if attr in line and attr != 'Wear_Leveling_Count':  # Skip wear leveling as it's handled above
+                    if attr in line and attr not in ['Wear_Leveling_Count', 'Erase_Fail_Count', 'Program_Fail_Count']:
                        parts = line.split()
                        if len(parts) >= 10:
                            raw_value = self._parse_smart_value(parts[9])
@@ -1313,15 +1336,28 @@ class SystemHealthMonitor:
                                        smart_health['severity'] = 'WARNING'
                                    smart_health['issues'].append(f"High temperature: {raw_value}°C")
                            else:
-                                # Fix: Only trigger alerts if the raw value actually exceeds thresholds
-                                if raw_value > 0:  # Only check non-zero values
-                                    if raw_value >= thresholds['critical']:
-                                        smart_health['severity'] = 'CRITICAL'
-                                        smart_health['issues'].append(f"Critical {attr}: {raw_value}")
-                                    elif raw_value >= thresholds['warning']:
-                                        if smart_health['severity'] != 'CRITICAL':
-                                            smart_health['severity'] = 'WARNING'
-                                        smart_health['issues'].append(f"Warning {attr}: {raw_value}")
+                                # Only trigger alerts if the raw value actually exceeds thresholds
+                                if raw_value >= thresholds['critical']:
+                                    smart_health['severity'] = 'CRITICAL'
+                                    smart_health['issues'].append(f"Critical {attr}: {raw_value}")
+                                elif raw_value >= thresholds['warning']:
+                                    if smart_health['severity'] != 'CRITICAL':
+                                        smart_health['severity'] = 'WARNING'
+                                    smart_health['issues'].append(f"Warning {attr}: {raw_value}")
+
+            # Now check the collected Erase_Fail_Count and Program_Fail_Count
+            for attr in ['Erase_Fail_Count', 'Program_Fail_Count']:
+                if attr in smart_health['attributes']:
+                    raw_value = smart_health['attributes'][attr]
+                    thresholds = BASE_SMART_THRESHOLDS[attr]
+                    
+                    if raw_value >= thresholds['critical']:
+                        smart_health['severity'] = 'CRITICAL'
+                        smart_health['issues'].append(f"Critical {attr}: {raw_value}")
+                    elif raw_value >= thresholds['warning']:
+                        if smart_health['severity'] != 'CRITICAL':
+                            smart_health['severity'] = 'WARNING'
+                        smart_health['issues'].append(f"Warning {attr}: {raw_value}")

            # Check for recent SMART errors
            error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"