From a74c4c03091bf6ab7ac4b3f804d51346e677ae56 Mon Sep 17 00:00:00 2001
From: Jared Vititoe <jjvititoe1@gmail.com>
Date: Tue, 24 Jun 2025 15:14:35 -0400
Subject: [PATCH] Erase_Fail_Count matched two values

---
 hwmonDaemon.py | 64 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 14 deletions(-)

diff --git a/hwmonDaemon.py b/hwmonDaemon.py
index d351c6d..b8faa3f 100644
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -109,7 +109,7 @@ class SystemHealthMonitor:
     }
     MANUFACTURER_SMART_PROFILES = {
         'Ridata': {
-            'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],  # Keep the generic model
+            'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],
             'firmware_patterns': ['HT3618B7', 'HT36'],  # Add exact firmware match first
             'wear_leveling_behavior': 'countup',
             'wear_leveling_baseline': 0,
@@ -1226,13 +1226,36 @@ class SystemHealthMonitor:
             # Parse SMART attributes with manufacturer-specific handling
             power_on_hours = 0
             
+            # First pass: collect all SMART attributes with priority for _Total versions
+            smart_attributes_raw = {}
+            
             for line in output.split('\n'):
                 # Extract Power_On_Hours first to determine if drive is new
                 if 'Power_On_Hours' in line:
                     parts = line.split()
                     if len(parts) >= 10:
                         power_on_hours = self._parse_smart_value(parts[9])
-                        smart_health['attributes']['Power_On_Hours'] = power_on_hours
+                        smart_attributes_raw['Power_On_Hours'] = power_on_hours
+
+                # Handle SMART attributes with preference for _Total versions
+                for attr in ['Erase_Fail_Count', 'Program_Fail_Count']:
+                    # Check for _Total version first (more accurate)
+                    if f'{attr}_Total' in line:
+                        parts = line.split()
+                        if len(parts) >= 10:
+                            raw_value = self._parse_smart_value(parts[9])
+                            smart_attributes_raw[attr] = raw_value
+                            logger.debug(f"Found {attr}_Total: {raw_value}")
+                            break
+                    # Only use non-_Total version if _Total not found
+                    elif attr in line and f'{attr}_Total' not in smart_attributes_raw:
+                        parts = line.split()
+                        if len(parts) >= 10:
+                            raw_value = self._parse_smart_value(parts[9])
+                            smart_attributes_raw[attr] = raw_value
+                            logger.debug(f"Found {attr} (non-Total): {raw_value}")
+
+            smart_health['attributes'] = smart_attributes_raw
 
             # Check if this is a new drive
             is_new_drive = self._is_new_drive(power_on_hours)
@@ -1255,7 +1278,7 @@ class SystemHealthMonitor:
                 'SSD_Life_Left': {'warning': 30, 'critical': 10}
             }
 
-            # Parse all SMART attributes
+            # Parse remaining SMART attributes
             for line in output.split('\n'):
                 # Handle manufacturer-specific Wear_Leveling_Count
                 if 'Wear_Leveling_Count' in line:
@@ -1295,9 +1318,9 @@ class SystemHealthMonitor:
                                         smart_health['severity'] = 'WARNING'
                                     smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")
 
-                # Handle all other standard SMART attributes
+                # Handle all other standard SMART attributes (except those already processed)
                 for attr, thresholds in BASE_SMART_THRESHOLDS.items():
-                    if attr in line and attr != 'Wear_Leveling_Count':  # Skip wear leveling as it's handled above
+                    if attr in line and attr not in ['Wear_Leveling_Count', 'Erase_Fail_Count', 'Program_Fail_Count']:
                         parts = line.split()
                         if len(parts) >= 10:
                             raw_value = self._parse_smart_value(parts[9])
@@ -1313,15 +1336,28 @@ class SystemHealthMonitor:
                                         smart_health['severity'] = 'WARNING'
                                     smart_health['issues'].append(f"High temperature: {raw_value}°C")
                             else:
-                                # Fix: Only trigger alerts if the raw value actually exceeds thresholds
-                                if raw_value > 0:  # Only check non-zero values
-                                    if raw_value >= thresholds['critical']:
-                                        smart_health['severity'] = 'CRITICAL'
-                                        smart_health['issues'].append(f"Critical {attr}: {raw_value}")
-                                    elif raw_value >= thresholds['warning']:
-                                        if smart_health['severity'] != 'CRITICAL':
-                                            smart_health['severity'] = 'WARNING'
-                                        smart_health['issues'].append(f"Warning {attr}: {raw_value}")
+                                # Only trigger alerts if the raw value actually exceeds thresholds
+                                if raw_value >= thresholds['critical']:
+                                    smart_health['severity'] = 'CRITICAL'
+                                    smart_health['issues'].append(f"Critical {attr}: {raw_value}")
+                                elif raw_value >= thresholds['warning']:
+                                    if smart_health['severity'] != 'CRITICAL':
+                                        smart_health['severity'] = 'WARNING'
+                                    smart_health['issues'].append(f"Warning {attr}: {raw_value}")
+
+            # Now check the collected Erase_Fail_Count and Program_Fail_Count
+            for attr in ['Erase_Fail_Count', 'Program_Fail_Count']:
+                if attr in smart_health['attributes']:
+                    raw_value = smart_health['attributes'][attr]
+                    thresholds = BASE_SMART_THRESHOLDS[attr]
+                    
+                    if raw_value >= thresholds['critical']:
+                        smart_health['severity'] = 'CRITICAL'
+                        smart_health['issues'].append(f"Critical {attr}: {raw_value}")
+                    elif raw_value >= thresholds['warning']:
+                        if smart_health['severity'] != 'CRITICAL':
+                            smart_health['severity'] = 'WARNING'
+                        smart_health['issues'].append(f"Warning {attr}: {raw_value}")
 
             # Check for recent SMART errors
             error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"