Fixed thesholds for thermals and smart

2025-09-03 12:58:30 -04:00
parent bc73a691df
commit 2d6626cece
1 changed files with 24 additions and 10 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -144,6 +144,15 @@ class SystemHealthMonitor:
                    'monitor': False,  # Skip monitoring entirely
                    'description': 'Operation counter, not actual failures - IGNORED'
                },
+                # ADD THIS: Regular Erase_Fail_Count is also an operation counter for Ridata
+                'Erase_Fail_Count': {
+                    'monitor': False,  # Skip monitoring entirely for Ridata
+                    'description': 'Operation counter for Ridata drives, not actual failures - IGNORED'
+                },
+                'Program_Fail_Count': {
+                    'monitor': False,  # Skip monitoring entirely for Ridata
+                    'description': 'Operation counter for Ridata drives, not actual failures - IGNORED'
+                },
                # These are the REAL failure counters - monitor with standard thresholds
                'Program_Fail_Cnt_Total': {
                    'monitor': True,
@@ -578,11 +587,11 @@ class SystemHealthMonitor:
        if temperature is None:
            return issues
        
-        # Drive-type specific temperature thresholds
+        # Drive-type specific temperature thresholds - ADJUSTED TO BE LESS SENSITIVE
        if drive_type == 'SSD':
-            temp_thresholds = {'warning': 70, 'critical': 85, 'optimal_max': 60}
+            temp_thresholds = {'warning': 70, 'critical': 85, 'optimal_max': 65}  # Raised from 60
        else:  # HDD
-            temp_thresholds = {'warning': 55, 'critical': 65, 'optimal_max': 45}
+            temp_thresholds = {'warning': 60, 'critical': 70, 'optimal_max': 55}  # Raised from 45/55/65
        
        if temperature >= temp_thresholds['critical']:
            issues.append(f"CRITICAL: Drive temperature {temperature}°C exceeds safe operating limit for {drive_type}")
@@ -1519,7 +1528,7 @@ class SystemHealthMonitor:
                    'behavior': attr_config.get('behavior', 'countup')
                }
        
-        # Enhanced BASE_SMART_THRESHOLDS with additional attributes
+        # Enhanced BASE_SMART_THRESHOLDS with manufacturer-specific handling
        BASE_SMART_THRESHOLDS = {
            'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
            'Current_Pending_Sector': {'warning': 1, 'critical': 5},
@@ -1536,10 +1545,10 @@ class SystemHealthMonitor:
            'SSD_Life_Left': {'warning': 30, 'critical': 10},
            'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5},
            'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5},
-            # Enhanced SMART attributes for better failure detection
-            'Raw_Read_Error_Rate': {'warning': 100000, 'critical': 1000000},
-            'Seek_Error_Rate': {'warning': 100000, 'critical': 1000000},
-            'Command_Timeout': {'warning': 1, 'critical': 5},
+            # ADJUSTED: More lenient thresholds for error rates on unknown drives
+            'Raw_Read_Error_Rate': {'warning': 10000000, 'critical': 100000000},  # Raised significantly
+            'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000},     # Raised significantly  
+            'Command_Timeout': {'warning': 100, 'critical': 1000},               # Raised significantly
            'High_Fly_Writes': {'warning': 1, 'critical': 5},
            'Airflow_Temperature_Cel': {'warning': 65, 'critical': 75},
            'G_Sense_Error_Rate': {'warning': 100, 'critical': 1000},
@@ -1658,11 +1667,16 @@ class SystemHealthMonitor:
                        parts = line.split()
                        if len(parts) >= 10:
                            raw_value = self._parse_smart_value(parts[9])
-                            smart_attributes_raw[attr] = raw_value
+                            smart_attributes_raw[f'{attr}_Total'] = raw_value  # Store as _Total
                            logger.debug(f"Found {attr}_Total: {raw_value}")
                            break
-                    # Only use non-_Total version if _Total not found
+                    # Only use non-_Total version if _Total not found AND not Ridata
                    elif attr in line and f'{attr}_Total' not in smart_attributes_raw:
+                        # Check if this is a Ridata drive and should skip regular counters
+                        if manufacturer_profile and manufacturer_profile.get('aliases', [{}])[0] == 'Ridata':
+                            logger.debug(f"Skipping {attr} for Ridata drive - using _Total version only")
+                            continue
+                            
                        parts = line.split()
                        if len(parts) >= 10:
                            raw_value = self._parse_smart_value(parts[9])