Huge update to vendor profiles

2025-07-24 19:15:21 -04:00
parent a74c4c0309
commit 0faf7654d6
1 changed files with 163 additions and 29 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -110,21 +110,48 @@ class SystemHealthMonitor:
    MANUFACTURER_SMART_PROFILES = {
        'Ridata': {
            'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],
-            'firmware_patterns': ['HT3618B7', 'HT36'],  # Add exact firmware match first
+            'firmware_patterns': ['HT3618B7', 'HT36'],
            'wear_leveling_behavior': 'countup',
            'wear_leveling_baseline': 0,
            'wear_leveling_thresholds': {
-                'warning': 1000000,   # Increase threshold significantly
-                'critical': 2000000   # Very high threshold for countup behavior
+                'warning': 1000000000,    # 1 billion - very conservative
+                'critical': 2000000000    # 2 billion - extremely conservative
            },
            'attributes': {
                'Wear_Leveling_Count': {
                    'behavior': 'countup',
                    'baseline': 0,
-                    'warning_threshold': 1000000,    # Much higher threshold
-                    'critical_threshold': 2000000,   # Very high threshold
-                    'description': 'Total wear leveling operations performed (countup from 0)',
-                    'ignore_on_new_drive': True      # Don't alert on new drives
+                    'warning_threshold': 1000000000,
+                    'critical_threshold': 2000000000,
+                    'description': 'Total wear leveling operations (countup from 0)',
+                    'ignore_on_new_drive': False,
+                    'monitor': True  # Include in health checks
+                },
+                # These are operation counters, NOT actual failures - ignore completely
+                'Erase_Fail_Count_Chip': {
+                    'monitor': False,  # Skip monitoring entirely
+                    'description': 'Operation counter, not actual failures - IGNORED'
+                },
+                'Program_Fail_Count_Chip': {
+                    'monitor': False,  # Skip monitoring entirely
+                    'description': 'Operation counter, not actual failures - IGNORED'
+                },
+                # These are the REAL failure counters - monitor with standard thresholds
+                'Program_Fail_Cnt_Total': {
+                    'monitor': True,
+                    'behavior': 'countup',
+                    'baseline': 0,
+                    'warning_threshold': 1,      # Any failures are concerning
+                    'critical_threshold': 5,
+                    'description': 'Actual program failures (real failures)'
+                },
+                'Erase_Fail_Count_Total': {
+                    'monitor': True,
+                    'behavior': 'countup',
+                    'baseline': 0,
+                    'warning_threshold': 1,      # Any failures are concerning
+                    'critical_threshold': 5,
+                    'description': 'Actual erase failures (real failures)'
                }
            }
        },
@@ -142,7 +169,19 @@ class SystemHealthMonitor:
                    'baseline': 0,
                    'warning_threshold': 2000,
                    'critical_threshold': 3000,
-                    'description': 'Total wear leveling operations performed'
+                    'description': 'Total wear leveling operations performed',
+                    'monitor': True
+                },
+                # Standard monitoring for all other attributes
+                'Program_Fail_Count': {
+                    'monitor': True,
+                    'warning_threshold': 10,
+                    'critical_threshold': 20
+                },
+                'Erase_Fail_Count': {
+                    'monitor': True,
+                    'warning_threshold': 10,
+                    'critical_threshold': 20
                }
            }
        },
@@ -160,7 +199,8 @@ class SystemHealthMonitor:
                    'baseline': 100,
                    'warning_threshold': 30,
                    'critical_threshold': 10,
-                    'description': 'Percentage of rated life remaining'
+                    'description': 'Percentage of rated life remaining',
+                    'monitor': True
                }
            }
        },
@@ -171,6 +211,9 @@ class SystemHealthMonitor:
            'wear_leveling_thresholds': {
                'warning': 30,
                'critical': 10
+            },
+            'attributes': {
+                # All attributes use default monitoring unless specified
            }
        },
        'Generic': {  # Fallback for unknown manufacturers
@@ -180,6 +223,9 @@ class SystemHealthMonitor:
            'wear_leveling_thresholds': {
                'warning': None,  # Don't trigger on unknown
                'critical': None
+            },
+            'attributes': {
+                # All attributes use default monitoring
            }
        }
    }
@@ -1142,6 +1188,64 @@ class SystemHealthMonitor:
        logger.debug(f"No specific profile found for Model: '{model}', Firmware: '{firmware}', using Generic profile")
        return self.MANUFACTURER_SMART_PROFILES['Generic']

+    def _should_monitor_attribute(self, attr_name: str, manufacturer_profile: dict) -> bool:
+        """
+        Check if an attribute should be monitored based on manufacturer profile
+        """
+        if not manufacturer_profile:
+            return True  # Default: monitor everything
+        
+        attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
+        
+        # Check if explicitly set to not monitor
+        if attr_config.get('monitor') is False:
+            logger.debug(f"Skipping monitoring for {attr_name} - explicitly disabled")
+            return False
+        
+        return True  # Default: monitor unless explicitly disabled
+
+    def _get_attribute_thresholds(self, attr_name: str, manufacturer_profile: dict) -> dict:
+        """
+        Get attribute-specific thresholds, falling back to defaults
+        """
+        # Check for manufacturer-specific thresholds first
+        if manufacturer_profile:
+            attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
+            if 'warning_threshold' in attr_config and 'critical_threshold' in attr_config:
+                return {
+                    'warning': attr_config['warning_threshold'],
+                    'critical': attr_config['critical_threshold'],
+                    'behavior': attr_config.get('behavior', 'countup')
+                }
+        
+        # Fall back to BASE_SMART_THRESHOLDS (your existing thresholds)
+        BASE_SMART_THRESHOLDS = {
+            'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
+            'Current_Pending_Sector': {'warning': 1, 'critical': 5},
+            'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
+            'Reported_Uncorrect': {'warning': 1, 'critical': 10},
+            'Spin_Retry_Count': {'warning': 1, 'critical': 5},
+            'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
+            'Power_On_Hours': {'warning': 61320, 'critical': 70080},
+            'Temperature_Celsius': {'warning': 65, 'critical': 75},
+            'Available_Spare': {'warning': 30, 'critical': 10},
+            'Program_Fail_Count': {'warning': 10, 'critical': 20},
+            'Erase_Fail_Count': {'warning': 10, 'critical': 20},
+            'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
+            'SSD_Life_Left': {'warning': 30, 'critical': 10},
+            'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5},
+            'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5}
+        }
+        
+        if attr_name in BASE_SMART_THRESHOLDS:
+            return {
+                'warning': BASE_SMART_THRESHOLDS[attr_name]['warning'],
+                'critical': BASE_SMART_THRESHOLDS[attr_name]['critical'],
+                'behavior': 'countup'
+            }
+        
+        return None  # No thresholds defined
+
    def _is_new_drive(self, power_on_hours: int) -> bool:
        """
        Determine if a drive is considered "new" based on power-on hours.
@@ -1318,29 +1422,59 @@ class SystemHealthMonitor:
                                        smart_health['severity'] = 'WARNING'
                                    smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")

-                # Handle all other standard SMART attributes (except those already processed)
-                for attr, thresholds in BASE_SMART_THRESHOLDS.items():
-                    if attr in line and attr not in ['Wear_Leveling_Count', 'Erase_Fail_Count', 'Program_Fail_Count']:
-                        parts = line.split()
-                        if len(parts) >= 10:
-                            raw_value = self._parse_smart_value(parts[9])
-                            smart_health['attributes'][attr] = raw_value
+            # Handle all SMART attributes with manufacturer-specific logic
+            # Define all possible attributes we might encounter
+            ALL_SMART_ATTRIBUTES = [
+                'Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Offline_Uncorrectable',
+                'Reported_Uncorrect', 'Spin_Retry_Count', 'Power_Cycle_Count', 'Power_On_Hours',
+                'Temperature_Celsius', 'Available_Spare', 'Program_Fail_Count', 'Erase_Fail_Count',
+                'Load_Cycle_Count', 'SSD_Life_Left', 'Program_Fail_Cnt_Total', 'Erase_Fail_Count_Total',
+                'Program_Fail_Count_Chip', 'Erase_Fail_Count_Chip'
+            ]
+            
+            for attr in ALL_SMART_ATTRIBUTES:
+                if attr in line and attr not in ['Wear_Leveling_Count']:  # Wear_Leveling handled separately above
+                    # Check if we should monitor this attribute
+                    if not self._should_monitor_attribute(attr, manufacturer_profile):
+                        logger.debug(f"Skipping {attr} - disabled for this manufacturer")
+                        continue
+                    
+                    parts = line.split()
+                    if len(parts) >= 10:
+                        raw_value = self._parse_smart_value(parts[9])
+                        smart_health['attributes'][attr] = raw_value

-                            if attr == 'Temperature_Celsius':
-                                smart_health['temp'] = raw_value
-                                if raw_value >= thresholds['critical']:
-                                    smart_health['severity'] = 'CRITICAL'
-                                    smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
-                                elif raw_value >= thresholds['warning']:
-                                    if smart_health['severity'] != 'CRITICAL':
-                                        smart_health['severity'] = 'WARNING'
-                                    smart_health['issues'].append(f"High temperature: {raw_value}°C")
-                            else:
-                                # Only trigger alerts if the raw value actually exceeds thresholds
-                                if raw_value >= thresholds['critical']:
+                        # Get manufacturer-specific or default thresholds
+                        attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile)
+                        if not attr_thresholds:
+                            continue
+                        
+                        # Apply thresholds based on behavior
+                        if attr == 'Temperature_Celsius':
+                            smart_health['temp'] = raw_value
+                            if raw_value >= attr_thresholds['critical']:
+                                smart_health['severity'] = 'CRITICAL'
+                                smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
+                            elif raw_value >= attr_thresholds['warning']:
+                                if smart_health['severity'] != 'CRITICAL':
+                                    smart_health['severity'] = 'WARNING'
+                                smart_health['issues'].append(f"High temperature: {raw_value}°C")
+                        else:
+                            # Handle countup/countdown behavior
+                            behavior = attr_thresholds.get('behavior', 'countup')
+                            if behavior == 'countup':
+                                if raw_value >= attr_thresholds['critical']:
                                    smart_health['severity'] = 'CRITICAL'
                                    smart_health['issues'].append(f"Critical {attr}: {raw_value}")
-                                elif raw_value >= thresholds['warning']:
+                                elif raw_value >= attr_thresholds['warning']:
+                                    if smart_health['severity'] != 'CRITICAL':
+                                        smart_health['severity'] = 'WARNING'
+                                    smart_health['issues'].append(f"Warning {attr}: {raw_value}")
+                            elif behavior == 'countdown':
+                                if raw_value <= attr_thresholds['critical']:
+                                    smart_health['severity'] = 'CRITICAL'
+                                    smart_health['issues'].append(f"Critical {attr}: {raw_value}")
+                                elif raw_value <= attr_thresholds['warning']:
                                    if smart_health['severity'] != 'CRITICAL':
                                        smart_health['severity'] = 'WARNING'
                                    smart_health['issues'].append(f"Warning {attr}: {raw_value}")