Huge update to vendor profiles

2025-07-24 19:15:21 -04:00
parent a74c4c0309
commit 0faf7654d6
1 changed files with 163 additions and 29 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -110,21 +110,48 @@ class SystemHealthMonitor:
    MANUFACTURER_SMART_PROFILES = {
        'Ridata': {
            'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],
-            'firmware_patterns': ['HT3618B7', 'HT36'],  # Add exact firmware match first
+            'firmware_patterns': ['HT3618B7', 'HT36'],
            'wear_leveling_behavior': 'countup',
            'wear_leveling_baseline': 0,
            'wear_leveling_thresholds': {
-                'warning': 1000000,   # Increase threshold significantly
+                'warning': 1000000000,    # 1 billion - very conservative
-                'critical': 2000000   # Very high threshold for countup behavior
+                'critical': 2000000000    # 2 billion - extremely conservative
            },
            'attributes': {
                'Wear_Leveling_Count': {
                    'behavior': 'countup',
                    'baseline': 0,
-                    'warning_threshold': 1000000,    # Much higher threshold
+                    'warning_threshold': 1000000000,
-                    'critical_threshold': 2000000,   # Very high threshold
+                    'critical_threshold': 2000000000,
-                    'description': 'Total wear leveling operations performed (countup from 0)',
+                    'description': 'Total wear leveling operations (countup from 0)',
-                    'ignore_on_new_drive': True      # Don't alert on new drives
+                    'ignore_on_new_drive': False,
                    'monitor': True  # Include in health checks
                },
                # These are operation counters, NOT actual failures - ignore completely
                'Erase_Fail_Count_Chip': {
                    'monitor': False,  # Skip monitoring entirely
                    'description': 'Operation counter, not actual failures - IGNORED'
                },
                'Program_Fail_Count_Chip': {
                    'monitor': False,  # Skip monitoring entirely
                    'description': 'Operation counter, not actual failures - IGNORED'
                },
                # These are the REAL failure counters - monitor with standard thresholds
                'Program_Fail_Cnt_Total': {
                    'monitor': True,
                    'behavior': 'countup',
                    'baseline': 0,
                    'warning_threshold': 1,      # Any failures are concerning
                    'critical_threshold': 5,
                    'description': 'Actual program failures (real failures)'
                },
                'Erase_Fail_Count_Total': {
                    'monitor': True,
                    'behavior': 'countup',
                    'baseline': 0,
                    'warning_threshold': 1,      # Any failures are concerning
                    'critical_threshold': 5,
                    'description': 'Actual erase failures (real failures)'
                }
            }
        },
@@ -142,7 +169,19 @@ class SystemHealthMonitor:
                    'baseline': 0,
                    'warning_threshold': 2000,
                    'critical_threshold': 3000,
-                    'description': 'Total wear leveling operations performed'
+                    'description': 'Total wear leveling operations performed',
                    'monitor': True
                },
                # Standard monitoring for all other attributes
                'Program_Fail_Count': {
                    'monitor': True,
                    'warning_threshold': 10,
                    'critical_threshold': 20
                },
                'Erase_Fail_Count': {
                    'monitor': True,
                    'warning_threshold': 10,
                    'critical_threshold': 20
                }
            }
        },
@@ -160,7 +199,8 @@ class SystemHealthMonitor:
                    'baseline': 100,
                    'warning_threshold': 30,
                    'critical_threshold': 10,
-                    'description': 'Percentage of rated life remaining'
+                    'description': 'Percentage of rated life remaining',
                    'monitor': True
                }
            }
        },
@@ -171,6 +211,9 @@ class SystemHealthMonitor:
            'wear_leveling_thresholds': {
                'warning': 30,
                'critical': 10
            },
            'attributes': {
                # All attributes use default monitoring unless specified
            }
        },
        'Generic': {  # Fallback for unknown manufacturers
@@ -180,6 +223,9 @@ class SystemHealthMonitor:
            'wear_leveling_thresholds': {
                'warning': None,  # Don't trigger on unknown
                'critical': None
            },
            'attributes': {
                # All attributes use default monitoring
            }
        }
    }
@@ -1142,6 +1188,64 @@ class SystemHealthMonitor:
        logger.debug(f"No specific profile found for Model: '{model}', Firmware: '{firmware}', using Generic profile")
        return self.MANUFACTURER_SMART_PROFILES['Generic']
    def _should_monitor_attribute(self, attr_name: str, manufacturer_profile: dict) -> bool:
        """
        Check if an attribute should be monitored based on manufacturer profile
        """
        if not manufacturer_profile:
            return True  # Default: monitor everything
        attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
        # Check if explicitly set to not monitor
        if attr_config.get('monitor') is False:
            logger.debug(f"Skipping monitoring for {attr_name} - explicitly disabled")
            return False
        return True  # Default: monitor unless explicitly disabled
    def _get_attribute_thresholds(self, attr_name: str, manufacturer_profile: dict) -> dict:
        """
        Get attribute-specific thresholds, falling back to defaults
        """
        # Check for manufacturer-specific thresholds first
        if manufacturer_profile:
            attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
            if 'warning_threshold' in attr_config and 'critical_threshold' in attr_config:
                return {
                    'warning': attr_config['warning_threshold'],
                    'critical': attr_config['critical_threshold'],
                    'behavior': attr_config.get('behavior', 'countup')
                }
        # Fall back to BASE_SMART_THRESHOLDS (your existing thresholds)
        BASE_SMART_THRESHOLDS = {
            'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
            'Current_Pending_Sector': {'warning': 1, 'critical': 5},
            'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
            'Reported_Uncorrect': {'warning': 1, 'critical': 10},
            'Spin_Retry_Count': {'warning': 1, 'critical': 5},
            'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
            'Power_On_Hours': {'warning': 61320, 'critical': 70080},
            'Temperature_Celsius': {'warning': 65, 'critical': 75},
            'Available_Spare': {'warning': 30, 'critical': 10},
            'Program_Fail_Count': {'warning': 10, 'critical': 20},
            'Erase_Fail_Count': {'warning': 10, 'critical': 20},
            'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
            'SSD_Life_Left': {'warning': 30, 'critical': 10},
            'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5},
            'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5}
        }
        if attr_name in BASE_SMART_THRESHOLDS:
            return {
                'warning': BASE_SMART_THRESHOLDS[attr_name]['warning'],
                'critical': BASE_SMART_THRESHOLDS[attr_name]['critical'],
                'behavior': 'countup'
            }
        return None  # No thresholds defined
    def _is_new_drive(self, power_on_hours: int) -> bool:
        """
        Determine if a drive is considered "new" based on power-on hours.
@@ -1318,29 +1422,59 @@ class SystemHealthMonitor:
                                        smart_health['severity'] = 'WARNING'
                                    smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")
-                # Handle all other standard SMART attributes (except those already processed)
+            # Handle all SMART attributes with manufacturer-specific logic
-                for attr, thresholds in BASE_SMART_THRESHOLDS.items():
+            # Define all possible attributes we might encounter
-                    if attr in line and attr not in ['Wear_Leveling_Count', 'Erase_Fail_Count', 'Program_Fail_Count']:
+            ALL_SMART_ATTRIBUTES = [
-                        parts = line.split()
+                'Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Offline_Uncorrectable',
-                        if len(parts) >= 10:
+                'Reported_Uncorrect', 'Spin_Retry_Count', 'Power_Cycle_Count', 'Power_On_Hours',
-                            raw_value = self._parse_smart_value(parts[9])
+                'Temperature_Celsius', 'Available_Spare', 'Program_Fail_Count', 'Erase_Fail_Count',
-                            smart_health['attributes'][attr] = raw_value
+                'Load_Cycle_Count', 'SSD_Life_Left', 'Program_Fail_Cnt_Total', 'Erase_Fail_Count_Total',
                'Program_Fail_Count_Chip', 'Erase_Fail_Count_Chip'
            ]
            for attr in ALL_SMART_ATTRIBUTES:
                if attr in line and attr not in ['Wear_Leveling_Count']:  # Wear_Leveling handled separately above
                    # Check if we should monitor this attribute
                    if not self._should_monitor_attribute(attr, manufacturer_profile):
                        logger.debug(f"Skipping {attr} - disabled for this manufacturer")
                        continue
                    parts = line.split()
                    if len(parts) >= 10:
                        raw_value = self._parse_smart_value(parts[9])
                        smart_health['attributes'][attr] = raw_value
-                            if attr == 'Temperature_Celsius':
+                        # Get manufacturer-specific or default thresholds
-                                smart_health['temp'] = raw_value
+                        attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile)
-                                if raw_value >= thresholds['critical']:
+                        if not attr_thresholds:
-                                    smart_health['severity'] = 'CRITICAL'
+                            continue
-                                    smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
+                        
-                                elif raw_value >= thresholds['warning']:
+                        # Apply thresholds based on behavior
-                                    if smart_health['severity'] != 'CRITICAL':
+                        if attr == 'Temperature_Celsius':
-                                        smart_health['severity'] = 'WARNING'
+                            smart_health['temp'] = raw_value
-                                    smart_health['issues'].append(f"High temperature: {raw_value}°C")
+                            if raw_value >= attr_thresholds['critical']:
-                            else:
+                                smart_health['severity'] = 'CRITICAL'
-                                # Only trigger alerts if the raw value actually exceeds thresholds
+                                smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
-                                if raw_value >= thresholds['critical']:
+                            elif raw_value >= attr_thresholds['warning']:
                                if smart_health['severity'] != 'CRITICAL':
                                    smart_health['severity'] = 'WARNING'
                                smart_health['issues'].append(f"High temperature: {raw_value}°C")
                        else:
                            # Handle countup/countdown behavior
                            behavior = attr_thresholds.get('behavior', 'countup')
                            if behavior == 'countup':
                                if raw_value >= attr_thresholds['critical']:
                                    smart_health['severity'] = 'CRITICAL'
                                    smart_health['issues'].append(f"Critical {attr}: {raw_value}")
-                                elif raw_value >= thresholds['warning']:
+                                elif raw_value >= attr_thresholds['warning']:
                                    if smart_health['severity'] != 'CRITICAL':
                                        smart_health['severity'] = 'WARNING'
                                    smart_health['issues'].append(f"Warning {attr}: {raw_value}")
                            elif behavior == 'countdown':
                                if raw_value <= attr_thresholds['critical']:
                                    smart_health['severity'] = 'CRITICAL'
                                    smart_health['issues'].append(f"Critical {attr}: {raw_value}")
                                elif raw_value <= attr_thresholds['warning']:
                                    if smart_health['severity'] != 'CRITICAL':
                                        smart_health['severity'] = 'WARNING'
                                    smart_health['issues'].append(f"Warning {attr}: {raw_value}")