diff --git a/hwmonDaemon.py b/hwmonDaemon.py index b8faa3f..6439b8a 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -110,21 +110,48 @@ class SystemHealthMonitor: MANUFACTURER_SMART_PROFILES = { 'Ridata': { 'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'], - 'firmware_patterns': ['HT3618B7', 'HT36'], # Add exact firmware match first + 'firmware_patterns': ['HT3618B7', 'HT36'], 'wear_leveling_behavior': 'countup', 'wear_leveling_baseline': 0, 'wear_leveling_thresholds': { - 'warning': 1000000, # Increase threshold significantly - 'critical': 2000000 # Very high threshold for countup behavior + 'warning': 1000000000, # 1 billion - very conservative + 'critical': 2000000000 # 2 billion - extremely conservative }, 'attributes': { 'Wear_Leveling_Count': { 'behavior': 'countup', 'baseline': 0, - 'warning_threshold': 1000000, # Much higher threshold - 'critical_threshold': 2000000, # Very high threshold - 'description': 'Total wear leveling operations performed (countup from 0)', - 'ignore_on_new_drive': True # Don't alert on new drives + 'warning_threshold': 1000000000, + 'critical_threshold': 2000000000, + 'description': 'Total wear leveling operations (countup from 0)', + 'ignore_on_new_drive': False, + 'monitor': True # Include in health checks + }, + # These are operation counters, NOT actual failures - ignore completely + 'Erase_Fail_Count_Chip': { + 'monitor': False, # Skip monitoring entirely + 'description': 'Operation counter, not actual failures - IGNORED' + }, + 'Program_Fail_Count_Chip': { + 'monitor': False, # Skip monitoring entirely + 'description': 'Operation counter, not actual failures - IGNORED' + }, + # These are the REAL failure counters - monitor with standard thresholds + 'Program_Fail_Cnt_Total': { + 'monitor': True, + 'behavior': 'countup', + 'baseline': 0, + 'warning_threshold': 1, # Any failures are concerning + 'critical_threshold': 5, + 'description': 'Actual program failures (real failures)' + }, + 'Erase_Fail_Count_Total': { + 'monitor': True, + 'behavior': 'countup', + 'baseline': 0, + 'warning_threshold': 1, # Any failures are concerning + 'critical_threshold': 5, + 'description': 'Actual erase failures (real failures)' } } }, @@ -142,7 +169,19 @@ class SystemHealthMonitor: 'baseline': 0, 'warning_threshold': 2000, 'critical_threshold': 3000, - 'description': 'Total wear leveling operations performed' + 'description': 'Total wear leveling operations performed', + 'monitor': True + }, + # Standard monitoring for all other attributes + 'Program_Fail_Count': { + 'monitor': True, + 'warning_threshold': 10, + 'critical_threshold': 20 + }, + 'Erase_Fail_Count': { + 'monitor': True, + 'warning_threshold': 10, + 'critical_threshold': 20 } } }, @@ -160,7 +199,8 @@ class SystemHealthMonitor: 'baseline': 100, 'warning_threshold': 30, 'critical_threshold': 10, - 'description': 'Percentage of rated life remaining' + 'description': 'Percentage of rated life remaining', + 'monitor': True } } }, @@ -171,6 +211,9 @@ class SystemHealthMonitor: 'wear_leveling_thresholds': { 'warning': 30, 'critical': 10 + }, + 'attributes': { + # All attributes use default monitoring unless specified } }, 'Generic': { # Fallback for unknown manufacturers @@ -180,6 +223,9 @@ class SystemHealthMonitor: 'wear_leveling_thresholds': { 'warning': None, # Don't trigger on unknown 'critical': None + }, + 'attributes': { + # All attributes use default monitoring } } } @@ -1142,6 +1188,64 @@ class SystemHealthMonitor: logger.debug(f"No specific profile found for Model: '{model}', Firmware: '{firmware}', using Generic profile") return self.MANUFACTURER_SMART_PROFILES['Generic'] + def _should_monitor_attribute(self, attr_name: str, manufacturer_profile: dict) -> bool: + """ + Check if an attribute should be monitored based on manufacturer profile + """ + if not manufacturer_profile: + return True # Default: monitor everything + + attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {}) + + # Check if explicitly set to not monitor + if attr_config.get('monitor') is False: + logger.debug(f"Skipping monitoring for {attr_name} - explicitly disabled") + return False + + return True # Default: monitor unless explicitly disabled + + def _get_attribute_thresholds(self, attr_name: str, manufacturer_profile: dict) -> dict: + """ + Get attribute-specific thresholds, falling back to defaults + """ + # Check for manufacturer-specific thresholds first + if manufacturer_profile: + attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {}) + if 'warning_threshold' in attr_config and 'critical_threshold' in attr_config: + return { + 'warning': attr_config['warning_threshold'], + 'critical': attr_config['critical_threshold'], + 'behavior': attr_config.get('behavior', 'countup') + } + + # Fall back to BASE_SMART_THRESHOLDS (your existing thresholds) + BASE_SMART_THRESHOLDS = { + 'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10}, + 'Current_Pending_Sector': {'warning': 1, 'critical': 5}, + 'Offline_Uncorrectable': {'warning': 1, 'critical': 2}, + 'Reported_Uncorrect': {'warning': 1, 'critical': 10}, + 'Spin_Retry_Count': {'warning': 1, 'critical': 5}, + 'Power_Cycle_Count': {'warning': 5000, 'critical': 10000}, + 'Power_On_Hours': {'warning': 61320, 'critical': 70080}, + 'Temperature_Celsius': {'warning': 65, 'critical': 75}, + 'Available_Spare': {'warning': 30, 'critical': 10}, + 'Program_Fail_Count': {'warning': 10, 'critical': 20}, + 'Erase_Fail_Count': {'warning': 10, 'critical': 20}, + 'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000}, + 'SSD_Life_Left': {'warning': 30, 'critical': 10}, + 'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5}, + 'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5} + } + + if attr_name in BASE_SMART_THRESHOLDS: + return { + 'warning': BASE_SMART_THRESHOLDS[attr_name]['warning'], + 'critical': BASE_SMART_THRESHOLDS[attr_name]['critical'], + 'behavior': 'countup' + } + + return None # No thresholds defined + def _is_new_drive(self, power_on_hours: int) -> bool: """ Determine if a drive is considered "new" based on power-on hours. @@ -1318,29 +1422,59 @@ class SystemHealthMonitor: smart_health['severity'] = 'WARNING' smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}") - # Handle all other standard SMART attributes (except those already processed) - for attr, thresholds in BASE_SMART_THRESHOLDS.items(): - if attr in line and attr not in ['Wear_Leveling_Count', 'Erase_Fail_Count', 'Program_Fail_Count']: - parts = line.split() - if len(parts) >= 10: - raw_value = self._parse_smart_value(parts[9]) - smart_health['attributes'][attr] = raw_value + # Handle all SMART attributes with manufacturer-specific logic + # Define all possible attributes we might encounter + ALL_SMART_ATTRIBUTES = [ + 'Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Offline_Uncorrectable', + 'Reported_Uncorrect', 'Spin_Retry_Count', 'Power_Cycle_Count', 'Power_On_Hours', + 'Temperature_Celsius', 'Available_Spare', 'Program_Fail_Count', 'Erase_Fail_Count', + 'Load_Cycle_Count', 'SSD_Life_Left', 'Program_Fail_Cnt_Total', 'Erase_Fail_Count_Total', + 'Program_Fail_Count_Chip', 'Erase_Fail_Count_Chip' + ] + + for attr in ALL_SMART_ATTRIBUTES: + if attr in line and attr not in ['Wear_Leveling_Count']: # Wear_Leveling handled separately above + # Check if we should monitor this attribute + if not self._should_monitor_attribute(attr, manufacturer_profile): + logger.debug(f"Skipping {attr} - disabled for this manufacturer") + continue + + parts = line.split() + if len(parts) >= 10: + raw_value = self._parse_smart_value(parts[9]) + smart_health['attributes'][attr] = raw_value - if attr == 'Temperature_Celsius': - smart_health['temp'] = raw_value - if raw_value >= thresholds['critical']: - smart_health['severity'] = 'CRITICAL' - smart_health['issues'].append(f"Critical temperature: {raw_value}°C") - elif raw_value >= thresholds['warning']: - if smart_health['severity'] != 'CRITICAL': - smart_health['severity'] = 'WARNING' - smart_health['issues'].append(f"High temperature: {raw_value}°C") - else: - # Only trigger alerts if the raw value actually exceeds thresholds - if raw_value >= thresholds['critical']: + # Get manufacturer-specific or default thresholds + attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile) + if not attr_thresholds: + continue + + # Apply thresholds based on behavior + if attr == 'Temperature_Celsius': + smart_health['temp'] = raw_value + if raw_value >= attr_thresholds['critical']: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical temperature: {raw_value}°C") + elif raw_value >= attr_thresholds['warning']: + if smart_health['severity'] != 'CRITICAL': + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"High temperature: {raw_value}°C") + else: + # Handle countup/countdown behavior + behavior = attr_thresholds.get('behavior', 'countup') + if behavior == 'countup': + if raw_value >= attr_thresholds['critical']: smart_health['severity'] = 'CRITICAL' smart_health['issues'].append(f"Critical {attr}: {raw_value}") - elif raw_value >= thresholds['warning']: + elif raw_value >= attr_thresholds['warning']: + if smart_health['severity'] != 'CRITICAL': + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"Warning {attr}: {raw_value}") + elif behavior == 'countdown': + if raw_value <= attr_thresholds['critical']: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical {attr}: {raw_value}") + elif raw_value <= attr_thresholds['warning']: if smart_health['severity'] != 'CRITICAL': smart_health['severity'] = 'WARNING' smart_health['issues'].append(f"Warning {attr}: {raw_value}")