Huge update to vendor profiles
This commit is contained in:
192
hwmonDaemon.py
192
hwmonDaemon.py
@ -110,21 +110,48 @@ class SystemHealthMonitor:
|
||||
MANUFACTURER_SMART_PROFILES = {
|
||||
'Ridata': {
|
||||
'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],
|
||||
'firmware_patterns': ['HT3618B7', 'HT36'], # Add exact firmware match first
|
||||
'firmware_patterns': ['HT3618B7', 'HT36'],
|
||||
'wear_leveling_behavior': 'countup',
|
||||
'wear_leveling_baseline': 0,
|
||||
'wear_leveling_thresholds': {
|
||||
'warning': 1000000, # Increase threshold significantly
|
||||
'critical': 2000000 # Very high threshold for countup behavior
|
||||
'warning': 1000000000, # 1 billion - very conservative
|
||||
'critical': 2000000000 # 2 billion - extremely conservative
|
||||
},
|
||||
'attributes': {
|
||||
'Wear_Leveling_Count': {
|
||||
'behavior': 'countup',
|
||||
'baseline': 0,
|
||||
'warning_threshold': 1000000, # Much higher threshold
|
||||
'critical_threshold': 2000000, # Very high threshold
|
||||
'description': 'Total wear leveling operations performed (countup from 0)',
|
||||
'ignore_on_new_drive': True # Don't alert on new drives
|
||||
'warning_threshold': 1000000000,
|
||||
'critical_threshold': 2000000000,
|
||||
'description': 'Total wear leveling operations (countup from 0)',
|
||||
'ignore_on_new_drive': False,
|
||||
'monitor': True # Include in health checks
|
||||
},
|
||||
# These are operation counters, NOT actual failures - ignore completely
|
||||
'Erase_Fail_Count_Chip': {
|
||||
'monitor': False, # Skip monitoring entirely
|
||||
'description': 'Operation counter, not actual failures - IGNORED'
|
||||
},
|
||||
'Program_Fail_Count_Chip': {
|
||||
'monitor': False, # Skip monitoring entirely
|
||||
'description': 'Operation counter, not actual failures - IGNORED'
|
||||
},
|
||||
# These are the REAL failure counters - monitor with standard thresholds
|
||||
'Program_Fail_Cnt_Total': {
|
||||
'monitor': True,
|
||||
'behavior': 'countup',
|
||||
'baseline': 0,
|
||||
'warning_threshold': 1, # Any failures are concerning
|
||||
'critical_threshold': 5,
|
||||
'description': 'Actual program failures (real failures)'
|
||||
},
|
||||
'Erase_Fail_Count_Total': {
|
||||
'monitor': True,
|
||||
'behavior': 'countup',
|
||||
'baseline': 0,
|
||||
'warning_threshold': 1, # Any failures are concerning
|
||||
'critical_threshold': 5,
|
||||
'description': 'Actual erase failures (real failures)'
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -142,7 +169,19 @@ class SystemHealthMonitor:
|
||||
'baseline': 0,
|
||||
'warning_threshold': 2000,
|
||||
'critical_threshold': 3000,
|
||||
'description': 'Total wear leveling operations performed'
|
||||
'description': 'Total wear leveling operations performed',
|
||||
'monitor': True
|
||||
},
|
||||
# Standard monitoring for all other attributes
|
||||
'Program_Fail_Count': {
|
||||
'monitor': True,
|
||||
'warning_threshold': 10,
|
||||
'critical_threshold': 20
|
||||
},
|
||||
'Erase_Fail_Count': {
|
||||
'monitor': True,
|
||||
'warning_threshold': 10,
|
||||
'critical_threshold': 20
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -160,7 +199,8 @@ class SystemHealthMonitor:
|
||||
'baseline': 100,
|
||||
'warning_threshold': 30,
|
||||
'critical_threshold': 10,
|
||||
'description': 'Percentage of rated life remaining'
|
||||
'description': 'Percentage of rated life remaining',
|
||||
'monitor': True
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -171,6 +211,9 @@ class SystemHealthMonitor:
|
||||
'wear_leveling_thresholds': {
|
||||
'warning': 30,
|
||||
'critical': 10
|
||||
},
|
||||
'attributes': {
|
||||
# All attributes use default monitoring unless specified
|
||||
}
|
||||
},
|
||||
'Generic': { # Fallback for unknown manufacturers
|
||||
@ -180,6 +223,9 @@ class SystemHealthMonitor:
|
||||
'wear_leveling_thresholds': {
|
||||
'warning': None, # Don't trigger on unknown
|
||||
'critical': None
|
||||
},
|
||||
'attributes': {
|
||||
# All attributes use default monitoring
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1142,6 +1188,64 @@ class SystemHealthMonitor:
|
||||
logger.debug(f"No specific profile found for Model: '{model}', Firmware: '{firmware}', using Generic profile")
|
||||
return self.MANUFACTURER_SMART_PROFILES['Generic']
|
||||
|
||||
def _should_monitor_attribute(self, attr_name: str, manufacturer_profile: dict) -> bool:
|
||||
"""
|
||||
Check if an attribute should be monitored based on manufacturer profile
|
||||
"""
|
||||
if not manufacturer_profile:
|
||||
return True # Default: monitor everything
|
||||
|
||||
attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
|
||||
|
||||
# Check if explicitly set to not monitor
|
||||
if attr_config.get('monitor') is False:
|
||||
logger.debug(f"Skipping monitoring for {attr_name} - explicitly disabled")
|
||||
return False
|
||||
|
||||
return True # Default: monitor unless explicitly disabled
|
||||
|
||||
def _get_attribute_thresholds(self, attr_name: str, manufacturer_profile: dict) -> dict:
|
||||
"""
|
||||
Get attribute-specific thresholds, falling back to defaults
|
||||
"""
|
||||
# Check for manufacturer-specific thresholds first
|
||||
if manufacturer_profile:
|
||||
attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
|
||||
if 'warning_threshold' in attr_config and 'critical_threshold' in attr_config:
|
||||
return {
|
||||
'warning': attr_config['warning_threshold'],
|
||||
'critical': attr_config['critical_threshold'],
|
||||
'behavior': attr_config.get('behavior', 'countup')
|
||||
}
|
||||
|
||||
# Fall back to BASE_SMART_THRESHOLDS (your existing thresholds)
|
||||
BASE_SMART_THRESHOLDS = {
|
||||
'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
|
||||
'Current_Pending_Sector': {'warning': 1, 'critical': 5},
|
||||
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
|
||||
'Reported_Uncorrect': {'warning': 1, 'critical': 10},
|
||||
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
|
||||
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
||||
'Power_On_Hours': {'warning': 61320, 'critical': 70080},
|
||||
'Temperature_Celsius': {'warning': 65, 'critical': 75},
|
||||
'Available_Spare': {'warning': 30, 'critical': 10},
|
||||
'Program_Fail_Count': {'warning': 10, 'critical': 20},
|
||||
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
|
||||
'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
|
||||
'SSD_Life_Left': {'warning': 30, 'critical': 10},
|
||||
'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5},
|
||||
'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5}
|
||||
}
|
||||
|
||||
if attr_name in BASE_SMART_THRESHOLDS:
|
||||
return {
|
||||
'warning': BASE_SMART_THRESHOLDS[attr_name]['warning'],
|
||||
'critical': BASE_SMART_THRESHOLDS[attr_name]['critical'],
|
||||
'behavior': 'countup'
|
||||
}
|
||||
|
||||
return None # No thresholds defined
|
||||
|
||||
def _is_new_drive(self, power_on_hours: int) -> bool:
|
||||
"""
|
||||
Determine if a drive is considered "new" based on power-on hours.
|
||||
@ -1318,29 +1422,59 @@ class SystemHealthMonitor:
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")
|
||||
|
||||
# Handle all other standard SMART attributes (except those already processed)
|
||||
for attr, thresholds in BASE_SMART_THRESHOLDS.items():
|
||||
if attr in line and attr not in ['Wear_Leveling_Count', 'Erase_Fail_Count', 'Program_Fail_Count']:
|
||||
parts = line.split()
|
||||
if len(parts) >= 10:
|
||||
raw_value = self._parse_smart_value(parts[9])
|
||||
smart_health['attributes'][attr] = raw_value
|
||||
# Handle all SMART attributes with manufacturer-specific logic
|
||||
# Define all possible attributes we might encounter
|
||||
ALL_SMART_ATTRIBUTES = [
|
||||
'Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Offline_Uncorrectable',
|
||||
'Reported_Uncorrect', 'Spin_Retry_Count', 'Power_Cycle_Count', 'Power_On_Hours',
|
||||
'Temperature_Celsius', 'Available_Spare', 'Program_Fail_Count', 'Erase_Fail_Count',
|
||||
'Load_Cycle_Count', 'SSD_Life_Left', 'Program_Fail_Cnt_Total', 'Erase_Fail_Count_Total',
|
||||
'Program_Fail_Count_Chip', 'Erase_Fail_Count_Chip'
|
||||
]
|
||||
|
||||
for attr in ALL_SMART_ATTRIBUTES:
|
||||
if attr in line and attr not in ['Wear_Leveling_Count']: # Wear_Leveling handled separately above
|
||||
# Check if we should monitor this attribute
|
||||
if not self._should_monitor_attribute(attr, manufacturer_profile):
|
||||
logger.debug(f"Skipping {attr} - disabled for this manufacturer")
|
||||
continue
|
||||
|
||||
parts = line.split()
|
||||
if len(parts) >= 10:
|
||||
raw_value = self._parse_smart_value(parts[9])
|
||||
smart_health['attributes'][attr] = raw_value
|
||||
|
||||
if attr == 'Temperature_Celsius':
|
||||
smart_health['temp'] = raw_value
|
||||
if raw_value >= thresholds['critical']:
|
||||
smart_health['severity'] = 'CRITICAL'
|
||||
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
|
||||
elif raw_value >= thresholds['warning']:
|
||||
if smart_health['severity'] != 'CRITICAL':
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"High temperature: {raw_value}°C")
|
||||
else:
|
||||
# Only trigger alerts if the raw value actually exceeds thresholds
|
||||
if raw_value >= thresholds['critical']:
|
||||
# Get manufacturer-specific or default thresholds
|
||||
attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile)
|
||||
if not attr_thresholds:
|
||||
continue
|
||||
|
||||
# Apply thresholds based on behavior
|
||||
if attr == 'Temperature_Celsius':
|
||||
smart_health['temp'] = raw_value
|
||||
if raw_value >= attr_thresholds['critical']:
|
||||
smart_health['severity'] = 'CRITICAL'
|
||||
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
|
||||
elif raw_value >= attr_thresholds['warning']:
|
||||
if smart_health['severity'] != 'CRITICAL':
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"High temperature: {raw_value}°C")
|
||||
else:
|
||||
# Handle countup/countdown behavior
|
||||
behavior = attr_thresholds.get('behavior', 'countup')
|
||||
if behavior == 'countup':
|
||||
if raw_value >= attr_thresholds['critical']:
|
||||
smart_health['severity'] = 'CRITICAL'
|
||||
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
|
||||
elif raw_value >= thresholds['warning']:
|
||||
elif raw_value >= attr_thresholds['warning']:
|
||||
if smart_health['severity'] != 'CRITICAL':
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
|
||||
elif behavior == 'countdown':
|
||||
if raw_value <= attr_thresholds['critical']:
|
||||
smart_health['severity'] = 'CRITICAL'
|
||||
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
|
||||
elif raw_value <= attr_thresholds['warning']:
|
||||
if smart_health['severity'] != 'CRITICAL':
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
|
||||
|
||||
Reference in New Issue
Block a user