Huge update to vendor profiles
This commit is contained in:
192
hwmonDaemon.py
192
hwmonDaemon.py
@ -110,21 +110,48 @@ class SystemHealthMonitor:
|
|||||||
MANUFACTURER_SMART_PROFILES = {
|
MANUFACTURER_SMART_PROFILES = {
|
||||||
'Ridata': {
|
'Ridata': {
|
||||||
'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],
|
'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],
|
||||||
'firmware_patterns': ['HT3618B7', 'HT36'], # Add exact firmware match first
|
'firmware_patterns': ['HT3618B7', 'HT36'],
|
||||||
'wear_leveling_behavior': 'countup',
|
'wear_leveling_behavior': 'countup',
|
||||||
'wear_leveling_baseline': 0,
|
'wear_leveling_baseline': 0,
|
||||||
'wear_leveling_thresholds': {
|
'wear_leveling_thresholds': {
|
||||||
'warning': 1000000, # Increase threshold significantly
|
'warning': 1000000000, # 1 billion - very conservative
|
||||||
'critical': 2000000 # Very high threshold for countup behavior
|
'critical': 2000000000 # 2 billion - extremely conservative
|
||||||
},
|
},
|
||||||
'attributes': {
|
'attributes': {
|
||||||
'Wear_Leveling_Count': {
|
'Wear_Leveling_Count': {
|
||||||
'behavior': 'countup',
|
'behavior': 'countup',
|
||||||
'baseline': 0,
|
'baseline': 0,
|
||||||
'warning_threshold': 1000000, # Much higher threshold
|
'warning_threshold': 1000000000,
|
||||||
'critical_threshold': 2000000, # Very high threshold
|
'critical_threshold': 2000000000,
|
||||||
'description': 'Total wear leveling operations performed (countup from 0)',
|
'description': 'Total wear leveling operations (countup from 0)',
|
||||||
'ignore_on_new_drive': True # Don't alert on new drives
|
'ignore_on_new_drive': False,
|
||||||
|
'monitor': True # Include in health checks
|
||||||
|
},
|
||||||
|
# These are operation counters, NOT actual failures - ignore completely
|
||||||
|
'Erase_Fail_Count_Chip': {
|
||||||
|
'monitor': False, # Skip monitoring entirely
|
||||||
|
'description': 'Operation counter, not actual failures - IGNORED'
|
||||||
|
},
|
||||||
|
'Program_Fail_Count_Chip': {
|
||||||
|
'monitor': False, # Skip monitoring entirely
|
||||||
|
'description': 'Operation counter, not actual failures - IGNORED'
|
||||||
|
},
|
||||||
|
# These are the REAL failure counters - monitor with standard thresholds
|
||||||
|
'Program_Fail_Cnt_Total': {
|
||||||
|
'monitor': True,
|
||||||
|
'behavior': 'countup',
|
||||||
|
'baseline': 0,
|
||||||
|
'warning_threshold': 1, # Any failures are concerning
|
||||||
|
'critical_threshold': 5,
|
||||||
|
'description': 'Actual program failures (real failures)'
|
||||||
|
},
|
||||||
|
'Erase_Fail_Count_Total': {
|
||||||
|
'monitor': True,
|
||||||
|
'behavior': 'countup',
|
||||||
|
'baseline': 0,
|
||||||
|
'warning_threshold': 1, # Any failures are concerning
|
||||||
|
'critical_threshold': 5,
|
||||||
|
'description': 'Actual erase failures (real failures)'
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -142,7 +169,19 @@ class SystemHealthMonitor:
|
|||||||
'baseline': 0,
|
'baseline': 0,
|
||||||
'warning_threshold': 2000,
|
'warning_threshold': 2000,
|
||||||
'critical_threshold': 3000,
|
'critical_threshold': 3000,
|
||||||
'description': 'Total wear leveling operations performed'
|
'description': 'Total wear leveling operations performed',
|
||||||
|
'monitor': True
|
||||||
|
},
|
||||||
|
# Standard monitoring for all other attributes
|
||||||
|
'Program_Fail_Count': {
|
||||||
|
'monitor': True,
|
||||||
|
'warning_threshold': 10,
|
||||||
|
'critical_threshold': 20
|
||||||
|
},
|
||||||
|
'Erase_Fail_Count': {
|
||||||
|
'monitor': True,
|
||||||
|
'warning_threshold': 10,
|
||||||
|
'critical_threshold': 20
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -160,7 +199,8 @@ class SystemHealthMonitor:
|
|||||||
'baseline': 100,
|
'baseline': 100,
|
||||||
'warning_threshold': 30,
|
'warning_threshold': 30,
|
||||||
'critical_threshold': 10,
|
'critical_threshold': 10,
|
||||||
'description': 'Percentage of rated life remaining'
|
'description': 'Percentage of rated life remaining',
|
||||||
|
'monitor': True
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -171,6 +211,9 @@ class SystemHealthMonitor:
|
|||||||
'wear_leveling_thresholds': {
|
'wear_leveling_thresholds': {
|
||||||
'warning': 30,
|
'warning': 30,
|
||||||
'critical': 10
|
'critical': 10
|
||||||
|
},
|
||||||
|
'attributes': {
|
||||||
|
# All attributes use default monitoring unless specified
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
'Generic': { # Fallback for unknown manufacturers
|
'Generic': { # Fallback for unknown manufacturers
|
||||||
@ -180,6 +223,9 @@ class SystemHealthMonitor:
|
|||||||
'wear_leveling_thresholds': {
|
'wear_leveling_thresholds': {
|
||||||
'warning': None, # Don't trigger on unknown
|
'warning': None, # Don't trigger on unknown
|
||||||
'critical': None
|
'critical': None
|
||||||
|
},
|
||||||
|
'attributes': {
|
||||||
|
# All attributes use default monitoring
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1142,6 +1188,64 @@ class SystemHealthMonitor:
|
|||||||
logger.debug(f"No specific profile found for Model: '{model}', Firmware: '{firmware}', using Generic profile")
|
logger.debug(f"No specific profile found for Model: '{model}', Firmware: '{firmware}', using Generic profile")
|
||||||
return self.MANUFACTURER_SMART_PROFILES['Generic']
|
return self.MANUFACTURER_SMART_PROFILES['Generic']
|
||||||
|
|
||||||
|
def _should_monitor_attribute(self, attr_name: str, manufacturer_profile: dict) -> bool:
|
||||||
|
"""
|
||||||
|
Check if an attribute should be monitored based on manufacturer profile
|
||||||
|
"""
|
||||||
|
if not manufacturer_profile:
|
||||||
|
return True # Default: monitor everything
|
||||||
|
|
||||||
|
attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
|
||||||
|
|
||||||
|
# Check if explicitly set to not monitor
|
||||||
|
if attr_config.get('monitor') is False:
|
||||||
|
logger.debug(f"Skipping monitoring for {attr_name} - explicitly disabled")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True # Default: monitor unless explicitly disabled
|
||||||
|
|
||||||
|
def _get_attribute_thresholds(self, attr_name: str, manufacturer_profile: dict) -> dict:
|
||||||
|
"""
|
||||||
|
Get attribute-specific thresholds, falling back to defaults
|
||||||
|
"""
|
||||||
|
# Check for manufacturer-specific thresholds first
|
||||||
|
if manufacturer_profile:
|
||||||
|
attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
|
||||||
|
if 'warning_threshold' in attr_config and 'critical_threshold' in attr_config:
|
||||||
|
return {
|
||||||
|
'warning': attr_config['warning_threshold'],
|
||||||
|
'critical': attr_config['critical_threshold'],
|
||||||
|
'behavior': attr_config.get('behavior', 'countup')
|
||||||
|
}
|
||||||
|
|
||||||
|
# Fall back to BASE_SMART_THRESHOLDS (your existing thresholds)
|
||||||
|
BASE_SMART_THRESHOLDS = {
|
||||||
|
'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
|
||||||
|
'Current_Pending_Sector': {'warning': 1, 'critical': 5},
|
||||||
|
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
|
||||||
|
'Reported_Uncorrect': {'warning': 1, 'critical': 10},
|
||||||
|
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
|
||||||
|
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
|
||||||
|
'Power_On_Hours': {'warning': 61320, 'critical': 70080},
|
||||||
|
'Temperature_Celsius': {'warning': 65, 'critical': 75},
|
||||||
|
'Available_Spare': {'warning': 30, 'critical': 10},
|
||||||
|
'Program_Fail_Count': {'warning': 10, 'critical': 20},
|
||||||
|
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
|
||||||
|
'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
|
||||||
|
'SSD_Life_Left': {'warning': 30, 'critical': 10},
|
||||||
|
'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5},
|
||||||
|
'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5}
|
||||||
|
}
|
||||||
|
|
||||||
|
if attr_name in BASE_SMART_THRESHOLDS:
|
||||||
|
return {
|
||||||
|
'warning': BASE_SMART_THRESHOLDS[attr_name]['warning'],
|
||||||
|
'critical': BASE_SMART_THRESHOLDS[attr_name]['critical'],
|
||||||
|
'behavior': 'countup'
|
||||||
|
}
|
||||||
|
|
||||||
|
return None # No thresholds defined
|
||||||
|
|
||||||
def _is_new_drive(self, power_on_hours: int) -> bool:
|
def _is_new_drive(self, power_on_hours: int) -> bool:
|
||||||
"""
|
"""
|
||||||
Determine if a drive is considered "new" based on power-on hours.
|
Determine if a drive is considered "new" based on power-on hours.
|
||||||
@ -1318,29 +1422,59 @@ class SystemHealthMonitor:
|
|||||||
smart_health['severity'] = 'WARNING'
|
smart_health['severity'] = 'WARNING'
|
||||||
smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")
|
smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")
|
||||||
|
|
||||||
# Handle all other standard SMART attributes (except those already processed)
|
# Handle all SMART attributes with manufacturer-specific logic
|
||||||
for attr, thresholds in BASE_SMART_THRESHOLDS.items():
|
# Define all possible attributes we might encounter
|
||||||
if attr in line and attr not in ['Wear_Leveling_Count', 'Erase_Fail_Count', 'Program_Fail_Count']:
|
ALL_SMART_ATTRIBUTES = [
|
||||||
parts = line.split()
|
'Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Offline_Uncorrectable',
|
||||||
if len(parts) >= 10:
|
'Reported_Uncorrect', 'Spin_Retry_Count', 'Power_Cycle_Count', 'Power_On_Hours',
|
||||||
raw_value = self._parse_smart_value(parts[9])
|
'Temperature_Celsius', 'Available_Spare', 'Program_Fail_Count', 'Erase_Fail_Count',
|
||||||
smart_health['attributes'][attr] = raw_value
|
'Load_Cycle_Count', 'SSD_Life_Left', 'Program_Fail_Cnt_Total', 'Erase_Fail_Count_Total',
|
||||||
|
'Program_Fail_Count_Chip', 'Erase_Fail_Count_Chip'
|
||||||
|
]
|
||||||
|
|
||||||
|
for attr in ALL_SMART_ATTRIBUTES:
|
||||||
|
if attr in line and attr not in ['Wear_Leveling_Count']: # Wear_Leveling handled separately above
|
||||||
|
# Check if we should monitor this attribute
|
||||||
|
if not self._should_monitor_attribute(attr, manufacturer_profile):
|
||||||
|
logger.debug(f"Skipping {attr} - disabled for this manufacturer")
|
||||||
|
continue
|
||||||
|
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 10:
|
||||||
|
raw_value = self._parse_smart_value(parts[9])
|
||||||
|
smart_health['attributes'][attr] = raw_value
|
||||||
|
|
||||||
if attr == 'Temperature_Celsius':
|
# Get manufacturer-specific or default thresholds
|
||||||
smart_health['temp'] = raw_value
|
attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile)
|
||||||
if raw_value >= thresholds['critical']:
|
if not attr_thresholds:
|
||||||
smart_health['severity'] = 'CRITICAL'
|
continue
|
||||||
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
|
|
||||||
elif raw_value >= thresholds['warning']:
|
# Apply thresholds based on behavior
|
||||||
if smart_health['severity'] != 'CRITICAL':
|
if attr == 'Temperature_Celsius':
|
||||||
smart_health['severity'] = 'WARNING'
|
smart_health['temp'] = raw_value
|
||||||
smart_health['issues'].append(f"High temperature: {raw_value}°C")
|
if raw_value >= attr_thresholds['critical']:
|
||||||
else:
|
smart_health['severity'] = 'CRITICAL'
|
||||||
# Only trigger alerts if the raw value actually exceeds thresholds
|
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
|
||||||
if raw_value >= thresholds['critical']:
|
elif raw_value >= attr_thresholds['warning']:
|
||||||
|
if smart_health['severity'] != 'CRITICAL':
|
||||||
|
smart_health['severity'] = 'WARNING'
|
||||||
|
smart_health['issues'].append(f"High temperature: {raw_value}°C")
|
||||||
|
else:
|
||||||
|
# Handle countup/countdown behavior
|
||||||
|
behavior = attr_thresholds.get('behavior', 'countup')
|
||||||
|
if behavior == 'countup':
|
||||||
|
if raw_value >= attr_thresholds['critical']:
|
||||||
smart_health['severity'] = 'CRITICAL'
|
smart_health['severity'] = 'CRITICAL'
|
||||||
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
|
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
|
||||||
elif raw_value >= thresholds['warning']:
|
elif raw_value >= attr_thresholds['warning']:
|
||||||
|
if smart_health['severity'] != 'CRITICAL':
|
||||||
|
smart_health['severity'] = 'WARNING'
|
||||||
|
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
|
||||||
|
elif behavior == 'countdown':
|
||||||
|
if raw_value <= attr_thresholds['critical']:
|
||||||
|
smart_health['severity'] = 'CRITICAL'
|
||||||
|
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
|
||||||
|
elif raw_value <= attr_thresholds['warning']:
|
||||||
if smart_health['severity'] != 'CRITICAL':
|
if smart_health['severity'] != 'CRITICAL':
|
||||||
smart_health['severity'] = 'WARNING'
|
smart_health['severity'] = 'WARNING'
|
||||||
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
|
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
|
||||||
|
|||||||
Reference in New Issue
Block a user