Huge update to vendor profiles

This commit is contained in:
2025-07-24 19:15:21 -04:00
parent a74c4c0309
commit 0faf7654d6

View File

@ -110,21 +110,48 @@ class SystemHealthMonitor:
MANUFACTURER_SMART_PROFILES = {
'Ridata': {
'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],
'firmware_patterns': ['HT3618B7', 'HT36'], # Add exact firmware match first
'firmware_patterns': ['HT3618B7', 'HT36'],
'wear_leveling_behavior': 'countup',
'wear_leveling_baseline': 0,
'wear_leveling_thresholds': {
'warning': 1000000, # Increase threshold significantly
'critical': 2000000 # Very high threshold for countup behavior
'warning': 1000000000, # 1 billion - very conservative
'critical': 2000000000 # 2 billion - extremely conservative
},
'attributes': {
'Wear_Leveling_Count': {
'behavior': 'countup',
'baseline': 0,
'warning_threshold': 1000000, # Much higher threshold
'critical_threshold': 2000000, # Very high threshold
'description': 'Total wear leveling operations performed (countup from 0)',
'ignore_on_new_drive': True # Don't alert on new drives
'warning_threshold': 1000000000,
'critical_threshold': 2000000000,
'description': 'Total wear leveling operations (countup from 0)',
'ignore_on_new_drive': False,
'monitor': True # Include in health checks
},
# These are operation counters, NOT actual failures - ignore completely
'Erase_Fail_Count_Chip': {
'monitor': False, # Skip monitoring entirely
'description': 'Operation counter, not actual failures - IGNORED'
},
'Program_Fail_Count_Chip': {
'monitor': False, # Skip monitoring entirely
'description': 'Operation counter, not actual failures - IGNORED'
},
# These are the REAL failure counters - monitor with standard thresholds
'Program_Fail_Cnt_Total': {
'monitor': True,
'behavior': 'countup',
'baseline': 0,
'warning_threshold': 1, # Any failures are concerning
'critical_threshold': 5,
'description': 'Actual program failures (real failures)'
},
'Erase_Fail_Count_Total': {
'monitor': True,
'behavior': 'countup',
'baseline': 0,
'warning_threshold': 1, # Any failures are concerning
'critical_threshold': 5,
'description': 'Actual erase failures (real failures)'
}
}
},
@ -142,7 +169,19 @@ class SystemHealthMonitor:
'baseline': 0,
'warning_threshold': 2000,
'critical_threshold': 3000,
'description': 'Total wear leveling operations performed'
'description': 'Total wear leveling operations performed',
'monitor': True
},
# Standard monitoring for all other attributes
'Program_Fail_Count': {
'monitor': True,
'warning_threshold': 10,
'critical_threshold': 20
},
'Erase_Fail_Count': {
'monitor': True,
'warning_threshold': 10,
'critical_threshold': 20
}
}
},
@ -160,7 +199,8 @@ class SystemHealthMonitor:
'baseline': 100,
'warning_threshold': 30,
'critical_threshold': 10,
'description': 'Percentage of rated life remaining'
'description': 'Percentage of rated life remaining',
'monitor': True
}
}
},
@ -171,6 +211,9 @@ class SystemHealthMonitor:
'wear_leveling_thresholds': {
'warning': 30,
'critical': 10
},
'attributes': {
# All attributes use default monitoring unless specified
}
},
'Generic': { # Fallback for unknown manufacturers
@ -180,6 +223,9 @@ class SystemHealthMonitor:
'wear_leveling_thresholds': {
'warning': None, # Don't trigger on unknown
'critical': None
},
'attributes': {
# All attributes use default monitoring
}
}
}
@ -1142,6 +1188,64 @@ class SystemHealthMonitor:
logger.debug(f"No specific profile found for Model: '{model}', Firmware: '{firmware}', using Generic profile")
return self.MANUFACTURER_SMART_PROFILES['Generic']
def _should_monitor_attribute(self, attr_name: str, manufacturer_profile: dict) -> bool:
"""
Check if an attribute should be monitored based on manufacturer profile
"""
if not manufacturer_profile:
return True # Default: monitor everything
attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
# Check if explicitly set to not monitor
if attr_config.get('monitor') is False:
logger.debug(f"Skipping monitoring for {attr_name} - explicitly disabled")
return False
return True # Default: monitor unless explicitly disabled
def _get_attribute_thresholds(self, attr_name: str, manufacturer_profile: dict) -> dict:
"""
Get attribute-specific thresholds, falling back to defaults
"""
# Check for manufacturer-specific thresholds first
if manufacturer_profile:
attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {})
if 'warning_threshold' in attr_config and 'critical_threshold' in attr_config:
return {
'warning': attr_config['warning_threshold'],
'critical': attr_config['critical_threshold'],
'behavior': attr_config.get('behavior', 'countup')
}
# Fall back to BASE_SMART_THRESHOLDS (your existing thresholds)
BASE_SMART_THRESHOLDS = {
'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
'Current_Pending_Sector': {'warning': 1, 'critical': 5},
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
'Reported_Uncorrect': {'warning': 1, 'critical': 10},
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
'Power_On_Hours': {'warning': 61320, 'critical': 70080},
'Temperature_Celsius': {'warning': 65, 'critical': 75},
'Available_Spare': {'warning': 30, 'critical': 10},
'Program_Fail_Count': {'warning': 10, 'critical': 20},
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
'SSD_Life_Left': {'warning': 30, 'critical': 10},
'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5},
'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5}
}
if attr_name in BASE_SMART_THRESHOLDS:
return {
'warning': BASE_SMART_THRESHOLDS[attr_name]['warning'],
'critical': BASE_SMART_THRESHOLDS[attr_name]['critical'],
'behavior': 'countup'
}
return None # No thresholds defined
def _is_new_drive(self, power_on_hours: int) -> bool:
"""
Determine if a drive is considered "new" based on power-on hours.
@ -1318,29 +1422,59 @@ class SystemHealthMonitor:
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")
# Handle all other standard SMART attributes (except those already processed)
for attr, thresholds in BASE_SMART_THRESHOLDS.items():
if attr in line and attr not in ['Wear_Leveling_Count', 'Erase_Fail_Count', 'Program_Fail_Count']:
parts = line.split()
if len(parts) >= 10:
raw_value = self._parse_smart_value(parts[9])
smart_health['attributes'][attr] = raw_value
# Handle all SMART attributes with manufacturer-specific logic
# Define all possible attributes we might encounter
ALL_SMART_ATTRIBUTES = [
'Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Offline_Uncorrectable',
'Reported_Uncorrect', 'Spin_Retry_Count', 'Power_Cycle_Count', 'Power_On_Hours',
'Temperature_Celsius', 'Available_Spare', 'Program_Fail_Count', 'Erase_Fail_Count',
'Load_Cycle_Count', 'SSD_Life_Left', 'Program_Fail_Cnt_Total', 'Erase_Fail_Count_Total',
'Program_Fail_Count_Chip', 'Erase_Fail_Count_Chip'
]
for attr in ALL_SMART_ATTRIBUTES:
if attr in line and attr not in ['Wear_Leveling_Count']: # Wear_Leveling handled separately above
# Check if we should monitor this attribute
if not self._should_monitor_attribute(attr, manufacturer_profile):
logger.debug(f"Skipping {attr} - disabled for this manufacturer")
continue
parts = line.split()
if len(parts) >= 10:
raw_value = self._parse_smart_value(parts[9])
smart_health['attributes'][attr] = raw_value
if attr == 'Temperature_Celsius':
smart_health['temp'] = raw_value
if raw_value >= thresholds['critical']:
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
elif raw_value >= thresholds['warning']:
if smart_health['severity'] != 'CRITICAL':
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"High temperature: {raw_value}°C")
else:
# Only trigger alerts if the raw value actually exceeds thresholds
if raw_value >= thresholds['critical']:
# Get manufacturer-specific or default thresholds
attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile)
if not attr_thresholds:
continue
# Apply thresholds based on behavior
if attr == 'Temperature_Celsius':
smart_health['temp'] = raw_value
if raw_value >= attr_thresholds['critical']:
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
elif raw_value >= attr_thresholds['warning']:
if smart_health['severity'] != 'CRITICAL':
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"High temperature: {raw_value}°C")
else:
# Handle countup/countdown behavior
behavior = attr_thresholds.get('behavior', 'countup')
if behavior == 'countup':
if raw_value >= attr_thresholds['critical']:
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
elif raw_value >= thresholds['warning']:
elif raw_value >= attr_thresholds['warning']:
if smart_health['severity'] != 'CRITICAL':
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
elif behavior == 'countdown':
if raw_value <= attr_thresholds['critical']:
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
elif raw_value <= attr_thresholds['warning']:
if smart_health['severity'] != 'CRITICAL':
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"Warning {attr}: {raw_value}")