Fixed thesholds for thermals and smart

This commit is contained in:
2025-09-03 12:58:30 -04:00
parent bc73a691df
commit 2d6626cece

View File

@ -144,6 +144,15 @@ class SystemHealthMonitor:
'monitor': False, # Skip monitoring entirely
'description': 'Operation counter, not actual failures - IGNORED'
},
# ADD THIS: Regular Erase_Fail_Count is also an operation counter for Ridata
'Erase_Fail_Count': {
'monitor': False, # Skip monitoring entirely for Ridata
'description': 'Operation counter for Ridata drives, not actual failures - IGNORED'
},
'Program_Fail_Count': {
'monitor': False, # Skip monitoring entirely for Ridata
'description': 'Operation counter for Ridata drives, not actual failures - IGNORED'
},
# These are the REAL failure counters - monitor with standard thresholds
'Program_Fail_Cnt_Total': {
'monitor': True,
@ -578,11 +587,11 @@ class SystemHealthMonitor:
if temperature is None:
return issues
# Drive-type specific temperature thresholds
# Drive-type specific temperature thresholds - ADJUSTED TO BE LESS SENSITIVE
if drive_type == 'SSD':
temp_thresholds = {'warning': 70, 'critical': 85, 'optimal_max': 60}
temp_thresholds = {'warning': 70, 'critical': 85, 'optimal_max': 65} # Raised from 60
else: # HDD
temp_thresholds = {'warning': 55, 'critical': 65, 'optimal_max': 45}
temp_thresholds = {'warning': 60, 'critical': 70, 'optimal_max': 55} # Raised from 45/55/65
if temperature >= temp_thresholds['critical']:
issues.append(f"CRITICAL: Drive temperature {temperature}°C exceeds safe operating limit for {drive_type}")
@ -1519,7 +1528,7 @@ class SystemHealthMonitor:
'behavior': attr_config.get('behavior', 'countup')
}
# Enhanced BASE_SMART_THRESHOLDS with additional attributes
# Enhanced BASE_SMART_THRESHOLDS with manufacturer-specific handling
BASE_SMART_THRESHOLDS = {
'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
'Current_Pending_Sector': {'warning': 1, 'critical': 5},
@ -1536,10 +1545,10 @@ class SystemHealthMonitor:
'SSD_Life_Left': {'warning': 30, 'critical': 10},
'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5},
'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5},
# Enhanced SMART attributes for better failure detection
'Raw_Read_Error_Rate': {'warning': 100000, 'critical': 1000000},
'Seek_Error_Rate': {'warning': 100000, 'critical': 1000000},
'Command_Timeout': {'warning': 1, 'critical': 5},
# ADJUSTED: More lenient thresholds for error rates on unknown drives
'Raw_Read_Error_Rate': {'warning': 10000000, 'critical': 100000000}, # Raised significantly
'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000}, # Raised significantly
'Command_Timeout': {'warning': 100, 'critical': 1000}, # Raised significantly
'High_Fly_Writes': {'warning': 1, 'critical': 5},
'Airflow_Temperature_Cel': {'warning': 65, 'critical': 75},
'G_Sense_Error_Rate': {'warning': 100, 'critical': 1000},
@ -1658,11 +1667,16 @@ class SystemHealthMonitor:
parts = line.split()
if len(parts) >= 10:
raw_value = self._parse_smart_value(parts[9])
smart_attributes_raw[attr] = raw_value
smart_attributes_raw[f'{attr}_Total'] = raw_value # Store as _Total
logger.debug(f"Found {attr}_Total: {raw_value}")
break
# Only use non-_Total version if _Total not found
# Only use non-_Total version if _Total not found AND not Ridata
elif attr in line and f'{attr}_Total' not in smart_attributes_raw:
# Check if this is a Ridata drive and should skip regular counters
if manufacturer_profile and manufacturer_profile.get('aliases', [{}])[0] == 'Ridata':
logger.debug(f"Skipping {attr} for Ridata drive - using _Total version only")
continue
parts = line.split()
if len(parts) >= 10:
raw_value = self._parse_smart_value(parts[9])