More Detailed SMART Monitoring

This commit is contained in:
2025-03-03 16:17:43 -05:00
parent d45e866b63
commit 6a498ed33a

View File

@ -295,65 +295,95 @@ class SystemHealthMonitor:
def _check_smart_health(self, device: str) -> Dict[str, Any]:
"""
Check comprehensive SMART health metrics for a drive.
Enhanced SMART health check with detailed failure thresholds.
"""
smart_health = {
'status': 'HEALTHY',
'severity': 'NORMAL',
'issues': [],
'temp': None,
'attributes': {}
}
# Define critical SMART attributes and their thresholds
SMART_THRESHOLDS = {
'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
'Current_Pending_Sector': {'warning': 1, 'critical': 5},
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
'Reported_Uncorrect': {'warning': 1, 'critical': 2},
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
'Command_Timeout': {'warning': 5, 'critical': 10},
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining
'Temperature_Celsius': {'warning': 65, 'critical': 75}
}
try:
# Get detailed SMART data
result = subprocess.run(
['smartctl', '-A', '-H', device],
['smartctl', '-A', '-H', '-l', 'error', device],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
output = result.stdout
# Check overall SMART status first
# Check overall health status
if 'FAILED' in output and 'PASSED' not in output:
smart_health['status'] = 'UNHEALTHY'
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append("SMART overall health check failed")
# Parse SMART attributes
# Parse SMART attributes with thresholds
for line in output.split('\n'):
if 'ATTRIBUTE_NAME' in line:
continue
# Check for current failures only
if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line:
smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append(f"Current failure detected: {line}")
# Monitor critical attributes
for attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector',
'Offline_Uncorrectable', 'Reported_Uncorrect']:
for attr, thresholds in SMART_THRESHOLDS.items():
if attr in line:
parts = line.split()
if len(parts) >= 10:
raw_value = int(parts[9])
if raw_value > 0:
smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append(f"{attr} has value {raw_value}")
# Check temperature
if 'Temperature_Celsius' in line or 'Airflow_Temperature_Cel' in line:
parts = line.split()
if len(parts) >= 10:
temp = int(parts[9])
smart_health['temp'] = temp
if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
smart_health['issues'].append(f"High drive temperature: {temp}°C")
smart_health['attributes'][attr] = raw_value
if attr == 'Temperature_Celsius':
smart_health['temp'] = raw_value
if raw_value >= thresholds['critical']:
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
elif raw_value >= thresholds['warning']:
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"High temperature: {raw_value}°C")
else:
if raw_value >= thresholds['critical']:
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
elif raw_value >= thresholds['warning']:
if smart_health['severity'] != 'CRITICAL':
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
# Check for recent SMART errors
error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"
error_matches = re.finditer(error_log_pattern, output)
recent_errors = []
for match in error_matches:
error_hour = int(match.group(1))
current_hours = smart_health['attributes'].get('Power_On_Hours', 0)
if current_hours - error_hour < 168: # Errors within last week
recent_errors.append(match.group(0))
if recent_errors:
smart_health['severity'] = 'WARNING'
smart_health['issues'].extend(recent_errors)
except Exception as e:
smart_health['status'] = 'ERROR'
smart_health['severity'] = 'UNKNOWN'
smart_health['issues'].append(f"Error checking SMART: {str(e)}")
return smart_health
def _check_drives_health(self) -> Dict[str, Any]:
"""
Check overall health of physical SATA and NVMe drives including disk usage and SMART status.