More Detailed SMART Monitoring

This commit is contained in:
2025-03-03 16:17:43 -05:00
parent d45e866b63
commit 6a498ed33a

View File

@ -295,18 +295,34 @@ class SystemHealthMonitor:
def _check_smart_health(self, device: str) -> Dict[str, Any]: def _check_smart_health(self, device: str) -> Dict[str, Any]:
""" """
Check comprehensive SMART health metrics for a drive. Enhanced SMART health check with detailed failure thresholds.
""" """
smart_health = { smart_health = {
'status': 'HEALTHY', 'status': 'HEALTHY',
'severity': 'NORMAL',
'issues': [], 'issues': [],
'temp': None, 'temp': None,
'attributes': {} 'attributes': {}
} }
# Define critical SMART attributes and their thresholds
SMART_THRESHOLDS = {
'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
'Current_Pending_Sector': {'warning': 1, 'critical': 5},
'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
'Reported_Uncorrect': {'warning': 1, 'critical': 2},
'Spin_Retry_Count': {'warning': 1, 'critical': 5},
'Command_Timeout': {'warning': 5, 'critical': 10},
'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years
'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining
'Temperature_Celsius': {'warning': 65, 'critical': 75}
}
try: try:
# Get detailed SMART data
result = subprocess.run( result = subprocess.run(
['smartctl', '-A', '-H', device], ['smartctl', '-A', '-H', '-l', 'error', device],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True text=True
@ -314,46 +330,60 @@ class SystemHealthMonitor:
output = result.stdout output = result.stdout
# Check overall SMART status first # Check overall health status
if 'FAILED' in output and 'PASSED' not in output: if 'FAILED' in output and 'PASSED' not in output:
smart_health['status'] = 'UNHEALTHY' smart_health['status'] = 'UNHEALTHY'
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append("SMART overall health check failed") smart_health['issues'].append("SMART overall health check failed")
# Parse SMART attributes # Parse SMART attributes with thresholds
for line in output.split('\n'): for line in output.split('\n'):
if 'ATTRIBUTE_NAME' in line: for attr, thresholds in SMART_THRESHOLDS.items():
continue
# Check for current failures only
if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line:
smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append(f"Current failure detected: {line}")
# Monitor critical attributes
for attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector',
'Offline_Uncorrectable', 'Reported_Uncorrect']:
if attr in line: if attr in line:
parts = line.split() parts = line.split()
if len(parts) >= 10: if len(parts) >= 10:
raw_value = int(parts[9]) raw_value = int(parts[9])
if raw_value > 0: smart_health['attributes'][attr] = raw_value
smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append(f"{attr} has value {raw_value}")
# Check temperature if attr == 'Temperature_Celsius':
if 'Temperature_Celsius' in line or 'Airflow_Temperature_Cel' in line: smart_health['temp'] = raw_value
parts = line.split() if raw_value >= thresholds['critical']:
if len(parts) >= 10: smart_health['severity'] = 'CRITICAL'
temp = int(parts[9]) smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
smart_health['temp'] = temp elif raw_value >= thresholds['warning']:
if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']: smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"High drive temperature: {temp}°C") smart_health['issues'].append(f"High temperature: {raw_value}°C")
else:
if raw_value >= thresholds['critical']:
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
elif raw_value >= thresholds['warning']:
if smart_health['severity'] != 'CRITICAL':
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
# Check for recent SMART errors
error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"
error_matches = re.finditer(error_log_pattern, output)
recent_errors = []
for match in error_matches:
error_hour = int(match.group(1))
current_hours = smart_health['attributes'].get('Power_On_Hours', 0)
if current_hours - error_hour < 168: # Errors within last week
recent_errors.append(match.group(0))
if recent_errors:
smart_health['severity'] = 'WARNING'
smart_health['issues'].extend(recent_errors)
except Exception as e: except Exception as e:
smart_health['status'] = 'ERROR' smart_health['status'] = 'ERROR'
smart_health['severity'] = 'UNKNOWN'
smart_health['issues'].append(f"Error checking SMART: {str(e)}") smart_health['issues'].append(f"Error checking SMART: {str(e)}")
return smart_health return smart_health
def _check_drives_health(self) -> Dict[str, Any]: def _check_drives_health(self) -> Dict[str, Any]:
""" """
Check overall health of physical SATA and NVMe drives including disk usage and SMART status. Check overall health of physical SATA and NVMe drives including disk usage and SMART status.