Changed detection for smart

This commit is contained in:
2024-12-05 21:17:09 -05:00
parent 883aee7390
commit d1d41106bd

View File

@ -248,7 +248,7 @@ class SystemHealthMonitor:
'temp': None, 'temp': None,
'attributes': {} 'attributes': {}
} }
try: try:
result = subprocess.run( result = subprocess.run(
['smartctl', '-A', '-H', device], ['smartctl', '-A', '-H', device],
@ -256,74 +256,48 @@ class SystemHealthMonitor:
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True text=True
) )
output = result.stdout output = result.stdout
# Check overall SMART status first # Check overall SMART status first
if 'FAILED' in output and not 'PASSED' in output: if 'FAILED' in output and 'PASSED' not in output:
smart_health['status'] = 'UNHEALTHY' smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append("SMART overall health check failed") smart_health['issues'].append("SMART overall health check failed")
# Define critical attributes and their thresholds
critical_attributes = {
'Reallocated_Sector_Ct': {'threshold': 0, 'critical': True},
'Current_Pending_Sector': {'threshold': 0, 'critical': True},
'Offline_Uncorrectable': {'threshold': 0, 'critical': True},
'Reported_Uncorrect': {'threshold': 0, 'critical': True},
'Command_Timeout': {'threshold': 5, 'critical': False},
'Temperature_Celsius': {'threshold': 65, 'critical': False},
'Wear_Leveling_Count': {'threshold': 10, 'critical': True},
'Media_Wearout_Indicator': {'threshold': 20, 'critical': True}
}
# Parse SMART attributes
for line in output.split('\n'): for line in output.split('\n'):
# Skip header lines if 'ATTRIBUTE_NAME' in line:
if 'ATTRIBUTE_NAME' in line or '===' in line:
continue continue
for attr_name, limits in critical_attributes.items(): # Check for current failures only
if attr_name in line: if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line:
smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append(f"Current failure detected: {line}")
# Monitor critical attributes
for attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector',
'Offline_Uncorrectable', 'Reported_Uncorrect']:
if attr in line:
parts = line.split() parts = line.split()
if len(parts) >= 10: if len(parts) >= 10:
value = int(parts[9]) # Raw value raw_value = int(parts[9])
normalized = int(parts[3]) # Normalized value if raw_value > 0:
smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append(f"{attr} has value {raw_value}")
smart_health['attributes'][attr_name] = { # Check temperature
'raw': value, if 'Temperature_Celsius' in line or 'Airflow_Temperature_Cel' in line:
'normalized': normalized parts = line.split()
} if len(parts) >= 10:
temp = int(parts[9])
# Check thresholds smart_health['temp'] = temp
if attr_name == 'Temperature_Celsius': if temp > 65:
smart_health['temp'] = value smart_health['issues'].append(f"High drive temperature: {temp}°C")
if value > limits['threshold']:
smart_health['issues'].append(
f"Drive temperature critical: {value}°C"
)
elif value > limits['threshold']:
if limits['critical']:
smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append(
f"{attr_name} above threshold: {value}"
)
# Check for very low normalized values
if normalized <= 10 and attr_name != 'Temperature_Celsius':
smart_health['issues'].append(
f"{attr_name} normalized value critical: {normalized}"
)
# Check if WHEN_FAILED is present and not in the past
if 'WHEN_FAILED' in output:
for line in output.split('\n'):
if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line:
smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append(f"Current failure detected: {line}")
except Exception as e: except Exception as e:
smart_health['status'] = 'ERROR' smart_health['status'] = 'ERROR'
smart_health['issues'].append(f"Error checking SMART: {str(e)}") smart_health['issues'].append(f"Error checking SMART: {str(e)}")
return smart_health return smart_health
def _check_drives_health(self) -> Dict[str, Any]: def _check_drives_health(self) -> Dict[str, Any]:
""" """