Improved smart status checks

This commit is contained in:
2024-12-05 21:09:37 -05:00
parent db3ce2e64b
commit 0a51bee132

View File

@ -244,13 +244,9 @@ class SystemHealthMonitor:
:return: Boolean indicating if it's a physical disk :return: Boolean indicating if it's a physical disk
""" """
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path)) return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path))
def _check_smart_health(self, device: str) -> Dict[str, Any]: def _check_smart_health(self, device: str) -> Dict[str, Any]:
""" """
Check comprehensive SMART health metrics for a drive. Check comprehensive SMART health metrics for a drive.
:param device: Path to device
:return: Dictionary containing health metrics and status
""" """
smart_health = { smart_health = {
'status': 'HEALTHY', 'status': 'HEALTHY',
@ -259,7 +255,6 @@ class SystemHealthMonitor:
'attributes': {} 'attributes': {}
} }
# Get detailed SMART attributes
try: try:
result = subprocess.run( result = subprocess.run(
['smartctl', '-A', '-H', device], ['smartctl', '-A', '-H', device],
@ -270,42 +265,72 @@ class SystemHealthMonitor:
output = result.stdout output = result.stdout
# Check critical attributes # Check overall SMART status first
critical_thresholds = { if 'FAILED' in output and not 'PASSED' in output:
'Reallocated_Sector_Ct': 10, smart_health['status'] = 'UNHEALTHY'
'Current_Pending_Sector': 1, smart_health['issues'].append("SMART overall health check failed")
'Offline_Uncorrectable': 1,
'Reported_Uncorrect': 1, # Define critical attributes and their thresholds
'Command_Timeout': 5, critical_attributes = {
'Temperature_Celsius': 60 'Reallocated_Sector_Ct': {'threshold': 0, 'critical': True},
'Current_Pending_Sector': {'threshold': 0, 'critical': True},
'Offline_Uncorrectable': {'threshold': 0, 'critical': True},
'Reported_Uncorrect': {'threshold': 0, 'critical': True},
'Command_Timeout': {'threshold': 5, 'critical': False},
'Temperature_Celsius': {'threshold': 65, 'critical': False},
'Wear_Leveling_Count': {'threshold': 10, 'critical': True},
'Media_Wearout_Indicator': {'threshold': 20, 'critical': True}
} }
for line in output.split('\n'): for line in output.split('\n'):
for attr, threshold in critical_thresholds.items(): # Skip header lines
if attr in line: if 'ATTRIBUTE_NAME' in line or '===' in line:
try: continue
value = int(line.split()[9]) # Raw value is typically in column 10
smart_health['attributes'][attr] = value for attr_name, limits in critical_attributes.items():
if attr_name in line:
parts = line.split()
if len(parts) >= 10:
value = int(parts[9]) # Raw value
normalized = int(parts[3]) # Normalized value
if attr == 'Temperature_Celsius': smart_health['attributes'][attr_name] = {
'raw': value,
'normalized': normalized
}
# Check thresholds
if attr_name == 'Temperature_Celsius':
smart_health['temp'] = value smart_health['temp'] = value
if value > threshold: if value > limits['threshold']:
smart_health['issues'].append(f"Drive temperature critical: {value}°C") smart_health['issues'].append(
elif value > threshold: f"Drive temperature critical: {value}°C"
smart_health['issues'].append(f"{attr} above threshold: {value}") )
except (IndexError, ValueError): elif value > limits['threshold']:
continue if limits['critical']:
smart_health['status'] = 'UNHEALTHY'
# Check overall SMART status smart_health['issues'].append(
if 'FAILED' in output or smart_health['issues']: f"{attr_name} above threshold: {value}"
smart_health['status'] = 'UNHEALTHY' )
# Check for very low normalized values
if normalized <= 10 and attr_name != 'Temperature_Celsius':
smart_health['issues'].append(
f"{attr_name} normalized value critical: {normalized}"
)
# Check if WHEN_FAILED is present and not in the past
if 'WHEN_FAILED' in output:
for line in output.split('\n'):
if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line:
smart_health['status'] = 'UNHEALTHY'
smart_health['issues'].append(f"Current failure detected: {line}")
except Exception as e: except Exception as e:
smart_health['status'] = 'ERROR' smart_health['status'] = 'ERROR'
smart_health['issues'].append(f"Error checking SMART: {str(e)}") smart_health['issues'].append(f"Error checking SMART: {str(e)}")
return smart_health return smart_health
def _check_drives_health(self) -> Dict[str, Any]: def _check_drives_health(self) -> Dict[str, Any]:
""" """
Check overall health of physical SATA and NVMe drives including disk usage and SMART status. Check overall health of physical SATA and NVMe drives including disk usage and SMART status.