Improved smart status checks
This commit is contained in:
@ -244,13 +244,9 @@ class SystemHealthMonitor:
|
|||||||
:return: Boolean indicating if it's a physical disk
|
:return: Boolean indicating if it's a physical disk
|
||||||
"""
|
"""
|
||||||
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path))
|
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path))
|
||||||
|
|
||||||
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check comprehensive SMART health metrics for a drive.
|
Check comprehensive SMART health metrics for a drive.
|
||||||
|
|
||||||
:param device: Path to device
|
|
||||||
:return: Dictionary containing health metrics and status
|
|
||||||
"""
|
"""
|
||||||
smart_health = {
|
smart_health = {
|
||||||
'status': 'HEALTHY',
|
'status': 'HEALTHY',
|
||||||
@ -259,7 +255,6 @@ class SystemHealthMonitor:
|
|||||||
'attributes': {}
|
'attributes': {}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Get detailed SMART attributes
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['smartctl', '-A', '-H', device],
|
['smartctl', '-A', '-H', device],
|
||||||
@ -270,42 +265,72 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
output = result.stdout
|
output = result.stdout
|
||||||
|
|
||||||
# Check critical attributes
|
# Check overall SMART status first
|
||||||
critical_thresholds = {
|
if 'FAILED' in output and not 'PASSED' in output:
|
||||||
'Reallocated_Sector_Ct': 10,
|
smart_health['status'] = 'UNHEALTHY'
|
||||||
'Current_Pending_Sector': 1,
|
smart_health['issues'].append("SMART overall health check failed")
|
||||||
'Offline_Uncorrectable': 1,
|
|
||||||
'Reported_Uncorrect': 1,
|
# Define critical attributes and their thresholds
|
||||||
'Command_Timeout': 5,
|
critical_attributes = {
|
||||||
'Temperature_Celsius': 60
|
'Reallocated_Sector_Ct': {'threshold': 0, 'critical': True},
|
||||||
|
'Current_Pending_Sector': {'threshold': 0, 'critical': True},
|
||||||
|
'Offline_Uncorrectable': {'threshold': 0, 'critical': True},
|
||||||
|
'Reported_Uncorrect': {'threshold': 0, 'critical': True},
|
||||||
|
'Command_Timeout': {'threshold': 5, 'critical': False},
|
||||||
|
'Temperature_Celsius': {'threshold': 65, 'critical': False},
|
||||||
|
'Wear_Leveling_Count': {'threshold': 10, 'critical': True},
|
||||||
|
'Media_Wearout_Indicator': {'threshold': 20, 'critical': True}
|
||||||
}
|
}
|
||||||
|
|
||||||
for line in output.split('\n'):
|
for line in output.split('\n'):
|
||||||
for attr, threshold in critical_thresholds.items():
|
# Skip header lines
|
||||||
if attr in line:
|
if 'ATTRIBUTE_NAME' in line or '===' in line:
|
||||||
try:
|
continue
|
||||||
value = int(line.split()[9]) # Raw value is typically in column 10
|
|
||||||
smart_health['attributes'][attr] = value
|
for attr_name, limits in critical_attributes.items():
|
||||||
|
if attr_name in line:
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 10:
|
||||||
|
value = int(parts[9]) # Raw value
|
||||||
|
normalized = int(parts[3]) # Normalized value
|
||||||
|
|
||||||
if attr == 'Temperature_Celsius':
|
smart_health['attributes'][attr_name] = {
|
||||||
|
'raw': value,
|
||||||
|
'normalized': normalized
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check thresholds
|
||||||
|
if attr_name == 'Temperature_Celsius':
|
||||||
smart_health['temp'] = value
|
smart_health['temp'] = value
|
||||||
if value > threshold:
|
if value > limits['threshold']:
|
||||||
smart_health['issues'].append(f"Drive temperature critical: {value}°C")
|
smart_health['issues'].append(
|
||||||
elif value > threshold:
|
f"Drive temperature critical: {value}°C"
|
||||||
smart_health['issues'].append(f"{attr} above threshold: {value}")
|
)
|
||||||
except (IndexError, ValueError):
|
elif value > limits['threshold']:
|
||||||
continue
|
if limits['critical']:
|
||||||
|
smart_health['status'] = 'UNHEALTHY'
|
||||||
# Check overall SMART status
|
smart_health['issues'].append(
|
||||||
if 'FAILED' in output or smart_health['issues']:
|
f"{attr_name} above threshold: {value}"
|
||||||
smart_health['status'] = 'UNHEALTHY'
|
)
|
||||||
|
|
||||||
|
# Check for very low normalized values
|
||||||
|
if normalized <= 10 and attr_name != 'Temperature_Celsius':
|
||||||
|
smart_health['issues'].append(
|
||||||
|
f"{attr_name} normalized value critical: {normalized}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if WHEN_FAILED is present and not in the past
|
||||||
|
if 'WHEN_FAILED' in output:
|
||||||
|
for line in output.split('\n'):
|
||||||
|
if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line:
|
||||||
|
smart_health['status'] = 'UNHEALTHY'
|
||||||
|
smart_health['issues'].append(f"Current failure detected: {line}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
smart_health['status'] = 'ERROR'
|
smart_health['status'] = 'ERROR'
|
||||||
smart_health['issues'].append(f"Error checking SMART: {str(e)}")
|
smart_health['issues'].append(f"Error checking SMART: {str(e)}")
|
||||||
|
|
||||||
return smart_health
|
return smart_health
|
||||||
|
|
||||||
def _check_drives_health(self) -> Dict[str, Any]:
|
def _check_drives_health(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check overall health of physical SATA and NVMe drives including disk usage and SMART status.
|
Check overall health of physical SATA and NVMe drives including disk usage and SMART status.
|
||||||
|
|||||||
Reference in New Issue
Block a user