Added MMC support and more thorough smart status checks

This commit is contained in:
2024-12-05 21:03:45 -05:00
parent 81d723f2a4
commit db3ce2e64b

View File

@ -238,12 +238,73 @@ class SystemHealthMonitor:
def _is_physical_disk(self, device_path):
"""
Check if the device is a physical SATA or NVMe disk.
Check if the device is a physical SATA, NVMe, or MMC disk.
:param device_path: Path to the device
:return: Boolean indicating if it's a physical disk
"""
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+)', device_path))
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path))
def _check_smart_health(self, device: str) -> Dict[str, Any]:
"""
Check comprehensive SMART health metrics for a drive.
:param device: Path to device
:return: Dictionary containing health metrics and status
"""
smart_health = {
'status': 'HEALTHY',
'issues': [],
'temp': None,
'attributes': {}
}
# Get detailed SMART attributes
try:
result = subprocess.run(
['smartctl', '-A', '-H', device],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
output = result.stdout
# Check critical attributes
critical_thresholds = {
'Reallocated_Sector_Ct': 10,
'Current_Pending_Sector': 1,
'Offline_Uncorrectable': 1,
'Reported_Uncorrect': 1,
'Command_Timeout': 5,
'Temperature_Celsius': 60
}
for line in output.split('\n'):
for attr, threshold in critical_thresholds.items():
if attr in line:
try:
value = int(line.split()[9]) # Raw value is typically in column 10
smart_health['attributes'][attr] = value
if attr == 'Temperature_Celsius':
smart_health['temp'] = value
if value > threshold:
smart_health['issues'].append(f"Drive temperature critical: {value}°C")
elif value > threshold:
smart_health['issues'].append(f"{attr} above threshold: {value}")
except (IndexError, ValueError):
continue
# Check overall SMART status
if 'FAILED' in output or smart_health['issues']:
smart_health['status'] = 'UNHEALTHY'
except Exception as e:
smart_health['status'] = 'ERROR'
smart_health['issues'].append(f"Error checking SMART: {str(e)}")
return smart_health
def _check_drives_health(self) -> Dict[str, Any]:
"""
@ -260,50 +321,45 @@ class SystemHealthMonitor:
'device': partition.device,
'mountpoint': partition.mountpoint
}
try:
# Check disk usage
usage = psutil.disk_usage(partition.mountpoint)
disk_usage_status = 'NORMAL'
if usage.percent > 90:
disk_usage_status = 'CRITICAL_HIGH_USAGE'
elif usage.percent > 80:
disk_usage_status = 'WARNING_HIGH_USAGE'
drive_report.update({
'total_space': self._convert_bytes(usage.total),
'used_space': self._convert_bytes(usage.used),
'free_space': self._convert_bytes(usage.free),
'usage_percent': usage.percent,
'usage_status': disk_usage_status
})
# Update overall status based on usage
if disk_usage_status == 'CRITICAL_HIGH_USAGE':
overall_status = 'CRITICAL_HIGH_USAGE'
elif disk_usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL_HIGH_USAGE':
overall_status = 'WARNING_HIGH_USAGE'
# Check SMART status of the drive
try:
result = subprocess.run(
['smartctl', '-H', '-d', 'auto', partition.device],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
output = result.stdout + result.stderr
drive_smart_status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
drive_report['smart_status'] = drive_smart_status
# Update overall status if SMART status is unhealthy
if drive_smart_status == 'UNHEALTHY' and overall_status != 'CRITICAL_HIGH_USAGE':
overall_status = 'UNHEALTHY'
except Exception as e:
print(f"Error checking SMART status for {partition.device}: {str(e)}")
drive_report['smart_status'] = 'ERROR'
except Exception as e:
logger.error(f"Could not check drive: {str(e)}")
drive_report['error'] = str(e)
# Check disk usage
usage = psutil.disk_usage(partition.mountpoint)
disk_usage_status = 'NORMAL'
if usage.percent > 90:
disk_usage_status = 'CRITICAL_HIGH_USAGE'
elif usage.percent > 80:
disk_usage_status = 'WARNING_HIGH_USAGE'
drive_report.update({
'total_space': self._convert_bytes(usage.total),
'used_space': self._convert_bytes(usage.used),
'free_space': self._convert_bytes(usage.free),
'usage_percent': usage.percent,
'usage_status': disk_usage_status
})
# Check SMART health
smart_health = self._check_smart_health(partition.device)
drive_report.update({
'smart_status': smart_health['status'],
'smart_issues': smart_health['issues'],
'temperature': smart_health['temp'],
'smart_attributes': smart_health['attributes']
})
# Update overall status
if smart_health['status'] == 'UNHEALTHY' or disk_usage_status == 'CRITICAL_HIGH_USAGE':
overall_status = 'CRITICAL'
elif disk_usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL':
overall_status = 'WARNING'
drives_health['drives'].append(drive_report)
drives_health['overall_status'] = overall_status
except Exception as e:
logger.error(f"Error checking drives health: {str(e)}")
return drives_health
@staticmethod