Added MMC support and more thorough smart status checks
This commit is contained in:
142
hwmonDaemon.py
142
hwmonDaemon.py
@ -238,12 +238,73 @@ class SystemHealthMonitor:
|
||||
|
||||
def _is_physical_disk(self, device_path):
|
||||
"""
|
||||
Check if the device is a physical SATA or NVMe disk.
|
||||
|
||||
Check if the device is a physical SATA, NVMe, or MMC disk.
|
||||
|
||||
:param device_path: Path to the device
|
||||
:return: Boolean indicating if it's a physical disk
|
||||
"""
|
||||
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+)', device_path))
|
||||
return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path))
|
||||
|
||||
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Check comprehensive SMART health metrics for a drive.
|
||||
|
||||
:param device: Path to device
|
||||
:return: Dictionary containing health metrics and status
|
||||
"""
|
||||
smart_health = {
|
||||
'status': 'HEALTHY',
|
||||
'issues': [],
|
||||
'temp': None,
|
||||
'attributes': {}
|
||||
}
|
||||
|
||||
# Get detailed SMART attributes
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['smartctl', '-A', '-H', device],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
output = result.stdout
|
||||
|
||||
# Check critical attributes
|
||||
critical_thresholds = {
|
||||
'Reallocated_Sector_Ct': 10,
|
||||
'Current_Pending_Sector': 1,
|
||||
'Offline_Uncorrectable': 1,
|
||||
'Reported_Uncorrect': 1,
|
||||
'Command_Timeout': 5,
|
||||
'Temperature_Celsius': 60
|
||||
}
|
||||
|
||||
for line in output.split('\n'):
|
||||
for attr, threshold in critical_thresholds.items():
|
||||
if attr in line:
|
||||
try:
|
||||
value = int(line.split()[9]) # Raw value is typically in column 10
|
||||
smart_health['attributes'][attr] = value
|
||||
|
||||
if attr == 'Temperature_Celsius':
|
||||
smart_health['temp'] = value
|
||||
if value > threshold:
|
||||
smart_health['issues'].append(f"Drive temperature critical: {value}°C")
|
||||
elif value > threshold:
|
||||
smart_health['issues'].append(f"{attr} above threshold: {value}")
|
||||
except (IndexError, ValueError):
|
||||
continue
|
||||
|
||||
# Check overall SMART status
|
||||
if 'FAILED' in output or smart_health['issues']:
|
||||
smart_health['status'] = 'UNHEALTHY'
|
||||
|
||||
except Exception as e:
|
||||
smart_health['status'] = 'ERROR'
|
||||
smart_health['issues'].append(f"Error checking SMART: {str(e)}")
|
||||
|
||||
return smart_health
|
||||
|
||||
def _check_drives_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
@ -260,50 +321,45 @@ class SystemHealthMonitor:
|
||||
'device': partition.device,
|
||||
'mountpoint': partition.mountpoint
|
||||
}
|
||||
try:
|
||||
# Check disk usage
|
||||
usage = psutil.disk_usage(partition.mountpoint)
|
||||
disk_usage_status = 'NORMAL'
|
||||
if usage.percent > 90:
|
||||
disk_usage_status = 'CRITICAL_HIGH_USAGE'
|
||||
elif usage.percent > 80:
|
||||
disk_usage_status = 'WARNING_HIGH_USAGE'
|
||||
drive_report.update({
|
||||
'total_space': self._convert_bytes(usage.total),
|
||||
'used_space': self._convert_bytes(usage.used),
|
||||
'free_space': self._convert_bytes(usage.free),
|
||||
'usage_percent': usage.percent,
|
||||
'usage_status': disk_usage_status
|
||||
})
|
||||
# Update overall status based on usage
|
||||
if disk_usage_status == 'CRITICAL_HIGH_USAGE':
|
||||
overall_status = 'CRITICAL_HIGH_USAGE'
|
||||
elif disk_usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL_HIGH_USAGE':
|
||||
overall_status = 'WARNING_HIGH_USAGE'
|
||||
# Check SMART status of the drive
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['smartctl', '-H', '-d', 'auto', partition.device],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
output = result.stdout + result.stderr
|
||||
drive_smart_status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
|
||||
drive_report['smart_status'] = drive_smart_status
|
||||
# Update overall status if SMART status is unhealthy
|
||||
if drive_smart_status == 'UNHEALTHY' and overall_status != 'CRITICAL_HIGH_USAGE':
|
||||
overall_status = 'UNHEALTHY'
|
||||
except Exception as e:
|
||||
print(f"Error checking SMART status for {partition.device}: {str(e)}")
|
||||
drive_report['smart_status'] = 'ERROR'
|
||||
except Exception as e:
|
||||
logger.error(f"Could not check drive: {str(e)}")
|
||||
drive_report['error'] = str(e)
|
||||
|
||||
# Check disk usage
|
||||
usage = psutil.disk_usage(partition.mountpoint)
|
||||
disk_usage_status = 'NORMAL'
|
||||
if usage.percent > 90:
|
||||
disk_usage_status = 'CRITICAL_HIGH_USAGE'
|
||||
elif usage.percent > 80:
|
||||
disk_usage_status = 'WARNING_HIGH_USAGE'
|
||||
|
||||
drive_report.update({
|
||||
'total_space': self._convert_bytes(usage.total),
|
||||
'used_space': self._convert_bytes(usage.used),
|
||||
'free_space': self._convert_bytes(usage.free),
|
||||
'usage_percent': usage.percent,
|
||||
'usage_status': disk_usage_status
|
||||
})
|
||||
|
||||
# Check SMART health
|
||||
smart_health = self._check_smart_health(partition.device)
|
||||
drive_report.update({
|
||||
'smart_status': smart_health['status'],
|
||||
'smart_issues': smart_health['issues'],
|
||||
'temperature': smart_health['temp'],
|
||||
'smart_attributes': smart_health['attributes']
|
||||
})
|
||||
|
||||
# Update overall status
|
||||
if smart_health['status'] == 'UNHEALTHY' or disk_usage_status == 'CRITICAL_HIGH_USAGE':
|
||||
overall_status = 'CRITICAL'
|
||||
elif disk_usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL':
|
||||
overall_status = 'WARNING'
|
||||
|
||||
drives_health['drives'].append(drive_report)
|
||||
|
||||
drives_health['overall_status'] = overall_status
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking drives health: {str(e)}")
|
||||
|
||||
return drives_health
|
||||
|
||||
@staticmethod
|
||||
|
||||
Reference in New Issue
Block a user