Aggregated disk checks into 1 function
This commit is contained in:
105
hwmonDaemon.py
105
hwmonDaemon.py
@ -42,11 +42,10 @@ class SystemHealthMonitor:
|
||||
:return: Dictionary with health check results
|
||||
"""
|
||||
health_report = {
|
||||
'disk_health': self._check_disk_health(),
|
||||
'drives_health': self._check_drives_health(),
|
||||
'memory_health': self._check_memory_usage(),
|
||||
'cpu_health': self._check_cpu_usage(),
|
||||
'network_health': self._check_network_status(),
|
||||
'drive_smart_status': self._check_drive_smart_status(),
|
||||
'temperature_health': self._check_system_temperatures()
|
||||
}
|
||||
return health_report
|
||||
@ -158,47 +157,77 @@ class SystemHealthMonitor:
|
||||
)
|
||||
return critical_issues
|
||||
|
||||
def _check_disk_health(self) -> Dict[str, Any]:
|
||||
def _check_drives_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check disk usage and health
|
||||
Check overall health of drives including disk usage and SMART status.
|
||||
|
||||
:return: Disk health metrics
|
||||
:return: Combined health report of all drives
|
||||
"""
|
||||
disk_health = {'partitions': []}
|
||||
|
||||
drives_health = {'overall_status': 'NORMAL', 'drives': []}
|
||||
try:
|
||||
# Get all mounted partitions
|
||||
partitions = psutil.disk_partitions()
|
||||
overall_status = 'NORMAL'
|
||||
|
||||
for partition in partitions:
|
||||
drive_report = {
|
||||
'device': partition.device,
|
||||
'mountpoint': partition.mountpoint
|
||||
}
|
||||
try:
|
||||
# Disk usage
|
||||
usage = psutil.disk_usage(partition.mountpoint)
|
||||
partition_info = {
|
||||
'mountpoint': partition.mountpoint,
|
||||
usage_status = 'NORMAL'
|
||||
if usage.percent > 90:
|
||||
usage_status = 'CRITICAL_HIGH_USAGE'
|
||||
elif usage.percent > 80:
|
||||
usage_status = 'WARNING_HIGH_USAGE'
|
||||
|
||||
drive_report.update({
|
||||
'total_space': self._convert_bytes(usage.total),
|
||||
'used_space': self._convert_bytes(usage.used),
|
||||
'free_space': self._convert_bytes(usage.free),
|
||||
'usage_percent': usage.percent
|
||||
}
|
||||
'usage_percent': usage.percent,
|
||||
'usage_status': usage_status
|
||||
})
|
||||
|
||||
# Flag high usage
|
||||
if usage.percent > 90:
|
||||
partition_info['status'] = 'CRITICAL_HIGH_USAGE'
|
||||
elif usage.percent > 80:
|
||||
partition_info['status'] = 'WARNING_HIGH_USAGE'
|
||||
else:
|
||||
partition_info['status'] = 'NORMAL'
|
||||
# Update overall status
|
||||
if usage_status == 'CRITICAL_HIGH_USAGE':
|
||||
overall_status = 'CRITICAL_HIGH_USAGE'
|
||||
elif usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL_HIGH_USAGE':
|
||||
overall_status = 'WARNING_HIGH_USAGE'
|
||||
|
||||
disk_health['partitions'].append(partition_info)
|
||||
except Exception as e:
|
||||
print(f"Could not check partition {partition.mountpoint}: {e}")
|
||||
# SMART status
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['smartctl', '-H', partition.device],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
output = result.stdout + result.stderr
|
||||
smart_status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
|
||||
drive_report['smart_status'] = smart_status
|
||||
|
||||
return disk_health
|
||||
if smart_status == 'UNHEALTHY' and overall_status != 'CRITICAL_HIGH_USAGE':
|
||||
overall_status = 'UNHEALTHY'
|
||||
|
||||
except Exception as e:
|
||||
print(f"Disk health check failed: {e}")
|
||||
drive_report['smart_status'] = f"ERROR: {str(e)}"
|
||||
|
||||
except Exception as e:
|
||||
drive_report['error'] = f"Could not check drive: {str(e)}"
|
||||
|
||||
drives_health['drives'].append(drive_report)
|
||||
|
||||
drives_health['overall_status'] = overall_status
|
||||
return drives_health
|
||||
|
||||
except Exception as e:
|
||||
print(f"Drive health check failed: {e}")
|
||||
return {'error': str(e)}
|
||||
|
||||
|
||||
|
||||
def _convert_bytes(self, bytes_value: int, suffix: str = 'B') -> str:
|
||||
"""
|
||||
Convert bytes to human-readable format
|
||||
@ -246,34 +275,6 @@ class SystemHealthMonitor:
|
||||
print(f"CPU health check failed: {e}")
|
||||
return {'error': str(e)}
|
||||
|
||||
def _check_drive_smart_status(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Check SMART status of drives using smartctl.
|
||||
|
||||
:return: List of SMART status for drives
|
||||
"""
|
||||
drives = []
|
||||
try:
|
||||
for disk in psutil.disk_partitions():
|
||||
drive = disk.device
|
||||
try:
|
||||
# Use smartctl to check the drive's SMART status
|
||||
result = subprocess.run(
|
||||
['smartctl', '-H', drive],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
output = result.stdout + result.stderr
|
||||
status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
|
||||
drives.append({'drive': drive, 'status': status})
|
||||
except Exception as e:
|
||||
drives.append({'drive': drive, 'status': 'ERROR', 'error': str(e)})
|
||||
except Exception as e:
|
||||
print(f"SMART status check failed: {e}")
|
||||
return [{'error': str(e)}]
|
||||
return drives
|
||||
|
||||
def _check_network_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check network connectivity between nodes and include detailed identifiers.
|
||||
|
||||
Reference in New Issue
Block a user