Aggregated disk checks into 1 function
This commit is contained in:
103
hwmonDaemon.py
103
hwmonDaemon.py
@ -42,11 +42,10 @@ class SystemHealthMonitor:
|
|||||||
:return: Dictionary with health check results
|
:return: Dictionary with health check results
|
||||||
"""
|
"""
|
||||||
health_report = {
|
health_report = {
|
||||||
'disk_health': self._check_disk_health(),
|
'drives_health': self._check_drives_health(),
|
||||||
'memory_health': self._check_memory_usage(),
|
'memory_health': self._check_memory_usage(),
|
||||||
'cpu_health': self._check_cpu_usage(),
|
'cpu_health': self._check_cpu_usage(),
|
||||||
'network_health': self._check_network_status(),
|
'network_health': self._check_network_status(),
|
||||||
'drive_smart_status': self._check_drive_smart_status(),
|
|
||||||
'temperature_health': self._check_system_temperatures()
|
'temperature_health': self._check_system_temperatures()
|
||||||
}
|
}
|
||||||
return health_report
|
return health_report
|
||||||
@ -158,47 +157,77 @@ class SystemHealthMonitor:
|
|||||||
)
|
)
|
||||||
return critical_issues
|
return critical_issues
|
||||||
|
|
||||||
def _check_disk_health(self) -> Dict[str, Any]:
|
def _check_drives_health(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check disk usage and health
|
Check overall health of drives including disk usage and SMART status.
|
||||||
|
|
||||||
:return: Disk health metrics
|
:return: Combined health report of all drives
|
||||||
"""
|
"""
|
||||||
disk_health = {'partitions': []}
|
drives_health = {'overall_status': 'NORMAL', 'drives': []}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get all mounted partitions
|
|
||||||
partitions = psutil.disk_partitions()
|
partitions = psutil.disk_partitions()
|
||||||
|
overall_status = 'NORMAL'
|
||||||
|
|
||||||
for partition in partitions:
|
for partition in partitions:
|
||||||
|
drive_report = {
|
||||||
|
'device': partition.device,
|
||||||
|
'mountpoint': partition.mountpoint
|
||||||
|
}
|
||||||
try:
|
try:
|
||||||
|
# Disk usage
|
||||||
usage = psutil.disk_usage(partition.mountpoint)
|
usage = psutil.disk_usage(partition.mountpoint)
|
||||||
partition_info = {
|
usage_status = 'NORMAL'
|
||||||
'mountpoint': partition.mountpoint,
|
if usage.percent > 90:
|
||||||
|
usage_status = 'CRITICAL_HIGH_USAGE'
|
||||||
|
elif usage.percent > 80:
|
||||||
|
usage_status = 'WARNING_HIGH_USAGE'
|
||||||
|
|
||||||
|
drive_report.update({
|
||||||
'total_space': self._convert_bytes(usage.total),
|
'total_space': self._convert_bytes(usage.total),
|
||||||
'used_space': self._convert_bytes(usage.used),
|
'used_space': self._convert_bytes(usage.used),
|
||||||
'free_space': self._convert_bytes(usage.free),
|
'free_space': self._convert_bytes(usage.free),
|
||||||
'usage_percent': usage.percent
|
'usage_percent': usage.percent,
|
||||||
}
|
'usage_status': usage_status
|
||||||
|
})
|
||||||
|
|
||||||
# Flag high usage
|
# Update overall status
|
||||||
if usage.percent > 90:
|
if usage_status == 'CRITICAL_HIGH_USAGE':
|
||||||
partition_info['status'] = 'CRITICAL_HIGH_USAGE'
|
overall_status = 'CRITICAL_HIGH_USAGE'
|
||||||
elif usage.percent > 80:
|
elif usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL_HIGH_USAGE':
|
||||||
partition_info['status'] = 'WARNING_HIGH_USAGE'
|
overall_status = 'WARNING_HIGH_USAGE'
|
||||||
else:
|
|
||||||
partition_info['status'] = 'NORMAL'
|
# SMART status
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['smartctl', '-H', partition.device],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
output = result.stdout + result.stderr
|
||||||
|
smart_status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
|
||||||
|
drive_report['smart_status'] = smart_status
|
||||||
|
|
||||||
|
if smart_status == 'UNHEALTHY' and overall_status != 'CRITICAL_HIGH_USAGE':
|
||||||
|
overall_status = 'UNHEALTHY'
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
drive_report['smart_status'] = f"ERROR: {str(e)}"
|
||||||
|
|
||||||
disk_health['partitions'].append(partition_info)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Could not check partition {partition.mountpoint}: {e}")
|
drive_report['error'] = f"Could not check drive: {str(e)}"
|
||||||
|
|
||||||
return disk_health
|
drives_health['drives'].append(drive_report)
|
||||||
|
|
||||||
|
drives_health['overall_status'] = overall_status
|
||||||
|
return drives_health
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Disk health check failed: {e}")
|
print(f"Drive health check failed: {e}")
|
||||||
return {'error': str(e)}
|
return {'error': str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _convert_bytes(self, bytes_value: int, suffix: str = 'B') -> str:
|
def _convert_bytes(self, bytes_value: int, suffix: str = 'B') -> str:
|
||||||
"""
|
"""
|
||||||
Convert bytes to human-readable format
|
Convert bytes to human-readable format
|
||||||
@ -246,34 +275,6 @@ class SystemHealthMonitor:
|
|||||||
print(f"CPU health check failed: {e}")
|
print(f"CPU health check failed: {e}")
|
||||||
return {'error': str(e)}
|
return {'error': str(e)}
|
||||||
|
|
||||||
def _check_drive_smart_status(self) -> List[Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
Check SMART status of drives using smartctl.
|
|
||||||
|
|
||||||
:return: List of SMART status for drives
|
|
||||||
"""
|
|
||||||
drives = []
|
|
||||||
try:
|
|
||||||
for disk in psutil.disk_partitions():
|
|
||||||
drive = disk.device
|
|
||||||
try:
|
|
||||||
# Use smartctl to check the drive's SMART status
|
|
||||||
result = subprocess.run(
|
|
||||||
['smartctl', '-H', drive],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
output = result.stdout + result.stderr
|
|
||||||
status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
|
|
||||||
drives.append({'drive': drive, 'status': status})
|
|
||||||
except Exception as e:
|
|
||||||
drives.append({'drive': drive, 'status': 'ERROR', 'error': str(e)})
|
|
||||||
except Exception as e:
|
|
||||||
print(f"SMART status check failed: {e}")
|
|
||||||
return [{'error': str(e)}]
|
|
||||||
return drives
|
|
||||||
|
|
||||||
def _check_network_status(self) -> Dict[str, Any]:
|
def _check_network_status(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check network connectivity between nodes and include detailed identifiers.
|
Check network connectivity between nodes and include detailed identifiers.
|
||||||
|
|||||||
Reference in New Issue
Block a user