Aggregated disk checks into 1 function

This commit is contained in:
2024-12-04 21:20:52 -05:00
parent 0b26b67019
commit aeabb69b94

View File

@ -42,11 +42,10 @@ class SystemHealthMonitor:
:return: Dictionary with health check results
"""
health_report = {
'disk_health': self._check_disk_health(),
'drives_health': self._check_drives_health(),
'memory_health': self._check_memory_usage(),
'cpu_health': self._check_cpu_usage(),
'network_health': self._check_network_status(),
'drive_smart_status': self._check_drive_smart_status(),
'temperature_health': self._check_system_temperatures()
}
return health_report
@ -158,47 +157,77 @@ class SystemHealthMonitor:
)
return critical_issues
def _check_disk_health(self) -> Dict[str, Any]:
def _check_drives_health(self) -> Dict[str, Any]:
"""
Check disk usage and health
Check overall health of drives including disk usage and SMART status.
:return: Disk health metrics
:return: Combined health report of all drives
"""
disk_health = {'partitions': []}
drives_health = {'overall_status': 'NORMAL', 'drives': []}
try:
# Get all mounted partitions
partitions = psutil.disk_partitions()
overall_status = 'NORMAL'
for partition in partitions:
drive_report = {
'device': partition.device,
'mountpoint': partition.mountpoint
}
try:
# Disk usage
usage = psutil.disk_usage(partition.mountpoint)
partition_info = {
'mountpoint': partition.mountpoint,
usage_status = 'NORMAL'
if usage.percent > 90:
usage_status = 'CRITICAL_HIGH_USAGE'
elif usage.percent > 80:
usage_status = 'WARNING_HIGH_USAGE'
drive_report.update({
'total_space': self._convert_bytes(usage.total),
'used_space': self._convert_bytes(usage.used),
'free_space': self._convert_bytes(usage.free),
'usage_percent': usage.percent
}
'usage_percent': usage.percent,
'usage_status': usage_status
})
# Flag high usage
if usage.percent > 90:
partition_info['status'] = 'CRITICAL_HIGH_USAGE'
elif usage.percent > 80:
partition_info['status'] = 'WARNING_HIGH_USAGE'
else:
partition_info['status'] = 'NORMAL'
# Update overall status
if usage_status == 'CRITICAL_HIGH_USAGE':
overall_status = 'CRITICAL_HIGH_USAGE'
elif usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL_HIGH_USAGE':
overall_status = 'WARNING_HIGH_USAGE'
disk_health['partitions'].append(partition_info)
except Exception as e:
print(f"Could not check partition {partition.mountpoint}: {e}")
# SMART status
try:
result = subprocess.run(
['smartctl', '-H', partition.device],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
output = result.stdout + result.stderr
smart_status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
drive_report['smart_status'] = smart_status
return disk_health
if smart_status == 'UNHEALTHY' and overall_status != 'CRITICAL_HIGH_USAGE':
overall_status = 'UNHEALTHY'
except Exception as e:
print(f"Disk health check failed: {e}")
drive_report['smart_status'] = f"ERROR: {str(e)}"
except Exception as e:
drive_report['error'] = f"Could not check drive: {str(e)}"
drives_health['drives'].append(drive_report)
drives_health['overall_status'] = overall_status
return drives_health
except Exception as e:
print(f"Drive health check failed: {e}")
return {'error': str(e)}
def _convert_bytes(self, bytes_value: int, suffix: str = 'B') -> str:
"""
Convert bytes to human-readable format
@ -246,34 +275,6 @@ class SystemHealthMonitor:
print(f"CPU health check failed: {e}")
return {'error': str(e)}
def _check_drive_smart_status(self) -> List[Dict[str, Any]]:
"""
Check SMART status of drives using smartctl.
:return: List of SMART status for drives
"""
drives = []
try:
for disk in psutil.disk_partitions():
drive = disk.device
try:
# Use smartctl to check the drive's SMART status
result = subprocess.run(
['smartctl', '-H', drive],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
output = result.stdout + result.stderr
status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
drives.append({'drive': drive, 'status': status})
except Exception as e:
drives.append({'drive': drive, 'status': 'ERROR', 'error': str(e)})
except Exception as e:
print(f"SMART status check failed: {e}")
return [{'error': str(e)}]
return drives
def _check_network_status(self) -> Dict[str, Any]:
"""
Check network connectivity between nodes and include detailed identifiers.