From aeabb69b9460585ec643b7a0a8d8964d48b2aa4b Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Wed, 4 Dec 2024 21:20:52 -0500 Subject: [PATCH] Aggregated disk checks into 1 function --- hwmonDaemon.py | 113 +++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 56 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 525fe90..759fa69 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -42,11 +42,10 @@ class SystemHealthMonitor: :return: Dictionary with health check results """ health_report = { - 'disk_health': self._check_disk_health(), + 'drives_health': self._check_drives_health(), 'memory_health': self._check_memory_usage(), 'cpu_health': self._check_cpu_usage(), 'network_health': self._check_network_status(), - 'drive_smart_status': self._check_drive_smart_status(), 'temperature_health': self._check_system_temperatures() } return health_report @@ -158,46 +157,76 @@ class SystemHealthMonitor: ) return critical_issues - def _check_disk_health(self) -> Dict[str, Any]: + def _check_drives_health(self) -> Dict[str, Any]: """ - Check disk usage and health + Check overall health of drives including disk usage and SMART status. - :return: Disk health metrics + :return: Combined health report of all drives """ - disk_health = {'partitions': []} - + drives_health = {'overall_status': 'NORMAL', 'drives': []} try: - # Get all mounted partitions partitions = psutil.disk_partitions() - + overall_status = 'NORMAL' + for partition in partitions: + drive_report = { + 'device': partition.device, + 'mountpoint': partition.mountpoint + } try: + # Disk usage usage = psutil.disk_usage(partition.mountpoint) - partition_info = { - 'mountpoint': partition.mountpoint, + usage_status = 'NORMAL' + if usage.percent > 90: + usage_status = 'CRITICAL_HIGH_USAGE' + elif usage.percent > 80: + usage_status = 'WARNING_HIGH_USAGE' + + drive_report.update({ 'total_space': self._convert_bytes(usage.total), 'used_space': self._convert_bytes(usage.used), 'free_space': self._convert_bytes(usage.free), - 'usage_percent': usage.percent - } - - # Flag high usage - if usage.percent > 90: - partition_info['status'] = 'CRITICAL_HIGH_USAGE' - elif usage.percent > 80: - partition_info['status'] = 'WARNING_HIGH_USAGE' - else: - partition_info['status'] = 'NORMAL' - - disk_health['partitions'].append(partition_info) + 'usage_percent': usage.percent, + 'usage_status': usage_status + }) + + # Update overall status + if usage_status == 'CRITICAL_HIGH_USAGE': + overall_status = 'CRITICAL_HIGH_USAGE' + elif usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL_HIGH_USAGE': + overall_status = 'WARNING_HIGH_USAGE' + + # SMART status + try: + result = subprocess.run( + ['smartctl', '-H', partition.device], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + output = result.stdout + result.stderr + smart_status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY' + drive_report['smart_status'] = smart_status + + if smart_status == 'UNHEALTHY' and overall_status != 'CRITICAL_HIGH_USAGE': + overall_status = 'UNHEALTHY' + + except Exception as e: + drive_report['smart_status'] = f"ERROR: {str(e)}" + except Exception as e: - print(f"Could not check partition {partition.mountpoint}: {e}") - - return disk_health - + drive_report['error'] = f"Could not check drive: {str(e)}" + + drives_health['drives'].append(drive_report) + + drives_health['overall_status'] = overall_status + return drives_health + except Exception as e: - print(f"Disk health check failed: {e}") + print(f"Drive health check failed: {e}") return {'error': str(e)} + + def _convert_bytes(self, bytes_value: int, suffix: str = 'B') -> str: """ @@ -246,34 +275,6 @@ class SystemHealthMonitor: print(f"CPU health check failed: {e}") return {'error': str(e)} - def _check_drive_smart_status(self) -> List[Dict[str, Any]]: - """ - Check SMART status of drives using smartctl. - - :return: List of SMART status for drives - """ - drives = [] - try: - for disk in psutil.disk_partitions(): - drive = disk.device - try: - # Use smartctl to check the drive's SMART status - result = subprocess.run( - ['smartctl', '-H', drive], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - output = result.stdout + result.stderr - status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY' - drives.append({'drive': drive, 'status': status}) - except Exception as e: - drives.append({'drive': drive, 'status': 'ERROR', 'error': str(e)}) - except Exception as e: - print(f"SMART status check failed: {e}") - return [{'error': str(e)}] - return drives - def _check_network_status(self) -> Dict[str, Any]: """ Check network connectivity between nodes and include detailed identifiers.