From 05072031402f55c33613f231041a0e035f6cfb13 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Mon, 3 Mar 2025 17:57:07 -0500 Subject: [PATCH] Less partitions more disks --- hwmonDaemon.py | 60 ++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 44 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index a3a3a01..aeda846 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -263,18 +263,12 @@ class SystemHealthMonitor: issues = [] # Check for drive-related issues - for partition in health_report.get('drives_health', {}).get('drives', []): - if partition.get('usage_status') == 'CRITICAL_HIGH_USAGE': - issues.append( - f"Disk {partition['mountpoint']} is {partition['usage_percent']}% full" - ) - elif partition.get('usage_status') == 'WARNING_HIGH_USAGE': - issues.append( - f"Disk {partition['mountpoint']} is {partition['usage_percent']}% full (Warning)" - ) - if partition.get('smart_status') == 'UNHEALTHY': - issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status") - + for drive in health_report.get('drives_health', {}).get('drives', []): + if drive.get('smart_issues'): + issues.append(f"Drive {drive['device']} has SMART issues: {', '.join(drive['smart_issues'])}") + if drive.get('temperature') and drive['temperature'] > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']: + issues.append(f"Drive {drive['device']} temperature is high: {drive['temperature']}°C") + # Check for ECC memory errors memory_health = health_report.get('memory_health', {}) if memory_health.get('has_ecc') and memory_health.get('ecc_errors'): @@ -581,11 +575,10 @@ class SystemHealthMonitor: drives_health = {'overall_status': 'NORMAL', 'drives': []} try: - physical_disks = self._get_all_disks() - logger.debug(f"Found physical disks: {physical_disks}") - - # Filter out RBD devices - physical_disks = [disk for disk in physical_disks if not disk.startswith('/dev/rbd')] + # Get physical disks only (exclude RBD devices) + physical_disks = [disk for disk in self._get_all_disks() + if disk.startswith(('/dev/sd', '/dev/nvme'))] + logger.debug(f"Checking physical disks: {physical_disks}") overall_status = 'NORMAL' for disk in physical_disks: @@ -596,30 +589,11 @@ class SystemHealthMonitor: 'usage_percent': 0, 'total_space': '0B', 'used_space': '0B', - 'free_space': '0B' + 'free_space': '0B', + 'smart_status': 'UNKNOWN' } - # Check if disk is mounted - try: - partitions = [p for p in psutil.disk_partitions() - if p.device.startswith(disk)] - if partitions: - partition = partitions[0] # Use first partition for stats - usage = psutil.disk_usage(partition.mountpoint) - drive_report.update({ - 'mountpoint': partition.mountpoint, - 'total_space': self._convert_bytes(usage.total), - 'used_space': self._convert_bytes(usage.used), - 'free_space': self._convert_bytes(usage.free), - 'usage_percent': usage.percent, - 'usage_status': ('CRITICAL_HIGH_USAGE' if usage.percent > self.CONFIG['THRESHOLDS']['DISK_CRITICAL'] - else 'WARNING_HIGH_USAGE' if usage.percent > self.CONFIG['THRESHOLDS']['DISK_WARNING'] - else 'NORMAL') - }) - except Exception as e: - logger.debug(f"Could not get usage stats for {disk}: {e}") - - # Check SMART health + # Check SMART health first smart_health = self._check_smart_health(disk) drive_report.update({ 'smart_status': smart_health['status'], @@ -628,12 +602,10 @@ class SystemHealthMonitor: 'smart_attributes': smart_health['attributes'] }) - # Update overall status - if (smart_health['status'] == 'UNHEALTHY' or - drive_report['usage_status'] == 'CRITICAL_HIGH_USAGE'): + # Update overall status based on SMART health + if smart_health['status'] == 'UNHEALTHY': overall_status = 'CRITICAL' - elif (drive_report['usage_status'] == 'WARNING_HIGH_USAGE' and - overall_status != 'CRITICAL'): + elif smart_health['issues'] and overall_status != 'CRITICAL': overall_status = 'WARNING' drives_health['drives'].append(drive_report)