diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 90ff224..d2cd812 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -238,12 +238,73 @@ class SystemHealthMonitor: def _is_physical_disk(self, device_path): """ - Check if the device is a physical SATA or NVMe disk. - + Check if the device is a physical SATA, NVMe, or MMC disk. + :param device_path: Path to the device :return: Boolean indicating if it's a physical disk """ - return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+)', device_path)) + return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path)) + + def _check_smart_health(self, device: str) -> Dict[str, Any]: + """ + Check comprehensive SMART health metrics for a drive. + + :param device: Path to device + :return: Dictionary containing health metrics and status + """ + smart_health = { + 'status': 'HEALTHY', + 'issues': [], + 'temp': None, + 'attributes': {} + } + + # Get detailed SMART attributes + try: + result = subprocess.run( + ['smartctl', '-A', '-H', device], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + output = result.stdout + + # Check critical attributes + critical_thresholds = { + 'Reallocated_Sector_Ct': 10, + 'Current_Pending_Sector': 1, + 'Offline_Uncorrectable': 1, + 'Reported_Uncorrect': 1, + 'Command_Timeout': 5, + 'Temperature_Celsius': 60 + } + + for line in output.split('\n'): + for attr, threshold in critical_thresholds.items(): + if attr in line: + try: + value = int(line.split()[9]) # Raw value is typically in column 10 + smart_health['attributes'][attr] = value + + if attr == 'Temperature_Celsius': + smart_health['temp'] = value + if value > threshold: + smart_health['issues'].append(f"Drive temperature critical: {value}°C") + elif value > threshold: + smart_health['issues'].append(f"{attr} above threshold: {value}") + except (IndexError, ValueError): + continue + + # Check overall SMART status + if 'FAILED' in output or smart_health['issues']: + smart_health['status'] = 'UNHEALTHY' + + except Exception as e: + smart_health['status'] = 'ERROR' + smart_health['issues'].append(f"Error checking SMART: {str(e)}") + + return smart_health def _check_drives_health(self) -> Dict[str, Any]: """ @@ -260,50 +321,45 @@ class SystemHealthMonitor: 'device': partition.device, 'mountpoint': partition.mountpoint } - try: - # Check disk usage - usage = psutil.disk_usage(partition.mountpoint) - disk_usage_status = 'NORMAL' - if usage.percent > 90: - disk_usage_status = 'CRITICAL_HIGH_USAGE' - elif usage.percent > 80: - disk_usage_status = 'WARNING_HIGH_USAGE' - drive_report.update({ - 'total_space': self._convert_bytes(usage.total), - 'used_space': self._convert_bytes(usage.used), - 'free_space': self._convert_bytes(usage.free), - 'usage_percent': usage.percent, - 'usage_status': disk_usage_status - }) - # Update overall status based on usage - if disk_usage_status == 'CRITICAL_HIGH_USAGE': - overall_status = 'CRITICAL_HIGH_USAGE' - elif disk_usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL_HIGH_USAGE': - overall_status = 'WARNING_HIGH_USAGE' - # Check SMART status of the drive - try: - result = subprocess.run( - ['smartctl', '-H', '-d', 'auto', partition.device], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - output = result.stdout + result.stderr - drive_smart_status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY' - drive_report['smart_status'] = drive_smart_status - # Update overall status if SMART status is unhealthy - if drive_smart_status == 'UNHEALTHY' and overall_status != 'CRITICAL_HIGH_USAGE': - overall_status = 'UNHEALTHY' - except Exception as e: - print(f"Error checking SMART status for {partition.device}: {str(e)}") - drive_report['smart_status'] = 'ERROR' - except Exception as e: - logger.error(f"Could not check drive: {str(e)}") - drive_report['error'] = str(e) + + # Check disk usage + usage = psutil.disk_usage(partition.mountpoint) + disk_usage_status = 'NORMAL' + if usage.percent > 90: + disk_usage_status = 'CRITICAL_HIGH_USAGE' + elif usage.percent > 80: + disk_usage_status = 'WARNING_HIGH_USAGE' + + drive_report.update({ + 'total_space': self._convert_bytes(usage.total), + 'used_space': self._convert_bytes(usage.used), + 'free_space': self._convert_bytes(usage.free), + 'usage_percent': usage.percent, + 'usage_status': disk_usage_status + }) + + # Check SMART health + smart_health = self._check_smart_health(partition.device) + drive_report.update({ + 'smart_status': smart_health['status'], + 'smart_issues': smart_health['issues'], + 'temperature': smart_health['temp'], + 'smart_attributes': smart_health['attributes'] + }) + + # Update overall status + if smart_health['status'] == 'UNHEALTHY' or disk_usage_status == 'CRITICAL_HIGH_USAGE': + overall_status = 'CRITICAL' + elif disk_usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL': + overall_status = 'WARNING' + drives_health['drives'].append(drive_report) + drives_health['overall_status'] = overall_status + except Exception as e: logger.error(f"Error checking drives health: {str(e)}") + return drives_health @staticmethod