From 6a498ed33a856093519f54f679b55adbd170c7f8 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Mon, 3 Mar 2025 16:17:43 -0500 Subject: [PATCH] More Detailed SMART Monitoring --- hwmonDaemon.py | 96 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 33 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index f67cfa5..be75ae2 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -295,65 +295,95 @@ class SystemHealthMonitor: def _check_smart_health(self, device: str) -> Dict[str, Any]: """ - Check comprehensive SMART health metrics for a drive. + Enhanced SMART health check with detailed failure thresholds. """ smart_health = { 'status': 'HEALTHY', + 'severity': 'NORMAL', 'issues': [], 'temp': None, 'attributes': {} } - + + # Define critical SMART attributes and their thresholds + SMART_THRESHOLDS = { + 'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10}, + 'Current_Pending_Sector': {'warning': 1, 'critical': 5}, + 'Offline_Uncorrectable': {'warning': 1, 'critical': 2}, + 'Reported_Uncorrect': {'warning': 1, 'critical': 2}, + 'Spin_Retry_Count': {'warning': 1, 'critical': 5}, + 'Command_Timeout': {'warning': 5, 'critical': 10}, + 'Power_Cycle_Count': {'warning': 5000, 'critical': 10000}, + 'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years + 'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining + 'Temperature_Celsius': {'warning': 65, 'critical': 75} + } + try: + # Get detailed SMART data result = subprocess.run( - ['smartctl', '-A', '-H', device], + ['smartctl', '-A', '-H', '-l', 'error', device], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) - + output = result.stdout - - # Check overall SMART status first + + # Check overall health status if 'FAILED' in output and 'PASSED' not in output: smart_health['status'] = 'UNHEALTHY' + smart_health['severity'] = 'CRITICAL' smart_health['issues'].append("SMART overall health check failed") - - # Parse SMART attributes + + # Parse SMART attributes with thresholds for line in output.split('\n'): - if 'ATTRIBUTE_NAME' in line: - continue - - # Check for current failures only - if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line: - smart_health['status'] = 'UNHEALTHY' - smart_health['issues'].append(f"Current failure detected: {line}") - - # Monitor critical attributes - for attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector', - 'Offline_Uncorrectable', 'Reported_Uncorrect']: + for attr, thresholds in SMART_THRESHOLDS.items(): if attr in line: parts = line.split() if len(parts) >= 10: raw_value = int(parts[9]) - if raw_value > 0: - smart_health['status'] = 'UNHEALTHY' - smart_health['issues'].append(f"{attr} has value {raw_value}") - - # Check temperature - if 'Temperature_Celsius' in line or 'Airflow_Temperature_Cel' in line: - parts = line.split() - if len(parts) >= 10: - temp = int(parts[9]) - smart_health['temp'] = temp - if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']: - smart_health['issues'].append(f"High drive temperature: {temp}°C") - + smart_health['attributes'][attr] = raw_value + + if attr == 'Temperature_Celsius': + smart_health['temp'] = raw_value + if raw_value >= thresholds['critical']: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical temperature: {raw_value}°C") + elif raw_value >= thresholds['warning']: + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"High temperature: {raw_value}°C") + else: + if raw_value >= thresholds['critical']: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical {attr}: {raw_value}") + elif raw_value >= thresholds['warning']: + if smart_health['severity'] != 'CRITICAL': + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"Warning {attr}: {raw_value}") + + # Check for recent SMART errors + error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours" + error_matches = re.finditer(error_log_pattern, output) + recent_errors = [] + + for match in error_matches: + error_hour = int(match.group(1)) + current_hours = smart_health['attributes'].get('Power_On_Hours', 0) + if current_hours - error_hour < 168: # Errors within last week + recent_errors.append(match.group(0)) + + if recent_errors: + smart_health['severity'] = 'WARNING' + smart_health['issues'].extend(recent_errors) + except Exception as e: smart_health['status'] = 'ERROR' + smart_health['severity'] = 'UNKNOWN' smart_health['issues'].append(f"Error checking SMART: {str(e)}") - + return smart_health + def _check_drives_health(self) -> Dict[str, Any]: """ Check overall health of physical SATA and NVMe drives including disk usage and SMART status.