From d1d41106bd3970835b6b90aa6ae56c2caa410892 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Thu, 5 Dec 2024 21:17:09 -0500 Subject: [PATCH] Changed detection for smart --- hwmonDaemon.py | 84 +++++++++++++++++--------------------------------- 1 file changed, 29 insertions(+), 55 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 4d53a97..23a4318 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -248,7 +248,7 @@ class SystemHealthMonitor: 'temp': None, 'attributes': {} } - + try: result = subprocess.run( ['smartctl', '-A', '-H', device], @@ -256,74 +256,48 @@ class SystemHealthMonitor: stderr=subprocess.PIPE, text=True ) - + output = result.stdout - + # Check overall SMART status first - if 'FAILED' in output and not 'PASSED' in output: + if 'FAILED' in output and 'PASSED' not in output: smart_health['status'] = 'UNHEALTHY' smart_health['issues'].append("SMART overall health check failed") - - # Define critical attributes and their thresholds - critical_attributes = { - 'Reallocated_Sector_Ct': {'threshold': 0, 'critical': True}, - 'Current_Pending_Sector': {'threshold': 0, 'critical': True}, - 'Offline_Uncorrectable': {'threshold': 0, 'critical': True}, - 'Reported_Uncorrect': {'threshold': 0, 'critical': True}, - 'Command_Timeout': {'threshold': 5, 'critical': False}, - 'Temperature_Celsius': {'threshold': 65, 'critical': False}, - 'Wear_Leveling_Count': {'threshold': 10, 'critical': True}, - 'Media_Wearout_Indicator': {'threshold': 20, 'critical': True} - } + # Parse SMART attributes for line in output.split('\n'): - # Skip header lines - if 'ATTRIBUTE_NAME' in line or '===' in line: + if 'ATTRIBUTE_NAME' in line: continue - - for attr_name, limits in critical_attributes.items(): - if attr_name in line: + + # Check for current failures only + if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line: + smart_health['status'] = 'UNHEALTHY' + smart_health['issues'].append(f"Current failure detected: {line}") + + # Monitor critical attributes + for attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector', + 'Offline_Uncorrectable', 'Reported_Uncorrect']: + if attr in line: parts = line.split() if len(parts) >= 10: - value = int(parts[9]) # Raw value - normalized = int(parts[3]) # Normalized value + raw_value = int(parts[9]) + if raw_value > 0: + smart_health['status'] = 'UNHEALTHY' + smart_health['issues'].append(f"{attr} has value {raw_value}") - smart_health['attributes'][attr_name] = { - 'raw': value, - 'normalized': normalized - } - - # Check thresholds - if attr_name == 'Temperature_Celsius': - smart_health['temp'] = value - if value > limits['threshold']: - smart_health['issues'].append( - f"Drive temperature critical: {value}°C" - ) - elif value > limits['threshold']: - if limits['critical']: - smart_health['status'] = 'UNHEALTHY' - smart_health['issues'].append( - f"{attr_name} above threshold: {value}" - ) - - # Check for very low normalized values - if normalized <= 10 and attr_name != 'Temperature_Celsius': - smart_health['issues'].append( - f"{attr_name} normalized value critical: {normalized}" - ) - - # Check if WHEN_FAILED is present and not in the past - if 'WHEN_FAILED' in output: - for line in output.split('\n'): - if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line: - smart_health['status'] = 'UNHEALTHY' - smart_health['issues'].append(f"Current failure detected: {line}") + # Check temperature + if 'Temperature_Celsius' in line or 'Airflow_Temperature_Cel' in line: + parts = line.split() + if len(parts) >= 10: + temp = int(parts[9]) + smart_health['temp'] = temp + if temp > 65: + smart_health['issues'].append(f"High drive temperature: {temp}°C") except Exception as e: smart_health['status'] = 'ERROR' smart_health['issues'].append(f"Error checking SMART: {str(e)}") - + return smart_health def _check_drives_health(self) -> Dict[str, Any]: """