More Detailed SMART Monitoring

2025-03-03 16:17:43 -05:00
parent d45e866b63
commit 6a498ed33a
1 changed files with 63 additions and 33 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -295,18 +295,34 @@ class SystemHealthMonitor:
        
    def _check_smart_health(self, device: str) -> Dict[str, Any]:
        """
-        Check comprehensive SMART health metrics for a drive.
+        Enhanced SMART health check with detailed failure thresholds.
        """
        smart_health = {
            'status': 'HEALTHY',
+            'severity': 'NORMAL',
            'issues': [],
            'temp': None,
            'attributes': {}
        }

+        # Define critical SMART attributes and their thresholds
+        SMART_THRESHOLDS = {
+            'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
+            'Current_Pending_Sector': {'warning': 1, 'critical': 5},
+            'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
+            'Reported_Uncorrect': {'warning': 1, 'critical': 2},
+            'Spin_Retry_Count': {'warning': 1, 'critical': 5},
+            'Command_Timeout': {'warning': 5, 'critical': 10},
+            'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
+            'Power_On_Hours': {'warning': 35040, 'critical': 43800},  # ~4-5 years
+            'Media_Wearout_Indicator': {'warning': 30, 'critical': 10},  # Percentage remaining
+            'Temperature_Celsius': {'warning': 65, 'critical': 75}
+        }
+
        try:
+            # Get detailed SMART data
            result = subprocess.run(
-                ['smartctl', '-A', '-H', device],
+                ['smartctl', '-A', '-H', '-l', 'error', device],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
@@ -314,46 +330,60 @@ class SystemHealthMonitor:
            
            output = result.stdout

-            # Check overall SMART status first
+            # Check overall health status
            if 'FAILED' in output and 'PASSED' not in output:
                smart_health['status'] = 'UNHEALTHY'
+                smart_health['severity'] = 'CRITICAL'
                smart_health['issues'].append("SMART overall health check failed")

-            # Parse SMART attributes
+            # Parse SMART attributes with thresholds
            for line in output.split('\n'):
-                if 'ATTRIBUTE_NAME' in line:
-                    continue
-                
-                # Check for current failures only
-                if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line:
-                    smart_health['status'] = 'UNHEALTHY'
-                    smart_health['issues'].append(f"Current failure detected: {line}")
-                
-                # Monitor critical attributes
-                for attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 
-                            'Offline_Uncorrectable', 'Reported_Uncorrect']:
+                for attr, thresholds in SMART_THRESHOLDS.items():
                    if attr in line:
                        parts = line.split()
                        if len(parts) >= 10:
                            raw_value = int(parts[9])
-                            if raw_value > 0:
-                                smart_health['status'] = 'UNHEALTHY'
-                                smart_health['issues'].append(f"{attr} has value {raw_value}")
+                            smart_health['attributes'][attr] = raw_value

-                # Check temperature
-                if 'Temperature_Celsius' in line or 'Airflow_Temperature_Cel' in line:
-                    parts = line.split()
-                    if len(parts) >= 10:
-                        temp = int(parts[9])
-                        smart_health['temp'] = temp
-                        if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
-                            smart_health['issues'].append(f"High drive temperature: {temp}°C")
+                            if attr == 'Temperature_Celsius':
+                                smart_health['temp'] = raw_value
+                                if raw_value >= thresholds['critical']:
+                                    smart_health['severity'] = 'CRITICAL'
+                                    smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
+                                elif raw_value >= thresholds['warning']:
+                                    smart_health['severity'] = 'WARNING'
+                                    smart_health['issues'].append(f"High temperature: {raw_value}°C")
+                            else:
+                                if raw_value >= thresholds['critical']:
+                                    smart_health['severity'] = 'CRITICAL'
+                                    smart_health['issues'].append(f"Critical {attr}: {raw_value}")
+                                elif raw_value >= thresholds['warning']:
+                                    if smart_health['severity'] != 'CRITICAL':
+                                        smart_health['severity'] = 'WARNING'
+                                    smart_health['issues'].append(f"Warning {attr}: {raw_value}")
+
+            # Check for recent SMART errors
+            error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"
+            error_matches = re.finditer(error_log_pattern, output)
+            recent_errors = []
+            
+            for match in error_matches:
+                error_hour = int(match.group(1))
+                current_hours = smart_health['attributes'].get('Power_On_Hours', 0)
+                if current_hours - error_hour < 168:  # Errors within last week
+                    recent_errors.append(match.group(0))
+
+            if recent_errors:
+                smart_health['severity'] = 'WARNING'
+                smart_health['issues'].extend(recent_errors)

        except Exception as e:
            smart_health['status'] = 'ERROR'
+            smart_health['severity'] = 'UNKNOWN'
            smart_health['issues'].append(f"Error checking SMART: {str(e)}")

        return smart_health
+
    def _check_drives_health(self) -> Dict[str, Any]:
        """
        Check overall health of physical SATA and NVMe drives including disk usage and SMART status.