Improved smart status checks

2024-12-05 21:09:37 -05:00
parent db3ce2e64b
commit 0a51bee132
1 changed files with 56 additions and 31 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -244,13 +244,9 @@ class SystemHealthMonitor:
        :return: Boolean indicating if it's a physical disk
        """
        return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path))
    def _check_smart_health(self, device: str) -> Dict[str, Any]:
        """
        Check comprehensive SMART health metrics for a drive.
        :param device: Path to device
        :return: Dictionary containing health metrics and status
        """
        smart_health = {
            'status': 'HEALTHY',
@@ -259,7 +255,6 @@ class SystemHealthMonitor:
            'attributes': {}
        }
        # Get detailed SMART attributes
        try:
            result = subprocess.run(
                ['smartctl', '-A', '-H', device],
@@ -270,42 +265,72 @@ class SystemHealthMonitor:
            output = result.stdout
-            # Check critical attributes
+            # Check overall SMART status first
-            critical_thresholds = {
+            if 'FAILED' in output and not 'PASSED' in output:
-                'Reallocated_Sector_Ct': 10,
+                smart_health['status'] = 'UNHEALTHY'
-                'Current_Pending_Sector': 1,
+                smart_health['issues'].append("SMART overall health check failed")
-                'Offline_Uncorrectable': 1,
+                
-                'Reported_Uncorrect': 1,
+            # Define critical attributes and their thresholds
-                'Command_Timeout': 5,
+            critical_attributes = {
-                'Temperature_Celsius': 60
+                'Reallocated_Sector_Ct': {'threshold': 0, 'critical': True},
                'Current_Pending_Sector': {'threshold': 0, 'critical': True},
                'Offline_Uncorrectable': {'threshold': 0, 'critical': True},
                'Reported_Uncorrect': {'threshold': 0, 'critical': True},
                'Command_Timeout': {'threshold': 5, 'critical': False},
                'Temperature_Celsius': {'threshold': 65, 'critical': False},
                'Wear_Leveling_Count': {'threshold': 10, 'critical': True},
                'Media_Wearout_Indicator': {'threshold': 20, 'critical': True}
            }
            for line in output.split('\n'):
-                for attr, threshold in critical_thresholds.items():
+                # Skip header lines
-                    if attr in line:
+                if 'ATTRIBUTE_NAME' in line or '===' in line:
-                        try:
+                    continue
-                            value = int(line.split()[9])  # Raw value is typically in column 10
+                    
-                            smart_health['attributes'][attr] = value
+                for attr_name, limits in critical_attributes.items():
                    if attr_name in line:
                        parts = line.split()
                        if len(parts) >= 10:
                            value = int(parts[9])  # Raw value
                            normalized = int(parts[3])  # Normalized value
-                            if attr == 'Temperature_Celsius':
+                            smart_health['attributes'][attr_name] = {
                                'raw': value,
                                'normalized': normalized
                            }
                            # Check thresholds
                            if attr_name == 'Temperature_Celsius':
                                smart_health['temp'] = value
-                                if value > threshold:
+                                if value > limits['threshold']:
-                                    smart_health['issues'].append(f"Drive temperature critical: {value}°C")
+                                    smart_health['issues'].append(
-                            elif value > threshold:
+                                        f"Drive temperature critical: {value}°C"
-                                smart_health['issues'].append(f"{attr} above threshold: {value}")
+                                    )
-                        except (IndexError, ValueError):
+                            elif value > limits['threshold']:
-                            continue
+                                if limits['critical']:
-            
+                                    smart_health['status'] = 'UNHEALTHY'
-            # Check overall SMART status
+                                smart_health['issues'].append(
-            if 'FAILED' in output or smart_health['issues']:
+                                    f"{attr_name} above threshold: {value}"
-                smart_health['status'] = 'UNHEALTHY'
+                                )
-                
+                                
                            # Check for very low normalized values
                            if normalized <= 10 and attr_name != 'Temperature_Celsius':
                                smart_health['issues'].append(
                                    f"{attr_name} normalized value critical: {normalized}"
                                )
            # Check if WHEN_FAILED is present and not in the past
            if 'WHEN_FAILED' in output:
                for line in output.split('\n'):
                    if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line:
                        smart_health['status'] = 'UNHEALTHY'
                        smart_health['issues'].append(f"Current failure detected: {line}")
        except Exception as e:
            smart_health['status'] = 'ERROR'
            smart_health['issues'].append(f"Error checking SMART: {str(e)}")
        return smart_health
    def _check_drives_health(self) -> Dict[str, Any]:
        """
        Check overall health of physical SATA and NVMe drives including disk usage and SMART status.