Added MMC support and more thorough smart status checks

2024-12-05 21:03:45 -05:00
parent 81d723f2a4
commit db3ce2e64b
1 changed files with 99 additions and 43 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -238,12 +238,73 @@ class SystemHealthMonitor:
        
    def _is_physical_disk(self, device_path):
        """
-        Check if the device is a physical SATA or NVMe disk.
-        
+        Check if the device is a physical SATA, NVMe, or MMC disk.
+    
        :param device_path: Path to the device
        :return: Boolean indicating if it's a physical disk
        """
-        return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+)', device_path))
+        return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path))
+
+    def _check_smart_health(self, device: str) -> Dict[str, Any]:
+        """
+        Check comprehensive SMART health metrics for a drive.
+        
+        :param device: Path to device
+        :return: Dictionary containing health metrics and status
+        """
+        smart_health = {
+            'status': 'HEALTHY',
+            'issues': [],
+            'temp': None,
+            'attributes': {}
+        }
+        
+        # Get detailed SMART attributes
+        try:
+            result = subprocess.run(
+                ['smartctl', '-A', '-H', device],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            
+            output = result.stdout
+            
+            # Check critical attributes
+            critical_thresholds = {
+                'Reallocated_Sector_Ct': 10,
+                'Current_Pending_Sector': 1,
+                'Offline_Uncorrectable': 1,
+                'Reported_Uncorrect': 1,
+                'Command_Timeout': 5,
+                'Temperature_Celsius': 60
+            }
+            
+            for line in output.split('\n'):
+                for attr, threshold in critical_thresholds.items():
+                    if attr in line:
+                        try:
+                            value = int(line.split()[9])  # Raw value is typically in column 10
+                            smart_health['attributes'][attr] = value
+                            
+                            if attr == 'Temperature_Celsius':
+                                smart_health['temp'] = value
+                                if value > threshold:
+                                    smart_health['issues'].append(f"Drive temperature critical: {value}°C")
+                            elif value > threshold:
+                                smart_health['issues'].append(f"{attr} above threshold: {value}")
+                        except (IndexError, ValueError):
+                            continue
+            
+            # Check overall SMART status
+            if 'FAILED' in output or smart_health['issues']:
+                smart_health['status'] = 'UNHEALTHY'
+                
+        except Exception as e:
+            smart_health['status'] = 'ERROR'
+            smart_health['issues'].append(f"Error checking SMART: {str(e)}")
+        
+        return smart_health

    def _check_drives_health(self) -> Dict[str, Any]:
        """
@@ -260,50 +321,45 @@ class SystemHealthMonitor:
                    'device': partition.device,
                    'mountpoint': partition.mountpoint
                }
-                try:
-                    # Check disk usage
-                    usage = psutil.disk_usage(partition.mountpoint)
-                    disk_usage_status = 'NORMAL'
-                    if usage.percent > 90:
-                        disk_usage_status = 'CRITICAL_HIGH_USAGE'
-                    elif usage.percent > 80:
-                        disk_usage_status = 'WARNING_HIGH_USAGE'
-                    drive_report.update({
-                        'total_space': self._convert_bytes(usage.total),
-                        'used_space': self._convert_bytes(usage.used),
-                        'free_space': self._convert_bytes(usage.free),
-                        'usage_percent': usage.percent,
-                        'usage_status': disk_usage_status
-                    })
-                    # Update overall status based on usage
-                    if disk_usage_status == 'CRITICAL_HIGH_USAGE':
-                        overall_status = 'CRITICAL_HIGH_USAGE'
-                    elif disk_usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL_HIGH_USAGE':
-                        overall_status = 'WARNING_HIGH_USAGE'
-                    # Check SMART status of the drive
-                    try:
-                        result = subprocess.run(
-                            ['smartctl', '-H', '-d', 'auto', partition.device],
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE,
-                            text=True
-                        )
-                        output = result.stdout + result.stderr
-                        drive_smart_status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
-                        drive_report['smart_status'] = drive_smart_status
-                        # Update overall status if SMART status is unhealthy
-                        if drive_smart_status == 'UNHEALTHY' and overall_status != 'CRITICAL_HIGH_USAGE':
-                            overall_status = 'UNHEALTHY'
-                    except Exception as e:
-                        print(f"Error checking SMART status for {partition.device}: {str(e)}")
-                        drive_report['smart_status'] = 'ERROR'
-                except Exception as e:
-                    logger.error(f"Could not check drive: {str(e)}")
-                    drive_report['error'] = str(e)
+                
+                # Check disk usage
+                usage = psutil.disk_usage(partition.mountpoint)
+                disk_usage_status = 'NORMAL'
+                if usage.percent > 90:
+                    disk_usage_status = 'CRITICAL_HIGH_USAGE'
+                elif usage.percent > 80:
+                    disk_usage_status = 'WARNING_HIGH_USAGE'
+                    
+                drive_report.update({
+                    'total_space': self._convert_bytes(usage.total),
+                    'used_space': self._convert_bytes(usage.used),
+                    'free_space': self._convert_bytes(usage.free),
+                    'usage_percent': usage.percent,
+                    'usage_status': disk_usage_status
+                })
+                
+                # Check SMART health
+                smart_health = self._check_smart_health(partition.device)
+                drive_report.update({
+                    'smart_status': smart_health['status'],
+                    'smart_issues': smart_health['issues'],
+                    'temperature': smart_health['temp'],
+                    'smart_attributes': smart_health['attributes']
+                })
+                
+                # Update overall status
+                if smart_health['status'] == 'UNHEALTHY' or disk_usage_status == 'CRITICAL_HIGH_USAGE':
+                    overall_status = 'CRITICAL'
+                elif disk_usage_status == 'WARNING_HIGH_USAGE' and overall_status != 'CRITICAL':
+                    overall_status = 'WARNING'
+                    
                drives_health['drives'].append(drive_report)
+                
            drives_health['overall_status'] = overall_status
+            
        except Exception as e:
            logger.error(f"Error checking drives health: {str(e)}")
+            
        return drives_health

    @staticmethod