From 6907f71de1cf25652ad0cf9cf76d9b53e36ce67c Mon Sep 17 00:00:00 2001
From: Jared Vititoe <jjvititoe1@gmail.com>
Date: Thu, 29 May 2025 19:50:17 -0400
Subject: [PATCH] Updated LXC storage checks

---
 hwmonDaemon.py | 182 +++++++++++++++++++++++++++++++------------------
 1 file changed, 117 insertions(+), 65 deletions(-)

diff --git a/hwmonDaemon.py b/hwmonDaemon.py
index 7c73397..00e1520 100644
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -885,23 +885,34 @@ class SystemHealthMonitor:
         """
         disks = set()
         
-        # Method 1: Use lsblk to get physical disks
+        # Method 1: Use lsblk to get physical disks, excluding virtual devices
         try:
             result = subprocess.run(
-                ['lsblk', '-d', '-n', '-o', 'NAME'],
+                ['lsblk', '-d', '-n', '-o', 'NAME,TYPE'],
                 stdout=subprocess.PIPE,
                 text=True
             )
-            disks.update(f"/dev/{disk}" for disk in result.stdout.strip().split('\n'))
-            logger.debug(f"Disks found via lsblk: {disks}")
+            for line in result.stdout.strip().split('\n'):
+                if line:
+                    parts = line.split()
+                    if len(parts) >= 2:
+                        name, device_type = parts[0], parts[1]
+                        # Only include actual disks, exclude virtual devices
+                        if device_type == 'disk' and not name.startswith('rbd'):
+                            disks.add(f"/dev/{name}")
+            logger.debug(f"Physical disks found via lsblk: {disks}")
         except Exception as e:
             logger.debug(f"lsblk detection failed: {e}")
 
-        # Method 2: Direct device scanning
-        for pattern in ['/dev/sd*', '/dev/nvme*n*']:
+        # Method 2: Direct device scanning for physical devices only
+        for pattern in ['/dev/sd[a-z]', '/dev/nvme[0-9]n[0-9]']:
             try:
+                import glob
                 matches = glob.glob(pattern)
-                disks.update(d for d in matches if not d[-1].isdigit())
+                # Filter out partitions (devices ending in numbers for sd*, already filtered for nvme)
+                if 'sd' in pattern:
+                    matches = [d for d in matches if not d[-1].isdigit()]
+                disks.update(matches)
                 logger.debug(f"Disks found via glob {pattern}: {matches}")
             except Exception as e:
                 logger.debug(f"Glob detection failed for {pattern}: {e}")
@@ -1151,6 +1162,12 @@ class SystemHealthMonitor:
         }
 
         try:
+            # Skip virtual devices
+            if '/dev/rbd' in device or '/dev/dm-' in device or '/dev/mapper/' in device:
+                smart_health['status'] = 'NOT_SUPPORTED'
+                smart_health['issues'].append("Virtual device - SMART not applicable")
+                return smart_health
+
             # First verify the device is SMART-capable
             drive_details = self._get_drive_details(device)
             if not drive_details.get('smart_capable', False):
@@ -1158,6 +1175,10 @@ class SystemHealthMonitor:
                 smart_health['issues'].append("SMART not supported on this device")
                 return smart_health
 
+            # Special handling for NVMe devices
+            if 'nvme' in device:
+                return self._check_nvme_smart_health(device)
+
             # If we have no model info, the device might not be responding properly
             if not drive_details.get('model'):
                 smart_health['status'] = 'ERROR'
@@ -1169,7 +1190,7 @@ class SystemHealthMonitor:
             manufacturer_profile = self._get_manufacturer_profile(
                 drive_details.get('model', ''), 
                 drive_details.get('manufacturer', ''),
-                drive_details.get('firmware', '')  # Add firmware to the call
+                drive_details.get('firmware', '')
             )
             smart_health['manufacturer_profile'] = manufacturer_profile
             
@@ -1382,6 +1403,65 @@ class SystemHealthMonitor:
 
         return smart_health
 
+    def _check_nvme_smart_health(self, device: str) -> Dict[str, Any]:
+        """
+        Dedicated NVMe SMART health check.
+        """
+        smart_health = {
+            'status': 'UNKNOWN',
+            'severity': 'NORMAL', 
+            'issues': [],
+            'temp': None,
+            'attributes': {},
+            'manufacturer_profile': None
+        }
+
+        try:
+            # Use nvme-cli for NVMe devices
+            result = subprocess.run(
+                ['nvme', 'smart-log', device],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                timeout=30
+            )
+            
+            if result.returncode == 0:
+                smart_health['status'] = 'HEALTHY'
+                
+                # Parse NVMe smart log output
+                for line in result.stdout.split('\n'):
+                    if 'temperature' in line.lower():
+                        # Extract temperature
+                        temp_match = re.search(r'(\d+)', line)
+                        if temp_match:
+                            smart_health['temp'] = int(temp_match.group(1))
+                            smart_health['attributes']['Temperature_Celsius'] = smart_health['temp']
+                            
+                    elif 'available_spare' in line.lower():
+                        spare_match = re.search(r'(\d+)%', line)
+                        if spare_match:
+                            spare_pct = int(spare_match.group(1))
+                            smart_health['attributes']['Available_Spare'] = spare_pct
+                            if spare_pct < 10:
+                                smart_health['severity'] = 'CRITICAL'
+                                smart_health['issues'].append(f"Critical Available_Spare: {spare_pct}%")
+                            elif spare_pct < 30:
+                                smart_health['severity'] = 'WARNING'
+                                smart_health['issues'].append(f"Low Available_Spare: {spare_pct}%")
+            else:
+                smart_health['status'] = 'ERROR'
+                smart_health['issues'].append("Failed to read NVMe SMART data")
+                
+        except subprocess.TimeoutExpired:
+            smart_health['status'] = 'ERROR'
+            smart_health['issues'].append("NVMe SMART check timed out")
+        except Exception as e:
+            smart_health['status'] = 'ERROR'
+            smart_health['issues'].append(f"Error checking NVMe SMART: {str(e)}")
+
+        return smart_health
+
     def _check_drives_health(self) -> Dict[str, Any]:
         drives_health = {'overall_status': 'NORMAL', 'drives': []}
         
@@ -1665,13 +1745,12 @@ class SystemHealthMonitor:
             )
             logger.debug(f"pct list output:\n{result.stdout}")
             
-            for line in result.stdout.split('\n')[1:]:
+            for line in result.stdout.split('\n')[1:]:  # Skip header
                 if not line.strip():
                     continue
                     
                 parts = line.split()
                 if len(parts) < 2:
-                    logger.debug(f"Skipping invalid line: {line}")
                     continue
                     
                 vmid, status = parts[0], parts[1]
@@ -1690,84 +1769,57 @@ class SystemHealthMonitor:
                         'filesystems': []
                     }
                     
-                    for fs_line in disk_info.stdout.split('\n')[1:]:
-                        if not fs_line.strip() or 'MP' in fs_line:
+                    # Parse df output correctly
+                    for fs_line in disk_info.stdout.split('\n')[1:]:  # Skip header
+                        if not fs_line.strip():
                             continue
                         
-                        columns = line.split()
-                        logger.debug(f"Split parts: {parts}")
-                        if len(columns) >= 6:
+                        # Split the df line properly
+                        fs_parts = fs_line.split()
+                        if len(fs_parts) >= 6:
                             try:
+                                filesystem = fs_parts[0]
+                                total_kb = int(fs_parts[1])
+                                used_kb = int(fs_parts[2]) 
+                                avail_kb = int(fs_parts[3])
+                                usage_pct = int(fs_parts[4].rstrip('%'))
+                                mountpoint = fs_parts[5]
+                                
                                 # Skip excluded mounts
-                                if parts[0].startswith('appPool:') or '/mnt/pve/mediaf' in parts[0]:
-                                    continue
-                                
-                                # Get the mountpoint (last column)
-                                if len(parts) > 5:
-                                    # The mountpoint is the last column
-                                    mountpoint = columns[-1]
-                                else:
-                                    mountpoint = "/"
-                                
-                                # Skip excluded mountpoints
                                 if self._is_excluded_mount(mountpoint):
-                                    logger.debug(f"Skipping excluded mount: {mountpoint}")
                                     continue
                                     
-                                # Parse size values safely
-                                total_space = self._parse_size(columns[-5])
-                                used_space = self._parse_size(columns[-4])
-                                available_space = self._parse_size(columns[-3])
-                                
-                                # Parse percentage safely
-                                try:
-                                    usage_percent = float(columns[-2].rstrip('%'))
-                                except (ValueError, IndexError):
-                                    # Calculate percentage if parsing fails
-                                    usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
-                                
-                                filesystem = {
+                                filesystem_info = {
                                     'mountpoint': mountpoint,
-                                    'total_space': total_space,
-                                    'used_space': used_space,
-                                    'available': available_space,
-                                    'usage_percent': usage_percent
+                                    'total_space': total_kb * 1024,  # Convert to bytes
+                                    'used_space': used_kb * 1024,
+                                    'available': avail_kb * 1024,
+                                    'usage_percent': usage_pct
                                 }
-                                container_info['filesystems'].append(filesystem)
+                                container_info['filesystems'].append(filesystem_info)
 
                                 # Check thresholds
-                                if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
+                                if usage_pct >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
                                     lxc_health['status'] = 'CRITICAL'
-                                    issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
+                                    issue = f"LXC {vmid} critical storage usage: {usage_pct}% on {mountpoint}"
                                     lxc_health['issues'].append(issue)
-                                elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
+                                elif usage_pct >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
                                     if lxc_health['status'] != 'CRITICAL':
                                         lxc_health['status'] = 'WARNING'
-                                    issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
+                                    issue = f"LXC {vmid} high storage usage: {usage_pct}% on {mountpoint}"
                                     lxc_health['issues'].append(issue)
 
-                                logger.debug(f"Filesystem details: {filesystem}")
-                            except Exception as e:
-                                logger.debug(f"Error processing line: {str(e)}")
-                                logger.debug(f"Full exception: {repr(e)}")
+                            except (ValueError, IndexError) as e:
+                                logger.debug(f"Error parsing df line '{fs_line}': {e}")
                                 continue
                     
-                    # Only add container info if we have filesystem data
                     if container_info['filesystems']:
                         lxc_health['containers'].append(container_info)
-                        logger.debug(f"Added container info for VMID {vmid}")
-            
-            logger.debug("=== LXC Storage Check Summary ===")
-            logger.debug(f"Status: {lxc_health['status']}")
-            logger.debug(f"Total containers checked: {len(lxc_health['containers'])}")
-            logger.debug(f"Issues found: {len(lxc_health['issues'])}")
-            logger.debug("=== End LXC Storage Check ===")
-                    
+                        
         except Exception as e:
-            logger.debug(f"Critical error during LXC storage check: {str(e)}")
+            logger.error(f"Error in LXC storage check: {e}")
             lxc_health['status'] = 'ERROR'
-            error_msg = f"Error checking LXC storage: {str(e)}"
-            lxc_health['issues'].append(error_msg)
+            lxc_health['issues'].append(f"Error checking LXC storage: {str(e)}")
             
         return lxc_health