From 6907f71de1cf25652ad0cf9cf76d9b53e36ce67c Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Thu, 29 May 2025 19:50:17 -0400 Subject: [PATCH] Updated LXC storage checks --- hwmonDaemon.py | 182 +++++++++++++++++++++++++++++++------------------ 1 file changed, 117 insertions(+), 65 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 7c73397..00e1520 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -885,23 +885,34 @@ class SystemHealthMonitor: """ disks = set() - # Method 1: Use lsblk to get physical disks + # Method 1: Use lsblk to get physical disks, excluding virtual devices try: result = subprocess.run( - ['lsblk', '-d', '-n', '-o', 'NAME'], + ['lsblk', '-d', '-n', '-o', 'NAME,TYPE'], stdout=subprocess.PIPE, text=True ) - disks.update(f"/dev/{disk}" for disk in result.stdout.strip().split('\n')) - logger.debug(f"Disks found via lsblk: {disks}") + for line in result.stdout.strip().split('\n'): + if line: + parts = line.split() + if len(parts) >= 2: + name, device_type = parts[0], parts[1] + # Only include actual disks, exclude virtual devices + if device_type == 'disk' and not name.startswith('rbd'): + disks.add(f"/dev/{name}") + logger.debug(f"Physical disks found via lsblk: {disks}") except Exception as e: logger.debug(f"lsblk detection failed: {e}") - # Method 2: Direct device scanning - for pattern in ['/dev/sd*', '/dev/nvme*n*']: + # Method 2: Direct device scanning for physical devices only + for pattern in ['/dev/sd[a-z]', '/dev/nvme[0-9]n[0-9]']: try: + import glob matches = glob.glob(pattern) - disks.update(d for d in matches if not d[-1].isdigit()) + # Filter out partitions (devices ending in numbers for sd*, already filtered for nvme) + if 'sd' in pattern: + matches = [d for d in matches if not d[-1].isdigit()] + disks.update(matches) logger.debug(f"Disks found via glob {pattern}: {matches}") except Exception as e: logger.debug(f"Glob detection failed for {pattern}: {e}") @@ -1151,6 +1162,12 @@ class SystemHealthMonitor: } try: + # Skip virtual devices + if '/dev/rbd' in device or '/dev/dm-' in device or '/dev/mapper/' in device: + smart_health['status'] = 'NOT_SUPPORTED' + smart_health['issues'].append("Virtual device - SMART not applicable") + return smart_health + # First verify the device is SMART-capable drive_details = self._get_drive_details(device) if not drive_details.get('smart_capable', False): @@ -1158,6 +1175,10 @@ class SystemHealthMonitor: smart_health['issues'].append("SMART not supported on this device") return smart_health + # Special handling for NVMe devices + if 'nvme' in device: + return self._check_nvme_smart_health(device) + # If we have no model info, the device might not be responding properly if not drive_details.get('model'): smart_health['status'] = 'ERROR' @@ -1169,7 +1190,7 @@ class SystemHealthMonitor: manufacturer_profile = self._get_manufacturer_profile( drive_details.get('model', ''), drive_details.get('manufacturer', ''), - drive_details.get('firmware', '') # Add firmware to the call + drive_details.get('firmware', '') ) smart_health['manufacturer_profile'] = manufacturer_profile @@ -1382,6 +1403,65 @@ class SystemHealthMonitor: return smart_health + def _check_nvme_smart_health(self, device: str) -> Dict[str, Any]: + """ + Dedicated NVMe SMART health check. + """ + smart_health = { + 'status': 'UNKNOWN', + 'severity': 'NORMAL', + 'issues': [], + 'temp': None, + 'attributes': {}, + 'manufacturer_profile': None + } + + try: + # Use nvme-cli for NVMe devices + result = subprocess.run( + ['nvme', 'smart-log', device], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30 + ) + + if result.returncode == 0: + smart_health['status'] = 'HEALTHY' + + # Parse NVMe smart log output + for line in result.stdout.split('\n'): + if 'temperature' in line.lower(): + # Extract temperature + temp_match = re.search(r'(\d+)', line) + if temp_match: + smart_health['temp'] = int(temp_match.group(1)) + smart_health['attributes']['Temperature_Celsius'] = smart_health['temp'] + + elif 'available_spare' in line.lower(): + spare_match = re.search(r'(\d+)%', line) + if spare_match: + spare_pct = int(spare_match.group(1)) + smart_health['attributes']['Available_Spare'] = spare_pct + if spare_pct < 10: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical Available_Spare: {spare_pct}%") + elif spare_pct < 30: + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"Low Available_Spare: {spare_pct}%") + else: + smart_health['status'] = 'ERROR' + smart_health['issues'].append("Failed to read NVMe SMART data") + + except subprocess.TimeoutExpired: + smart_health['status'] = 'ERROR' + smart_health['issues'].append("NVMe SMART check timed out") + except Exception as e: + smart_health['status'] = 'ERROR' + smart_health['issues'].append(f"Error checking NVMe SMART: {str(e)}") + + return smart_health + def _check_drives_health(self) -> Dict[str, Any]: drives_health = {'overall_status': 'NORMAL', 'drives': []} @@ -1665,13 +1745,12 @@ class SystemHealthMonitor: ) logger.debug(f"pct list output:\n{result.stdout}") - for line in result.stdout.split('\n')[1:]: + for line in result.stdout.split('\n')[1:]: # Skip header if not line.strip(): continue parts = line.split() if len(parts) < 2: - logger.debug(f"Skipping invalid line: {line}") continue vmid, status = parts[0], parts[1] @@ -1690,84 +1769,57 @@ class SystemHealthMonitor: 'filesystems': [] } - for fs_line in disk_info.stdout.split('\n')[1:]: - if not fs_line.strip() or 'MP' in fs_line: + # Parse df output correctly + for fs_line in disk_info.stdout.split('\n')[1:]: # Skip header + if not fs_line.strip(): continue - columns = line.split() - logger.debug(f"Split parts: {parts}") - if len(columns) >= 6: + # Split the df line properly + fs_parts = fs_line.split() + if len(fs_parts) >= 6: try: + filesystem = fs_parts[0] + total_kb = int(fs_parts[1]) + used_kb = int(fs_parts[2]) + avail_kb = int(fs_parts[3]) + usage_pct = int(fs_parts[4].rstrip('%')) + mountpoint = fs_parts[5] + # Skip excluded mounts - if parts[0].startswith('appPool:') or '/mnt/pve/mediaf' in parts[0]: - continue - - # Get the mountpoint (last column) - if len(parts) > 5: - # The mountpoint is the last column - mountpoint = columns[-1] - else: - mountpoint = "/" - - # Skip excluded mountpoints if self._is_excluded_mount(mountpoint): - logger.debug(f"Skipping excluded mount: {mountpoint}") continue - # Parse size values safely - total_space = self._parse_size(columns[-5]) - used_space = self._parse_size(columns[-4]) - available_space = self._parse_size(columns[-3]) - - # Parse percentage safely - try: - usage_percent = float(columns[-2].rstrip('%')) - except (ValueError, IndexError): - # Calculate percentage if parsing fails - usage_percent = (used_space / total_space * 100) if total_space > 0 else 0 - - filesystem = { + filesystem_info = { 'mountpoint': mountpoint, - 'total_space': total_space, - 'used_space': used_space, - 'available': available_space, - 'usage_percent': usage_percent + 'total_space': total_kb * 1024, # Convert to bytes + 'used_space': used_kb * 1024, + 'available': avail_kb * 1024, + 'usage_percent': usage_pct } - container_info['filesystems'].append(filesystem) + container_info['filesystems'].append(filesystem_info) # Check thresholds - if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: + if usage_pct >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: lxc_health['status'] = 'CRITICAL' - issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}" + issue = f"LXC {vmid} critical storage usage: {usage_pct}% on {mountpoint}" lxc_health['issues'].append(issue) - elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: + elif usage_pct >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: if lxc_health['status'] != 'CRITICAL': lxc_health['status'] = 'WARNING' - issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}" + issue = f"LXC {vmid} high storage usage: {usage_pct}% on {mountpoint}" lxc_health['issues'].append(issue) - logger.debug(f"Filesystem details: {filesystem}") - except Exception as e: - logger.debug(f"Error processing line: {str(e)}") - logger.debug(f"Full exception: {repr(e)}") + except (ValueError, IndexError) as e: + logger.debug(f"Error parsing df line '{fs_line}': {e}") continue - # Only add container info if we have filesystem data if container_info['filesystems']: lxc_health['containers'].append(container_info) - logger.debug(f"Added container info for VMID {vmid}") - - logger.debug("=== LXC Storage Check Summary ===") - logger.debug(f"Status: {lxc_health['status']}") - logger.debug(f"Total containers checked: {len(lxc_health['containers'])}") - logger.debug(f"Issues found: {len(lxc_health['issues'])}") - logger.debug("=== End LXC Storage Check ===") - + except Exception as e: - logger.debug(f"Critical error during LXC storage check: {str(e)}") + logger.error(f"Error in LXC storage check: {e}") lxc_health['status'] = 'ERROR' - error_msg = f"Error checking LXC storage: {str(e)}" - lxc_health['issues'].append(error_msg) + lxc_health['issues'].append(f"Error checking LXC storage: {str(e)}") return lxc_health