Attempted fix for lxc storage

2025-05-29 20:23:21 -04:00
parent 1371592b9e
commit 9a700e9853
1 changed files with 63 additions and 48 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -1297,13 +1297,10 @@ class SystemHealthMonitor:

                # Handle all other standard SMART attributes
                for attr, thresholds in BASE_SMART_THRESHOLDS.items():
-                    if attr in line and attr != 'Wear_Leveling_Count':
+                    if attr in line and attr != 'Wear_Leveling_Count':  # Skip wear leveling as it's handled above
                        parts = line.split()
                        if len(parts) >= 10:
-                            if attr == 'Erase_Fail_Count':
-                                raw_value = int(parts[9]) if parts[9].isdigit() else 0
-                            else:
-                                raw_value = self._parse_smart_value(parts[9])
+                            raw_value = self._parse_smart_value(parts[9])
                            smart_health['attributes'][attr] = raw_value

                            if attr == 'Temperature_Celsius':
@@ -1316,13 +1313,15 @@ class SystemHealthMonitor:
                                        smart_health['severity'] = 'WARNING'
                                    smart_health['issues'].append(f"High temperature: {raw_value}°C")
                            else:
-                                if raw_value >= thresholds['critical']:
-                                    smart_health['severity'] = 'CRITICAL'
-                                    smart_health['issues'].append(f"Critical {attr}: {raw_value}")
-                                elif raw_value >= thresholds['warning']:
-                                    if smart_health['severity'] != 'CRITICAL':
-                                        smart_health['severity'] = 'WARNING'
-                                    smart_health['issues'].append(f"Warning {attr}: {raw_value}")
+                                # Fix: Only trigger alerts if the raw value actually exceeds thresholds
+                                if raw_value > 0:  # Only check non-zero values
+                                    if raw_value >= thresholds['critical']:
+                                        smart_health['severity'] = 'CRITICAL'
+                                        smart_health['issues'].append(f"Critical {attr}: {raw_value}")
+                                    elif raw_value >= thresholds['warning']:
+                                        if smart_health['severity'] != 'CRITICAL':
+                                            smart_health['severity'] = 'WARNING'
+                                        smart_health['issues'].append(f"Warning {attr}: {raw_value}")

            # Check for recent SMART errors
            error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"
@@ -1748,12 +1747,13 @@ class SystemHealthMonitor:
            )
            logger.debug(f"pct list output:\n{result.stdout}")
            
-            for line in result.stdout.split('\n')[1:]:  # Skip header
+            for line in result.stdout.split('\n')[1:]:
                if not line.strip():
                    continue
                    
                parts = line.split()
                if len(parts) < 2:
+                    logger.debug(f"Skipping invalid line: {line}")
                    continue
                    
                vmid, status = parts[0], parts[1]
@@ -1772,68 +1772,83 @@ class SystemHealthMonitor:
                        'filesystems': []
                    }
                    
-                    # Parse df output correctly
-                    for fs_line in disk_info.stdout.split('\n')[1:]:  # Skip header
+                    for fs_line in disk_info.stdout.split('\n')[1:]:
                        if not fs_line.strip() or 'MP' in fs_line:
                            continue
                        
-                        # Split the df line properly
-                        fs_parts = fs_line.split()
-                        logger.debug(f"Split parts: {fs_parts}")
-                        if len(fs_parts) >= 6:
+                        # Fix: Use fs_line instead of line, and columns consistently
+                        columns = fs_line.split()
+                        logger.debug(f"Processing df line: {fs_line}")
+                        logger.debug(f"Split columns: {columns}")
+                        
+                        if len(columns) >= 6:
                            try:
-                                filesystem = fs_parts[0]
-                                
-                                # Skip excluded mounts by filesystem name
-                                if filesystem.startswith('appPool:') or '/mnt/pve/mediafs' in filesystem:
-                                    logger.debug(f"Skipping excluded filesystem: {filesystem}")
+                                # Skip excluded mounts by checking the first column
+                                if columns[0].startswith('appPool:') or '/mnt/pve/mediaf' in columns[1]:
                                    continue
                                
-                                # Parse the values - pct df output format:
-                                # Filesystem     1K-blocks    Used Available Use% Mounted on
-                                total_kb = int(fs_parts[1])
-                                used_kb = int(fs_parts[2]) 
-                                avail_kb = int(fs_parts[3])
-                                usage_pct = int(fs_parts[4].rstrip('%'))
-                                mountpoint = fs_parts[5]
+                                # Get the mountpoint (last column)
+                                mountpoint = columns[-1]
                                
-                                # Skip excluded mounts by mountpoint
+                                # Skip excluded mountpoints
                                if self._is_excluded_mount(mountpoint):
                                    logger.debug(f"Skipping excluded mount: {mountpoint}")
                                    continue
                                    
-                                filesystem_info = {
+                                # Parse size values safely - use correct column indices
+                                total_space = self._parse_size(columns[2])  # 3rd column
+                                used_space = self._parse_size(columns[3])   # 4th column  
+                                available_space = self._parse_size(columns[4])  # 5th column
+                                
+                                # Parse percentage safely
+                                try:
+                                    usage_percent = float(columns[5].rstrip('%'))  # 6th column
+                                except (ValueError, IndexError):
+                                    # Calculate percentage if parsing fails
+                                    usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
+                                
+                                filesystem = {
                                    'mountpoint': mountpoint,
-                                    'filesystem': filesystem,
-                                    'total_space': total_kb * 1024,  # Convert to bytes
-                                    'used_space': used_kb * 1024,
-                                    'available': avail_kb * 1024,
-                                    'usage_percent': usage_pct
+                                    'total_space': total_space,
+                                    'used_space': used_space,
+                                    'available': available_space,
+                                    'usage_percent': usage_percent
                                }
-                                container_info['filesystems'].append(filesystem_info)
+                                container_info['filesystems'].append(filesystem)

                                # Check thresholds
-                                if usage_pct >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
+                                if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
                                    lxc_health['status'] = 'CRITICAL'
-                                    issue = f"LXC {vmid} critical storage usage: {usage_pct}% on {mountpoint}"
+                                    issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
                                    lxc_health['issues'].append(issue)
-                                elif usage_pct >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
+                                elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
                                    if lxc_health['status'] != 'CRITICAL':
                                        lxc_health['status'] = 'WARNING'
-                                    issue = f"LXC {vmid} high storage usage: {usage_pct}% on {mountpoint}"
+                                    issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
                                    lxc_health['issues'].append(issue)

-                            except (ValueError, IndexError) as e:
-                                logger.debug(f"Error parsing df line '{fs_line}': {e}")
+                                logger.debug(f"Filesystem details: {filesystem}")
+                            except Exception as e:
+                                logger.debug(f"Error processing line: {str(e)}")
+                                logger.debug(f"Full exception: {repr(e)}")
                                continue
                    
+                    # Only add container info if we have filesystem data
                    if container_info['filesystems']:
                        lxc_health['containers'].append(container_info)
-                        
+                        logger.debug(f"Added container info for VMID {vmid}")
+            
+            logger.debug("=== LXC Storage Check Summary ===")
+            logger.debug(f"Status: {lxc_health['status']}")
+            logger.debug(f"Total containers checked: {len(lxc_health['containers'])}")
+            logger.debug(f"Issues found: {len(lxc_health['issues'])}")
+            logger.debug("=== End LXC Storage Check ===")
+                    
        except Exception as e:
-            logger.error(f"Error in LXC storage check: {e}")
+            logger.debug(f"Critical error during LXC storage check: {str(e)}")
            lxc_health['status'] = 'ERROR'
-            lxc_health['issues'].append(f"Error checking LXC storage: {str(e)}")
+            error_msg = f"Error checking LXC storage: {str(e)}"
+            lxc_health['issues'].append(error_msg)
            
        return lxc_health