diff --git a/hwmonDaemon.py b/hwmonDaemon.py index bd1f654..d351c6d 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -1297,13 +1297,10 @@ class SystemHealthMonitor: # Handle all other standard SMART attributes for attr, thresholds in BASE_SMART_THRESHOLDS.items(): - if attr in line and attr != 'Wear_Leveling_Count': + if attr in line and attr != 'Wear_Leveling_Count': # Skip wear leveling as it's handled above parts = line.split() if len(parts) >= 10: - if attr == 'Erase_Fail_Count': - raw_value = int(parts[9]) if parts[9].isdigit() else 0 - else: - raw_value = self._parse_smart_value(parts[9]) + raw_value = self._parse_smart_value(parts[9]) smart_health['attributes'][attr] = raw_value if attr == 'Temperature_Celsius': @@ -1316,13 +1313,15 @@ class SystemHealthMonitor: smart_health['severity'] = 'WARNING' smart_health['issues'].append(f"High temperature: {raw_value}°C") else: - if raw_value >= thresholds['critical']: - smart_health['severity'] = 'CRITICAL' - smart_health['issues'].append(f"Critical {attr}: {raw_value}") - elif raw_value >= thresholds['warning']: - if smart_health['severity'] != 'CRITICAL': - smart_health['severity'] = 'WARNING' - smart_health['issues'].append(f"Warning {attr}: {raw_value}") + # Fix: Only trigger alerts if the raw value actually exceeds thresholds + if raw_value > 0: # Only check non-zero values + if raw_value >= thresholds['critical']: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical {attr}: {raw_value}") + elif raw_value >= thresholds['warning']: + if smart_health['severity'] != 'CRITICAL': + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"Warning {attr}: {raw_value}") # Check for recent SMART errors error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours" @@ -1748,12 +1747,13 @@ class SystemHealthMonitor: ) logger.debug(f"pct list output:\n{result.stdout}") - for line in result.stdout.split('\n')[1:]: # Skip header + for line in result.stdout.split('\n')[1:]: if not line.strip(): continue parts = line.split() if len(parts) < 2: + logger.debug(f"Skipping invalid line: {line}") continue vmid, status = parts[0], parts[1] @@ -1772,68 +1772,83 @@ class SystemHealthMonitor: 'filesystems': [] } - # Parse df output correctly - for fs_line in disk_info.stdout.split('\n')[1:]: # Skip header + for fs_line in disk_info.stdout.split('\n')[1:]: if not fs_line.strip() or 'MP' in fs_line: continue - # Split the df line properly - fs_parts = fs_line.split() - logger.debug(f"Split parts: {fs_parts}") - if len(fs_parts) >= 6: + # Fix: Use fs_line instead of line, and columns consistently + columns = fs_line.split() + logger.debug(f"Processing df line: {fs_line}") + logger.debug(f"Split columns: {columns}") + + if len(columns) >= 6: try: - filesystem = fs_parts[0] - - # Skip excluded mounts by filesystem name - if filesystem.startswith('appPool:') or '/mnt/pve/mediafs' in filesystem: - logger.debug(f"Skipping excluded filesystem: {filesystem}") + # Skip excluded mounts by checking the first column + if columns[0].startswith('appPool:') or '/mnt/pve/mediaf' in columns[1]: continue - # Parse the values - pct df output format: - # Filesystem 1K-blocks Used Available Use% Mounted on - total_kb = int(fs_parts[1]) - used_kb = int(fs_parts[2]) - avail_kb = int(fs_parts[3]) - usage_pct = int(fs_parts[4].rstrip('%')) - mountpoint = fs_parts[5] + # Get the mountpoint (last column) + mountpoint = columns[-1] - # Skip excluded mounts by mountpoint + # Skip excluded mountpoints if self._is_excluded_mount(mountpoint): logger.debug(f"Skipping excluded mount: {mountpoint}") continue - filesystem_info = { + # Parse size values safely - use correct column indices + total_space = self._parse_size(columns[2]) # 3rd column + used_space = self._parse_size(columns[3]) # 4th column + available_space = self._parse_size(columns[4]) # 5th column + + # Parse percentage safely + try: + usage_percent = float(columns[5].rstrip('%')) # 6th column + except (ValueError, IndexError): + # Calculate percentage if parsing fails + usage_percent = (used_space / total_space * 100) if total_space > 0 else 0 + + filesystem = { 'mountpoint': mountpoint, - 'filesystem': filesystem, - 'total_space': total_kb * 1024, # Convert to bytes - 'used_space': used_kb * 1024, - 'available': avail_kb * 1024, - 'usage_percent': usage_pct + 'total_space': total_space, + 'used_space': used_space, + 'available': available_space, + 'usage_percent': usage_percent } - container_info['filesystems'].append(filesystem_info) + container_info['filesystems'].append(filesystem) # Check thresholds - if usage_pct >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: + if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: lxc_health['status'] = 'CRITICAL' - issue = f"LXC {vmid} critical storage usage: {usage_pct}% on {mountpoint}" + issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}" lxc_health['issues'].append(issue) - elif usage_pct >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: + elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: if lxc_health['status'] != 'CRITICAL': lxc_health['status'] = 'WARNING' - issue = f"LXC {vmid} high storage usage: {usage_pct}% on {mountpoint}" + issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}" lxc_health['issues'].append(issue) - except (ValueError, IndexError) as e: - logger.debug(f"Error parsing df line '{fs_line}': {e}") + logger.debug(f"Filesystem details: {filesystem}") + except Exception as e: + logger.debug(f"Error processing line: {str(e)}") + logger.debug(f"Full exception: {repr(e)}") continue + # Only add container info if we have filesystem data if container_info['filesystems']: lxc_health['containers'].append(container_info) - + logger.debug(f"Added container info for VMID {vmid}") + + logger.debug("=== LXC Storage Check Summary ===") + logger.debug(f"Status: {lxc_health['status']}") + logger.debug(f"Total containers checked: {len(lxc_health['containers'])}") + logger.debug(f"Issues found: {len(lxc_health['issues'])}") + logger.debug("=== End LXC Storage Check ===") + except Exception as e: - logger.error(f"Error in LXC storage check: {e}") + logger.debug(f"Critical error during LXC storage check: {str(e)}") lxc_health['status'] = 'ERROR' - lxc_health['issues'].append(f"Error checking LXC storage: {str(e)}") + error_msg = f"Error checking LXC storage: {str(e)}" + lxc_health['issues'].append(error_msg) return lxc_health