Attempted fix for lxc storage

This commit is contained in:
2025-05-29 20:23:21 -04:00
parent 1371592b9e
commit 9a700e9853

View File

@ -1297,12 +1297,9 @@ class SystemHealthMonitor:
# Handle all other standard SMART attributes # Handle all other standard SMART attributes
for attr, thresholds in BASE_SMART_THRESHOLDS.items(): for attr, thresholds in BASE_SMART_THRESHOLDS.items():
if attr in line and attr != 'Wear_Leveling_Count': if attr in line and attr != 'Wear_Leveling_Count': # Skip wear leveling as it's handled above
parts = line.split() parts = line.split()
if len(parts) >= 10: if len(parts) >= 10:
if attr == 'Erase_Fail_Count':
raw_value = int(parts[9]) if parts[9].isdigit() else 0
else:
raw_value = self._parse_smart_value(parts[9]) raw_value = self._parse_smart_value(parts[9])
smart_health['attributes'][attr] = raw_value smart_health['attributes'][attr] = raw_value
@ -1316,6 +1313,8 @@ class SystemHealthMonitor:
smart_health['severity'] = 'WARNING' smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"High temperature: {raw_value}°C") smart_health['issues'].append(f"High temperature: {raw_value}°C")
else: else:
# Fix: Only trigger alerts if the raw value actually exceeds thresholds
if raw_value > 0: # Only check non-zero values
if raw_value >= thresholds['critical']: if raw_value >= thresholds['critical']:
smart_health['severity'] = 'CRITICAL' smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical {attr}: {raw_value}") smart_health['issues'].append(f"Critical {attr}: {raw_value}")
@ -1748,12 +1747,13 @@ class SystemHealthMonitor:
) )
logger.debug(f"pct list output:\n{result.stdout}") logger.debug(f"pct list output:\n{result.stdout}")
for line in result.stdout.split('\n')[1:]: # Skip header for line in result.stdout.split('\n')[1:]:
if not line.strip(): if not line.strip():
continue continue
parts = line.split() parts = line.split()
if len(parts) < 2: if len(parts) < 2:
logger.debug(f"Skipping invalid line: {line}")
continue continue
vmid, status = parts[0], parts[1] vmid, status = parts[0], parts[1]
@ -1772,68 +1772,83 @@ class SystemHealthMonitor:
'filesystems': [] 'filesystems': []
} }
# Parse df output correctly for fs_line in disk_info.stdout.split('\n')[1:]:
for fs_line in disk_info.stdout.split('\n')[1:]: # Skip header
if not fs_line.strip() or 'MP' in fs_line: if not fs_line.strip() or 'MP' in fs_line:
continue continue
# Split the df line properly # Fix: Use fs_line instead of line, and columns consistently
fs_parts = fs_line.split() columns = fs_line.split()
logger.debug(f"Split parts: {fs_parts}") logger.debug(f"Processing df line: {fs_line}")
if len(fs_parts) >= 6: logger.debug(f"Split columns: {columns}")
try:
filesystem = fs_parts[0]
# Skip excluded mounts by filesystem name if len(columns) >= 6:
if filesystem.startswith('appPool:') or '/mnt/pve/mediafs' in filesystem: try:
logger.debug(f"Skipping excluded filesystem: {filesystem}") # Skip excluded mounts by checking the first column
if columns[0].startswith('appPool:') or '/mnt/pve/mediaf' in columns[1]:
continue continue
# Parse the values - pct df output format: # Get the mountpoint (last column)
# Filesystem 1K-blocks Used Available Use% Mounted on mountpoint = columns[-1]
total_kb = int(fs_parts[1])
used_kb = int(fs_parts[2])
avail_kb = int(fs_parts[3])
usage_pct = int(fs_parts[4].rstrip('%'))
mountpoint = fs_parts[5]
# Skip excluded mounts by mountpoint # Skip excluded mountpoints
if self._is_excluded_mount(mountpoint): if self._is_excluded_mount(mountpoint):
logger.debug(f"Skipping excluded mount: {mountpoint}") logger.debug(f"Skipping excluded mount: {mountpoint}")
continue continue
filesystem_info = { # Parse size values safely - use correct column indices
total_space = self._parse_size(columns[2]) # 3rd column
used_space = self._parse_size(columns[3]) # 4th column
available_space = self._parse_size(columns[4]) # 5th column
# Parse percentage safely
try:
usage_percent = float(columns[5].rstrip('%')) # 6th column
except (ValueError, IndexError):
# Calculate percentage if parsing fails
usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
filesystem = {
'mountpoint': mountpoint, 'mountpoint': mountpoint,
'filesystem': filesystem, 'total_space': total_space,
'total_space': total_kb * 1024, # Convert to bytes 'used_space': used_space,
'used_space': used_kb * 1024, 'available': available_space,
'available': avail_kb * 1024, 'usage_percent': usage_percent
'usage_percent': usage_pct
} }
container_info['filesystems'].append(filesystem_info) container_info['filesystems'].append(filesystem)
# Check thresholds # Check thresholds
if usage_pct >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
lxc_health['status'] = 'CRITICAL' lxc_health['status'] = 'CRITICAL'
issue = f"LXC {vmid} critical storage usage: {usage_pct}% on {mountpoint}" issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
lxc_health['issues'].append(issue) lxc_health['issues'].append(issue)
elif usage_pct >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
if lxc_health['status'] != 'CRITICAL': if lxc_health['status'] != 'CRITICAL':
lxc_health['status'] = 'WARNING' lxc_health['status'] = 'WARNING'
issue = f"LXC {vmid} high storage usage: {usage_pct}% on {mountpoint}" issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
lxc_health['issues'].append(issue) lxc_health['issues'].append(issue)
except (ValueError, IndexError) as e: logger.debug(f"Filesystem details: {filesystem}")
logger.debug(f"Error parsing df line '{fs_line}': {e}") except Exception as e:
logger.debug(f"Error processing line: {str(e)}")
logger.debug(f"Full exception: {repr(e)}")
continue continue
# Only add container info if we have filesystem data
if container_info['filesystems']: if container_info['filesystems']:
lxc_health['containers'].append(container_info) lxc_health['containers'].append(container_info)
logger.debug(f"Added container info for VMID {vmid}")
logger.debug("=== LXC Storage Check Summary ===")
logger.debug(f"Status: {lxc_health['status']}")
logger.debug(f"Total containers checked: {len(lxc_health['containers'])}")
logger.debug(f"Issues found: {len(lxc_health['issues'])}")
logger.debug("=== End LXC Storage Check ===")
except Exception as e: except Exception as e:
logger.error(f"Error in LXC storage check: {e}") logger.debug(f"Critical error during LXC storage check: {str(e)}")
lxc_health['status'] = 'ERROR' lxc_health['status'] = 'ERROR'
lxc_health['issues'].append(f"Error checking LXC storage: {str(e)}") error_msg = f"Error checking LXC storage: {str(e)}"
lxc_health['issues'].append(error_msg)
return lxc_health return lxc_health