Attempted fix for lxc storage

This commit is contained in:
2025-05-29 20:23:21 -04:00
parent 1371592b9e
commit 9a700e9853

View File

@ -1297,13 +1297,10 @@ class SystemHealthMonitor:
# Handle all other standard SMART attributes
for attr, thresholds in BASE_SMART_THRESHOLDS.items():
if attr in line and attr != 'Wear_Leveling_Count':
if attr in line and attr != 'Wear_Leveling_Count': # Skip wear leveling as it's handled above
parts = line.split()
if len(parts) >= 10:
if attr == 'Erase_Fail_Count':
raw_value = int(parts[9]) if parts[9].isdigit() else 0
else:
raw_value = self._parse_smart_value(parts[9])
raw_value = self._parse_smart_value(parts[9])
smart_health['attributes'][attr] = raw_value
if attr == 'Temperature_Celsius':
@ -1316,13 +1313,15 @@ class SystemHealthMonitor:
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"High temperature: {raw_value}°C")
else:
if raw_value >= thresholds['critical']:
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
elif raw_value >= thresholds['warning']:
if smart_health['severity'] != 'CRITICAL':
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
# Fix: Only trigger alerts if the raw value actually exceeds thresholds
if raw_value > 0: # Only check non-zero values
if raw_value >= thresholds['critical']:
smart_health['severity'] = 'CRITICAL'
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
elif raw_value >= thresholds['warning']:
if smart_health['severity'] != 'CRITICAL':
smart_health['severity'] = 'WARNING'
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
# Check for recent SMART errors
error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"
@ -1748,12 +1747,13 @@ class SystemHealthMonitor:
)
logger.debug(f"pct list output:\n{result.stdout}")
for line in result.stdout.split('\n')[1:]: # Skip header
for line in result.stdout.split('\n')[1:]:
if not line.strip():
continue
parts = line.split()
if len(parts) < 2:
logger.debug(f"Skipping invalid line: {line}")
continue
vmid, status = parts[0], parts[1]
@ -1772,68 +1772,83 @@ class SystemHealthMonitor:
'filesystems': []
}
# Parse df output correctly
for fs_line in disk_info.stdout.split('\n')[1:]: # Skip header
for fs_line in disk_info.stdout.split('\n')[1:]:
if not fs_line.strip() or 'MP' in fs_line:
continue
# Split the df line properly
fs_parts = fs_line.split()
logger.debug(f"Split parts: {fs_parts}")
if len(fs_parts) >= 6:
# Fix: Use fs_line instead of line, and columns consistently
columns = fs_line.split()
logger.debug(f"Processing df line: {fs_line}")
logger.debug(f"Split columns: {columns}")
if len(columns) >= 6:
try:
filesystem = fs_parts[0]
# Skip excluded mounts by filesystem name
if filesystem.startswith('appPool:') or '/mnt/pve/mediafs' in filesystem:
logger.debug(f"Skipping excluded filesystem: {filesystem}")
# Skip excluded mounts by checking the first column
if columns[0].startswith('appPool:') or '/mnt/pve/mediaf' in columns[1]:
continue
# Parse the values - pct df output format:
# Filesystem 1K-blocks Used Available Use% Mounted on
total_kb = int(fs_parts[1])
used_kb = int(fs_parts[2])
avail_kb = int(fs_parts[3])
usage_pct = int(fs_parts[4].rstrip('%'))
mountpoint = fs_parts[5]
# Get the mountpoint (last column)
mountpoint = columns[-1]
# Skip excluded mounts by mountpoint
# Skip excluded mountpoints
if self._is_excluded_mount(mountpoint):
logger.debug(f"Skipping excluded mount: {mountpoint}")
continue
filesystem_info = {
# Parse size values safely - use correct column indices
total_space = self._parse_size(columns[2]) # 3rd column
used_space = self._parse_size(columns[3]) # 4th column
available_space = self._parse_size(columns[4]) # 5th column
# Parse percentage safely
try:
usage_percent = float(columns[5].rstrip('%')) # 6th column
except (ValueError, IndexError):
# Calculate percentage if parsing fails
usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
filesystem = {
'mountpoint': mountpoint,
'filesystem': filesystem,
'total_space': total_kb * 1024, # Convert to bytes
'used_space': used_kb * 1024,
'available': avail_kb * 1024,
'usage_percent': usage_pct
'total_space': total_space,
'used_space': used_space,
'available': available_space,
'usage_percent': usage_percent
}
container_info['filesystems'].append(filesystem_info)
container_info['filesystems'].append(filesystem)
# Check thresholds
if usage_pct >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
lxc_health['status'] = 'CRITICAL'
issue = f"LXC {vmid} critical storage usage: {usage_pct}% on {mountpoint}"
issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
lxc_health['issues'].append(issue)
elif usage_pct >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
if lxc_health['status'] != 'CRITICAL':
lxc_health['status'] = 'WARNING'
issue = f"LXC {vmid} high storage usage: {usage_pct}% on {mountpoint}"
issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
lxc_health['issues'].append(issue)
except (ValueError, IndexError) as e:
logger.debug(f"Error parsing df line '{fs_line}': {e}")
logger.debug(f"Filesystem details: {filesystem}")
except Exception as e:
logger.debug(f"Error processing line: {str(e)}")
logger.debug(f"Full exception: {repr(e)}")
continue
# Only add container info if we have filesystem data
if container_info['filesystems']:
lxc_health['containers'].append(container_info)
logger.debug(f"Added container info for VMID {vmid}")
logger.debug("=== LXC Storage Check Summary ===")
logger.debug(f"Status: {lxc_health['status']}")
logger.debug(f"Total containers checked: {len(lxc_health['containers'])}")
logger.debug(f"Issues found: {len(lxc_health['issues'])}")
logger.debug("=== End LXC Storage Check ===")
except Exception as e:
logger.error(f"Error in LXC storage check: {e}")
logger.debug(f"Critical error during LXC storage check: {str(e)}")
lxc_health['status'] = 'ERROR'
lxc_health['issues'].append(f"Error checking LXC storage: {str(e)}")
error_msg = f"Error checking LXC storage: {str(e)}"
lxc_health['issues'].append(error_msg)
return lxc_health