Attempted fix for lxc storage
This commit is contained in:
111
hwmonDaemon.py
111
hwmonDaemon.py
@ -1297,13 +1297,10 @@ class SystemHealthMonitor:
|
||||
|
||||
# Handle all other standard SMART attributes
|
||||
for attr, thresholds in BASE_SMART_THRESHOLDS.items():
|
||||
if attr in line and attr != 'Wear_Leveling_Count':
|
||||
if attr in line and attr != 'Wear_Leveling_Count': # Skip wear leveling as it's handled above
|
||||
parts = line.split()
|
||||
if len(parts) >= 10:
|
||||
if attr == 'Erase_Fail_Count':
|
||||
raw_value = int(parts[9]) if parts[9].isdigit() else 0
|
||||
else:
|
||||
raw_value = self._parse_smart_value(parts[9])
|
||||
raw_value = self._parse_smart_value(parts[9])
|
||||
smart_health['attributes'][attr] = raw_value
|
||||
|
||||
if attr == 'Temperature_Celsius':
|
||||
@ -1316,13 +1313,15 @@ class SystemHealthMonitor:
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"High temperature: {raw_value}°C")
|
||||
else:
|
||||
if raw_value >= thresholds['critical']:
|
||||
smart_health['severity'] = 'CRITICAL'
|
||||
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
|
||||
elif raw_value >= thresholds['warning']:
|
||||
if smart_health['severity'] != 'CRITICAL':
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
|
||||
# Fix: Only trigger alerts if the raw value actually exceeds thresholds
|
||||
if raw_value > 0: # Only check non-zero values
|
||||
if raw_value >= thresholds['critical']:
|
||||
smart_health['severity'] = 'CRITICAL'
|
||||
smart_health['issues'].append(f"Critical {attr}: {raw_value}")
|
||||
elif raw_value >= thresholds['warning']:
|
||||
if smart_health['severity'] != 'CRITICAL':
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append(f"Warning {attr}: {raw_value}")
|
||||
|
||||
# Check for recent SMART errors
|
||||
error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours"
|
||||
@ -1748,12 +1747,13 @@ class SystemHealthMonitor:
|
||||
)
|
||||
logger.debug(f"pct list output:\n{result.stdout}")
|
||||
|
||||
for line in result.stdout.split('\n')[1:]: # Skip header
|
||||
for line in result.stdout.split('\n')[1:]:
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
parts = line.split()
|
||||
if len(parts) < 2:
|
||||
logger.debug(f"Skipping invalid line: {line}")
|
||||
continue
|
||||
|
||||
vmid, status = parts[0], parts[1]
|
||||
@ -1772,68 +1772,83 @@ class SystemHealthMonitor:
|
||||
'filesystems': []
|
||||
}
|
||||
|
||||
# Parse df output correctly
|
||||
for fs_line in disk_info.stdout.split('\n')[1:]: # Skip header
|
||||
for fs_line in disk_info.stdout.split('\n')[1:]:
|
||||
if not fs_line.strip() or 'MP' in fs_line:
|
||||
continue
|
||||
|
||||
# Split the df line properly
|
||||
fs_parts = fs_line.split()
|
||||
logger.debug(f"Split parts: {fs_parts}")
|
||||
if len(fs_parts) >= 6:
|
||||
# Fix: Use fs_line instead of line, and columns consistently
|
||||
columns = fs_line.split()
|
||||
logger.debug(f"Processing df line: {fs_line}")
|
||||
logger.debug(f"Split columns: {columns}")
|
||||
|
||||
if len(columns) >= 6:
|
||||
try:
|
||||
filesystem = fs_parts[0]
|
||||
|
||||
# Skip excluded mounts by filesystem name
|
||||
if filesystem.startswith('appPool:') or '/mnt/pve/mediafs' in filesystem:
|
||||
logger.debug(f"Skipping excluded filesystem: {filesystem}")
|
||||
# Skip excluded mounts by checking the first column
|
||||
if columns[0].startswith('appPool:') or '/mnt/pve/mediaf' in columns[1]:
|
||||
continue
|
||||
|
||||
# Parse the values - pct df output format:
|
||||
# Filesystem 1K-blocks Used Available Use% Mounted on
|
||||
total_kb = int(fs_parts[1])
|
||||
used_kb = int(fs_parts[2])
|
||||
avail_kb = int(fs_parts[3])
|
||||
usage_pct = int(fs_parts[4].rstrip('%'))
|
||||
mountpoint = fs_parts[5]
|
||||
# Get the mountpoint (last column)
|
||||
mountpoint = columns[-1]
|
||||
|
||||
# Skip excluded mounts by mountpoint
|
||||
# Skip excluded mountpoints
|
||||
if self._is_excluded_mount(mountpoint):
|
||||
logger.debug(f"Skipping excluded mount: {mountpoint}")
|
||||
continue
|
||||
|
||||
filesystem_info = {
|
||||
# Parse size values safely - use correct column indices
|
||||
total_space = self._parse_size(columns[2]) # 3rd column
|
||||
used_space = self._parse_size(columns[3]) # 4th column
|
||||
available_space = self._parse_size(columns[4]) # 5th column
|
||||
|
||||
# Parse percentage safely
|
||||
try:
|
||||
usage_percent = float(columns[5].rstrip('%')) # 6th column
|
||||
except (ValueError, IndexError):
|
||||
# Calculate percentage if parsing fails
|
||||
usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
|
||||
|
||||
filesystem = {
|
||||
'mountpoint': mountpoint,
|
||||
'filesystem': filesystem,
|
||||
'total_space': total_kb * 1024, # Convert to bytes
|
||||
'used_space': used_kb * 1024,
|
||||
'available': avail_kb * 1024,
|
||||
'usage_percent': usage_pct
|
||||
'total_space': total_space,
|
||||
'used_space': used_space,
|
||||
'available': available_space,
|
||||
'usage_percent': usage_percent
|
||||
}
|
||||
container_info['filesystems'].append(filesystem_info)
|
||||
container_info['filesystems'].append(filesystem)
|
||||
|
||||
# Check thresholds
|
||||
if usage_pct >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
|
||||
if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
|
||||
lxc_health['status'] = 'CRITICAL'
|
||||
issue = f"LXC {vmid} critical storage usage: {usage_pct}% on {mountpoint}"
|
||||
issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
|
||||
lxc_health['issues'].append(issue)
|
||||
elif usage_pct >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
|
||||
elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
|
||||
if lxc_health['status'] != 'CRITICAL':
|
||||
lxc_health['status'] = 'WARNING'
|
||||
issue = f"LXC {vmid} high storage usage: {usage_pct}% on {mountpoint}"
|
||||
issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
|
||||
lxc_health['issues'].append(issue)
|
||||
|
||||
except (ValueError, IndexError) as e:
|
||||
logger.debug(f"Error parsing df line '{fs_line}': {e}")
|
||||
logger.debug(f"Filesystem details: {filesystem}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Error processing line: {str(e)}")
|
||||
logger.debug(f"Full exception: {repr(e)}")
|
||||
continue
|
||||
|
||||
# Only add container info if we have filesystem data
|
||||
if container_info['filesystems']:
|
||||
lxc_health['containers'].append(container_info)
|
||||
|
||||
logger.debug(f"Added container info for VMID {vmid}")
|
||||
|
||||
logger.debug("=== LXC Storage Check Summary ===")
|
||||
logger.debug(f"Status: {lxc_health['status']}")
|
||||
logger.debug(f"Total containers checked: {len(lxc_health['containers'])}")
|
||||
logger.debug(f"Issues found: {len(lxc_health['issues'])}")
|
||||
logger.debug("=== End LXC Storage Check ===")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in LXC storage check: {e}")
|
||||
logger.debug(f"Critical error during LXC storage check: {str(e)}")
|
||||
lxc_health['status'] = 'ERROR'
|
||||
lxc_health['issues'].append(f"Error checking LXC storage: {str(e)}")
|
||||
error_msg = f"Error checking LXC storage: {str(e)}"
|
||||
lxc_health['issues'].append(error_msg)
|
||||
|
||||
return lxc_health
|
||||
|
||||
|
||||
Reference in New Issue
Block a user