Updated LXC storage checks
This commit is contained in:
182
hwmonDaemon.py
182
hwmonDaemon.py
@ -885,23 +885,34 @@ class SystemHealthMonitor:
|
|||||||
"""
|
"""
|
||||||
disks = set()
|
disks = set()
|
||||||
|
|
||||||
# Method 1: Use lsblk to get physical disks
|
# Method 1: Use lsblk to get physical disks, excluding virtual devices
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
['lsblk', '-d', '-n', '-o', 'NAME'],
|
['lsblk', '-d', '-n', '-o', 'NAME,TYPE'],
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
text=True
|
text=True
|
||||||
)
|
)
|
||||||
disks.update(f"/dev/{disk}" for disk in result.stdout.strip().split('\n'))
|
for line in result.stdout.strip().split('\n'):
|
||||||
logger.debug(f"Disks found via lsblk: {disks}")
|
if line:
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 2:
|
||||||
|
name, device_type = parts[0], parts[1]
|
||||||
|
# Only include actual disks, exclude virtual devices
|
||||||
|
if device_type == 'disk' and not name.startswith('rbd'):
|
||||||
|
disks.add(f"/dev/{name}")
|
||||||
|
logger.debug(f"Physical disks found via lsblk: {disks}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"lsblk detection failed: {e}")
|
logger.debug(f"lsblk detection failed: {e}")
|
||||||
|
|
||||||
# Method 2: Direct device scanning
|
# Method 2: Direct device scanning for physical devices only
|
||||||
for pattern in ['/dev/sd*', '/dev/nvme*n*']:
|
for pattern in ['/dev/sd[a-z]', '/dev/nvme[0-9]n[0-9]']:
|
||||||
try:
|
try:
|
||||||
|
import glob
|
||||||
matches = glob.glob(pattern)
|
matches = glob.glob(pattern)
|
||||||
disks.update(d for d in matches if not d[-1].isdigit())
|
# Filter out partitions (devices ending in numbers for sd*, already filtered for nvme)
|
||||||
|
if 'sd' in pattern:
|
||||||
|
matches = [d for d in matches if not d[-1].isdigit()]
|
||||||
|
disks.update(matches)
|
||||||
logger.debug(f"Disks found via glob {pattern}: {matches}")
|
logger.debug(f"Disks found via glob {pattern}: {matches}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Glob detection failed for {pattern}: {e}")
|
logger.debug(f"Glob detection failed for {pattern}: {e}")
|
||||||
@ -1151,6 +1162,12 @@ class SystemHealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Skip virtual devices
|
||||||
|
if '/dev/rbd' in device or '/dev/dm-' in device or '/dev/mapper/' in device:
|
||||||
|
smart_health['status'] = 'NOT_SUPPORTED'
|
||||||
|
smart_health['issues'].append("Virtual device - SMART not applicable")
|
||||||
|
return smart_health
|
||||||
|
|
||||||
# First verify the device is SMART-capable
|
# First verify the device is SMART-capable
|
||||||
drive_details = self._get_drive_details(device)
|
drive_details = self._get_drive_details(device)
|
||||||
if not drive_details.get('smart_capable', False):
|
if not drive_details.get('smart_capable', False):
|
||||||
@ -1158,6 +1175,10 @@ class SystemHealthMonitor:
|
|||||||
smart_health['issues'].append("SMART not supported on this device")
|
smart_health['issues'].append("SMART not supported on this device")
|
||||||
return smart_health
|
return smart_health
|
||||||
|
|
||||||
|
# Special handling for NVMe devices
|
||||||
|
if 'nvme' in device:
|
||||||
|
return self._check_nvme_smart_health(device)
|
||||||
|
|
||||||
# If we have no model info, the device might not be responding properly
|
# If we have no model info, the device might not be responding properly
|
||||||
if not drive_details.get('model'):
|
if not drive_details.get('model'):
|
||||||
smart_health['status'] = 'ERROR'
|
smart_health['status'] = 'ERROR'
|
||||||
@ -1169,7 +1190,7 @@ class SystemHealthMonitor:
|
|||||||
manufacturer_profile = self._get_manufacturer_profile(
|
manufacturer_profile = self._get_manufacturer_profile(
|
||||||
drive_details.get('model', ''),
|
drive_details.get('model', ''),
|
||||||
drive_details.get('manufacturer', ''),
|
drive_details.get('manufacturer', ''),
|
||||||
drive_details.get('firmware', '') # Add firmware to the call
|
drive_details.get('firmware', '')
|
||||||
)
|
)
|
||||||
smart_health['manufacturer_profile'] = manufacturer_profile
|
smart_health['manufacturer_profile'] = manufacturer_profile
|
||||||
|
|
||||||
@ -1382,6 +1403,65 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
return smart_health
|
return smart_health
|
||||||
|
|
||||||
|
def _check_nvme_smart_health(self, device: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Dedicated NVMe SMART health check.
|
||||||
|
"""
|
||||||
|
smart_health = {
|
||||||
|
'status': 'UNKNOWN',
|
||||||
|
'severity': 'NORMAL',
|
||||||
|
'issues': [],
|
||||||
|
'temp': None,
|
||||||
|
'attributes': {},
|
||||||
|
'manufacturer_profile': None
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Use nvme-cli for NVMe devices
|
||||||
|
result = subprocess.run(
|
||||||
|
['nvme', 'smart-log', device],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
smart_health['status'] = 'HEALTHY'
|
||||||
|
|
||||||
|
# Parse NVMe smart log output
|
||||||
|
for line in result.stdout.split('\n'):
|
||||||
|
if 'temperature' in line.lower():
|
||||||
|
# Extract temperature
|
||||||
|
temp_match = re.search(r'(\d+)', line)
|
||||||
|
if temp_match:
|
||||||
|
smart_health['temp'] = int(temp_match.group(1))
|
||||||
|
smart_health['attributes']['Temperature_Celsius'] = smart_health['temp']
|
||||||
|
|
||||||
|
elif 'available_spare' in line.lower():
|
||||||
|
spare_match = re.search(r'(\d+)%', line)
|
||||||
|
if spare_match:
|
||||||
|
spare_pct = int(spare_match.group(1))
|
||||||
|
smart_health['attributes']['Available_Spare'] = spare_pct
|
||||||
|
if spare_pct < 10:
|
||||||
|
smart_health['severity'] = 'CRITICAL'
|
||||||
|
smart_health['issues'].append(f"Critical Available_Spare: {spare_pct}%")
|
||||||
|
elif spare_pct < 30:
|
||||||
|
smart_health['severity'] = 'WARNING'
|
||||||
|
smart_health['issues'].append(f"Low Available_Spare: {spare_pct}%")
|
||||||
|
else:
|
||||||
|
smart_health['status'] = 'ERROR'
|
||||||
|
smart_health['issues'].append("Failed to read NVMe SMART data")
|
||||||
|
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
smart_health['status'] = 'ERROR'
|
||||||
|
smart_health['issues'].append("NVMe SMART check timed out")
|
||||||
|
except Exception as e:
|
||||||
|
smart_health['status'] = 'ERROR'
|
||||||
|
smart_health['issues'].append(f"Error checking NVMe SMART: {str(e)}")
|
||||||
|
|
||||||
|
return smart_health
|
||||||
|
|
||||||
def _check_drives_health(self) -> Dict[str, Any]:
|
def _check_drives_health(self) -> Dict[str, Any]:
|
||||||
drives_health = {'overall_status': 'NORMAL', 'drives': []}
|
drives_health = {'overall_status': 'NORMAL', 'drives': []}
|
||||||
|
|
||||||
@ -1665,13 +1745,12 @@ class SystemHealthMonitor:
|
|||||||
)
|
)
|
||||||
logger.debug(f"pct list output:\n{result.stdout}")
|
logger.debug(f"pct list output:\n{result.stdout}")
|
||||||
|
|
||||||
for line in result.stdout.split('\n')[1:]:
|
for line in result.stdout.split('\n')[1:]: # Skip header
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
parts = line.split()
|
parts = line.split()
|
||||||
if len(parts) < 2:
|
if len(parts) < 2:
|
||||||
logger.debug(f"Skipping invalid line: {line}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
vmid, status = parts[0], parts[1]
|
vmid, status = parts[0], parts[1]
|
||||||
@ -1690,84 +1769,57 @@ class SystemHealthMonitor:
|
|||||||
'filesystems': []
|
'filesystems': []
|
||||||
}
|
}
|
||||||
|
|
||||||
for fs_line in disk_info.stdout.split('\n')[1:]:
|
# Parse df output correctly
|
||||||
if not fs_line.strip() or 'MP' in fs_line:
|
for fs_line in disk_info.stdout.split('\n')[1:]: # Skip header
|
||||||
|
if not fs_line.strip():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
columns = line.split()
|
# Split the df line properly
|
||||||
logger.debug(f"Split parts: {parts}")
|
fs_parts = fs_line.split()
|
||||||
if len(columns) >= 6:
|
if len(fs_parts) >= 6:
|
||||||
try:
|
try:
|
||||||
|
filesystem = fs_parts[0]
|
||||||
|
total_kb = int(fs_parts[1])
|
||||||
|
used_kb = int(fs_parts[2])
|
||||||
|
avail_kb = int(fs_parts[3])
|
||||||
|
usage_pct = int(fs_parts[4].rstrip('%'))
|
||||||
|
mountpoint = fs_parts[5]
|
||||||
|
|
||||||
# Skip excluded mounts
|
# Skip excluded mounts
|
||||||
if parts[0].startswith('appPool:') or '/mnt/pve/mediaf' in parts[0]:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get the mountpoint (last column)
|
|
||||||
if len(parts) > 5:
|
|
||||||
# The mountpoint is the last column
|
|
||||||
mountpoint = columns[-1]
|
|
||||||
else:
|
|
||||||
mountpoint = "/"
|
|
||||||
|
|
||||||
# Skip excluded mountpoints
|
|
||||||
if self._is_excluded_mount(mountpoint):
|
if self._is_excluded_mount(mountpoint):
|
||||||
logger.debug(f"Skipping excluded mount: {mountpoint}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Parse size values safely
|
filesystem_info = {
|
||||||
total_space = self._parse_size(columns[-5])
|
|
||||||
used_space = self._parse_size(columns[-4])
|
|
||||||
available_space = self._parse_size(columns[-3])
|
|
||||||
|
|
||||||
# Parse percentage safely
|
|
||||||
try:
|
|
||||||
usage_percent = float(columns[-2].rstrip('%'))
|
|
||||||
except (ValueError, IndexError):
|
|
||||||
# Calculate percentage if parsing fails
|
|
||||||
usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
|
|
||||||
|
|
||||||
filesystem = {
|
|
||||||
'mountpoint': mountpoint,
|
'mountpoint': mountpoint,
|
||||||
'total_space': total_space,
|
'total_space': total_kb * 1024, # Convert to bytes
|
||||||
'used_space': used_space,
|
'used_space': used_kb * 1024,
|
||||||
'available': available_space,
|
'available': avail_kb * 1024,
|
||||||
'usage_percent': usage_percent
|
'usage_percent': usage_pct
|
||||||
}
|
}
|
||||||
container_info['filesystems'].append(filesystem)
|
container_info['filesystems'].append(filesystem_info)
|
||||||
|
|
||||||
# Check thresholds
|
# Check thresholds
|
||||||
if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
|
if usage_pct >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
|
||||||
lxc_health['status'] = 'CRITICAL'
|
lxc_health['status'] = 'CRITICAL'
|
||||||
issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
|
issue = f"LXC {vmid} critical storage usage: {usage_pct}% on {mountpoint}"
|
||||||
lxc_health['issues'].append(issue)
|
lxc_health['issues'].append(issue)
|
||||||
elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
|
elif usage_pct >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
|
||||||
if lxc_health['status'] != 'CRITICAL':
|
if lxc_health['status'] != 'CRITICAL':
|
||||||
lxc_health['status'] = 'WARNING'
|
lxc_health['status'] = 'WARNING'
|
||||||
issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
|
issue = f"LXC {vmid} high storage usage: {usage_pct}% on {mountpoint}"
|
||||||
lxc_health['issues'].append(issue)
|
lxc_health['issues'].append(issue)
|
||||||
|
|
||||||
logger.debug(f"Filesystem details: {filesystem}")
|
except (ValueError, IndexError) as e:
|
||||||
except Exception as e:
|
logger.debug(f"Error parsing df line '{fs_line}': {e}")
|
||||||
logger.debug(f"Error processing line: {str(e)}")
|
|
||||||
logger.debug(f"Full exception: {repr(e)}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Only add container info if we have filesystem data
|
|
||||||
if container_info['filesystems']:
|
if container_info['filesystems']:
|
||||||
lxc_health['containers'].append(container_info)
|
lxc_health['containers'].append(container_info)
|
||||||
logger.debug(f"Added container info for VMID {vmid}")
|
|
||||||
|
|
||||||
logger.debug("=== LXC Storage Check Summary ===")
|
|
||||||
logger.debug(f"Status: {lxc_health['status']}")
|
|
||||||
logger.debug(f"Total containers checked: {len(lxc_health['containers'])}")
|
|
||||||
logger.debug(f"Issues found: {len(lxc_health['issues'])}")
|
|
||||||
logger.debug("=== End LXC Storage Check ===")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Critical error during LXC storage check: {str(e)}")
|
logger.error(f"Error in LXC storage check: {e}")
|
||||||
lxc_health['status'] = 'ERROR'
|
lxc_health['status'] = 'ERROR'
|
||||||
error_msg = f"Error checking LXC storage: {str(e)}"
|
lxc_health['issues'].append(f"Error checking LXC storage: {str(e)}")
|
||||||
lxc_health['issues'].append(error_msg)
|
|
||||||
|
|
||||||
return lxc_health
|
return lxc_health
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user