Better identification of RiData drives
This commit is contained in:
490
hwmonDaemon.py
490
hwmonDaemon.py
@ -109,8 +109,9 @@ class SystemHealthMonitor:
|
|||||||
}
|
}
|
||||||
MANUFACTURER_SMART_PROFILES = {
|
MANUFACTURER_SMART_PROFILES = {
|
||||||
'Ridata': {
|
'Ridata': {
|
||||||
'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK'],
|
'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'], # Add the generic model
|
||||||
'wear_leveling_behavior': 'countup', # Based on your data, it counts up
|
'firmware_patterns': ['HT3618B7', 'HT36'], # Add firmware pattern matching
|
||||||
|
'wear_leveling_behavior': 'countup',
|
||||||
'wear_leveling_baseline': 0,
|
'wear_leveling_baseline': 0,
|
||||||
'wear_leveling_thresholds': {
|
'wear_leveling_thresholds': {
|
||||||
'warning': 500000, # Much higher threshold for countup behavior
|
'warning': 500000, # Much higher threshold for countup behavior
|
||||||
@ -1105,12 +1106,20 @@ class SystemHealthMonitor:
|
|||||||
logger.debug(f"Could not parse SMART value: {raw_value}")
|
logger.debug(f"Could not parse SMART value: {raw_value}")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def _get_manufacturer_profile(self, model: str, manufacturer: str = None) -> Dict[str, Any]:
|
def _get_manufacturer_profile(self, model: str, manufacturer: str = None, firmware: str = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Get manufacturer-specific SMART profile based on drive model/manufacturer.
|
Get manufacturer-specific SMART profile based on drive model/manufacturer/firmware.
|
||||||
"""
|
"""
|
||||||
# Check each manufacturer profile
|
# Check each manufacturer profile
|
||||||
for mfg, profile in self.MANUFACTURER_SMART_PROFILES.items():
|
for mfg, profile in self.MANUFACTURER_SMART_PROFILES.items():
|
||||||
|
# Check firmware patterns first (most specific for OEM drives)
|
||||||
|
if firmware and 'firmware_patterns' in profile:
|
||||||
|
for pattern in profile['firmware_patterns']:
|
||||||
|
if pattern in firmware:
|
||||||
|
logger.debug(f"Matched manufacturer profile: {mfg} for firmware: {firmware}")
|
||||||
|
return profile
|
||||||
|
|
||||||
|
# Check model/manufacturer aliases
|
||||||
for alias in profile['aliases']:
|
for alias in profile['aliases']:
|
||||||
if alias.lower() in model.lower() or (manufacturer and alias.lower() in manufacturer.lower()):
|
if alias.lower() in model.lower() or (manufacturer and alias.lower() in manufacturer.lower()):
|
||||||
logger.debug(f"Matched manufacturer profile: {mfg} for model: {model}")
|
logger.debug(f"Matched manufacturer profile: {mfg} for model: {model}")
|
||||||
@ -1124,7 +1133,7 @@ class SystemHealthMonitor:
|
|||||||
"""
|
"""
|
||||||
Determine if a drive is considered "new" based on power-on hours.
|
Determine if a drive is considered "new" based on power-on hours.
|
||||||
"""
|
"""
|
||||||
return power_on_hours < 168 # Less than 1 week of runtime
|
return power_on_hours < 720 # Less than 1 week of runtime
|
||||||
|
|
||||||
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
@ -1302,473 +1311,4 @@ class SystemHealthMonitor:
|
|||||||
logger.debug(f"=== SMART Health Check for {device} ===")
|
logger.debug(f"=== SMART Health Check for {device} ===")
|
||||||
logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0] if manufacturer_profile else 'None'}")
|
logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0] if manufacturer_profile else 'None'}")
|
||||||
logger.debug("Raw SMART attributes:")
|
logger.debug("Raw SMART attributes:")
|
||||||
for attr, value in smart_health['attributes'].items():
|
for attr, value in smart_health['attributes'].
|
||||||
logger.debug(f"{attr}: {value}")
|
|
||||||
logger.debug(f"Temperature: {smart_health['temp']}°C")
|
|
||||||
logger.debug(f"Is new drive: {is_new_drive}")
|
|
||||||
logger.debug(f"Detected Issues: {smart_health['issues']}")
|
|
||||||
logger.debug("=== End SMART Check ===\n")
|
|
||||||
|
|
||||||
# Special handling for NVMe drives
|
|
||||||
if 'nvme' in device:
|
|
||||||
try:
|
|
||||||
nvme_result = subprocess.run(
|
|
||||||
['nvme', 'smart-log', device],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True,
|
|
||||||
timeout=10
|
|
||||||
)
|
|
||||||
logger.debug(f"NVMe smart-log raw output for {device}:")
|
|
||||||
logger.debug(nvme_result.stdout)
|
|
||||||
|
|
||||||
# Initialize the temperature attribute
|
|
||||||
if smart_health['temp'] is None:
|
|
||||||
smart_health['attributes']['Temperature_Celsius'] = None
|
|
||||||
|
|
||||||
for line in nvme_result.stdout.split('\n'):
|
|
||||||
# Fix the NoneType error by checking if line exists and has content
|
|
||||||
if line and line.strip() and 'temperature' in line.lower():
|
|
||||||
try:
|
|
||||||
temp_str = line.split(':')[1].strip() if ':' in line else line.strip()
|
|
||||||
logger.debug(f"Raw temperature string: {temp_str}")
|
|
||||||
|
|
||||||
# Extract first temperature value more safely
|
|
||||||
digits = ''.join(c for c in temp_str if c.isdigit())
|
|
||||||
if len(digits) >= 2:
|
|
||||||
temp_value = int(digits[:2])
|
|
||||||
logger.debug(f"Parsed temperature value: {temp_value}")
|
|
||||||
|
|
||||||
# Set both temperature fields
|
|
||||||
smart_health['temp'] = temp_value
|
|
||||||
smart_health['attributes']['Temperature_Celsius'] = temp_value
|
|
||||||
|
|
||||||
logger.debug(f"Final temperature recorded: {smart_health['temp']}")
|
|
||||||
break
|
|
||||||
except (ValueError, IndexError, AttributeError) as e:
|
|
||||||
logger.debug(f"Error parsing NVMe temperature from line '{line}': {e}")
|
|
||||||
continue
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
logger.debug(f"NVMe smart-log for {device} timed out")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error getting NVMe smart data for {device}: {e}")
|
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
smart_health['status'] = 'ERROR'
|
|
||||||
smart_health['issues'].append("SMART check timed out")
|
|
||||||
except Exception as e:
|
|
||||||
smart_health['status'] = 'ERROR'
|
|
||||||
smart_health['severity'] = 'UNKNOWN'
|
|
||||||
smart_health['issues'].append(f"Error checking SMART: {str(e)}")
|
|
||||||
logger.debug(f"Exception in _check_smart_health for {device}: {e}")
|
|
||||||
import traceback
|
|
||||||
logger.debug(traceback.format_exc())
|
|
||||||
|
|
||||||
return smart_health
|
|
||||||
|
|
||||||
def _check_drives_health(self) -> Dict[str, Any]:
|
|
||||||
drives_health = {'overall_status': 'NORMAL', 'drives': []}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Get only valid physical disks
|
|
||||||
physical_disks = self._get_all_disks()
|
|
||||||
logger.debug(f"Checking physical disks: {physical_disks}")
|
|
||||||
|
|
||||||
if not physical_disks:
|
|
||||||
logger.warning("No valid physical disks found for monitoring")
|
|
||||||
drives_health['overall_status'] = 'WARNING'
|
|
||||||
return drives_health
|
|
||||||
|
|
||||||
# Get ALL partition information including device mapper
|
|
||||||
partitions = psutil.disk_partitions(all=True)
|
|
||||||
|
|
||||||
# Create mapping of base devices to their partitions
|
|
||||||
device_partitions = {}
|
|
||||||
for part in partitions:
|
|
||||||
# Extract base device (e.g., /dev/sda from /dev/sda1)
|
|
||||||
base_device = re.match(r'(/dev/[a-z]+)', part.device)
|
|
||||||
if base_device:
|
|
||||||
base_dev = base_device.group(1)
|
|
||||||
if base_dev not in device_partitions:
|
|
||||||
device_partitions[base_dev] = []
|
|
||||||
device_partitions[base_dev].append(part)
|
|
||||||
|
|
||||||
overall_status = 'NORMAL'
|
|
||||||
for disk in physical_disks:
|
|
||||||
drive_report = {
|
|
||||||
'device': disk,
|
|
||||||
'partitions': [],
|
|
||||||
'smart_status': 'UNKNOWN',
|
|
||||||
'usage_percent': 0
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add partition information if available
|
|
||||||
if disk in device_partitions:
|
|
||||||
total_used = 0
|
|
||||||
total_space = 0
|
|
||||||
for partition in device_partitions[disk]:
|
|
||||||
try:
|
|
||||||
usage = psutil.disk_usage(partition.mountpoint)
|
|
||||||
total_used += usage.used
|
|
||||||
total_space += usage.total
|
|
||||||
part_info = {
|
|
||||||
'device': partition.device,
|
|
||||||
'mountpoint': partition.mountpoint,
|
|
||||||
'fstype': partition.fstype,
|
|
||||||
'total_space': self._convert_bytes(usage.total),
|
|
||||||
'used_space': self._convert_bytes(usage.used),
|
|
||||||
'free_space': self._convert_bytes(usage.free),
|
|
||||||
'usage_percent': usage.percent
|
|
||||||
}
|
|
||||||
drive_report['partitions'].append(part_info)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error getting partition usage for {partition.device}: {e}")
|
|
||||||
|
|
||||||
# Calculate overall drive usage percentage
|
|
||||||
if total_space > 0:
|
|
||||||
drive_report['usage_percent'] = (total_used / total_space) * 100
|
|
||||||
|
|
||||||
# Check SMART health
|
|
||||||
smart_health = self._check_smart_health(disk)
|
|
||||||
drive_report.update({
|
|
||||||
'smart_status': smart_health['status'],
|
|
||||||
'smart_issues': smart_health['issues'],
|
|
||||||
'temperature': smart_health['temp'],
|
|
||||||
'smart_attributes': smart_health['attributes']
|
|
||||||
})
|
|
||||||
|
|
||||||
# Only report issues for drives that should be monitored
|
|
||||||
if smart_health['status'] == 'UNHEALTHY':
|
|
||||||
overall_status = 'CRITICAL'
|
|
||||||
elif smart_health['status'] == 'ERROR':
|
|
||||||
# Don't escalate overall status for ERROR drives (might be virtual)
|
|
||||||
logger.debug(f"Drive {disk} returned ERROR status, skipping from issue detection")
|
|
||||||
elif smart_health['issues'] and smart_health['status'] not in ['ERROR', 'NOT_SUPPORTED']:
|
|
||||||
if overall_status != 'CRITICAL':
|
|
||||||
overall_status = 'WARNING'
|
|
||||||
|
|
||||||
drives_health['drives'].append(drive_report)
|
|
||||||
|
|
||||||
drives_health['overall_status'] = overall_status
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error checking drives health: {str(e)}")
|
|
||||||
|
|
||||||
return drives_health
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _convert_bytes(bytes_value: int, suffix: str = 'B') -> str:
|
|
||||||
"""
|
|
||||||
Convert bytes to a human-readable format.
|
|
||||||
|
|
||||||
:param bytes_value: Number of bytes to convert.
|
|
||||||
:param suffix: Suffix to append (default is 'B' for bytes).
|
|
||||||
:return: Formatted string with the size in human-readable form.
|
|
||||||
"""
|
|
||||||
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
|
|
||||||
if abs(bytes_value) < 1024.0:
|
|
||||||
return f"{bytes_value:.1f}{unit}{suffix}"
|
|
||||||
bytes_value /= 1024.0
|
|
||||||
return f"{bytes_value:.1f}Y{suffix}"
|
|
||||||
|
|
||||||
def _convert_size_to_bytes(self, size_str: str) -> float:
|
|
||||||
"""Convert size string with units to bytes"""
|
|
||||||
units = {'B': 1, 'K': 1024, 'M': 1024**2, 'G': 1024**3, 'T': 1024**4}
|
|
||||||
size = float(size_str[:-1])
|
|
||||||
unit = size_str[-1].upper()
|
|
||||||
return size * units[unit]
|
|
||||||
|
|
||||||
def _check_memory_usage(self) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Check for ECC memory errors if ECC memory is present.
|
|
||||||
"""
|
|
||||||
memory_health = {
|
|
||||||
'has_ecc': False,
|
|
||||||
'ecc_errors': [],
|
|
||||||
'status': 'OK',
|
|
||||||
'total_memory': self._convert_bytes(psutil.virtual_memory().total),
|
|
||||||
'used_memory': self._convert_bytes(psutil.virtual_memory().used),
|
|
||||||
'memory_percent': psutil.virtual_memory().percent
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# First check using dmidecode
|
|
||||||
result = subprocess.run(
|
|
||||||
['dmidecode', '--type', 'memory'],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
if 'Error Correction Type: Multi-bit ECC' in result.stdout:
|
|
||||||
memory_health['has_ecc'] = True
|
|
||||||
|
|
||||||
# If dmidecode didn't find ECC, try the edac method as backup
|
|
||||||
if not memory_health['has_ecc']:
|
|
||||||
edac_path = '/sys/devices/system/edac/mc'
|
|
||||||
if os.path.exists(edac_path) and os.listdir(edac_path):
|
|
||||||
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
|
||||||
if os.path.exists(f"{mc_dir}/csrow0"):
|
|
||||||
memory_health['has_ecc'] = True
|
|
||||||
break
|
|
||||||
|
|
||||||
# If ECC is present, check for errors
|
|
||||||
if memory_health['has_ecc']:
|
|
||||||
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
|
||||||
if os.path.exists(f"{mc_dir}/csrow0"):
|
|
||||||
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
|
|
||||||
if ue_count > 0:
|
|
||||||
memory_health['status'] = 'CRITICAL'
|
|
||||||
memory_health['ecc_errors'].append(
|
|
||||||
f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}"
|
|
||||||
)
|
|
||||||
|
|
||||||
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
|
|
||||||
if ce_count > 0:
|
|
||||||
if memory_health['status'] != 'CRITICAL':
|
|
||||||
memory_health['status'] = 'WARNING'
|
|
||||||
memory_health['ecc_errors'].append(
|
|
||||||
f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}"
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
memory_health['status'] = 'ERROR'
|
|
||||||
memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
|
|
||||||
|
|
||||||
return memory_health
|
|
||||||
|
|
||||||
def _read_ecc_count(self, filepath: str) -> int:
|
|
||||||
"""
|
|
||||||
Read ECC error count from a file.
|
|
||||||
|
|
||||||
:param filepath: Path to the ECC count file
|
|
||||||
:return: Number of ECC errors
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with open(filepath, 'r') as f:
|
|
||||||
return int(f.read().strip())
|
|
||||||
except:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def _check_cpu_usage(self) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Check CPU usage and return health metrics.
|
|
||||||
|
|
||||||
:return: Dictionary with CPU health metrics.
|
|
||||||
"""
|
|
||||||
cpu_usage_percent = psutil.cpu_percent(interval=1)
|
|
||||||
cpu_health = {
|
|
||||||
'cpu_usage_percent': cpu_usage_percent,
|
|
||||||
'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING'
|
|
||||||
}
|
|
||||||
return cpu_health
|
|
||||||
|
|
||||||
def _check_network_status(self) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Check the status of network interfaces and report any issues.
|
|
||||||
|
|
||||||
:return: Dictionary containing network health metrics and any issues found.
|
|
||||||
"""
|
|
||||||
network_health = {
|
|
||||||
'management_network': {
|
|
||||||
'issues': [],
|
|
||||||
'status': 'OK',
|
|
||||||
'latency': None
|
|
||||||
},
|
|
||||||
'ceph_network': {
|
|
||||||
'issues': [],
|
|
||||||
'status': 'OK',
|
|
||||||
'latency': None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check management network connectivity
|
|
||||||
mgmt_result = subprocess.run(
|
|
||||||
[
|
|
||||||
"ping",
|
|
||||||
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
|
|
||||||
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
|
|
||||||
self.CONFIG['NETWORKS']['MANAGEMENT']
|
|
||||||
],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if mgmt_result.returncode != 0:
|
|
||||||
network_health['management_network']['status'] = 'CRITICAL'
|
|
||||||
network_health['management_network']['issues'].append(
|
|
||||||
"Management network is unreachable"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check Ceph network connectivity
|
|
||||||
ceph_result = subprocess.run(
|
|
||||||
[
|
|
||||||
"ping",
|
|
||||||
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
|
|
||||||
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
|
|
||||||
self.CONFIG['NETWORKS']['CEPH']
|
|
||||||
],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if ceph_result.returncode != 0:
|
|
||||||
network_health['ceph_network']['status'] = 'CRITICAL'
|
|
||||||
network_health['ceph_network']['issues'].append(
|
|
||||||
"Ceph network is unreachable"
|
|
||||||
)
|
|
||||||
|
|
||||||
return network_health
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Network health check failed: {e}")
|
|
||||||
return {
|
|
||||||
'status': 'ERROR',
|
|
||||||
'error': str(e)
|
|
||||||
}
|
|
||||||
|
|
||||||
def _check_lxc_storage(self) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Check storage utilization for all running LXC containers
|
|
||||||
"""
|
|
||||||
logger.debug("Starting LXC storage check")
|
|
||||||
lxc_health = {
|
|
||||||
'status': 'OK',
|
|
||||||
'containers': [],
|
|
||||||
'issues': []
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = subprocess.run(
|
|
||||||
['pct', 'list'],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
logger.debug(f"pct list output:\n{result.stdout}")
|
|
||||||
|
|
||||||
for line in result.stdout.split('\n')[1:]:
|
|
||||||
if not line.strip():
|
|
||||||
continue
|
|
||||||
|
|
||||||
parts = line.split()
|
|
||||||
if len(parts) < 2:
|
|
||||||
logger.debug(f"Skipping invalid line: {line}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
vmid, status = parts[0], parts[1]
|
|
||||||
|
|
||||||
if status.lower() == 'running':
|
|
||||||
logger.debug(f"Checking container {vmid} disk usage")
|
|
||||||
disk_info = subprocess.run(
|
|
||||||
['pct', 'df', vmid],
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
|
|
||||||
container_info = {
|
|
||||||
'vmid': vmid,
|
|
||||||
'filesystems': []
|
|
||||||
}
|
|
||||||
|
|
||||||
for fs_line in disk_info.stdout.split('\n')[1:]:
|
|
||||||
if not fs_line.strip() or 'MP' in fs_line:
|
|
||||||
continue
|
|
||||||
|
|
||||||
columns = line.split()
|
|
||||||
logger.debug(f"Split parts: {parts}")
|
|
||||||
if len(columns) >= 6:
|
|
||||||
try:
|
|
||||||
# Skip excluded mounts
|
|
||||||
if parts[0].startswith('appPool:') or '/mnt/pve/mediaf' in parts[0]:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Get the mountpoint (last column)
|
|
||||||
if len(parts) > 5:
|
|
||||||
# The mountpoint is the last column
|
|
||||||
mountpoint = columns[-1]
|
|
||||||
else:
|
|
||||||
mountpoint = "/"
|
|
||||||
|
|
||||||
# Skip excluded mountpoints
|
|
||||||
if self._is_excluded_mount(mountpoint):
|
|
||||||
logger.debug(f"Skipping excluded mount: {mountpoint}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Parse size values safely
|
|
||||||
total_space = self._parse_size(columns[-5])
|
|
||||||
used_space = self._parse_size(columns[-4])
|
|
||||||
available_space = self._parse_size(columns[-3])
|
|
||||||
|
|
||||||
# Parse percentage safely
|
|
||||||
try:
|
|
||||||
usage_percent = float(columns[-2].rstrip('%'))
|
|
||||||
except (ValueError, IndexError):
|
|
||||||
# Calculate percentage if parsing fails
|
|
||||||
usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
|
|
||||||
|
|
||||||
filesystem = {
|
|
||||||
'mountpoint': mountpoint,
|
|
||||||
'total_space': total_space,
|
|
||||||
'used_space': used_space,
|
|
||||||
'available': available_space,
|
|
||||||
'usage_percent': usage_percent
|
|
||||||
}
|
|
||||||
container_info['filesystems'].append(filesystem)
|
|
||||||
|
|
||||||
# Check thresholds
|
|
||||||
if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
|
|
||||||
lxc_health['status'] = 'CRITICAL'
|
|
||||||
issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
|
|
||||||
lxc_health['issues'].append(issue)
|
|
||||||
elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
|
|
||||||
if lxc_health['status'] != 'CRITICAL':
|
|
||||||
lxc_health['status'] = 'WARNING'
|
|
||||||
issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
|
|
||||||
lxc_health['issues'].append(issue)
|
|
||||||
|
|
||||||
logger.debug(f"Filesystem details: {filesystem}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Error processing line: {str(e)}")
|
|
||||||
logger.debug(f"Full exception: {repr(e)}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Only add container info if we have filesystem data
|
|
||||||
if container_info['filesystems']:
|
|
||||||
lxc_health['containers'].append(container_info)
|
|
||||||
logger.debug(f"Added container info for VMID {vmid}")
|
|
||||||
|
|
||||||
logger.debug("=== LXC Storage Check Summary ===")
|
|
||||||
logger.debug(f"Status: {lxc_health['status']}")
|
|
||||||
logger.debug(f"Total containers checked: {len(lxc_health['containers'])}")
|
|
||||||
logger.debug(f"Issues found: {len(lxc_health['issues'])}")
|
|
||||||
logger.debug("=== End LXC Storage Check ===")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f"Critical error during LXC storage check: {str(e)}")
|
|
||||||
lxc_health['status'] = 'ERROR'
|
|
||||||
error_msg = f"Error checking LXC storage: {str(e)}"
|
|
||||||
lxc_health['issues'].append(error_msg)
|
|
||||||
|
|
||||||
return lxc_health
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser(description="System Health Monitor")
|
|
||||||
parser.add_argument(
|
|
||||||
"--dry-run",
|
|
||||||
action="store_true",
|
|
||||||
help="Enable dry-run mode (simulate ticket creation without actual API calls)."
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
monitor = SystemHealthMonitor(
|
|
||||||
ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
|
|
||||||
dry_run=args.dry_run
|
|
||||||
)
|
|
||||||
monitor.run()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Reference in New Issue
Block a user