From 5ac12fd6b71ae4241cadd0cd3108d80554701207 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Thu, 29 May 2025 19:04:45 -0400 Subject: [PATCH] Correction of deleted code --- hwmonDaemon.py | 471 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 470 insertions(+), 1 deletion(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 4bad238..f64b145 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -1311,4 +1311,473 @@ class SystemHealthMonitor: logger.debug(f"=== SMART Health Check for {device} ===") logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0] if manufacturer_profile else 'None'}") logger.debug("Raw SMART attributes:") - for attr, value in smart_health['attributes']. \ No newline at end of file + for attr, value in smart_health['attributes'].items(): + logger.debug(f"{attr}: {value}") + logger.debug(f"Temperature: {smart_health['temp']}°C") + logger.debug(f"Is new drive: {is_new_drive}") + logger.debug(f"Detected Issues: {smart_health['issues']}") + logger.debug("=== End SMART Check ===\n") + + # Special handling for NVMe drives + if 'nvme' in device: + try: + nvme_result = subprocess.run( + ['nvme', 'smart-log', device], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=10 + ) + logger.debug(f"NVMe smart-log raw output for {device}:") + logger.debug(nvme_result.stdout) + + # Initialize the temperature attribute + if smart_health['temp'] is None: + smart_health['attributes']['Temperature_Celsius'] = None + + for line in nvme_result.stdout.split('\n'): + # Fix the NoneType error by checking if line exists and has content + if line and line.strip() and 'temperature' in line.lower(): + try: + temp_str = line.split(':')[1].strip() if ':' in line else line.strip() + logger.debug(f"Raw temperature string: {temp_str}") + + # Extract first temperature value more safely + digits = ''.join(c for c in temp_str if c.isdigit()) + if len(digits) >= 2: + temp_value = int(digits[:2]) + logger.debug(f"Parsed temperature value: {temp_value}") + + # Set both temperature fields + smart_health['temp'] = temp_value + smart_health['attributes']['Temperature_Celsius'] = temp_value + + logger.debug(f"Final temperature recorded: {smart_health['temp']}") + break + except (ValueError, IndexError, AttributeError) as e: + logger.debug(f"Error parsing NVMe temperature from line '{line}': {e}") + continue + except subprocess.TimeoutExpired: + logger.debug(f"NVMe smart-log for {device} timed out") + except Exception as e: + logger.debug(f"Error getting NVMe smart data for {device}: {e}") + + except subprocess.TimeoutExpired: + smart_health['status'] = 'ERROR' + smart_health['issues'].append("SMART check timed out") + except Exception as e: + smart_health['status'] = 'ERROR' + smart_health['severity'] = 'UNKNOWN' + smart_health['issues'].append(f"Error checking SMART: {str(e)}") + logger.debug(f"Exception in _check_smart_health for {device}: {e}") + import traceback + logger.debug(traceback.format_exc()) + + return smart_health + + def _check_drives_health(self) -> Dict[str, Any]: + drives_health = {'overall_status': 'NORMAL', 'drives': []} + + try: + # Get only valid physical disks + physical_disks = self._get_all_disks() + logger.debug(f"Checking physical disks: {physical_disks}") + + if not physical_disks: + logger.warning("No valid physical disks found for monitoring") + drives_health['overall_status'] = 'WARNING' + return drives_health + + # Get ALL partition information including device mapper + partitions = psutil.disk_partitions(all=True) + + # Create mapping of base devices to their partitions + device_partitions = {} + for part in partitions: + # Extract base device (e.g., /dev/sda from /dev/sda1) + base_device = re.match(r'(/dev/[a-z]+)', part.device) + if base_device: + base_dev = base_device.group(1) + if base_dev not in device_partitions: + device_partitions[base_dev] = [] + device_partitions[base_dev].append(part) + + overall_status = 'NORMAL' + for disk in physical_disks: + drive_report = { + 'device': disk, + 'partitions': [], + 'smart_status': 'UNKNOWN', + 'usage_percent': 0 + } + + # Add partition information if available + if disk in device_partitions: + total_used = 0 + total_space = 0 + for partition in device_partitions[disk]: + try: + usage = psutil.disk_usage(partition.mountpoint) + total_used += usage.used + total_space += usage.total + part_info = { + 'device': partition.device, + 'mountpoint': partition.mountpoint, + 'fstype': partition.fstype, + 'total_space': self._convert_bytes(usage.total), + 'used_space': self._convert_bytes(usage.used), + 'free_space': self._convert_bytes(usage.free), + 'usage_percent': usage.percent + } + drive_report['partitions'].append(part_info) + except Exception as e: + logger.debug(f"Error getting partition usage for {partition.device}: {e}") + + # Calculate overall drive usage percentage + if total_space > 0: + drive_report['usage_percent'] = (total_used / total_space) * 100 + + # Check SMART health + smart_health = self._check_smart_health(disk) + drive_report.update({ + 'smart_status': smart_health['status'], + 'smart_issues': smart_health['issues'], + 'temperature': smart_health['temp'], + 'smart_attributes': smart_health['attributes'] + }) + + # Only report issues for drives that should be monitored + if smart_health['status'] == 'UNHEALTHY': + overall_status = 'CRITICAL' + elif smart_health['status'] == 'ERROR': + # Don't escalate overall status for ERROR drives (might be virtual) + logger.debug(f"Drive {disk} returned ERROR status, skipping from issue detection") + elif smart_health['issues'] and smart_health['status'] not in ['ERROR', 'NOT_SUPPORTED']: + if overall_status != 'CRITICAL': + overall_status = 'WARNING' + + drives_health['drives'].append(drive_report) + + drives_health['overall_status'] = overall_status + + except Exception as e: + logger.error(f"Error checking drives health: {str(e)}") + + return drives_health + + @staticmethod + def _convert_bytes(bytes_value: int, suffix: str = 'B') -> str: + """ + Convert bytes to a human-readable format. + + :param bytes_value: Number of bytes to convert. + :param suffix: Suffix to append (default is 'B' for bytes). + :return: Formatted string with the size in human-readable form. + """ + for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: + if abs(bytes_value) < 1024.0: + return f"{bytes_value:.1f}{unit}{suffix}" + bytes_value /= 1024.0 + return f"{bytes_value:.1f}Y{suffix}" + + def _convert_size_to_bytes(self, size_str: str) -> float: + """Convert size string with units to bytes""" + units = {'B': 1, 'K': 1024, 'M': 1024**2, 'G': 1024**3, 'T': 1024**4} + size = float(size_str[:-1]) + unit = size_str[-1].upper() + return size * units[unit] + + def _check_memory_usage(self) -> Dict[str, Any]: + """ + Check for ECC memory errors if ECC memory is present. + """ + memory_health = { + 'has_ecc': False, + 'ecc_errors': [], + 'status': 'OK', + 'total_memory': self._convert_bytes(psutil.virtual_memory().total), + 'used_memory': self._convert_bytes(psutil.virtual_memory().used), + 'memory_percent': psutil.virtual_memory().percent + } + + try: + # First check using dmidecode + result = subprocess.run( + ['dmidecode', '--type', 'memory'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + if 'Error Correction Type: Multi-bit ECC' in result.stdout: + memory_health['has_ecc'] = True + + # If dmidecode didn't find ECC, try the edac method as backup + if not memory_health['has_ecc']: + edac_path = '/sys/devices/system/edac/mc' + if os.path.exists(edac_path) and os.listdir(edac_path): + for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'): + if os.path.exists(f"{mc_dir}/csrow0"): + memory_health['has_ecc'] = True + break + + # If ECC is present, check for errors + if memory_health['has_ecc']: + for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'): + if os.path.exists(f"{mc_dir}/csrow0"): + ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count") + if ue_count > 0: + memory_health['status'] = 'CRITICAL' + memory_health['ecc_errors'].append( + f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}" + ) + + ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count") + if ce_count > 0: + if memory_health['status'] != 'CRITICAL': + memory_health['status'] = 'WARNING' + memory_health['ecc_errors'].append( + f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}" + ) + + except Exception as e: + memory_health['status'] = 'ERROR' + memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}") + + return memory_health + + def _read_ecc_count(self, filepath: str) -> int: + """ + Read ECC error count from a file. + + :param filepath: Path to the ECC count file + :return: Number of ECC errors + """ + try: + with open(filepath, 'r') as f: + return int(f.read().strip()) + except: + return 0 + + def _check_cpu_usage(self) -> Dict[str, Any]: + """ + Check CPU usage and return health metrics. + + :return: Dictionary with CPU health metrics. + """ + cpu_usage_percent = psutil.cpu_percent(interval=1) + cpu_health = { + 'cpu_usage_percent': cpu_usage_percent, + 'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING' + } + return cpu_health + + def _check_network_status(self) -> Dict[str, Any]: + """ + Check the status of network interfaces and report any issues. + + :return: Dictionary containing network health metrics and any issues found. + """ + network_health = { + 'management_network': { + 'issues': [], + 'status': 'OK', + 'latency': None + }, + 'ceph_network': { + 'issues': [], + 'status': 'OK', + 'latency': None + } + } + + try: + # Check management network connectivity + mgmt_result = subprocess.run( + [ + "ping", + "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']), + "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']), + self.CONFIG['NETWORKS']['MANAGEMENT'] + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + if mgmt_result.returncode != 0: + network_health['management_network']['status'] = 'CRITICAL' + network_health['management_network']['issues'].append( + "Management network is unreachable" + ) + + # Check Ceph network connectivity + ceph_result = subprocess.run( + [ + "ping", + "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']), + "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']), + self.CONFIG['NETWORKS']['CEPH'] + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + if ceph_result.returncode != 0: + network_health['ceph_network']['status'] = 'CRITICAL' + network_health['ceph_network']['issues'].append( + "Ceph network is unreachable" + ) + + return network_health + + except Exception as e: + logger.error(f"Network health check failed: {e}") + return { + 'status': 'ERROR', + 'error': str(e) + } + + def _check_lxc_storage(self) -> Dict[str, Any]: + """ + Check storage utilization for all running LXC containers + """ + logger.debug("Starting LXC storage check") + lxc_health = { + 'status': 'OK', + 'containers': [], + 'issues': [] + } + + try: + result = subprocess.run( + ['pct', 'list'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + logger.debug(f"pct list output:\n{result.stdout}") + + for line in result.stdout.split('\n')[1:]: + if not line.strip(): + continue + + parts = line.split() + if len(parts) < 2: + logger.debug(f"Skipping invalid line: {line}") + continue + + vmid, status = parts[0], parts[1] + + if status.lower() == 'running': + logger.debug(f"Checking container {vmid} disk usage") + disk_info = subprocess.run( + ['pct', 'df', vmid], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + container_info = { + 'vmid': vmid, + 'filesystems': [] + } + + for fs_line in disk_info.stdout.split('\n')[1:]: + if not fs_line.strip() or 'MP' in fs_line: + continue + + columns = line.split() + logger.debug(f"Split parts: {parts}") + if len(columns) >= 6: + try: + # Skip excluded mounts + if parts[0].startswith('appPool:') or '/mnt/pve/mediaf' in parts[0]: + continue + + # Get the mountpoint (last column) + if len(parts) > 5: + # The mountpoint is the last column + mountpoint = columns[-1] + else: + mountpoint = "/" + + # Skip excluded mountpoints + if self._is_excluded_mount(mountpoint): + logger.debug(f"Skipping excluded mount: {mountpoint}") + continue + + # Parse size values safely + total_space = self._parse_size(columns[-5]) + used_space = self._parse_size(columns[-4]) + available_space = self._parse_size(columns[-3]) + + # Parse percentage safely + try: + usage_percent = float(columns[-2].rstrip('%')) + except (ValueError, IndexError): + # Calculate percentage if parsing fails + usage_percent = (used_space / total_space * 100) if total_space > 0 else 0 + + filesystem = { + 'mountpoint': mountpoint, + 'total_space': total_space, + 'used_space': used_space, + 'available': available_space, + 'usage_percent': usage_percent + } + container_info['filesystems'].append(filesystem) + + # Check thresholds + if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: + lxc_health['status'] = 'CRITICAL' + issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}" + lxc_health['issues'].append(issue) + elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: + if lxc_health['status'] != 'CRITICAL': + lxc_health['status'] = 'WARNING' + issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}" + lxc_health['issues'].append(issue) + + logger.debug(f"Filesystem details: {filesystem}") + except Exception as e: + logger.debug(f"Error processing line: {str(e)}") + logger.debug(f"Full exception: {repr(e)}") + continue + + # Only add container info if we have filesystem data + if container_info['filesystems']: + lxc_health['containers'].append(container_info) + logger.debug(f"Added container info for VMID {vmid}") + + logger.debug("=== LXC Storage Check Summary ===") + logger.debug(f"Status: {lxc_health['status']}") + logger.debug(f"Total containers checked: {len(lxc_health['containers'])}") + logger.debug(f"Issues found: {len(lxc_health['issues'])}") + logger.debug("=== End LXC Storage Check ===") + + except Exception as e: + logger.debug(f"Critical error during LXC storage check: {str(e)}") + lxc_health['status'] = 'ERROR' + error_msg = f"Error checking LXC storage: {str(e)}" + lxc_health['issues'].append(error_msg) + + return lxc_health + +def main(): + parser = argparse.ArgumentParser(description="System Health Monitor") + parser.add_argument( + "--dry-run", + action="store_true", + help="Enable dry-run mode (simulate ticket creation without actual API calls)." + ) + args = parser.parse_args() + + monitor = SystemHealthMonitor( + ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'], + dry_run=args.dry_run + ) + monitor.run() + +if __name__ == "__main__": + main() \ No newline at end of file