diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 6b4d4de..4bad238 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -109,8 +109,9 @@ class SystemHealthMonitor: } MANUFACTURER_SMART_PROFILES = { 'Ridata': { - 'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK'], - 'wear_leveling_behavior': 'countup', # Based on your data, it counts up + 'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'], # Add the generic model + 'firmware_patterns': ['HT3618B7', 'HT36'], # Add firmware pattern matching + 'wear_leveling_behavior': 'countup', 'wear_leveling_baseline': 0, 'wear_leveling_thresholds': { 'warning': 500000, # Much higher threshold for countup behavior @@ -1105,12 +1106,20 @@ class SystemHealthMonitor: logger.debug(f"Could not parse SMART value: {raw_value}") return 0 - def _get_manufacturer_profile(self, model: str, manufacturer: str = None) -> Dict[str, Any]: + def _get_manufacturer_profile(self, model: str, manufacturer: str = None, firmware: str = None) -> Dict[str, Any]: """ - Get manufacturer-specific SMART profile based on drive model/manufacturer. + Get manufacturer-specific SMART profile based on drive model/manufacturer/firmware. """ # Check each manufacturer profile for mfg, profile in self.MANUFACTURER_SMART_PROFILES.items(): + # Check firmware patterns first (most specific for OEM drives) + if firmware and 'firmware_patterns' in profile: + for pattern in profile['firmware_patterns']: + if pattern in firmware: + logger.debug(f"Matched manufacturer profile: {mfg} for firmware: {firmware}") + return profile + + # Check model/manufacturer aliases for alias in profile['aliases']: if alias.lower() in model.lower() or (manufacturer and alias.lower() in manufacturer.lower()): logger.debug(f"Matched manufacturer profile: {mfg} for model: {model}") @@ -1124,7 +1133,7 @@ class SystemHealthMonitor: """ Determine if a drive is considered "new" based on power-on hours. """ - return power_on_hours < 168 # Less than 1 week of runtime + return power_on_hours < 720 # Less than 1 week of runtime def _check_smart_health(self, device: str) -> Dict[str, Any]: """ @@ -1302,473 +1311,4 @@ class SystemHealthMonitor: logger.debug(f"=== SMART Health Check for {device} ===") logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0] if manufacturer_profile else 'None'}") logger.debug("Raw SMART attributes:") - for attr, value in smart_health['attributes'].items(): - logger.debug(f"{attr}: {value}") - logger.debug(f"Temperature: {smart_health['temp']}°C") - logger.debug(f"Is new drive: {is_new_drive}") - logger.debug(f"Detected Issues: {smart_health['issues']}") - logger.debug("=== End SMART Check ===\n") - - # Special handling for NVMe drives - if 'nvme' in device: - try: - nvme_result = subprocess.run( - ['nvme', 'smart-log', device], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - timeout=10 - ) - logger.debug(f"NVMe smart-log raw output for {device}:") - logger.debug(nvme_result.stdout) - - # Initialize the temperature attribute - if smart_health['temp'] is None: - smart_health['attributes']['Temperature_Celsius'] = None - - for line in nvme_result.stdout.split('\n'): - # Fix the NoneType error by checking if line exists and has content - if line and line.strip() and 'temperature' in line.lower(): - try: - temp_str = line.split(':')[1].strip() if ':' in line else line.strip() - logger.debug(f"Raw temperature string: {temp_str}") - - # Extract first temperature value more safely - digits = ''.join(c for c in temp_str if c.isdigit()) - if len(digits) >= 2: - temp_value = int(digits[:2]) - logger.debug(f"Parsed temperature value: {temp_value}") - - # Set both temperature fields - smart_health['temp'] = temp_value - smart_health['attributes']['Temperature_Celsius'] = temp_value - - logger.debug(f"Final temperature recorded: {smart_health['temp']}") - break - except (ValueError, IndexError, AttributeError) as e: - logger.debug(f"Error parsing NVMe temperature from line '{line}': {e}") - continue - except subprocess.TimeoutExpired: - logger.debug(f"NVMe smart-log for {device} timed out") - except Exception as e: - logger.debug(f"Error getting NVMe smart data for {device}: {e}") - - except subprocess.TimeoutExpired: - smart_health['status'] = 'ERROR' - smart_health['issues'].append("SMART check timed out") - except Exception as e: - smart_health['status'] = 'ERROR' - smart_health['severity'] = 'UNKNOWN' - smart_health['issues'].append(f"Error checking SMART: {str(e)}") - logger.debug(f"Exception in _check_smart_health for {device}: {e}") - import traceback - logger.debug(traceback.format_exc()) - - return smart_health - - def _check_drives_health(self) -> Dict[str, Any]: - drives_health = {'overall_status': 'NORMAL', 'drives': []} - - try: - # Get only valid physical disks - physical_disks = self._get_all_disks() - logger.debug(f"Checking physical disks: {physical_disks}") - - if not physical_disks: - logger.warning("No valid physical disks found for monitoring") - drives_health['overall_status'] = 'WARNING' - return drives_health - - # Get ALL partition information including device mapper - partitions = psutil.disk_partitions(all=True) - - # Create mapping of base devices to their partitions - device_partitions = {} - for part in partitions: - # Extract base device (e.g., /dev/sda from /dev/sda1) - base_device = re.match(r'(/dev/[a-z]+)', part.device) - if base_device: - base_dev = base_device.group(1) - if base_dev not in device_partitions: - device_partitions[base_dev] = [] - device_partitions[base_dev].append(part) - - overall_status = 'NORMAL' - for disk in physical_disks: - drive_report = { - 'device': disk, - 'partitions': [], - 'smart_status': 'UNKNOWN', - 'usage_percent': 0 - } - - # Add partition information if available - if disk in device_partitions: - total_used = 0 - total_space = 0 - for partition in device_partitions[disk]: - try: - usage = psutil.disk_usage(partition.mountpoint) - total_used += usage.used - total_space += usage.total - part_info = { - 'device': partition.device, - 'mountpoint': partition.mountpoint, - 'fstype': partition.fstype, - 'total_space': self._convert_bytes(usage.total), - 'used_space': self._convert_bytes(usage.used), - 'free_space': self._convert_bytes(usage.free), - 'usage_percent': usage.percent - } - drive_report['partitions'].append(part_info) - except Exception as e: - logger.debug(f"Error getting partition usage for {partition.device}: {e}") - - # Calculate overall drive usage percentage - if total_space > 0: - drive_report['usage_percent'] = (total_used / total_space) * 100 - - # Check SMART health - smart_health = self._check_smart_health(disk) - drive_report.update({ - 'smart_status': smart_health['status'], - 'smart_issues': smart_health['issues'], - 'temperature': smart_health['temp'], - 'smart_attributes': smart_health['attributes'] - }) - - # Only report issues for drives that should be monitored - if smart_health['status'] == 'UNHEALTHY': - overall_status = 'CRITICAL' - elif smart_health['status'] == 'ERROR': - # Don't escalate overall status for ERROR drives (might be virtual) - logger.debug(f"Drive {disk} returned ERROR status, skipping from issue detection") - elif smart_health['issues'] and smart_health['status'] not in ['ERROR', 'NOT_SUPPORTED']: - if overall_status != 'CRITICAL': - overall_status = 'WARNING' - - drives_health['drives'].append(drive_report) - - drives_health['overall_status'] = overall_status - - except Exception as e: - logger.error(f"Error checking drives health: {str(e)}") - - return drives_health - - @staticmethod - def _convert_bytes(bytes_value: int, suffix: str = 'B') -> str: - """ - Convert bytes to a human-readable format. - - :param bytes_value: Number of bytes to convert. - :param suffix: Suffix to append (default is 'B' for bytes). - :return: Formatted string with the size in human-readable form. - """ - for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: - if abs(bytes_value) < 1024.0: - return f"{bytes_value:.1f}{unit}{suffix}" - bytes_value /= 1024.0 - return f"{bytes_value:.1f}Y{suffix}" - - def _convert_size_to_bytes(self, size_str: str) -> float: - """Convert size string with units to bytes""" - units = {'B': 1, 'K': 1024, 'M': 1024**2, 'G': 1024**3, 'T': 1024**4} - size = float(size_str[:-1]) - unit = size_str[-1].upper() - return size * units[unit] - - def _check_memory_usage(self) -> Dict[str, Any]: - """ - Check for ECC memory errors if ECC memory is present. - """ - memory_health = { - 'has_ecc': False, - 'ecc_errors': [], - 'status': 'OK', - 'total_memory': self._convert_bytes(psutil.virtual_memory().total), - 'used_memory': self._convert_bytes(psutil.virtual_memory().used), - 'memory_percent': psutil.virtual_memory().percent - } - - try: - # First check using dmidecode - result = subprocess.run( - ['dmidecode', '--type', 'memory'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - if 'Error Correction Type: Multi-bit ECC' in result.stdout: - memory_health['has_ecc'] = True - - # If dmidecode didn't find ECC, try the edac method as backup - if not memory_health['has_ecc']: - edac_path = '/sys/devices/system/edac/mc' - if os.path.exists(edac_path) and os.listdir(edac_path): - for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'): - if os.path.exists(f"{mc_dir}/csrow0"): - memory_health['has_ecc'] = True - break - - # If ECC is present, check for errors - if memory_health['has_ecc']: - for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'): - if os.path.exists(f"{mc_dir}/csrow0"): - ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count") - if ue_count > 0: - memory_health['status'] = 'CRITICAL' - memory_health['ecc_errors'].append( - f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}" - ) - - ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count") - if ce_count > 0: - if memory_health['status'] != 'CRITICAL': - memory_health['status'] = 'WARNING' - memory_health['ecc_errors'].append( - f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}" - ) - - except Exception as e: - memory_health['status'] = 'ERROR' - memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}") - - return memory_health - - def _read_ecc_count(self, filepath: str) -> int: - """ - Read ECC error count from a file. - - :param filepath: Path to the ECC count file - :return: Number of ECC errors - """ - try: - with open(filepath, 'r') as f: - return int(f.read().strip()) - except: - return 0 - - def _check_cpu_usage(self) -> Dict[str, Any]: - """ - Check CPU usage and return health metrics. - - :return: Dictionary with CPU health metrics. - """ - cpu_usage_percent = psutil.cpu_percent(interval=1) - cpu_health = { - 'cpu_usage_percent': cpu_usage_percent, - 'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING' - } - return cpu_health - - def _check_network_status(self) -> Dict[str, Any]: - """ - Check the status of network interfaces and report any issues. - - :return: Dictionary containing network health metrics and any issues found. - """ - network_health = { - 'management_network': { - 'issues': [], - 'status': 'OK', - 'latency': None - }, - 'ceph_network': { - 'issues': [], - 'status': 'OK', - 'latency': None - } - } - - try: - # Check management network connectivity - mgmt_result = subprocess.run( - [ - "ping", - "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']), - "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']), - self.CONFIG['NETWORKS']['MANAGEMENT'] - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - - if mgmt_result.returncode != 0: - network_health['management_network']['status'] = 'CRITICAL' - network_health['management_network']['issues'].append( - "Management network is unreachable" - ) - - # Check Ceph network connectivity - ceph_result = subprocess.run( - [ - "ping", - "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']), - "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']), - self.CONFIG['NETWORKS']['CEPH'] - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - - if ceph_result.returncode != 0: - network_health['ceph_network']['status'] = 'CRITICAL' - network_health['ceph_network']['issues'].append( - "Ceph network is unreachable" - ) - - return network_health - - except Exception as e: - logger.error(f"Network health check failed: {e}") - return { - 'status': 'ERROR', - 'error': str(e) - } - - def _check_lxc_storage(self) -> Dict[str, Any]: - """ - Check storage utilization for all running LXC containers - """ - logger.debug("Starting LXC storage check") - lxc_health = { - 'status': 'OK', - 'containers': [], - 'issues': [] - } - - try: - result = subprocess.run( - ['pct', 'list'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - logger.debug(f"pct list output:\n{result.stdout}") - - for line in result.stdout.split('\n')[1:]: - if not line.strip(): - continue - - parts = line.split() - if len(parts) < 2: - logger.debug(f"Skipping invalid line: {line}") - continue - - vmid, status = parts[0], parts[1] - - if status.lower() == 'running': - logger.debug(f"Checking container {vmid} disk usage") - disk_info = subprocess.run( - ['pct', 'df', vmid], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - - container_info = { - 'vmid': vmid, - 'filesystems': [] - } - - for fs_line in disk_info.stdout.split('\n')[1:]: - if not fs_line.strip() or 'MP' in fs_line: - continue - - columns = line.split() - logger.debug(f"Split parts: {parts}") - if len(columns) >= 6: - try: - # Skip excluded mounts - if parts[0].startswith('appPool:') or '/mnt/pve/mediaf' in parts[0]: - continue - - # Get the mountpoint (last column) - if len(parts) > 5: - # The mountpoint is the last column - mountpoint = columns[-1] - else: - mountpoint = "/" - - # Skip excluded mountpoints - if self._is_excluded_mount(mountpoint): - logger.debug(f"Skipping excluded mount: {mountpoint}") - continue - - # Parse size values safely - total_space = self._parse_size(columns[-5]) - used_space = self._parse_size(columns[-4]) - available_space = self._parse_size(columns[-3]) - - # Parse percentage safely - try: - usage_percent = float(columns[-2].rstrip('%')) - except (ValueError, IndexError): - # Calculate percentage if parsing fails - usage_percent = (used_space / total_space * 100) if total_space > 0 else 0 - - filesystem = { - 'mountpoint': mountpoint, - 'total_space': total_space, - 'used_space': used_space, - 'available': available_space, - 'usage_percent': usage_percent - } - container_info['filesystems'].append(filesystem) - - # Check thresholds - if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: - lxc_health['status'] = 'CRITICAL' - issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}" - lxc_health['issues'].append(issue) - elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: - if lxc_health['status'] != 'CRITICAL': - lxc_health['status'] = 'WARNING' - issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}" - lxc_health['issues'].append(issue) - - logger.debug(f"Filesystem details: {filesystem}") - except Exception as e: - logger.debug(f"Error processing line: {str(e)}") - logger.debug(f"Full exception: {repr(e)}") - continue - - # Only add container info if we have filesystem data - if container_info['filesystems']: - lxc_health['containers'].append(container_info) - logger.debug(f"Added container info for VMID {vmid}") - - logger.debug("=== LXC Storage Check Summary ===") - logger.debug(f"Status: {lxc_health['status']}") - logger.debug(f"Total containers checked: {len(lxc_health['containers'])}") - logger.debug(f"Issues found: {len(lxc_health['issues'])}") - logger.debug("=== End LXC Storage Check ===") - - except Exception as e: - logger.debug(f"Critical error during LXC storage check: {str(e)}") - lxc_health['status'] = 'ERROR' - error_msg = f"Error checking LXC storage: {str(e)}" - lxc_health['issues'].append(error_msg) - - return lxc_health - -def main(): - parser = argparse.ArgumentParser(description="System Health Monitor") - parser.add_argument( - "--dry-run", - action="store_true", - help="Enable dry-run mode (simulate ticket creation without actual API calls)." - ) - args = parser.parse_args() - - monitor = SystemHealthMonitor( - ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'], - dry_run=args.dry_run - ) - monitor.run() - -if __name__ == "__main__": - main() + for attr, value in smart_health['attributes']. \ No newline at end of file