diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..c0b2536 --- /dev/null +++ b/.flake8 @@ -0,0 +1,6 @@ +[flake8] +max-line-length = 120 +# F841: local variable assigned but never used — many are intentional debug/future-use assignments +# E501: line too long — URLs and log messages in monitoring code are exempt +extend-ignore = F841, E501 +exclude = __pycache__, .git diff --git a/.gitea/workflows/lint.yml b/.gitea/workflows/lint.yml new file mode 100644 index 0000000..b4c5095 --- /dev/null +++ b/.gitea/workflows/lint.yml @@ -0,0 +1,20 @@ +name: Lint + +on: + push: + branches: ["**"] + pull_request: + branches: ["**"] + +jobs: + python-lint: + name: Python (flake8) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Install flake8 + run: pip install flake8 + + - name: Run flake8 + run: flake8 . diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 2cce8a4..28f0e7f 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -1,5 +1,18 @@ #!/usr/bin/env python3 -import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl, textwrap, shutil +import os +import json +import requests +import psutil +import socket +import subprocess +import logging +import argparse +import re +import glob +import datetime +import fcntl +import textwrap +import shutil from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Dict, Any, List @@ -22,7 +35,7 @@ class SystemHealthMonitor: # CLASS CONSTANTS AND CONFIGURATION # ============================================================================= STANDARD_WIDTH = 80 - + PRIORITIES = { 'CRITICAL': '1', # P1 - Cluster outages, total system failure 'HIGH': '2', # P2 - Hardware failures, same-day response @@ -83,14 +96,14 @@ class SystemHealthMonitor: # PBS (Proxmox Backup Server) issues 'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded - 'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full - 'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high + 'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full + 'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - ZFS pool usage high 'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors 'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed 'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed 'PBS_SYNC_FAILED': PRIORITIES['MEDIUM'] # P3 - Sync job failed } - + CONFIG = { 'TICKET_API_URL': 'http://10.10.10.45/create_ticket_api.php', 'TICKET_API_KEY': None, # Will be loaded from .env file @@ -259,7 +272,7 @@ class SystemHealthMonitor: api_key = cls.CONFIG.get('TICKET_API_KEY') if not api_key or api_key == 'your_api_key_here': logger.warning("TICKET_API_KEY is not configured - ticket creation will fail (dry-run will still work)") - + TICKET_TEMPLATES = { 'ACTION_TYPE': { 'AUTO': '[auto]', @@ -272,7 +285,7 @@ class SystemHealthMonitor: 'ISSUE': '[issue]', # General issue (replaces invalid 'incident') 'PROBLEM': '[problem]', # Root cause investigation 'TASK': '[task]', # Planned work item - 'MAINTENANCE': '[maintenance]', # Scheduled/preventive work + 'MAINTENANCE': '[maintenance]', # Scheduled/preventive work 'UPGRADE': '[upgrade]' # Hardware/software upgrade }, 'HARDWARE_TYPE': { @@ -300,12 +313,12 @@ class SystemHealthMonitor: 'ISSUE': 'Issue', # General issue/incident 'PROBLEM': 'Problem', # Root cause investigation needed 'TASK': 'Task', # Planned work item - 'MAINTENANCE': 'Maintenance', # Scheduled/preventive work + 'MAINTENANCE': 'Maintenance', # Scheduled/preventive work 'UPGRADE': 'Upgrade', # Hardware/software upgrade 'INSTALL': 'Install', # New installation 'REQUEST': 'Request' # Service or information request } - + PROBLEMATIC_FIRMWARE = { 'Samsung': { 'EVO860': ['RVT01B6Q', 'RVT02B6Q'], # Known issues with sudden performance drops @@ -321,7 +334,7 @@ class SystemHealthMonitor: 'WD141KRYZ': ['02.01A02'] } } - + MANUFACTURER_SMART_PROFILES = { 'Western Digital': { 'aliases': ['WDC', 'Western Digital', 'HGST', 'Ultrastar'], @@ -410,7 +423,7 @@ class SystemHealthMonitor: 'description': 'OOS drives report high values normally' }, 'Seek_Error_Rate': { - 'monitor': False, # Skip monitoring - seems to be a counter + 'monitor': False, # Skip monitoring - seems to be a counter 'description': 'OOS drives report high values normally' }, 'Command_Timeout': { @@ -494,14 +507,14 @@ class SystemHealthMonitor: } } } - + SEVERITY_INDICATORS = { 'CRITICAL': '[CRIT]', 'WARNING': '[WARN]', 'HEALTHY': '[ OK ]', 'UNKNOWN': '[ ?? ]' } - + SMART_DESCRIPTIONS = { 'Reported_Uncorrect': """ Number of errors that could not be recovered using hardware ECC. @@ -509,168 +522,168 @@ class SystemHealthMonitor: - Indicates permanent data loss in affected sectors - High correlation with drive hardware failure - Critical reliability indicator - + Recommended Actions: 1. Backup critical data immediately 2. Check drive logs for related errors 3. Plan for drive replacement 4. Monitor for error count increases """, - + 'Reallocated_Sector_Ct': """ Number of sectors that have been reallocated due to errors. Impact: - High counts indicate degrading media - Each reallocation uses one of the drive's limited spare sectors - Rapid increases suggest accelerating drive wear - + Recommended Actions: 1. Monitor rate of increase 2. Check drive temperature 3. Plan replacement if count grows rapidly """, - + 'Current_Pending_Sector': """ Sectors waiting to be reallocated due to read/write errors. Impact: - Indicates potentially unstable sectors - May result in data loss if unrecoverable - Should be monitored for increases - + Recommended Actions: 1. Backup affected files 2. Run extended SMART tests 3. Monitor for conversion to reallocated sectors """, - + 'Offline_Uncorrectable': """ Count of uncorrectable errors detected during offline data collection. Impact: - Direct indicator of media reliability issues - May affect data integrity - High values suggest drive replacement needed - + Recommended Actions: 1. Run extended SMART tests 2. Check drive logs 3. Plan replacement if count is increasing """, - + 'Spin_Retry_Count': """ Number of spin start retry attempts. Impact: - Indicates potential motor or bearing issues - May predict imminent mechanical failure - Increasing values suggest degrading drive health - + Recommended Actions: 1. Monitor for rapid increases 2. Check drive temperature 3. Plan replacement if count grows rapidly """, - + 'Power_On_Hours': """ Total number of hours the device has been powered on. Impact: - Normal aging metric - Used to gauge overall drive lifetime - Compare against manufacturer's MTBF rating - + Recommended Actions: 1. Compare to warranty period 2. Plan replacement if approaching rated lifetime """, - + 'Media_Wearout_Indicator': """ Percentage of drive's rated life remaining (SSDs). Impact: - 100 indicates new drive - 0 indicates exceeded rated writes - Critical for SSD lifecycle management - + Recommended Actions: 1. Plan replacement below 20% 2. Monitor write workload 3. Consider workload redistribution """, - + 'Temperature_Celsius': """ Current drive temperature. Impact: - High temperatures accelerate wear - Optimal range: 20-45°C - Sustained high temps reduce lifespan - + Recommended Actions: 1. Check system cooling 2. Verify airflow 3. Monitor for sustained high temperatures """, - + 'Available_Spare': """ Percentage of spare blocks remaining (SSDs). Impact: - Critical for SSD endurance - Low values indicate approaching end-of-life - Rapid decreases suggest excessive writes - + Recommended Actions: 1. Plan replacement if below 20% 2. Monitor write patterns 3. Consider workload changes """, - + 'Program_Fail_Count': """ Number of flash program operation failures. Impact: - Indicates NAND cell reliability - Important for SSD health assessment - Increasing values suggest flash degradation - + Recommended Actions: 1. Monitor rate of increase 2. Check firmware updates 3. Plan replacement if rapidly increasing """, - + 'Erase_Fail_Count': """ Number of flash erase operation failures. Impact: - Related to NAND block health - Critical for SSD reliability - High counts suggest failing flash blocks - + Recommended Actions: 1. Monitor count increases 2. Check firmware version 3. Plan replacement if count is high """, - + 'Load_Cycle_Count': """ Number of power cycles and head load/unload events. Impact: - Normal operation metric - High counts may indicate power management issues - Compare against rated cycles (typically 600k-1M) - + Recommended Actions: 1. Review power management settings 2. Monitor rate of increase 3. Plan replacement near rated limit """, - + 'Wear_Leveling_Count': """ SSD block erase distribution metric. Impact: - Indicates wear pattern uniformity - Interpretation varies by manufacturer - Critical for SSD longevity - + Recommended Actions: 1. Monitor trend over time 2. Compare with manufacturer baseline 3. Check workload distribution - + Note: Different manufacturers use different counting methods: - Some count up from 0 (Samsung, etc.) - Others count down from baseline (Ridata, etc.) @@ -858,7 +871,7 @@ class SystemHealthMonitor: import traceback logger.error(f"Unexpected error during health check: {e}") logger.error(traceback.format_exc()) - + def perform_health_checks(self) -> Dict[str, Any]: """Perform comprehensive system health checks and return a report.""" health_report = { @@ -873,24 +886,24 @@ class SystemHealthMonitor: 'system_health': self._check_system_drive_indicators(), 'pbs_health': self._check_pbs_health() } - + if self.dry_run: logger.info("\n=== System Health Summary ===") logger.info(f"Overall Drive Health: {health_report['drives_health']['overall_status']}") - + # Summarized drive information with usage logger.info("\nDrive Status:") for drive in health_report['drives_health']['drives']: issues = drive.get('smart_issues', []) temp = f", {drive.get('temperature')}°C" if drive.get('temperature') else "" status = "⚠️ " if issues else "✓ " - + # Disk usage information usage_info = "" if drive.get('partitions'): for partition in drive['partitions']: usage_info += f"\n └─ {partition['mountpoint']}: {partition['used_space']}/{partition['total_space']} ({partition['usage_percent']}% used)" - + logger.info(f"{status}{drive['device']}{temp} - SMART: {drive['smart_status']}{usage_info}") if issues: logger.info(f" Issues: {', '.join(issues)}") @@ -939,7 +952,7 @@ class SystemHealthMonitor: logger.info(f" Issues: {len(pbs['issues'])}") logger.info("\n=== End Summary ===") - + return health_report # ============================================================================= @@ -991,7 +1004,7 @@ class SystemHealthMonitor: # Analyze trends for critical attributes if len(history) >= 3: # Need at least 3 data points for trend analysis critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect', - 'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count'] + 'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count'] for attr in critical_attrs: if attr in current_attributes: @@ -1036,34 +1049,34 @@ class SystemHealthMonitor: def _check_thermal_health(self, device: str, temperature: int, drive_type: str = 'HDD') -> List[str]: """Enhanced thermal health checking with drive-type specific thresholds.""" issues = [] - + if temperature is None: return issues - + # Drive-type specific temperature thresholds - ADJUSTED TO BE LESS SENSITIVE if drive_type == 'SSD': temp_thresholds = {'warning': 70, 'critical': 85, 'optimal_max': 65} else: # HDD temp_thresholds = {'warning': 65, 'critical': 75, 'optimal_max': 60} - + if temperature >= temp_thresholds['critical']: issues.append(f"CRITICAL: Drive temperature {temperature}°C exceeds safe operating limit for {drive_type}") elif temperature >= temp_thresholds['warning']: issues.append(f"WARNING: Drive temperature {temperature}°C approaching thermal limit for {drive_type}") elif temperature > temp_thresholds['optimal_max']: issues.append(f"INFO: Drive temperature {temperature}°C above optimal range for {drive_type}") - + return issues def _analyze_error_patterns(self, device: str, smart_output: str) -> List[str]: """Analyze SMART error logs for failure patterns.""" issues = [] - + # Pattern matching for different error types error_patterns = { 'media_errors': [ r'UNC_ERR', - r'ABRT_ERR', + r'ABRT_ERR', r'read error', r'write error', r'medium error' @@ -1081,13 +1094,13 @@ class SystemHealthMonitor: r'reset required' ] } - + for error_type, patterns in error_patterns.items(): error_count = 0 for pattern in patterns: matches = re.findall(pattern, smart_output, re.IGNORECASE) error_count += len(matches) - + if error_count > 0: if error_count >= 10: issues.append(f"CRITICAL: Multiple {error_type} detected ({error_count} occurrences)") @@ -1095,26 +1108,26 @@ class SystemHealthMonitor: issues.append(f"WARNING: {error_type} detected ({error_count} occurrences)") elif error_count >= 1: issues.append(f"INFO: {error_type} detected ({error_count} occurrences)") - + return issues def _check_ssd_health(self, device: str, smart_attributes: dict) -> List[str]: """SSD-specific health checks for wear and endurance.""" issues = [] - + # Check wear leveling and endurance indicators wear_indicators = [ 'Media_Wearout_Indicator', - 'SSD_Life_Left', + 'SSD_Life_Left', 'Percent_Lifetime_Remain', 'Available_Spare', 'Available_Spare_Threshold' ] - + for indicator in wear_indicators: if indicator in smart_attributes: value = smart_attributes[indicator] - + # Handle percentage-based indicators (countdown from 100) if indicator in ['Media_Wearout_Indicator', 'SSD_Life_Left', 'Percent_Lifetime_Remain', 'Available_Spare']: if value <= 5: @@ -1123,7 +1136,7 @@ class SystemHealthMonitor: issues.append(f"WARNING: {indicator} at {value}% - SSD showing significant wear") elif value <= 30: issues.append(f"INFO: {indicator} at {value}% - SSD wear monitoring recommended") - + # Check for excessive bad blocks bad_block_indicators = [ 'Runtime_Bad_Block', @@ -1131,7 +1144,7 @@ class SystemHealthMonitor: 'Grown_Failing_Block_Ct', 'End-to-End_Error' ] - + for indicator in bad_block_indicators: if indicator in smart_attributes: value = smart_attributes[indicator] @@ -1139,7 +1152,7 @@ class SystemHealthMonitor: issues.append(f"WARNING: High {indicator}: {value}") elif value > 10: issues.append(f"INFO: Elevated {indicator}: {value}") - + # Check write amplification and endurance metrics endurance_indicators = [ 'Total_LBAs_Written', @@ -1147,18 +1160,18 @@ class SystemHealthMonitor: 'Host_Program_NAND_Pages_Count', 'FTL_Program_NAND_Pages_Count' ] - + # Calculate write amplification if both host and FTL write counts are available host_writes = smart_attributes.get('Host_Program_NAND_Pages_Count', 0) ftl_writes = smart_attributes.get('FTL_Program_NAND_Pages_Count', 0) - + if host_writes > 0 and ftl_writes > 0: write_amplification = ftl_writes / host_writes if write_amplification > 5.0: issues.append(f"WARNING: High write amplification factor: {write_amplification:.2f}") elif write_amplification > 3.0: issues.append(f"INFO: Elevated write amplification factor: {write_amplification:.2f}") - + return issues def _check_system_drive_indicators(self) -> Dict[str, Any]: @@ -1167,12 +1180,12 @@ class SystemHealthMonitor: 'status': 'OK', 'issues': [] } - + try: # Check dmesg for drive-related errors (last 1000 lines to avoid overwhelming output) - result = subprocess.run(['dmesg', '-T', '--level=err,warn'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=10) - + result = subprocess.run(['dmesg', '-T', '--level=err,warn'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=10) + if result.returncode == 0: error_patterns = [ (r'ata\d+.*failed command', 'ATA command failures'), @@ -1184,7 +1197,7 @@ class SystemHealthMonitor: (r'ata\d+.*SError:', 'SATA errors'), (r'nvme\d+.*I/O error', 'NVMe I/O errors') ] - + for pattern, description in error_patterns: matches = re.findall(pattern, result.stdout, re.IGNORECASE) if matches: @@ -1198,13 +1211,13 @@ class SystemHealthMonitor: system_health['issues'].append(f"WARNING: {description} in system logs ({count} occurrences)") else: system_health['issues'].append(f"INFO: {description} in system logs ({count} occurrences)") - + except subprocess.TimeoutExpired: system_health['issues'].append("WARNING: System log check timed out") except Exception as e: logger.debug(f"Error checking system drive indicators: {e}") system_health['issues'].append(f"ERROR: Failed to check system logs: {str(e)}") - + return system_health # ============================================================================= @@ -1223,7 +1236,7 @@ class SystemHealthMonitor: 'type': None, # SSD or HDD 'smart_capable': False } - + try: # First check if device supports SMART capability_result = subprocess.run( @@ -1233,21 +1246,21 @@ class SystemHealthMonitor: text=True, timeout=30 ) - + # Check if smartctl failed completely if capability_result.returncode not in [0, 4]: # 0 = success, 4 = some SMART errors but readable logger.debug(f"smartctl failed for {device}: return code {capability_result.returncode}") return drive_details - + output = capability_result.stdout - + # Check if SMART is supported if "SMART support is: Enabled" in output or "SMART support is: Available" in output: drive_details['smart_capable'] = True elif "SMART support is: Unavailable" in output or "does not support SMART" in output: logger.debug(f"Device {device} does not support SMART") return drive_details - + for line in output.split('\n'): if 'Device Model' in line or 'Model Number' in line: drive_details['model'] = line.split(':')[1].strip() @@ -1265,14 +1278,13 @@ class SystemHealthMonitor: drive_details['type'] = 'SSD' else: drive_details['type'] = 'HDD' - + except Exception as e: logger.debug(f"Error getting drive details for {device}: {e}") self._drive_details_cache[device] = drive_details return drive_details - def _get_issue_type(self, issue: str) -> str: """Determine issue type from issue description.""" if "SMART" in issue: @@ -1318,7 +1330,7 @@ class SystemHealthMonitor: # content lines: prefix + field_width + ┃ = 80 box_width = 78 - banner = f""" + banner = """ ┏{'━' * box_width}┓ ┃{' HARDWARE MONITORING ALERT TICKET '.center(box_width)}┃ ┣{'━' * box_width}┫ @@ -1330,7 +1342,7 @@ class SystemHealthMonitor: issue_type = self._get_issue_type(issue) impact_level = self._get_impact_level(issue) - executive_summary = f""" + executive_summary = """ ┏━ EXECUTIVE SUMMARY {'━' * (box_width - 20)}┓ ┃ Issue Type │ {issue_type:<60}┃ ┃ Impact Level │ {impact_level:<60}┃ @@ -1395,7 +1407,7 @@ class SystemHealthMonitor: type_safe = drive_details.get('type') or 'N/A' firmware_safe = drive_details.get('firmware') or 'N/A' - description += f""" + description += """ ┏━ DRIVE SPECIFICATIONS {'━' * (box_width - 23)}┓ ┃ Device Path │ {device_safe:<61}┃ ┃ Model │ {model_safe:<61}┃ @@ -1410,7 +1422,7 @@ class SystemHealthMonitor: last_test_safe = last_test_date or 'N/A' age_safe = age or 'N/A' - description += f""" + description += """ ┏━ DRIVE TIMELINE {'━' * (box_width - 17)}┓ ┃ Power-On Hours │ {power_on_safe:<56}┃ ┃ Last SMART Test │ {last_test_safe:<56}┃ @@ -1423,13 +1435,13 @@ class SystemHealthMonitor: temp_value = drive_info.get('temperature') temp_safe = f"{temp_value}°C" if temp_value is not None else 'N/A' - description += f""" + description += """ ┏━ SMART STATUS {'━' * (box_width - 15)}┓ ┃ Status │ {smart_status_safe:<62}┃ ┃ Temperature │ {temp_safe:<62}┃ ┗{'━' * box_width}┛ """ - + if drive_info.get('smart_attributes'): description += f"\n┏━ SMART ATTRIBUTES {'━' * (box_width - 19)}┓\n" for attr, value in drive_info['smart_attributes'].items(): @@ -1455,7 +1467,7 @@ class SystemHealthMonitor: # Truncate mountpoint if too long for header mountpoint_display = mountpoint_safe[:50] if len(mountpoint_safe) > 50 else mountpoint_safe - description += f""" + description += """ ┏━ PARTITION: {mountpoint_display} {'━' * (box_width - 14 - len(mountpoint_display))}┓ ┃ Filesystem │ {fstype_safe:<61}┃ ┃ Usage Meter │ {usage_meter} {usage_pct_str:>10}┃ @@ -1474,7 +1486,7 @@ class SystemHealthMonitor: description += f"┗{'━' * box_width}┛\n" except Exception as e: description += f"\nError generating drive details: {str(e)}\n" - + if "Temperature" in issue: description += "\n" + textwrap.dedent(""" High drive temperatures can: @@ -1508,7 +1520,7 @@ class SystemHealthMonitor: cpu_status = cpu_health.get('status', 'N/A') cpu_usage_str = f"{cpu_usage}%" if isinstance(cpu_usage, (int, float)) else cpu_usage - description += f""" + description += """ ┏━ CPU STATUS {'━' * (box_width - 13)}┓ ┃ Usage │ {cpu_usage_str:<61}┃ ┃ Threshold │ {str(cpu_threshold) + '%':<61}┃ @@ -1541,7 +1553,7 @@ class SystemHealthMonitor: if len(issues_str) > 61: issues_str = issues_str[:58] + '...' - description += f""" + description += """ ┏━ NETWORK STATUS {'━' * (box_width - 17)}┓ ┃ Management │ {mgmt_status:<61}┃ ┃ Ceph Network │ {ceph_status:<61}┃ @@ -1573,7 +1585,7 @@ class SystemHealthMonitor: usage_meter = '█' * blocks + '░' * (50 - blocks) usage_pct_str = f"{usage_pct:.1f}%" - description += f""" + description += """ ┏━ CONTAINER STORAGE {'━' * (box_width - 20)}┓ ┃ VMID │ {vmid:<61}┃ ┃ Mountpoint │ {mountpoint:<61}┃ @@ -1601,7 +1613,7 @@ class SystemHealthMonitor: osd_up = sum(1 for o in osd_list if o.get('status') == 'up') osd_summary = f"{osd_up}/{osd_total} up" if osd_total > 0 else 'N/A' - description += f""" + description += """ ┏━ CEPH CLUSTER STATUS {'━' * (box_width - 22)}┓ ┃ Health │ {cluster_health:<61}┃ ┃ Usage │ {usage_pct_str:<61}┃ @@ -1610,11 +1622,11 @@ class SystemHealthMonitor: ┃ OSDs │ {osd_summary:<61}┃ ┗{'━' * box_width}┛ """ - + if "Disk" in issue: for partition in health_report.get('drives_health', {}).get('drives', []): if partition.get('mountpoint') in issue: - description += f"\n=== Disk Metrics ===\n" + description += "\n=== Disk Metrics ===\n" description += f"Disk Device: {partition['device']}\n" description += f"Mount Point: {partition['mountpoint']}\n" description += f"Total Space: {partition['total_space']}\n" @@ -1973,7 +1985,7 @@ class SystemHealthMonitor: response = requests.post( self.ticket_api_url, json=ticket_payload, - headers = { + headers={ 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}' }, @@ -1985,7 +1997,7 @@ class SystemHealthMonitor: except json.JSONDecodeError as e: logger.error(f"Invalid JSON response from ticket API: {e}") continue - + if response_data.get('success'): logger.info(f"Ticket created successfully: {ticket_title}") logger.info(f"Ticket ID: {response_data.get('ticket_id')}") @@ -1994,26 +2006,26 @@ class SystemHealthMonitor: continue else: logger.error(f"Failed to create ticket: {response_data.get('error')}") - + except Exception as e: logger.error(f"Error creating ticket: {e}") def _detect_issues(self, health_report: Dict[str, Any]) -> List[str]: """ Detect issues in the health report including non-critical issues. - + :param health_report: The comprehensive health report from the checks. :return: List of issue descriptions detected during checks. """ issues = [] - + # Check for drive-related issues for drive in health_report.get('drives_health', {}).get('drives', []): # Skip drives with ERROR or NOT_SUPPORTED status - these are likely virtual/unsupported devices if drive.get('smart_status') in ['ERROR', 'NOT_SUPPORTED']: logger.debug(f"Skipping issue detection for drive {drive['device']} with status {drive.get('smart_status')}") continue - + # Only report issues for drives with valid SMART status if drive.get('smart_issues') and drive.get('smart_status') in ['HEALTHY', 'UNHEALTHY', 'UNKNOWN', 'REPLACEMENT_NEEDED']: # Filter out generic error messages and manufacturer-specific false positives @@ -2050,7 +2062,7 @@ class SystemHealthMonitor: drive_details = self._get_drive_details(drive['device']) drive_id = drive_details.get('serial') or drive['device'] issues.append(f"Drive {drive_id} temperature is high: {drive['temperature']}°C") - + # Check for ECC memory errors memory_health = health_report.get('memory_health', {}) if memory_health.get('has_ecc') and memory_health.get('ecc_errors'): @@ -2162,13 +2174,13 @@ class SystemHealthMonitor: # Check exact matches if mountpoint in self.CONFIG['EXCLUDED_MOUNTS']: return True - + # Check patterns for pattern in self.CONFIG['EXCLUDED_PATTERNS']: if re.match(pattern, mountpoint): return True return False - + def _format_bytes_human(self, num_bytes): """Format a byte count into a human-readable string.""" for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']: @@ -2180,16 +2192,16 @@ class SystemHealthMonitor: def _parse_size(self, size_str: str) -> float: """ Parse size string with units to bytes. - + :param size_str: String containing size with unit (e.g. '15.7G', '21.8T') :return: Size in bytes as float - """ + """ try: # Skip non-size strings if not isinstance(size_str, str): logger.debug(f"Not a string: {size_str}") return 0.0 - + if not any(unit in size_str.upper() for unit in ['B', 'K', 'M', 'G', 'T']): logger.debug(f"No valid size unit found in: {size_str}") return 0.0 @@ -2198,7 +2210,7 @@ class SystemHealthMonitor: multipliers = { 'B': 1, 'K': 1024, - 'M': 1024**2, + 'M': 1024**2, 'G': 1024**3, 'T': 1024**4 } @@ -2208,19 +2220,19 @@ class SystemHealthMonitor: if not match: logger.debug(f"Could not extract numeric value from: {size_str}") return 0.0 - + value = float(match.group(1)) - + unit_match = re.search(r'([BKMGT])', size_str.upper()) if not unit_match: logger.debug(f"Could not extract unit from: {size_str}") return 0.0 - + unit = unit_match.group(1) - + # Convert to bytes bytes_value = value * multipliers.get(unit, 0) - + return bytes_value except (ValueError, AttributeError, TypeError) as e: @@ -2231,12 +2243,12 @@ class SystemHealthMonitor: def _is_physical_disk(self, device_path): """ Check if the device is a physical disk, excluding logical volumes and special devices. - + :param device_path: Path to the device :return: Boolean indicating if it's a relevant physical disk """ logger.debug(f"Checking device: {device_path}") - + # Exclude known non-physical or special devices excluded_patterns = [ r'/dev/mapper/', # LVM devices @@ -2247,11 +2259,11 @@ class SystemHealthMonitor: r'/boot/efi', # EFI partitions r'[0-9]+$' # Partition numbers ] - + if any(re.search(pattern, device_path) for pattern in excluded_patterns): logger.debug(f"Device {device_path} excluded due to pattern match") return False - + # Match physical devices physical_patterns = [ r'/dev/sd[a-z]+$', # SATA/SAS drives @@ -2259,10 +2271,10 @@ class SystemHealthMonitor: r'/dev/mmcblk\d+$', # MMC/SD cards r'/dev/hd[a-z]+$' # IDE drives (legacy) ] - + is_physical = any(re.match(pattern, device_path) for pattern in physical_patterns) logger.debug(f"Device {device_path} physical disk check result: {is_physical}") - + return is_physical def _check_disk_firmware(self, device: str) -> Dict[str, Any]: @@ -2283,7 +2295,7 @@ class SystemHealthMonitor: 'Micron': ['Micron', 'Crucial'], 'Toshiba': ['Toshiba', 'TOSHIBA'] } - + try: result = subprocess.run( ['smartctl', '-i', device], @@ -2303,14 +2315,14 @@ class SystemHealthMonitor: elif 'Device Model:' in line and not firmware_info['model']: model_line = line firmware_info['model'] = line.split(':')[1].strip() - + # Determine manufacturer if model_line: for manufacturer, patterns in MANUFACTURER_PATTERNS.items(): if any(pattern in model_line for pattern in patterns): firmware_info['manufacturer'] = manufacturer break - + # Check against known problematic versions if firmware_info['manufacturer'] and firmware_info['model']: # Check if manufacturer exists in our problematic firmware database @@ -2364,52 +2376,52 @@ class SystemHealthMonitor: """Enhanced manufacturer detection based on model and serial patterns.""" if not model: return 'Unknown' - + model_upper = model.upper() - + # Western Digital patterns (including HGST which WD acquired) if any(pattern in model_upper for pattern in ['WDC', 'WD-', 'HGST', 'WESTERN DIGITAL']): return 'Western Digital' - + # Seagate patterns elif any(pattern in model_upper for pattern in ['ST', 'SEAGATE']): return 'Seagate' - + # Samsung patterns elif 'SAMSUNG' in model_upper: return 'Samsung' - + # Intel patterns elif any(pattern in model_upper for pattern in ['INTEL', 'SSDSC']): return 'Intel' - + # Micron/Crucial patterns elif any(pattern in model_upper for pattern in ['CRUCIAL', 'MICRON', 'CT']): return 'Micron' - + # Toshiba patterns elif 'TOSHIBA' in model_upper: return 'Toshiba' - + # Ridata/Ritek patterns (for your existing special handling) elif any(pattern in model_upper for pattern in ['RIDATA', 'RITEK']): return 'Ridata' - + # OOS patterns (for your existing special handling) elif 'OOS' in model_upper: return 'OOS' - + return 'Unknown' def _get_manufacturer_profile(self, model: str, manufacturer: str = None, firmware: str = None) -> Dict[str, Any]: """Get manufacturer-specific SMART profile based on drive model/manufacturer/firmware.""" logger.debug(f"Looking for profile - Model: '{model}', Manufacturer: '{manufacturer}', Firmware: '{firmware}'") - + # First, try to detect manufacturer if not provided if not manufacturer: manufacturer = self._detect_manufacturer(model) logger.debug(f"Auto-detected manufacturer: {manufacturer}") - + # Check each manufacturer profile for mfg, profile in self.MANUFACTURER_SMART_PROFILES.items(): # Check firmware patterns first (most specific for OEM drives like RiData) @@ -2418,18 +2430,18 @@ class SystemHealthMonitor: if firmware.startswith(pattern) or pattern in firmware: logger.debug(f"Matched manufacturer profile: {mfg} for firmware pattern '{pattern}' in '{firmware}'") return profile - + # Check if detected manufacturer matches this profile if manufacturer and manufacturer in profile['aliases']: logger.debug(f"Matched manufacturer profile: {mfg} for detected manufacturer '{manufacturer}'") return profile - + # Check model/manufacturer aliases (fallback) for alias in profile['aliases']: if alias.lower() in model.lower() or (manufacturer and alias.lower() in manufacturer.lower()): logger.debug(f"Matched manufacturer profile: {mfg} for model alias '{alias}' in '{model}'") return profile - + # Return generic profile if no match logger.debug(f"No specific profile found for Model: '{model}', Manufacturer: '{manufacturer}', Firmware: '{firmware}', using Generic profile") return self.MANUFACTURER_SMART_PROFILES['Generic'] @@ -2438,14 +2450,14 @@ class SystemHealthMonitor: """Check if an attribute should be monitored based on manufacturer profile.""" if not manufacturer_profile: return True # Default: monitor everything - + attr_config = manufacturer_profile.get('attributes', {}).get(attr_name, {}) - + # Check if explicitly set to not monitor if attr_config.get('monitor') is False: logger.debug(f"Skipping monitoring for {attr_name} - explicitly disabled") return False - + return True # Default: monitor unless explicitly disabled def _get_attribute_thresholds(self, attr_name: str, manufacturer_profile: dict) -> dict: @@ -2459,7 +2471,7 @@ class SystemHealthMonitor: 'critical': attr_config['critical_threshold'], 'behavior': attr_config.get('behavior', 'countup') } - + # Enhanced BASE_SMART_THRESHOLDS with manufacturer-specific handling BASE_SMART_THRESHOLDS = { 'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10}, @@ -2479,7 +2491,7 @@ class SystemHealthMonitor: 'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5}, # ADJUSTED: More lenient thresholds for error rates on unknown drives 'Raw_Read_Error_Rate': {'warning': 10000000, 'critical': 100000000}, # Raised significantly - 'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000}, # Raised significantly + 'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000}, # Raised significantly 'Command_Timeout': {'warning': 100, 'critical': 1000}, # Raised significantly 'High_Fly_Writes': {'warning': 1, 'critical': 5}, 'Airflow_Temperature_Cel': {'warning': 65, 'critical': 75}, @@ -2491,14 +2503,14 @@ class SystemHealthMonitor: 'Grown_Failing_Block_Ct': {'warning': 10, 'critical': 50}, 'End-to-End_Error': {'warning': 1, 'critical': 5} } - + if attr_name in BASE_SMART_THRESHOLDS: return { 'warning': BASE_SMART_THRESHOLDS[attr_name]['warning'], 'critical': BASE_SMART_THRESHOLDS[attr_name]['critical'], 'behavior': 'countup' } - + return None # No thresholds defined def _is_new_drive(self, power_on_hours: int) -> bool: @@ -2552,12 +2564,12 @@ class SystemHealthMonitor: logger.debug(f"Drive details for {device}: {drive_details}") manufacturer_profile = self._get_manufacturer_profile( - drive_details.get('model', ''), + drive_details.get('model', ''), drive_details.get('manufacturer', ''), drive_details.get('firmware', '') ) smart_health['manufacturer_profile'] = manufacturer_profile - + logger.debug(f"Selected manufacturer profile for {device}: {manufacturer_profile.get('aliases', ['Unknown'])[0] if manufacturer_profile else 'None'}") # Get firmware information @@ -2574,7 +2586,7 @@ class SystemHealthMonitor: text=True, timeout=30 ) - + output = result.stdout # Check overall health status @@ -2589,10 +2601,10 @@ class SystemHealthMonitor: # Parse SMART attributes with manufacturer-specific handling power_on_hours = 0 - + # First pass: collect all SMART attributes with priority for _Total versions smart_attributes_raw = {} - + for line in output.split('\n'): # Extract Power_On_Hours first to determine if drive is new if 'Power_On_Hours' in line: @@ -2617,7 +2629,7 @@ class SystemHealthMonitor: if manufacturer_profile and manufacturer_profile.get('aliases', [{}])[0] == 'Ridata': logger.debug(f"Skipping {attr} for Ridata drive - using _Total version only") continue - + parts = line.split() if len(parts) >= 10: raw_value = self._parse_smart_value(parts[9]) @@ -2638,21 +2650,21 @@ class SystemHealthMonitor: if len(parts) >= 10: raw_value = self._parse_smart_value(parts[9]) smart_health['attributes']['Wear_Leveling_Count'] = raw_value - + # Get manufacturer-specific thresholds wear_attr = manufacturer_profile.get('attributes', {}).get('Wear_Leveling_Count', {}) - + # Skip evaluation if this is a new drive and manufacturer profile says to ignore if is_new_drive and wear_attr.get('ignore_on_new_drive', False): logger.debug(f"Skipping Wear_Leveling_Count evaluation for new drive: {raw_value}") continue - + warning_threshold = wear_attr.get('warning_threshold') critical_threshold = wear_attr.get('critical_threshold') - + if warning_threshold and critical_threshold: behavior = wear_attr.get('behavior', 'countup') - + if behavior == 'countup': if raw_value >= critical_threshold: smart_health['severity'] = 'CRITICAL' @@ -2682,7 +2694,7 @@ class SystemHealthMonitor: 'Head_Flying_Hours', 'Runtime_Bad_Block', 'Factory_Bad_Block_Ct', 'Grown_Failing_Block_Ct', 'End-to-End_Error' ] - + for line in output.split('\n'): for attr in ALL_SMART_ATTRIBUTES: if attr in line and attr not in ['Wear_Leveling_Count']: # Wear_Leveling handled separately above @@ -2690,7 +2702,7 @@ class SystemHealthMonitor: if not self._should_monitor_attribute(attr, manufacturer_profile): logger.debug(f"Skipping {attr} - disabled for this manufacturer") continue - + parts = line.split() if len(parts) >= 10: raw_value = self._parse_smart_value(parts[9]) @@ -2700,7 +2712,7 @@ class SystemHealthMonitor: attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile) if not attr_thresholds: continue - + # Apply thresholds based on behavior if attr == 'Temperature_Celsius': smart_health['temp'] = raw_value @@ -2735,7 +2747,7 @@ class SystemHealthMonitor: error_log_pattern = r"Error \d+ occurred at disk power-on lifetime: (\d+) hours" error_matches = re.finditer(error_log_pattern, output) recent_errors = [] - + for match in error_matches: error_hour = int(match.group(1)) current_hours = smart_health['attributes'].get('Power_On_Hours', 0) @@ -2751,19 +2763,19 @@ class SystemHealthMonitor: # Trend analysis for predictive failure detection trend_issues = self._analyze_smart_trends(device, smart_health['attributes']) smart_health['issues'].extend(trend_issues) - + # SSD-specific checks drive_type = drive_details.get('type', 'HDD') if drive_type == 'SSD': ssd_issues = self._check_ssd_health(device, smart_health['attributes']) smart_health['issues'].extend(ssd_issues) - + # Enhanced temperature analysis if smart_health['temp']: drive_type = drive_details.get('type', 'HDD') thermal_issues = self._check_thermal_health(device, smart_health['temp'], drive_type) smart_health['issues'].extend(thermal_issues) - + # Error pattern analysis error_pattern_issues = self._analyze_error_patterns(device, output) smart_health['issues'].extend(error_pattern_issues) @@ -2790,28 +2802,28 @@ class SystemHealthMonitor: ) logger.debug(f"NVMe smart-log raw output for {device}:") logger.debug(nvme_result.stdout) - + # Initialize the temperature attribute if smart_health['temp'] is None: smart_health['attributes']['Temperature_Celsius'] = None - + for line in nvme_result.stdout.split('\n'): # Fix the NoneType error by checking if line exists and has content if line and line.strip() and 'temperature' in line.lower(): try: temp_str = line.split(':')[1].strip() if ':' in line else line.strip() logger.debug(f"Raw temperature string: {temp_str}") - + # Extract the first complete number from temperature string temp_match = re.search(r'(\d+)', temp_str) if temp_match: temp_value = int(temp_match.group(1)) logger.debug(f"Parsed temperature value: {temp_value}") - + # Set both temperature fields smart_health['temp'] = temp_value smart_health['attributes']['Temperature_Celsius'] = temp_value - + logger.debug(f"Final temperature recorded: {smart_health['temp']}") break except (ValueError, IndexError, AttributeError) as e: @@ -2859,10 +2871,10 @@ class SystemHealthMonitor: text=True, timeout=30 ) - + if result.returncode == 0: smart_health['status'] = 'HEALTHY' - + # Parse NVMe smart log output for line in result.stdout.split('\n'): if 'temperature' in line.lower(): @@ -2871,7 +2883,7 @@ class SystemHealthMonitor: if temp_match: smart_health['temp'] = int(temp_match.group(1)) smart_health['attributes']['Temperature_Celsius'] = smart_health['temp'] - + elif 'available_spare' in line.lower(): spare_match = re.search(r'(\d+)%', line) if spare_match: @@ -2883,26 +2895,26 @@ class SystemHealthMonitor: elif spare_pct < 30: smart_health['severity'] = 'WARNING' smart_health['issues'].append(f"Low Available_Spare: {spare_pct}%") - + # Enhanced NVMe analysis if smart_health['attributes']: # Trend analysis for NVMe devices trend_issues = self._analyze_smart_trends(device, smart_health['attributes']) smart_health['issues'].extend(trend_issues) - + # SSD-specific checks for NVMe ssd_issues = self._check_ssd_health(device, smart_health['attributes']) smart_health['issues'].extend(ssd_issues) - + # Enhanced temperature analysis for NVMe if smart_health['temp']: thermal_issues = self._check_thermal_health(device, smart_health['temp'], 'SSD') smart_health['issues'].extend(thermal_issues) - + else: smart_health['status'] = 'ERROR' smart_health['issues'].append("Failed to read NVMe SMART data") - + except subprocess.TimeoutExpired: smart_health['status'] = 'ERROR' smart_health['issues'].append("NVMe SMART check timed out") @@ -2925,15 +2937,15 @@ class SystemHealthMonitor: # Get only valid physical disks physical_disks = self._get_all_disks() logger.debug(f"Checking physical disks: {physical_disks}") - + if not physical_disks: logger.warning("No valid physical disks found for monitoring") drives_health['overall_status'] = 'WARNING' return drives_health - + # Get ALL partition information including device mapper partitions = psutil.disk_partitions(all=True) - + # Create mapping of base devices to their partitions device_partitions = {} for part in partitions: @@ -3016,10 +3028,10 @@ class SystemHealthMonitor: drives_health['drives'].append(drive_report) drives_health['overall_status'] = overall_status - + except Exception as e: logger.error(f"Error checking drives health: {str(e)}") - + return drives_health # ============================================================================= @@ -3057,7 +3069,7 @@ class SystemHealthMonitor: 'used_memory': self._convert_bytes(psutil.virtual_memory().used), 'memory_percent': psutil.virtual_memory().percent } - + try: # First check using dmidecode (if available) if self._available_tools.get('dmidecode'): @@ -3079,7 +3091,7 @@ class SystemHealthMonitor: if os.path.exists(f"{mc_dir}/csrow0"): memory_health['has_ecc'] = True break - + # If ECC is present, check for errors if memory_health['has_ecc']: for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'): @@ -3090,7 +3102,7 @@ class SystemHealthMonitor: memory_health['ecc_errors'].append( f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}" ) - + ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count") if ce_count > 0: if memory_health['status'] != 'CRITICAL': @@ -3098,11 +3110,11 @@ class SystemHealthMonitor: memory_health['ecc_errors'].append( f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}" ) - + except Exception as e: memory_health['status'] = 'ERROR' memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}") - + return memory_health def _read_ecc_count(self, filepath: str) -> int: @@ -3122,7 +3134,7 @@ class SystemHealthMonitor: def _check_cpu_usage(self) -> Dict[str, Any]: """ Check CPU usage and return health metrics. - + :return: Dictionary with CPU health metrics. """ cpu_usage_percent = psutil.cpu_percent(interval=1) @@ -3131,7 +3143,7 @@ class SystemHealthMonitor: 'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING' } return cpu_health - + def _check_network_status(self) -> Dict[str, Any]: """ Check the status of network interfaces and report any issues. @@ -3373,7 +3385,7 @@ class SystemHealthMonitor: except json.JSONDecodeError as e: logger.warning(f"Failed to parse ceph mon stat JSON: {e}") - logger.debug(f"=== Ceph Health Check ===") + logger.debug("=== Ceph Health Check ===") logger.debug(f"Is Ceph node: {ceph_health['is_ceph_node']}") logger.debug(f"Cluster health: {ceph_health['cluster_health']}") logger.debug(f"Cluster usage: {ceph_health['cluster_usage']}") @@ -3597,22 +3609,22 @@ class SystemHealthMonitor: return '{' + ','.join(pairs) + '}' if pairs else '' # === System Info === - metrics.append(f'# HELP hwmon_info System information') - metrics.append(f'# TYPE hwmon_info gauge') + metrics.append('# HELP hwmon_info System information') + metrics.append('# TYPE hwmon_info gauge') metrics.append(f'hwmon_info{labels(hostname=hostname)} 1') # === Drive Metrics === - metrics.append(f'# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)') - metrics.append(f'# TYPE hwmon_drive_smart_healthy gauge') + metrics.append('# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)') + metrics.append('# TYPE hwmon_drive_smart_healthy gauge') - metrics.append(f'# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius') - metrics.append(f'# TYPE hwmon_drive_temperature_celsius gauge') + metrics.append('# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius') + metrics.append('# TYPE hwmon_drive_temperature_celsius gauge') - metrics.append(f'# HELP hwmon_drive_size_bytes Drive total size in bytes') - metrics.append(f'# TYPE hwmon_drive_size_bytes gauge') + metrics.append('# HELP hwmon_drive_size_bytes Drive total size in bytes') + metrics.append('# TYPE hwmon_drive_size_bytes gauge') - metrics.append(f'# HELP hwmon_drive_smart_issues_total Number of SMART issues detected') - metrics.append(f'# TYPE hwmon_drive_smart_issues_total gauge') + metrics.append('# HELP hwmon_drive_smart_issues_total Number of SMART issues detected') + metrics.append('# TYPE hwmon_drive_smart_issues_total gauge') for drive in health_report.get('drives_health', {}).get('drives', []): device = drive.get('device', 'unknown') @@ -3639,33 +3651,33 @@ class SystemHealthMonitor: # === CPU Metrics === cpu = health_report.get('cpu_health', {}) - metrics.append(f'# HELP hwmon_cpu_usage_percent CPU usage percentage') - metrics.append(f'# TYPE hwmon_cpu_usage_percent gauge') + metrics.append('# HELP hwmon_cpu_usage_percent CPU usage percentage') + metrics.append('# TYPE hwmon_cpu_usage_percent gauge') if cpu.get('cpu_usage_percent') is not None: metrics.append(f'hwmon_cpu_usage_percent{labels(hostname=hostname)} {cpu["cpu_usage_percent"]}') # === Memory Metrics === mem = health_report.get('memory_health', {}) - metrics.append(f'# HELP hwmon_memory_usage_percent Memory usage percentage') - metrics.append(f'# TYPE hwmon_memory_usage_percent gauge') + metrics.append('# HELP hwmon_memory_usage_percent Memory usage percentage') + metrics.append('# TYPE hwmon_memory_usage_percent gauge') if mem.get('memory_percent') is not None: metrics.append(f'hwmon_memory_usage_percent{labels(hostname=hostname)} {mem["memory_percent"]}') - metrics.append(f'# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)') - metrics.append(f'# TYPE hwmon_memory_has_ecc gauge') + metrics.append('# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)') + metrics.append('# TYPE hwmon_memory_has_ecc gauge') has_ecc = 1 if mem.get('has_ecc') else 0 metrics.append(f'hwmon_memory_has_ecc{labels(hostname=hostname)} {has_ecc}') if mem.get('has_ecc'): - metrics.append(f'# HELP hwmon_memory_ecc_errors_total Total ECC errors detected') - metrics.append(f'# TYPE hwmon_memory_ecc_errors_total gauge') + metrics.append('# HELP hwmon_memory_ecc_errors_total Total ECC errors detected') + metrics.append('# TYPE hwmon_memory_ecc_errors_total gauge') ecc_errors = len(mem.get('ecc_errors', [])) metrics.append(f'hwmon_memory_ecc_errors_total{labels(hostname=hostname)} {ecc_errors}') # === Network Metrics === net = health_report.get('network_health', {}) - metrics.append(f'# HELP hwmon_network_status Network status (1=OK, 0=issue)') - metrics.append(f'# TYPE hwmon_network_status gauge') + metrics.append('# HELP hwmon_network_status Network status (1=OK, 0=issue)') + metrics.append('# TYPE hwmon_network_status gauge') for net_type in ['management_network', 'ceph_network']: net_info = net.get(net_type, {}) @@ -3676,40 +3688,40 @@ class SystemHealthMonitor: # === Ceph Metrics === ceph = health_report.get('ceph_health', {}) if ceph.get('is_ceph_node'): - metrics.append(f'# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)') - metrics.append(f'# TYPE hwmon_ceph_cluster_healthy gauge') + metrics.append('# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)') + metrics.append('# TYPE hwmon_ceph_cluster_healthy gauge') ceph_healthy = 1 if ceph.get('cluster_health') == 'HEALTH_OK' else 0 metrics.append(f'hwmon_ceph_cluster_healthy{labels(hostname=hostname)} {ceph_healthy}') if ceph.get('cluster_usage'): usage = ceph['cluster_usage'] - metrics.append(f'# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage') - metrics.append(f'# TYPE hwmon_ceph_cluster_usage_percent gauge') + metrics.append('# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage') + metrics.append('# TYPE hwmon_ceph_cluster_usage_percent gauge') metrics.append(f'hwmon_ceph_cluster_usage_percent{labels(hostname=hostname)} {usage.get("usage_percent", 0)}') - metrics.append(f'# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes') - metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_total gauge') + metrics.append('# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes') + metrics.append('# TYPE hwmon_ceph_cluster_bytes_total gauge') metrics.append(f'hwmon_ceph_cluster_bytes_total{labels(hostname=hostname)} {usage.get("total_bytes", 0)}') - metrics.append(f'# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes') - metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_used gauge') + metrics.append('# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes') + metrics.append('# TYPE hwmon_ceph_cluster_bytes_used gauge') metrics.append(f'hwmon_ceph_cluster_bytes_used{labels(hostname=hostname)} {usage.get("used_bytes", 0)}') - metrics.append(f'# HELP hwmon_ceph_osd_total Total number of OSDs') - metrics.append(f'# TYPE hwmon_ceph_osd_total gauge') + metrics.append('# HELP hwmon_ceph_osd_total Total number of OSDs') + metrics.append('# TYPE hwmon_ceph_osd_total gauge') osd_count = len(ceph.get('osd_status', [])) metrics.append(f'hwmon_ceph_osd_total{labels(hostname=hostname)} {osd_count}') - metrics.append(f'# HELP hwmon_ceph_osd_down Number of down OSDs') - metrics.append(f'# TYPE hwmon_ceph_osd_down gauge') + metrics.append('# HELP hwmon_ceph_osd_down Number of down OSDs') + metrics.append('# TYPE hwmon_ceph_osd_down gauge') down_osds = len([o for o in ceph.get('osd_status', []) if o.get('status') == 'down']) metrics.append(f'hwmon_ceph_osd_down{labels(hostname=hostname)} {down_osds}') # === LXC Metrics === lxc = health_report.get('lxc_health', {}) if lxc.get('containers'): - metrics.append(f'# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage') - metrics.append(f'# TYPE hwmon_lxc_storage_usage_percent gauge') + metrics.append('# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage') + metrics.append('# TYPE hwmon_lxc_storage_usage_percent gauge') for container in lxc['containers']: vmid = container.get('vmid', 'unknown') @@ -3721,18 +3733,18 @@ class SystemHealthMonitor: # === PBS Metrics === pbs = health_report.get('pbs_health', {}) if pbs.get('is_pbs_node'): - metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage') - metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge') + metrics.append('# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage') + metrics.append('# TYPE hwmon_pbs_zfs_usage_percent gauge') for pool in pbs.get('zfs_pools', []): metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}') - metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count') - metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge') + metrics.append('# HELP hwmon_pbs_failed_tasks_total PBS failed task count') + metrics.append('# TYPE hwmon_pbs_failed_tasks_total gauge') metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}') # === Issue Summary Metrics === - metrics.append(f'# HELP hwmon_issues_total Total number of issues detected') - metrics.append(f'# TYPE hwmon_issues_total gauge') + metrics.append('# HELP hwmon_issues_total Total number of issues detected') + metrics.append('# TYPE hwmon_issues_total gauge') system_issues = len(health_report.get('system_health', {}).get('issues', [])) ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', [])) @@ -3859,16 +3871,16 @@ class SystemHealthMonitor: text=True, timeout=30 # 30 second timeout per container ) - + container_info = { 'vmid': vmid, 'filesystems': [] } - + for fs_line in disk_info.stdout.split('\n')[1:]: if not fs_line.strip() or 'MP' in fs_line: continue - + # Parse df output using regex for reliable column extraction match = re.match( r'(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+\.?\d*)%?\s+(.*)', @@ -3881,74 +3893,75 @@ class SystemHealthMonitor: pool, device_col, total_str, used_str, avail_str, percent_str, mountpoint = match.groups() try: - # Skip excluded mounts - if pool.startswith('appPool:') or '/mnt/pve/mediaf' in device_col: - continue + # Skip excluded mounts + if pool.startswith('appPool:') or '/mnt/pve/mediaf' in device_col: + continue - mountpoint = mountpoint.strip() + mountpoint = mountpoint.strip() - # Skip excluded mountpoints - if self._is_excluded_mount(mountpoint): - logger.debug(f"Skipping excluded mount: {mountpoint}") - continue + # Skip excluded mountpoints + if self._is_excluded_mount(mountpoint): + logger.debug(f"Skipping excluded mount: {mountpoint}") + continue - # Parse size values from named regex groups - total_space = self._parse_size(total_str) - used_space = self._parse_size(used_str) - available_space = self._parse_size(avail_str) + # Parse size values from named regex groups + total_space = self._parse_size(total_str) + used_space = self._parse_size(used_str) + available_space = self._parse_size(avail_str) - # Parse percentage from regex group - try: - usage_percent = float(percent_str) - except ValueError: - # Calculate percentage if parsing fails - usage_percent = (used_space / total_space * 100) if total_space > 0 else 0 - - filesystem = { - 'mountpoint': mountpoint, - 'total_space': total_space, - 'used_space': used_space, - 'available': available_space, - 'usage_percent': usage_percent - } - container_info['filesystems'].append(filesystem) + # Parse percentage from regex group + try: + usage_percent = float(percent_str) + except ValueError: + # Calculate percentage if parsing fails + usage_percent = (used_space / total_space * 100) if total_space > 0 else 0 - # Check thresholds - if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: - lxc_health['status'] = 'CRITICAL' - issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}" - lxc_health['issues'].append(issue) - elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: - if lxc_health['status'] != 'CRITICAL': - lxc_health['status'] = 'WARNING' - issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}" - lxc_health['issues'].append(issue) + filesystem = { + 'mountpoint': mountpoint, + 'total_space': total_space, + 'used_space': used_space, + 'available': available_space, + 'usage_percent': usage_percent + } + container_info['filesystems'].append(filesystem) - logger.debug(f"Filesystem details: {filesystem}") + # Check thresholds + if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: + lxc_health['status'] = 'CRITICAL' + issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}" + lxc_health['issues'].append(issue) + elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: + if lxc_health['status'] != 'CRITICAL': + lxc_health['status'] = 'WARNING' + issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}" + lxc_health['issues'].append(issue) + + logger.debug(f"Filesystem details: {filesystem}") except Exception as e: logger.debug(f"Error processing line: {str(e)}") logger.debug(f"Full exception: {repr(e)}") continue - + # Only add container info if we have filesystem data if container_info['filesystems']: lxc_health['containers'].append(container_info) logger.debug(f"Added container info for VMID {vmid}") - + logger.debug("=== LXC Storage Check Summary ===") logger.debug(f"Status: {lxc_health['status']}") logger.debug(f"Total containers checked: {len(lxc_health['containers'])}") logger.debug(f"Issues found: {len(lxc_health['issues'])}") logger.debug("=== End LXC Storage Check ===") - + except Exception as e: logger.debug(f"Critical error during LXC storage check: {str(e)}") lxc_health['status'] = 'ERROR' error_msg = f"Error checking LXC storage: {str(e)}" lxc_health['issues'].append(error_msg) - + return lxc_health + def main(): parser = argparse.ArgumentParser(description="System Health Monitor") parser.add_argument( @@ -4003,5 +4016,6 @@ def main(): else: monitor.run() + if __name__ == "__main__": - main() \ No newline at end of file + main()