From a9b0fb77f2818e18d29e535223e9b6512707221a Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Fri, 13 Dec 2024 18:36:01 -0500 Subject: [PATCH] Added ECC support and sorted code --- hwmon.timer | 1 + hwmonDaemon.py | 225 ++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 178 insertions(+), 48 deletions(-) diff --git a/hwmon.timer b/hwmon.timer index a43f36c..d4ac957 100644 --- a/hwmon.timer +++ b/hwmon.timer @@ -3,6 +3,7 @@ Description=Run System Health Monitoring Daemon Daily [Timer] OnCalendar=daily +RandomizedDelaySec=1h Persistent=true [Install] diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 8045c40..e171367 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re +import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob from typing import Dict, Any, List # Create a logger @@ -20,6 +20,49 @@ console_handler.setFormatter(formatter) logger.addHandler(console_handler) class SystemHealthMonitor: + PRIORITIES = { + 'CRITICAL': '1', + 'HIGH': '2', + 'MEDIUM': '3', + 'LOW': '4' + } + ISSUE_PRIORITIES = { + 'SMART_FAILURE': PRIORITIES['HIGH'], + 'DISK_CRITICAL': PRIORITIES['HIGH'], + 'DISK_WARNING': PRIORITIES['MEDIUM'], + 'UNCORRECTABLE_ECC': PRIORITIES['HIGH'], + 'CORRECTABLE_ECC': PRIORITIES['MEDIUM'], + 'CPU_HIGH': PRIORITIES['MEDIUM'], + 'NETWORK_FAILURE': PRIORITIES['HIGH'] + } + CONFIG = { + 'TICKET_API_URL': 'http://10.10.10.45/create_ticket_api.php', + 'STATE_FILE': '/tmp/last_health_check.json', + 'THRESHOLDS': { + 'DISK_CRITICAL': 90, + 'DISK_WARNING': 80, + 'CPU_WARNING': 80, + 'TEMPERATURE_WARNING': 65 + }, + 'NETWORKS': { + 'MANAGEMENT': '10.10.10.1', + 'CEPH': '10.10.90.1', + 'PING_TIMEOUT': 1, # seconds + 'PING_COUNT': 1 + } + } + TICKET_TEMPLATES = { + 'ACTION_TYPE': '[auto]', + 'ENVIRONMENT': '[production]', + 'TICKET_TYPE': '[maintenance]', + 'HARDWARE_TYPE': '[hardware]', + 'NETWORK_TYPE': '[network]', + 'SCOPE_SINGLE': '[single-node]', + 'SCOPE_CLUSTER': '[cluster-wide]', + 'DEFAULT_CATEGORY': 'Hardware', + 'DEFAULT_ISSUE_TYPE': 'Problem' + } + def __init__(self, ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php', state_file: str = '/tmp/last_health_check.json', @@ -34,7 +77,7 @@ class SystemHealthMonitor: self.ticket_api_url = ticket_api_url self.state_file = state_file self.dry_run = dry_run - + def run(self): """ Perform a one-shot health check of the system. @@ -127,33 +170,35 @@ class SystemHealthMonitor: return hostname = socket.gethostname() - action_type = "[auto]" - environment = "[production]" - ticket_type = "[maintenance]" + action_type = self.TICKET_TEMPLATES['ACTION_TYPE'] + environment = self.TICKET_TEMPLATES['ENVIRONMENT'] + ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE'] for issue in issues: - # Set default values - priority = "" - category = "Hardware" - issue_type = "Problem" - scope = "[single-node]" + priority = self.PRIORITIES['MEDIUM'] # default + category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY'] + issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE'] + scope = self.TICKET_TEMPLATES['SCOPE_SINGLE'] if "Disk" in issue: - hardware_type = "[hardware]" + hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE'] if "CRITICAL" in issue or "SMART failure" in issue: - priority = "2" + priority = self.ISSUE_PRIORITIES['DISK_CRITICAL'] elif "WARNING" in issue: - priority = "3" - elif "Memory" in issue: + priority = self.ISSUE_PRIORITIES['DISK_WARNING'] + elif "Network" in issue: + hardware_type = self.TICKET_TEMPLATES['NETWORK_TYPE'] + priority = self.ISSUE_PRIORITIES['NETWORK_FAILURE'] + scope = self.TICKET_TEMPLATES['SCOPE_CLUSTER'] + elif "Uncorrectable ECC" in issue: + hardware_type = "[hardware]" + priority = "2" + elif "Correctable ECC" in issue: hardware_type = "[hardware]" priority = "3" elif "CPU" in issue: hardware_type = "[hardware]" priority = "3" - elif "Network" in issue: - hardware_type = "[network]" - priority = "2" - scope = "[cluster-wide]" # Create standardized ticket title ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}" @@ -212,15 +257,15 @@ class SystemHealthMonitor: if partition.get('smart_status') == 'UNHEALTHY': issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status") - # Check for memory-related issues + # Check for ECC memory errors memory_health = health_report.get('memory_health', {}) - if memory_health and memory_health.get('memory_percent', 0) > 80: - issues.append("Memory usage is above 80%") + if memory_health.get('has_ecc') and memory_health.get('ecc_errors'): + issues.extend(memory_health['ecc_errors']) # Check for CPU-related issues cpu_health = health_report.get('cpu_health', {}) - if cpu_health and cpu_health.get('cpu_usage_percent', 0) > 80: - issues.append("CPU usage is above 80%") + if cpu_health and cpu_health.get('cpu_usage_percent', 0) > self.CONFIG['THRESHOLDS']['CPU_WARNING']: + issues.append("CPU usage is above threshold") # Check for network-related issues network_health = health_report.get('network_health', {}) @@ -291,7 +336,7 @@ class SystemHealthMonitor: if len(parts) >= 10: temp = int(parts[9]) smart_health['temp'] = temp - if temp > 65: + if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']: smart_health['issues'].append(f"High drive temperature: {temp}°C") except Exception as e: @@ -318,9 +363,9 @@ class SystemHealthMonitor: # Check disk usage usage = psutil.disk_usage(partition.mountpoint) disk_usage_status = 'NORMAL' - if usage.percent > 90: + if usage.percent > self.CONFIG['THRESHOLDS']['DISK_CRITICAL']: disk_usage_status = 'CRITICAL_HIGH_USAGE' - elif usage.percent > 80: + elif usage.percent > self.CONFIG['THRESHOLDS']['DISK_WARNING']: disk_usage_status = 'WARNING_HIGH_USAGE' drive_report.update({ @@ -372,19 +417,68 @@ class SystemHealthMonitor: def _check_memory_usage(self) -> Dict[str, Any]: """ - Check memory usage and return health metrics. + Check for ECC memory errors if ECC memory is present. - :return: Dictionary with memory health metrics. + :return: Dictionary with memory health metrics and ECC status. """ - memory_info = psutil.virtual_memory() memory_health = { - 'total_memory': self._convert_bytes(memory_info.total), - 'used_memory': self._convert_bytes(memory_info.used), - 'memory_percent': memory_info.percent, - 'status': 'OK' if memory_info.percent < 90 else 'WARNING' + 'has_ecc': False, + 'ecc_errors': [], + 'status': 'OK' } + + try: + # Check if ECC memory is present by looking at edac_mc + result = subprocess.run( + ['ls', '/sys/devices/system/edac/mc'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + if result.returncode == 0: + memory_health['has_ecc'] = True + + # Check for ECC errors in mcX/csrowY/ue_count and ce_count files + for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'): + mc_name = os.path.basename(mc_dir) + + # Check uncorrectable errors + ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count") + if ue_count > 0: + memory_health['status'] = 'CRITICAL' + memory_health['ecc_errors'].append( + f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}" + ) + + # Check correctable errors + ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count") + if ce_count > 0: + if memory_health['status'] != 'CRITICAL': + memory_health['status'] = 'WARNING' + memory_health['ecc_errors'].append( + f"Correctable ECC errors detected in {mc_name}: {ce_count}" + ) + + except Exception as e: + memory_health['status'] = 'ERROR' + memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}") + return memory_health + def _read_ecc_count(self, filepath: str) -> int: + """ + Read ECC error count from a file. + + :param filepath: Path to the ECC count file + :return: Number of ECC errors + """ + try: + with open(filepath, 'r') as f: + return int(f.read().strip()) + except: + return 0 + def _check_cpu_usage(self) -> Dict[str, Any]: """ Check CPU usage and return health metrics. @@ -394,7 +488,7 @@ class SystemHealthMonitor: cpu_usage_percent = psutil.cpu_percent(interval=1) cpu_health = { 'cpu_usage_percent': cpu_usage_percent, - 'status': 'OK' if cpu_usage_percent < 90 else 'WARNING' + 'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING' } return cpu_health @@ -405,30 +499,65 @@ class SystemHealthMonitor: :return: Dictionary containing network health metrics and any issues found. """ network_health = { - 'management_network': {'issues': []}, - 'ceph_network': {'issues': []} + 'management_network': { + 'issues': [], + 'status': 'OK', + 'latency': None + }, + 'ceph_network': { + 'issues': [], + 'status': 'OK', + 'latency': None + } } try: # Check management network connectivity - proc = subprocess.run(["ping", "-c", "1", "10.10.10.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if proc.returncode != 0: + mgmt_result = subprocess.run( + [ + "ping", + "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']), + "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']), + self.CONFIG['NETWORKS']['MANAGEMENT'] + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + if mgmt_result.returncode != 0: + network_health['management_network']['status'] = 'CRITICAL' network_health['management_network']['issues'].append( - "Management network is unreachable." + "Management network is unreachable" ) # Check Ceph network connectivity - proc = subprocess.run(["ping", "-c", "1", "10.10.90.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if proc.returncode != 0: + ceph_result = subprocess.run( + [ + "ping", + "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']), + "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']), + self.CONFIG['NETWORKS']['CEPH'] + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + if ceph_result.returncode != 0: + network_health['ceph_network']['status'] = 'CRITICAL' network_health['ceph_network']['issues'].append( - "Ceph network is unreachable." + "Ceph network is unreachable" ) return network_health except Exception as e: - print(f"Network health check failed: {e}") - return {'error': str(e)} + logger.error(f"Network health check failed: {e}") + return { + 'status': 'ERROR', + 'error': str(e) + } def main(): try: @@ -447,9 +576,9 @@ def main(): # Instantiate the SystemHealthMonitor class monitor = SystemHealthMonitor( - ticket_api_url=ticket_api_url, - state_file=state_file, - dry_run=args.dry_run # Pass the dry-run flag + ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'], + state_file=SystemHealthMonitor.CONFIG['STATE_FILE'], + dry_run=args.dry_run ) # Run the health checks @@ -476,4 +605,4 @@ if __name__ == "__main__": # Set dry-run mode if specified dry_run_mode = args.dry_run - main() \ No newline at end of file + main()