Added ECC support and sorted code

2024-12-13 18:36:01 -05:00
parent 9a4a2dadc9
commit a9b0fb77f2
2 changed files with 178 additions and 48 deletions
--- a/hwmon.timer
+++ b/hwmon.timer
@@ -3,6 +3,7 @@ Description=Run System Health Monitoring Daemon Daily
 [Timer]
 OnCalendar=daily
 RandomizedDelaySec=1h
 Persistent=true
 [Install]
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re
+import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob
 from typing import Dict, Any, List
 # Create a logger
@@ -20,6 +20,49 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 class SystemHealthMonitor:
    PRIORITIES = {
        'CRITICAL': '1',
        'HIGH': '2',
        'MEDIUM': '3',
        'LOW': '4'
    }
    ISSUE_PRIORITIES = {
        'SMART_FAILURE': PRIORITIES['HIGH'],
        'DISK_CRITICAL': PRIORITIES['HIGH'],
        'DISK_WARNING': PRIORITIES['MEDIUM'],
        'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
        'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
        'CPU_HIGH': PRIORITIES['MEDIUM'],
        'NETWORK_FAILURE': PRIORITIES['HIGH']
    }
    CONFIG = {
        'TICKET_API_URL': 'http://10.10.10.45/create_ticket_api.php',
        'STATE_FILE': '/tmp/last_health_check.json',
        'THRESHOLDS': {
            'DISK_CRITICAL': 90,
            'DISK_WARNING': 80,
            'CPU_WARNING': 80,
            'TEMPERATURE_WARNING': 65
        },
        'NETWORKS': {
            'MANAGEMENT': '10.10.10.1',
            'CEPH': '10.10.90.1',
            'PING_TIMEOUT': 1,  # seconds
            'PING_COUNT': 1
        }
    }
    TICKET_TEMPLATES = {
        'ACTION_TYPE': '[auto]',
        'ENVIRONMENT': '[production]',
        'TICKET_TYPE': '[maintenance]',
        'HARDWARE_TYPE': '[hardware]',
        'NETWORK_TYPE': '[network]',
        'SCOPE_SINGLE': '[single-node]',
        'SCOPE_CLUSTER': '[cluster-wide]',
        'DEFAULT_CATEGORY': 'Hardware',
        'DEFAULT_ISSUE_TYPE': 'Problem'
    }
    def __init__(self, 
                 ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
                 state_file: str = '/tmp/last_health_check.json',
@@ -127,33 +170,35 @@ class SystemHealthMonitor:
            return
        hostname = socket.gethostname()
-        action_type = "[auto]"
+        action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
-        environment = "[production]"
+        environment = self.TICKET_TEMPLATES['ENVIRONMENT']
-        ticket_type = "[maintenance]"
+        ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
        for issue in issues:
-            # Set default values
+            priority = self.PRIORITIES['MEDIUM']  # default
-            priority = ""
+            category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
-            category = "Hardware"
+            issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
-            issue_type = "Problem"
+            scope = self.TICKET_TEMPLATES['SCOPE_SINGLE']
            scope = "[single-node]"
            if "Disk" in issue:
-                hardware_type = "[hardware]"
+                hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
                if "CRITICAL" in issue or "SMART failure" in issue:
-                    priority = "2"
+                    priority = self.ISSUE_PRIORITIES['DISK_CRITICAL']
                elif "WARNING" in issue:
-                    priority = "3"
+                    priority = self.ISSUE_PRIORITIES['DISK_WARNING']
-            elif "Memory" in issue:
+            elif "Network" in issue:
                hardware_type = self.TICKET_TEMPLATES['NETWORK_TYPE']
                priority = self.ISSUE_PRIORITIES['NETWORK_FAILURE']
                scope = self.TICKET_TEMPLATES['SCOPE_CLUSTER']
            elif "Uncorrectable ECC" in issue:
                hardware_type = "[hardware]"
                priority = "2"
            elif "Correctable ECC" in issue:
                hardware_type = "[hardware]"
                priority = "3"
            elif "CPU" in issue:
                hardware_type = "[hardware]"
                priority = "3"
            elif "Network" in issue:
                hardware_type = "[network]"
                priority = "2"
                scope = "[cluster-wide]"
            # Create standardized ticket title
            ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}"
@@ -212,15 +257,15 @@ class SystemHealthMonitor:
            if partition.get('smart_status') == 'UNHEALTHY':
                issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status")
-        # Check for memory-related issues
+        # Check for ECC memory errors
        memory_health = health_report.get('memory_health', {})
-        if memory_health and memory_health.get('memory_percent', 0) > 80:
+        if memory_health.get('has_ecc') and memory_health.get('ecc_errors'):
-            issues.append("Memory usage is above 80%")
+            issues.extend(memory_health['ecc_errors'])
        # Check for CPU-related issues
        cpu_health = health_report.get('cpu_health', {})
-        if cpu_health and cpu_health.get('cpu_usage_percent', 0) > 80:
+        if cpu_health and cpu_health.get('cpu_usage_percent', 0) > self.CONFIG['THRESHOLDS']['CPU_WARNING']:
-            issues.append("CPU usage is above 80%")
+            issues.append("CPU usage is above threshold")
        # Check for network-related issues
        network_health = health_report.get('network_health', {})
@@ -291,7 +336,7 @@ class SystemHealthMonitor:
                    if len(parts) >= 10:
                        temp = int(parts[9])
                        smart_health['temp'] = temp
-                        if temp > 65:
+                        if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
                            smart_health['issues'].append(f"High drive temperature: {temp}°C")
        except Exception as e:
@@ -318,9 +363,9 @@ class SystemHealthMonitor:
                # Check disk usage
                usage = psutil.disk_usage(partition.mountpoint)
                disk_usage_status = 'NORMAL'
-                if usage.percent > 90:
+                if usage.percent > self.CONFIG['THRESHOLDS']['DISK_CRITICAL']:
                    disk_usage_status = 'CRITICAL_HIGH_USAGE'
-                elif usage.percent > 80:
+                elif usage.percent > self.CONFIG['THRESHOLDS']['DISK_WARNING']:
                    disk_usage_status = 'WARNING_HIGH_USAGE'
                drive_report.update({
@@ -372,19 +417,68 @@ class SystemHealthMonitor:
    def _check_memory_usage(self) -> Dict[str, Any]:
        """
-        Check memory usage and return health metrics.
+        Check for ECC memory errors if ECC memory is present.
-        :return: Dictionary with memory health metrics.
+        :return: Dictionary with memory health metrics and ECC status.
        """
        memory_info = psutil.virtual_memory()
        memory_health = {
-            'total_memory': self._convert_bytes(memory_info.total),
+            'has_ecc': False,
-            'used_memory': self._convert_bytes(memory_info.used),
+            'ecc_errors': [],
-            'memory_percent': memory_info.percent,
+            'status': 'OK'
            'status': 'OK' if memory_info.percent < 90 else 'WARNING'
        }
        try:
            # Check if ECC memory is present by looking at edac_mc
            result = subprocess.run(
                ['ls', '/sys/devices/system/edac/mc'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            if result.returncode == 0:
                memory_health['has_ecc'] = True
                # Check for ECC errors in mcX/csrowY/ue_count and ce_count files
                for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
                    mc_name = os.path.basename(mc_dir)
                    # Check uncorrectable errors
                    ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
                    if ue_count > 0:
                        memory_health['status'] = 'CRITICAL'
                        memory_health['ecc_errors'].append(
                            f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}"
                        )
                    # Check correctable errors
                    ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
                    if ce_count > 0:
                        if memory_health['status'] != 'CRITICAL':
                            memory_health['status'] = 'WARNING'
                        memory_health['ecc_errors'].append(
                            f"Correctable ECC errors detected in {mc_name}: {ce_count}"
                        )
        except Exception as e:
            memory_health['status'] = 'ERROR'
            memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
        return memory_health
    def _read_ecc_count(self, filepath: str) -> int:
        """
        Read ECC error count from a file.
        :param filepath: Path to the ECC count file
        :return: Number of ECC errors
        """
        try:
            with open(filepath, 'r') as f:
                return int(f.read().strip())
        except:
            return 0
    def _check_cpu_usage(self) -> Dict[str, Any]:
        """
        Check CPU usage and return health metrics.
@@ -394,7 +488,7 @@ class SystemHealthMonitor:
        cpu_usage_percent = psutil.cpu_percent(interval=1)
        cpu_health = {
            'cpu_usage_percent': cpu_usage_percent,
-            'status': 'OK' if cpu_usage_percent < 90 else 'WARNING'
+            'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING'
        }
        return cpu_health
@@ -405,30 +499,65 @@ class SystemHealthMonitor:
        :return: Dictionary containing network health metrics and any issues found.
        """
        network_health = {
-            'management_network': {'issues': []},
+            'management_network': {
-            'ceph_network': {'issues': []}
+                'issues': [],
                'status': 'OK',
                'latency': None
            },
            'ceph_network': {
                'issues': [],
                'status': 'OK',
                'latency': None
            }
        }
        try:
            # Check management network connectivity
-            proc = subprocess.run(["ping", "-c", "1", "10.10.10.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            mgmt_result = subprocess.run(
-            if proc.returncode != 0:
+                [
                    "ping", 
                    "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
                    "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
                    self.CONFIG['NETWORKS']['MANAGEMENT']
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            if mgmt_result.returncode != 0:
                network_health['management_network']['status'] = 'CRITICAL'
                network_health['management_network']['issues'].append(
-                    "Management network is unreachable."
+                    "Management network is unreachable"
                )
            # Check Ceph network connectivity
-            proc = subprocess.run(["ping", "-c", "1", "10.10.90.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            ceph_result = subprocess.run(
-            if proc.returncode != 0:
+                [
                    "ping",
                    "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
                    "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
                    self.CONFIG['NETWORKS']['CEPH']
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            if ceph_result.returncode != 0:
                network_health['ceph_network']['status'] = 'CRITICAL'
                network_health['ceph_network']['issues'].append(
-                    "Ceph network is unreachable."
+                    "Ceph network is unreachable"
                )
            return network_health
        except Exception as e:
-            print(f"Network health check failed: {e}")
+            logger.error(f"Network health check failed: {e}")
-            return {'error': str(e)}
+            return {
                'status': 'ERROR',
                'error': str(e)
            }
 def main():
    try:
@@ -447,9 +576,9 @@ def main():
        # Instantiate the SystemHealthMonitor class
        monitor = SystemHealthMonitor(
-            ticket_api_url=ticket_api_url,
+            ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
-            state_file=state_file,
+            state_file=SystemHealthMonitor.CONFIG['STATE_FILE'],
-            dry_run=args.dry_run  # Pass the dry-run flag
+            dry_run=args.dry_run
        )
        # Run the health checks