Added ECC support and sorted code

2024-12-13 18:36:01 -05:00
parent 9a4a2dadc9
commit a9b0fb77f2
2 changed files with 178 additions and 48 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re
+import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob
 from typing import Dict, Any, List

 # Create a logger
@@ -20,6 +20,49 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)

 class SystemHealthMonitor:
+    PRIORITIES = {
+        'CRITICAL': '1',
+        'HIGH': '2',
+        'MEDIUM': '3',
+        'LOW': '4'
+    }
+    ISSUE_PRIORITIES = {
+        'SMART_FAILURE': PRIORITIES['HIGH'],
+        'DISK_CRITICAL': PRIORITIES['HIGH'],
+        'DISK_WARNING': PRIORITIES['MEDIUM'],
+        'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
+        'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
+        'CPU_HIGH': PRIORITIES['MEDIUM'],
+        'NETWORK_FAILURE': PRIORITIES['HIGH']
+    }
+    CONFIG = {
+        'TICKET_API_URL': 'http://10.10.10.45/create_ticket_api.php',
+        'STATE_FILE': '/tmp/last_health_check.json',
+        'THRESHOLDS': {
+            'DISK_CRITICAL': 90,
+            'DISK_WARNING': 80,
+            'CPU_WARNING': 80,
+            'TEMPERATURE_WARNING': 65
+        },
+        'NETWORKS': {
+            'MANAGEMENT': '10.10.10.1',
+            'CEPH': '10.10.90.1',
+            'PING_TIMEOUT': 1,  # seconds
+            'PING_COUNT': 1
+        }
+    }
+    TICKET_TEMPLATES = {
+        'ACTION_TYPE': '[auto]',
+        'ENVIRONMENT': '[production]',
+        'TICKET_TYPE': '[maintenance]',
+        'HARDWARE_TYPE': '[hardware]',
+        'NETWORK_TYPE': '[network]',
+        'SCOPE_SINGLE': '[single-node]',
+        'SCOPE_CLUSTER': '[cluster-wide]',
+        'DEFAULT_CATEGORY': 'Hardware',
+        'DEFAULT_ISSUE_TYPE': 'Problem'
+    }
+
    def __init__(self, 
                 ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
                 state_file: str = '/tmp/last_health_check.json',
@@ -34,7 +77,7 @@ class SystemHealthMonitor:
        self.ticket_api_url = ticket_api_url
        self.state_file = state_file
        self.dry_run = dry_run
-    
+
    def run(self):
        """
        Perform a one-shot health check of the system.
@@ -127,33 +170,35 @@ class SystemHealthMonitor:
            return

        hostname = socket.gethostname()
-        action_type = "[auto]"
-        environment = "[production]"
-        ticket_type = "[maintenance]"
+        action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
+        environment = self.TICKET_TEMPLATES['ENVIRONMENT']
+        ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']

        for issue in issues:
-            # Set default values
-            priority = ""
-            category = "Hardware"
-            issue_type = "Problem"
-            scope = "[single-node]"
+            priority = self.PRIORITIES['MEDIUM']  # default
+            category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
+            issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
+            scope = self.TICKET_TEMPLATES['SCOPE_SINGLE']

            if "Disk" in issue:
-                hardware_type = "[hardware]"
+                hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
                if "CRITICAL" in issue or "SMART failure" in issue:
-                    priority = "2"
+                    priority = self.ISSUE_PRIORITIES['DISK_CRITICAL']
                elif "WARNING" in issue:
-                    priority = "3"
-            elif "Memory" in issue:
+                    priority = self.ISSUE_PRIORITIES['DISK_WARNING']
+            elif "Network" in issue:
+                hardware_type = self.TICKET_TEMPLATES['NETWORK_TYPE']
+                priority = self.ISSUE_PRIORITIES['NETWORK_FAILURE']
+                scope = self.TICKET_TEMPLATES['SCOPE_CLUSTER']
+            elif "Uncorrectable ECC" in issue:
+                hardware_type = "[hardware]"
+                priority = "2"
+            elif "Correctable ECC" in issue:
                hardware_type = "[hardware]"
                priority = "3"
            elif "CPU" in issue:
                hardware_type = "[hardware]"
                priority = "3"
-            elif "Network" in issue:
-                hardware_type = "[network]"
-                priority = "2"
-                scope = "[cluster-wide]"

            # Create standardized ticket title
            ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}"
@@ -212,15 +257,15 @@ class SystemHealthMonitor:
            if partition.get('smart_status') == 'UNHEALTHY':
                issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status")

-        # Check for memory-related issues
+        # Check for ECC memory errors
        memory_health = health_report.get('memory_health', {})
-        if memory_health and memory_health.get('memory_percent', 0) > 80:
-            issues.append("Memory usage is above 80%")
+        if memory_health.get('has_ecc') and memory_health.get('ecc_errors'):
+            issues.extend(memory_health['ecc_errors'])

        # Check for CPU-related issues
        cpu_health = health_report.get('cpu_health', {})
-        if cpu_health and cpu_health.get('cpu_usage_percent', 0) > 80:
-            issues.append("CPU usage is above 80%")
+        if cpu_health and cpu_health.get('cpu_usage_percent', 0) > self.CONFIG['THRESHOLDS']['CPU_WARNING']:
+            issues.append("CPU usage is above threshold")

        # Check for network-related issues
        network_health = health_report.get('network_health', {})
@@ -291,7 +336,7 @@ class SystemHealthMonitor:
                    if len(parts) >= 10:
                        temp = int(parts[9])
                        smart_health['temp'] = temp
-                        if temp > 65:
+                        if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
                            smart_health['issues'].append(f"High drive temperature: {temp}°C")
                        
        except Exception as e:
@@ -318,9 +363,9 @@ class SystemHealthMonitor:
                # Check disk usage
                usage = psutil.disk_usage(partition.mountpoint)
                disk_usage_status = 'NORMAL'
-                if usage.percent > 90:
+                if usage.percent > self.CONFIG['THRESHOLDS']['DISK_CRITICAL']:
                    disk_usage_status = 'CRITICAL_HIGH_USAGE'
-                elif usage.percent > 80:
+                elif usage.percent > self.CONFIG['THRESHOLDS']['DISK_WARNING']:
                    disk_usage_status = 'WARNING_HIGH_USAGE'
                    
                drive_report.update({
@@ -372,19 +417,68 @@ class SystemHealthMonitor:

    def _check_memory_usage(self) -> Dict[str, Any]:
        """
-        Check memory usage and return health metrics.
+        Check for ECC memory errors if ECC memory is present.
        
-        :return: Dictionary with memory health metrics.
+        :return: Dictionary with memory health metrics and ECC status.
        """
-        memory_info = psutil.virtual_memory()
        memory_health = {
-            'total_memory': self._convert_bytes(memory_info.total),
-            'used_memory': self._convert_bytes(memory_info.used),
-            'memory_percent': memory_info.percent,
-            'status': 'OK' if memory_info.percent < 90 else 'WARNING'
+            'has_ecc': False,
+            'ecc_errors': [],
+            'status': 'OK'
        }
+        
+        try:
+            # Check if ECC memory is present by looking at edac_mc
+            result = subprocess.run(
+                ['ls', '/sys/devices/system/edac/mc'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            
+            if result.returncode == 0:
+                memory_health['has_ecc'] = True
+                
+                # Check for ECC errors in mcX/csrowY/ue_count and ce_count files
+                for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
+                    mc_name = os.path.basename(mc_dir)
+                    
+                    # Check uncorrectable errors
+                    ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
+                    if ue_count > 0:
+                        memory_health['status'] = 'CRITICAL'
+                        memory_health['ecc_errors'].append(
+                            f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}"
+                        )
+                    
+                    # Check correctable errors
+                    ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
+                    if ce_count > 0:
+                        if memory_health['status'] != 'CRITICAL':
+                            memory_health['status'] = 'WARNING'
+                        memory_health['ecc_errors'].append(
+                            f"Correctable ECC errors detected in {mc_name}: {ce_count}"
+                        )
+                        
+        except Exception as e:
+            memory_health['status'] = 'ERROR'
+            memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
+            
        return memory_health

+    def _read_ecc_count(self, filepath: str) -> int:
+        """
+        Read ECC error count from a file.
+        
+        :param filepath: Path to the ECC count file
+        :return: Number of ECC errors
+        """
+        try:
+            with open(filepath, 'r') as f:
+                return int(f.read().strip())
+        except:
+            return 0
+
    def _check_cpu_usage(self) -> Dict[str, Any]:
        """
        Check CPU usage and return health metrics.
@@ -394,7 +488,7 @@ class SystemHealthMonitor:
        cpu_usage_percent = psutil.cpu_percent(interval=1)
        cpu_health = {
            'cpu_usage_percent': cpu_usage_percent,
-            'status': 'OK' if cpu_usage_percent < 90 else 'WARNING'
+            'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING'
        }
        return cpu_health
        
@@ -405,30 +499,65 @@ class SystemHealthMonitor:
        :return: Dictionary containing network health metrics and any issues found.
        """
        network_health = {
-            'management_network': {'issues': []},
-            'ceph_network': {'issues': []}
+            'management_network': {
+                'issues': [],
+                'status': 'OK',
+                'latency': None
+            },
+            'ceph_network': {
+                'issues': [],
+                'status': 'OK',
+                'latency': None
+            }
        }

        try:
            # Check management network connectivity
-            proc = subprocess.run(["ping", "-c", "1", "10.10.10.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            if proc.returncode != 0:
+            mgmt_result = subprocess.run(
+                [
+                    "ping", 
+                    "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
+                    "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
+                    self.CONFIG['NETWORKS']['MANAGEMENT']
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            
+            if mgmt_result.returncode != 0:
+                network_health['management_network']['status'] = 'CRITICAL'
                network_health['management_network']['issues'].append(
-                    "Management network is unreachable."
+                    "Management network is unreachable"
                )

            # Check Ceph network connectivity
-            proc = subprocess.run(["ping", "-c", "1", "10.10.90.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            if proc.returncode != 0:
+            ceph_result = subprocess.run(
+                [
+                    "ping",
+                    "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
+                    "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
+                    self.CONFIG['NETWORKS']['CEPH']
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+
+            if ceph_result.returncode != 0:
+                network_health['ceph_network']['status'] = 'CRITICAL'
                network_health['ceph_network']['issues'].append(
-                    "Ceph network is unreachable."
+                    "Ceph network is unreachable"
                )

            return network_health

        except Exception as e:
-            print(f"Network health check failed: {e}")
-            return {'error': str(e)}
+            logger.error(f"Network health check failed: {e}")
+            return {
+                'status': 'ERROR',
+                'error': str(e)
+            }

 def main():
    try:
@@ -447,9 +576,9 @@ def main():

        # Instantiate the SystemHealthMonitor class
        monitor = SystemHealthMonitor(
-            ticket_api_url=ticket_api_url,
-            state_file=state_file,
-            dry_run=args.dry_run  # Pass the dry-run flag
+            ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
+            state_file=SystemHealthMonitor.CONFIG['STATE_FILE'],
+            dry_run=args.dry_run
        )

        # Run the health checks
@@ -476,4 +605,4 @@ if __name__ == "__main__":
    # Set dry-run mode if specified
    dry_run_mode = args.dry_run

-    main()
+    main()