From a9b0fb77f2818e18d29e535223e9b6512707221a Mon Sep 17 00:00:00 2001
From: Jared Vititoe <jjvititoe1@gmail.com>
Date: Fri, 13 Dec 2024 18:36:01 -0500
Subject: [PATCH] Added ECC support and sorted code

---
 hwmon.timer    |   1 +
 hwmonDaemon.py | 225 ++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 178 insertions(+), 48 deletions(-)

diff --git a/hwmon.timer b/hwmon.timer
index a43f36c..d4ac957 100644
--- a/hwmon.timer
+++ b/hwmon.timer
@@ -3,6 +3,7 @@ Description=Run System Health Monitoring Daemon Daily
 
 [Timer]
 OnCalendar=daily
+RandomizedDelaySec=1h
 Persistent=true
 
 [Install]
diff --git a/hwmonDaemon.py b/hwmonDaemon.py
index 8045c40..e171367 100644
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re
+import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob
 from typing import Dict, Any, List
 
 # Create a logger
@@ -20,6 +20,49 @@ console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 
 class SystemHealthMonitor:
+    PRIORITIES = {
+        'CRITICAL': '1',
+        'HIGH': '2',
+        'MEDIUM': '3',
+        'LOW': '4'
+    }
+    ISSUE_PRIORITIES = {
+        'SMART_FAILURE': PRIORITIES['HIGH'],
+        'DISK_CRITICAL': PRIORITIES['HIGH'],
+        'DISK_WARNING': PRIORITIES['MEDIUM'],
+        'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
+        'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
+        'CPU_HIGH': PRIORITIES['MEDIUM'],
+        'NETWORK_FAILURE': PRIORITIES['HIGH']
+    }
+    CONFIG = {
+        'TICKET_API_URL': 'http://10.10.10.45/create_ticket_api.php',
+        'STATE_FILE': '/tmp/last_health_check.json',
+        'THRESHOLDS': {
+            'DISK_CRITICAL': 90,
+            'DISK_WARNING': 80,
+            'CPU_WARNING': 80,
+            'TEMPERATURE_WARNING': 65
+        },
+        'NETWORKS': {
+            'MANAGEMENT': '10.10.10.1',
+            'CEPH': '10.10.90.1',
+            'PING_TIMEOUT': 1,  # seconds
+            'PING_COUNT': 1
+        }
+    }
+    TICKET_TEMPLATES = {
+        'ACTION_TYPE': '[auto]',
+        'ENVIRONMENT': '[production]',
+        'TICKET_TYPE': '[maintenance]',
+        'HARDWARE_TYPE': '[hardware]',
+        'NETWORK_TYPE': '[network]',
+        'SCOPE_SINGLE': '[single-node]',
+        'SCOPE_CLUSTER': '[cluster-wide]',
+        'DEFAULT_CATEGORY': 'Hardware',
+        'DEFAULT_ISSUE_TYPE': 'Problem'
+    }
+
     def __init__(self, 
                  ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
                  state_file: str = '/tmp/last_health_check.json',
@@ -34,7 +77,7 @@ class SystemHealthMonitor:
         self.ticket_api_url = ticket_api_url
         self.state_file = state_file
         self.dry_run = dry_run
-    
+
     def run(self):
         """
         Perform a one-shot health check of the system.
@@ -127,33 +170,35 @@ class SystemHealthMonitor:
             return
 
         hostname = socket.gethostname()
-        action_type = "[auto]"
-        environment = "[production]"
-        ticket_type = "[maintenance]"
+        action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
+        environment = self.TICKET_TEMPLATES['ENVIRONMENT']
+        ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
 
         for issue in issues:
-            # Set default values
-            priority = ""
-            category = "Hardware"
-            issue_type = "Problem"
-            scope = "[single-node]"
+            priority = self.PRIORITIES['MEDIUM']  # default
+            category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
+            issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
+            scope = self.TICKET_TEMPLATES['SCOPE_SINGLE']
 
             if "Disk" in issue:
-                hardware_type = "[hardware]"
+                hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
                 if "CRITICAL" in issue or "SMART failure" in issue:
-                    priority = "2"
+                    priority = self.ISSUE_PRIORITIES['DISK_CRITICAL']
                 elif "WARNING" in issue:
-                    priority = "3"
-            elif "Memory" in issue:
+                    priority = self.ISSUE_PRIORITIES['DISK_WARNING']
+            elif "Network" in issue:
+                hardware_type = self.TICKET_TEMPLATES['NETWORK_TYPE']
+                priority = self.ISSUE_PRIORITIES['NETWORK_FAILURE']
+                scope = self.TICKET_TEMPLATES['SCOPE_CLUSTER']
+            elif "Uncorrectable ECC" in issue:
+                hardware_type = "[hardware]"
+                priority = "2"
+            elif "Correctable ECC" in issue:
                 hardware_type = "[hardware]"
                 priority = "3"
             elif "CPU" in issue:
                 hardware_type = "[hardware]"
                 priority = "3"
-            elif "Network" in issue:
-                hardware_type = "[network]"
-                priority = "2"
-                scope = "[cluster-wide]"
 
             # Create standardized ticket title
             ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}"
@@ -212,15 +257,15 @@ class SystemHealthMonitor:
             if partition.get('smart_status') == 'UNHEALTHY':
                 issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status")
 
-        # Check for memory-related issues
+        # Check for ECC memory errors
         memory_health = health_report.get('memory_health', {})
-        if memory_health and memory_health.get('memory_percent', 0) > 80:
-            issues.append("Memory usage is above 80%")
+        if memory_health.get('has_ecc') and memory_health.get('ecc_errors'):
+            issues.extend(memory_health['ecc_errors'])
 
         # Check for CPU-related issues
         cpu_health = health_report.get('cpu_health', {})
-        if cpu_health and cpu_health.get('cpu_usage_percent', 0) > 80:
-            issues.append("CPU usage is above 80%")
+        if cpu_health and cpu_health.get('cpu_usage_percent', 0) > self.CONFIG['THRESHOLDS']['CPU_WARNING']:
+            issues.append("CPU usage is above threshold")
 
         # Check for network-related issues
         network_health = health_report.get('network_health', {})
@@ -291,7 +336,7 @@ class SystemHealthMonitor:
                     if len(parts) >= 10:
                         temp = int(parts[9])
                         smart_health['temp'] = temp
-                        if temp > 65:
+                        if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
                             smart_health['issues'].append(f"High drive temperature: {temp}°C")
                         
         except Exception as e:
@@ -318,9 +363,9 @@ class SystemHealthMonitor:
                 # Check disk usage
                 usage = psutil.disk_usage(partition.mountpoint)
                 disk_usage_status = 'NORMAL'
-                if usage.percent > 90:
+                if usage.percent > self.CONFIG['THRESHOLDS']['DISK_CRITICAL']:
                     disk_usage_status = 'CRITICAL_HIGH_USAGE'
-                elif usage.percent > 80:
+                elif usage.percent > self.CONFIG['THRESHOLDS']['DISK_WARNING']:
                     disk_usage_status = 'WARNING_HIGH_USAGE'
                     
                 drive_report.update({
@@ -372,19 +417,68 @@ class SystemHealthMonitor:
 
     def _check_memory_usage(self) -> Dict[str, Any]:
         """
-        Check memory usage and return health metrics.
+        Check for ECC memory errors if ECC memory is present.
         
-        :return: Dictionary with memory health metrics.
+        :return: Dictionary with memory health metrics and ECC status.
         """
-        memory_info = psutil.virtual_memory()
         memory_health = {
-            'total_memory': self._convert_bytes(memory_info.total),
-            'used_memory': self._convert_bytes(memory_info.used),
-            'memory_percent': memory_info.percent,
-            'status': 'OK' if memory_info.percent < 90 else 'WARNING'
+            'has_ecc': False,
+            'ecc_errors': [],
+            'status': 'OK'
         }
+        
+        try:
+            # Check if ECC memory is present by looking at edac_mc
+            result = subprocess.run(
+                ['ls', '/sys/devices/system/edac/mc'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            
+            if result.returncode == 0:
+                memory_health['has_ecc'] = True
+                
+                # Check for ECC errors in mcX/csrowY/ue_count and ce_count files
+                for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
+                    mc_name = os.path.basename(mc_dir)
+                    
+                    # Check uncorrectable errors
+                    ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
+                    if ue_count > 0:
+                        memory_health['status'] = 'CRITICAL'
+                        memory_health['ecc_errors'].append(
+                            f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}"
+                        )
+                    
+                    # Check correctable errors
+                    ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
+                    if ce_count > 0:
+                        if memory_health['status'] != 'CRITICAL':
+                            memory_health['status'] = 'WARNING'
+                        memory_health['ecc_errors'].append(
+                            f"Correctable ECC errors detected in {mc_name}: {ce_count}"
+                        )
+                        
+        except Exception as e:
+            memory_health['status'] = 'ERROR'
+            memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
+            
         return memory_health
 
+    def _read_ecc_count(self, filepath: str) -> int:
+        """
+        Read ECC error count from a file.
+        
+        :param filepath: Path to the ECC count file
+        :return: Number of ECC errors
+        """
+        try:
+            with open(filepath, 'r') as f:
+                return int(f.read().strip())
+        except:
+            return 0
+
     def _check_cpu_usage(self) -> Dict[str, Any]:
         """
         Check CPU usage and return health metrics.
@@ -394,7 +488,7 @@ class SystemHealthMonitor:
         cpu_usage_percent = psutil.cpu_percent(interval=1)
         cpu_health = {
             'cpu_usage_percent': cpu_usage_percent,
-            'status': 'OK' if cpu_usage_percent < 90 else 'WARNING'
+            'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING'
         }
         return cpu_health
         
@@ -405,30 +499,65 @@ class SystemHealthMonitor:
         :return: Dictionary containing network health metrics and any issues found.
         """
         network_health = {
-            'management_network': {'issues': []},
-            'ceph_network': {'issues': []}
+            'management_network': {
+                'issues': [],
+                'status': 'OK',
+                'latency': None
+            },
+            'ceph_network': {
+                'issues': [],
+                'status': 'OK',
+                'latency': None
+            }
         }
 
         try:
             # Check management network connectivity
-            proc = subprocess.run(["ping", "-c", "1", "10.10.10.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            if proc.returncode != 0:
+            mgmt_result = subprocess.run(
+                [
+                    "ping", 
+                    "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
+                    "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
+                    self.CONFIG['NETWORKS']['MANAGEMENT']
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            
+            if mgmt_result.returncode != 0:
+                network_health['management_network']['status'] = 'CRITICAL'
                 network_health['management_network']['issues'].append(
-                    "Management network is unreachable."
+                    "Management network is unreachable"
                 )
 
             # Check Ceph network connectivity
-            proc = subprocess.run(["ping", "-c", "1", "10.10.90.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            if proc.returncode != 0:
+            ceph_result = subprocess.run(
+                [
+                    "ping",
+                    "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
+                    "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
+                    self.CONFIG['NETWORKS']['CEPH']
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+
+            if ceph_result.returncode != 0:
+                network_health['ceph_network']['status'] = 'CRITICAL'
                 network_health['ceph_network']['issues'].append(
-                    "Ceph network is unreachable."
+                    "Ceph network is unreachable"
                 )
 
             return network_health
 
         except Exception as e:
-            print(f"Network health check failed: {e}")
-            return {'error': str(e)}
+            logger.error(f"Network health check failed: {e}")
+            return {
+                'status': 'ERROR',
+                'error': str(e)
+            }
 
 def main():
     try:
@@ -447,9 +576,9 @@ def main():
 
         # Instantiate the SystemHealthMonitor class
         monitor = SystemHealthMonitor(
-            ticket_api_url=ticket_api_url,
-            state_file=state_file,
-            dry_run=args.dry_run  # Pass the dry-run flag
+            ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
+            state_file=SystemHealthMonitor.CONFIG['STATE_FILE'],
+            dry_run=args.dry_run
         )
 
         # Run the health checks
@@ -476,4 +605,4 @@ if __name__ == "__main__":
     # Set dry-run mode if specified
     dry_run_mode = args.dry_run
 
-    main()
\ No newline at end of file
+    main()