From 8e5cda287d1d8a4390791f737a6332138db9c565 Mon Sep 17 00:00:00 2001
From: Jared Vititoe <jjvititoe1@gmail.com>
Date: Wed, 4 Dec 2024 20:46:35 -0500
Subject: [PATCH] Creation of hwmonDaemon and service files

---
 hwmon.service  |  13 ++
 hwmon.timer    |   9 ++
 hwmonDaemon.py | 345 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 367 insertions(+)
 create mode 100644 hwmon.service
 create mode 100644 hwmon.timer
 create mode 100644 hwmonDaemon.py

diff --git a/hwmon.service b/hwmon.service
new file mode 100644
index 0000000..5970b93
--- /dev/null
+++ b/hwmon.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=System Health Monitoring Daemon
+After=network.target
+
+[Service]
+Type=simple
+ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('https://10.10.10.58/JWS/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
+Restart=always
+User=root
+Group=root
+
+[Install]
+WantedBy=multi-user.target
diff --git a/hwmon.timer b/hwmon.timer
new file mode 100644
index 0000000..a43f36c
--- /dev/null
+++ b/hwmon.timer
@@ -0,0 +1,9 @@
+[Unit]
+Description=Run System Health Monitoring Daemon Daily
+
+[Timer]
+OnCalendar=daily
+Persistent=true
+
+[Install]
+WantedBy=timers.target
\ No newline at end of file
diff --git a/hwmonDaemon.py b/hwmonDaemon.py
new file mode 100644
index 0000000..1e4d08a
--- /dev/null
+++ b/hwmonDaemon.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+import os
+import sys
+import json
+import datetime
+import requests
+import psutil
+from typing import Dict, Any, List
+
+class SystemHealthMonitor:
+    def __init__(self, 
+                 ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
+                 state_file: str = '/tmp/last_health_check.json'):
+        """
+        Initialize the system health monitor
+        
+        :param ticket_api_url: URL for ticket creation API
+        :param state_file: File to track last health check
+        """
+        self.ticket_api_url = ticket_api_url
+        self.state_file = state_file
+    
+    def run(self):
+        """
+        Perform a one-shot health check
+        """
+        try:
+            # Perform health checks
+            health_report = self.perform_health_checks()
+            
+            # Create tickets for critical issues
+            self._create_tickets_for_issues(health_report)
+        except Exception as e:
+            print(f"Unexpected error during health check: {e}")
+    
+    def perform_health_checks(self) -> Dict[str, Any]:
+        """
+        Perform comprehensive system health checks
+        
+        :return: Dictionary with health check results
+        """
+        health_report = {
+            'disk_health': self._check_disk_health(),
+            'memory_health': self._check_memory_usage(),
+            'cpu_health': self._check_cpu_usage(),
+            'network_health': self._check_network_status(),
+            'drive_smart_status': self._check_drive_smart_status(),
+            'temperature_health': self._check_system_temperatures()
+        }
+        return health_report
+    
+    def _create_tickets_for_issues(self, health_report: Dict[str, Any]):
+        """
+        Create tickets for critical issues with dynamic parameters.
+        """
+        critical_issues = self._detect_critical_issues(health_report)
+        if not critical_issues:
+            print("No critical issues detected.")
+            return
+
+        # Initialize default ticket fields
+        priority = "P4"  # Default to low priority
+        categories = set()  # To accumulate unique categories
+        issue_types = set()  # To accumulate unique issue types
+        hostname = "medium1"  # Replace with actual logic to determine the hostname
+        action_type = "[auto]"
+        scope = "[cluster-wide]"
+        environment = "[production]"
+        ticket_type = "[maintenance]"
+        
+        # Analyze critical issues to determine ticket parameters
+        for issue in critical_issues:
+            if "disk" in issue.lower():
+                priority = "P2"  # High priority for disk issues
+                categories.add("Hardware")
+                issue_types.add("Incident")
+            elif "memory" in issue.lower():
+                priority = "P2"  # High priority for memory issues
+                categories.add("Hardware")
+                issue_types.add("Incident")
+            elif "cpu" in issue.lower():
+                priority = "P2"  # High priority for CPU issues
+                categories.add("Hardware")
+                issue_types.add("Incident")
+            elif "internet connectivity" in issue.lower():
+                priority = "P3"  # Medium priority for network issues
+                categories.add("Network")
+                issue_types.add("Problem")
+            elif "health issues" in issue.lower():
+                priority = "P1"  # Critical priority for health issues
+                categories.add("Hardware")
+                issue_types.add("Problem")
+
+        # Create a list from the set to get unique values
+        category = list(categories)[0] if categories else "Other"
+        issue_type = list(issue_types)[0] if issue_types else "Task"
+
+        ticket_title = f"[{hostname}]{action_type}[{issue_type}] System Health Issues Detected {scope}{environment}{ticket_type}"
+        ticket_description = "Multiple system health issues detected:\n\n" + "\n".join(critical_issues)
+
+        ticket_payload = {
+            "title": ticket_title,
+            "description": ticket_description,
+            "priority": priority,
+            "status": "Open",
+            "category": category,
+            "type": issue_type
+        }
+
+        try:
+            response = requests.post(
+                self.ticket_api_url, 
+                json=ticket_payload, 
+                headers={'Content-Type': 'application/json'}
+            )
+            if response.status_code in [200, 201]:
+                print(f"Ticket created successfully: {ticket_title}")
+            else:
+                print(f"Failed to create ticket. Status code: {response.status_code}")
+                print(f"Response: {response.text}")
+        except Exception as e:
+            print(f"Error creating ticket: {e}")
+
+
+    
+    def _detect_critical_issues(self, health_report: Dict[str, Any]) -> List[str]:
+        """
+        Detect critical issues in the health report
+        
+        :param health_report: Comprehensive health report
+        :return: List of critical issue descriptions
+        """
+        critical_issues = []
+        for partition in health_report.get('disk_health', {}).get('partitions', []):
+            if partition.get('status') == 'CRITICAL_HIGH_USAGE':
+                critical_issues.append(
+                    f"Disk {partition['mountpoint']} is {partition['usage_percent']}% full"
+                )
+        memory_health = health_report.get('memory_health', {})
+        if memory_health.get('memory_percent', 0) > 90:
+            critical_issues.append(
+                f"Memory usage at {memory_health['memory_percent']}%"
+            )
+        cpu_health = health_report.get('cpu_health', {})
+        if cpu_health.get('cpu_usage_percent', 0) > 90:
+            critical_issues.append(
+                f"CPU usage at {cpu_health['cpu_usage_percent']}%"
+            )
+        network_health = health_report.get('network_health', {})
+        if not network_health.get('internet_connectivity', True):
+            critical_issues.append("No internet connectivity")
+        for drive in health_report.get('drive_smart_status', []):
+            if drive.get('status') != 'HEALTHY':
+                critical_issues.append(
+                    f"Drive {drive['drive']} may have health issues"
+                )
+        return critical_issues
+        
+    def _check_disk_health(self) -> Dict[str, Any]:
+        """
+        Check disk usage and health
+        
+        :return: Disk health metrics
+        """
+        disk_health = {'partitions': []}
+        
+        try:
+            # Get all mounted partitions
+            partitions = psutil.disk_partitions()
+            
+            for partition in partitions:
+                try:
+                    usage = psutil.disk_usage(partition.mountpoint)
+                    partition_info = {
+                        'mountpoint': partition.mountpoint,
+                        'total_space': self._convert_bytes(usage.total),
+                        'used_space': self._convert_bytes(usage.used),
+                        'free_space': self._convert_bytes(usage.free),
+                        'usage_percent': usage.percent
+                    }
+                    
+                    # Flag high usage
+                    if usage.percent > 90:
+                        partition_info['status'] = 'CRITICAL_HIGH_USAGE'
+                    elif usage.percent > 80:
+                        partition_info['status'] = 'WARNING_HIGH_USAGE'
+                    else:
+                        partition_info['status'] = 'NORMAL'
+                    
+                    disk_health['partitions'].append(partition_info)
+                except Exception as e:
+                    print(f"Could not check partition {partition.mountpoint}: {e}")
+            
+            return disk_health
+        
+        except Exception as e:
+            print(f"Disk health check failed: {e}")
+            return {'error': str(e)}
+        
+    def _convert_bytes(self, bytes_value: int, suffix: str = 'B') -> str:
+        """
+        Convert bytes to human-readable format
+        
+        :param bytes_value: Number of bytes
+        :param suffix: Suffix to append (default 'B' for bytes)
+        :return: Formatted string with size
+        """
+        for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
+            if abs(bytes_value) < 1024.0:
+                return f"{bytes_value:.1f}{unit}{suffix}"
+            bytes_value /= 1024.0
+        return f"{bytes_value:.1f}Y{suffix}"
+
+    def _check_memory_usage(self) -> Dict[str, Any]:
+        """
+        Check memory usage and return health metrics
+        
+        :return: Memory health metrics
+        """
+        try:
+            memory = psutil.virtual_memory()
+            return {
+                'total_memory': self._convert_bytes(memory.total),
+                'used_memory': self._convert_bytes(memory.used),
+                'free_memory': self._convert_bytes(memory.available),
+                'memory_percent': memory.percent
+            }
+        except Exception as e:
+            print(f"Memory health check failed: {e}")
+            return {'error': str(e)}
+
+    def _check_cpu_usage(self) -> Dict[str, Any]:
+        """
+        Check CPU usage and return health metrics
+        
+        :return: CPU health metrics
+        """
+        try:
+            cpu_usage = psutil.cpu_percent(interval=1)
+            return {
+                'cpu_usage_percent': cpu_usage
+            }
+        except Exception as e:
+            print(f"CPU health check failed: {e}")
+            return {'error': str(e)}
+
+    def _check_drive_smart_status(self) -> List[Dict[str, Any]]:
+        """
+        Check SMART status of drives
+        
+        :return: List of SMART status for drives
+        """
+        drives = []
+        try:
+            for disk in psutil.disk_partitions():
+                drive = disk.device
+                # Example placeholder: SMART status retrieval would need smartmontools
+                drives.append({'drive': drive, 'status': 'HEALTHY'})
+            return drives
+        except Exception as e:
+            print(f"SMART status check failed: {e}")
+            return [{'error': str(e)}]
+
+    def _check_system_temperatures(self) -> Dict[str, Any]:
+        """
+        Check system temperatures and return health metrics
+        
+        :return: System temperature metrics
+        """
+        try:
+            temperatures = psutil.sensors_temperatures()
+            if not temperatures:
+                return {'error': 'No temperature data available'}
+            
+            temp_metrics = {}
+            for name, entries in temperatures.items():
+                temp_metrics[name] = [
+                    {'label': entry.label, 'current': entry.current, 'high': entry.high, 'critical': entry.critical}
+                    for entry in entries
+                ]
+            return temp_metrics
+        except Exception as e:
+            print(f"Temperature health check failed: {e}")
+            return {'error': str(e)}
+
+    def _check_network_status(self) -> Dict[str, Any]:
+        """
+        Check network connectivity between nodes
+        
+        :return: Network health report
+        """
+        network_health = {
+            'management_network': {'status': 'UNKNOWN', 'issues': []},
+            'ceph_network': {'status': 'UNKNOWN', 'issues': []}
+        }
+        
+        management_ips = ['10.10.10.2', '10.10.10.10', '10.10.10.4', '10.10.10.8', '10.10.10.9']
+        ceph_ips = ['10.10.90.10', '10.10.90.4', '10.10.90.3', '10.10.90.2', '10.10.90.6']
+        
+        def _ping_device(ip: str) -> bool:
+            try:
+                result = subprocess.run(['ping', '-c', '1', ip], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                return result.returncode == 0
+            except Exception as e:
+                print(f"Error pinging {ip}: {e}")
+                return False
+        
+        # Check management network
+        for source_ip in management_ips:
+            for target_ip in management_ips:
+                if source_ip != target_ip and not _ping_device(target_ip):
+                    issue = f"{source_ip} cannot reach {target_ip} in Management Network"
+                    network_health['management_network']['issues'].append(issue)
+        
+        # Check Ceph network
+        for source_ip in ceph_ips:
+            for target_ip in ceph_ips:
+                if source_ip != target_ip and not _ping_device(target_ip):
+                    issue = f"{source_ip} cannot reach {target_ip} in Ceph Network"
+                    network_health['ceph_network']['issues'].append(issue)
+        
+        # Update statuses
+        network_health['management_network']['status'] = 'HEALTHY' if not network_health['management_network']['issues'] else 'ISSUES_DETECTED'
+        network_health['ceph_network']['status'] = 'HEALTHY' if not network_health['ceph_network']['issues'] else 'ISSUES_DETECTED'
+        
+        return network_health
+
+
+def main():
+    # Initialize the monitor
+    monitor = SystemHealthMonitor(
+        check_interval=86400,  # Check once per day
+        ticket_api_url='http://10.10.10.45/create_ticket_api.php'
+    )
+    
+    # Run the monitor
+    monitor.run()
+
+if __name__ == '__main__':
+    # Require root/sudo for full system access
+    if os.geteuid() != 0:
+        print("This script must be run with sudo/root privileges")
+        sys.exit(1)
+    
+    main()
\ No newline at end of file