From 8e5cda287d1d8a4390791f737a6332138db9c565 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Wed, 4 Dec 2024 20:46:35 -0500 Subject: [PATCH] Creation of hwmonDaemon and service files --- hwmon.service | 13 ++ hwmon.timer | 9 ++ hwmonDaemon.py | 345 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 367 insertions(+) create mode 100644 hwmon.service create mode 100644 hwmon.timer create mode 100644 hwmonDaemon.py diff --git a/hwmon.service b/hwmon.service new file mode 100644 index 0000000..5970b93 --- /dev/null +++ b/hwmon.service @@ -0,0 +1,13 @@ +[Unit] +Description=System Health Monitoring Daemon +After=network.target + +[Service] +Type=simple +ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('https://10.10.10.58/JWS/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))" +Restart=always +User=root +Group=root + +[Install] +WantedBy=multi-user.target diff --git a/hwmon.timer b/hwmon.timer new file mode 100644 index 0000000..a43f36c --- /dev/null +++ b/hwmon.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Run System Health Monitoring Daemon Daily + +[Timer] +OnCalendar=daily +Persistent=true + +[Install] +WantedBy=timers.target \ No newline at end of file diff --git a/hwmonDaemon.py b/hwmonDaemon.py new file mode 100644 index 0000000..1e4d08a --- /dev/null +++ b/hwmonDaemon.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +import os +import sys +import json +import datetime +import requests +import psutil +from typing import Dict, Any, List + +class SystemHealthMonitor: + def __init__(self, + ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php', + state_file: str = '/tmp/last_health_check.json'): + """ + Initialize the system health monitor + + :param ticket_api_url: URL for ticket creation API + :param state_file: File to track last health check + """ + self.ticket_api_url = ticket_api_url + self.state_file = state_file + + def run(self): + """ + Perform a one-shot health check + """ + try: + # Perform health checks + health_report = self.perform_health_checks() + + # Create tickets for critical issues + self._create_tickets_for_issues(health_report) + except Exception as e: + print(f"Unexpected error during health check: {e}") + + def perform_health_checks(self) -> Dict[str, Any]: + """ + Perform comprehensive system health checks + + :return: Dictionary with health check results + """ + health_report = { + 'disk_health': self._check_disk_health(), + 'memory_health': self._check_memory_usage(), + 'cpu_health': self._check_cpu_usage(), + 'network_health': self._check_network_status(), + 'drive_smart_status': self._check_drive_smart_status(), + 'temperature_health': self._check_system_temperatures() + } + return health_report + + def _create_tickets_for_issues(self, health_report: Dict[str, Any]): + """ + Create tickets for critical issues with dynamic parameters. + """ + critical_issues = self._detect_critical_issues(health_report) + if not critical_issues: + print("No critical issues detected.") + return + + # Initialize default ticket fields + priority = "P4" # Default to low priority + categories = set() # To accumulate unique categories + issue_types = set() # To accumulate unique issue types + hostname = "medium1" # Replace with actual logic to determine the hostname + action_type = "[auto]" + scope = "[cluster-wide]" + environment = "[production]" + ticket_type = "[maintenance]" + + # Analyze critical issues to determine ticket parameters + for issue in critical_issues: + if "disk" in issue.lower(): + priority = "P2" # High priority for disk issues + categories.add("Hardware") + issue_types.add("Incident") + elif "memory" in issue.lower(): + priority = "P2" # High priority for memory issues + categories.add("Hardware") + issue_types.add("Incident") + elif "cpu" in issue.lower(): + priority = "P2" # High priority for CPU issues + categories.add("Hardware") + issue_types.add("Incident") + elif "internet connectivity" in issue.lower(): + priority = "P3" # Medium priority for network issues + categories.add("Network") + issue_types.add("Problem") + elif "health issues" in issue.lower(): + priority = "P1" # Critical priority for health issues + categories.add("Hardware") + issue_types.add("Problem") + + # Create a list from the set to get unique values + category = list(categories)[0] if categories else "Other" + issue_type = list(issue_types)[0] if issue_types else "Task" + + ticket_title = f"[{hostname}]{action_type}[{issue_type}] System Health Issues Detected {scope}{environment}{ticket_type}" + ticket_description = "Multiple system health issues detected:\n\n" + "\n".join(critical_issues) + + ticket_payload = { + "title": ticket_title, + "description": ticket_description, + "priority": priority, + "status": "Open", + "category": category, + "type": issue_type + } + + try: + response = requests.post( + self.ticket_api_url, + json=ticket_payload, + headers={'Content-Type': 'application/json'} + ) + if response.status_code in [200, 201]: + print(f"Ticket created successfully: {ticket_title}") + else: + print(f"Failed to create ticket. Status code: {response.status_code}") + print(f"Response: {response.text}") + except Exception as e: + print(f"Error creating ticket: {e}") + + + + def _detect_critical_issues(self, health_report: Dict[str, Any]) -> List[str]: + """ + Detect critical issues in the health report + + :param health_report: Comprehensive health report + :return: List of critical issue descriptions + """ + critical_issues = [] + for partition in health_report.get('disk_health', {}).get('partitions', []): + if partition.get('status') == 'CRITICAL_HIGH_USAGE': + critical_issues.append( + f"Disk {partition['mountpoint']} is {partition['usage_percent']}% full" + ) + memory_health = health_report.get('memory_health', {}) + if memory_health.get('memory_percent', 0) > 90: + critical_issues.append( + f"Memory usage at {memory_health['memory_percent']}%" + ) + cpu_health = health_report.get('cpu_health', {}) + if cpu_health.get('cpu_usage_percent', 0) > 90: + critical_issues.append( + f"CPU usage at {cpu_health['cpu_usage_percent']}%" + ) + network_health = health_report.get('network_health', {}) + if not network_health.get('internet_connectivity', True): + critical_issues.append("No internet connectivity") + for drive in health_report.get('drive_smart_status', []): + if drive.get('status') != 'HEALTHY': + critical_issues.append( + f"Drive {drive['drive']} may have health issues" + ) + return critical_issues + + def _check_disk_health(self) -> Dict[str, Any]: + """ + Check disk usage and health + + :return: Disk health metrics + """ + disk_health = {'partitions': []} + + try: + # Get all mounted partitions + partitions = psutil.disk_partitions() + + for partition in partitions: + try: + usage = psutil.disk_usage(partition.mountpoint) + partition_info = { + 'mountpoint': partition.mountpoint, + 'total_space': self._convert_bytes(usage.total), + 'used_space': self._convert_bytes(usage.used), + 'free_space': self._convert_bytes(usage.free), + 'usage_percent': usage.percent + } + + # Flag high usage + if usage.percent > 90: + partition_info['status'] = 'CRITICAL_HIGH_USAGE' + elif usage.percent > 80: + partition_info['status'] = 'WARNING_HIGH_USAGE' + else: + partition_info['status'] = 'NORMAL' + + disk_health['partitions'].append(partition_info) + except Exception as e: + print(f"Could not check partition {partition.mountpoint}: {e}") + + return disk_health + + except Exception as e: + print(f"Disk health check failed: {e}") + return {'error': str(e)} + + def _convert_bytes(self, bytes_value: int, suffix: str = 'B') -> str: + """ + Convert bytes to human-readable format + + :param bytes_value: Number of bytes + :param suffix: Suffix to append (default 'B' for bytes) + :return: Formatted string with size + """ + for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: + if abs(bytes_value) < 1024.0: + return f"{bytes_value:.1f}{unit}{suffix}" + bytes_value /= 1024.0 + return f"{bytes_value:.1f}Y{suffix}" + + def _check_memory_usage(self) -> Dict[str, Any]: + """ + Check memory usage and return health metrics + + :return: Memory health metrics + """ + try: + memory = psutil.virtual_memory() + return { + 'total_memory': self._convert_bytes(memory.total), + 'used_memory': self._convert_bytes(memory.used), + 'free_memory': self._convert_bytes(memory.available), + 'memory_percent': memory.percent + } + except Exception as e: + print(f"Memory health check failed: {e}") + return {'error': str(e)} + + def _check_cpu_usage(self) -> Dict[str, Any]: + """ + Check CPU usage and return health metrics + + :return: CPU health metrics + """ + try: + cpu_usage = psutil.cpu_percent(interval=1) + return { + 'cpu_usage_percent': cpu_usage + } + except Exception as e: + print(f"CPU health check failed: {e}") + return {'error': str(e)} + + def _check_drive_smart_status(self) -> List[Dict[str, Any]]: + """ + Check SMART status of drives + + :return: List of SMART status for drives + """ + drives = [] + try: + for disk in psutil.disk_partitions(): + drive = disk.device + # Example placeholder: SMART status retrieval would need smartmontools + drives.append({'drive': drive, 'status': 'HEALTHY'}) + return drives + except Exception as e: + print(f"SMART status check failed: {e}") + return [{'error': str(e)}] + + def _check_system_temperatures(self) -> Dict[str, Any]: + """ + Check system temperatures and return health metrics + + :return: System temperature metrics + """ + try: + temperatures = psutil.sensors_temperatures() + if not temperatures: + return {'error': 'No temperature data available'} + + temp_metrics = {} + for name, entries in temperatures.items(): + temp_metrics[name] = [ + {'label': entry.label, 'current': entry.current, 'high': entry.high, 'critical': entry.critical} + for entry in entries + ] + return temp_metrics + except Exception as e: + print(f"Temperature health check failed: {e}") + return {'error': str(e)} + + def _check_network_status(self) -> Dict[str, Any]: + """ + Check network connectivity between nodes + + :return: Network health report + """ + network_health = { + 'management_network': {'status': 'UNKNOWN', 'issues': []}, + 'ceph_network': {'status': 'UNKNOWN', 'issues': []} + } + + management_ips = ['10.10.10.2', '10.10.10.10', '10.10.10.4', '10.10.10.8', '10.10.10.9'] + ceph_ips = ['10.10.90.10', '10.10.90.4', '10.10.90.3', '10.10.90.2', '10.10.90.6'] + + def _ping_device(ip: str) -> bool: + try: + result = subprocess.run(['ping', '-c', '1', ip], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return result.returncode == 0 + except Exception as e: + print(f"Error pinging {ip}: {e}") + return False + + # Check management network + for source_ip in management_ips: + for target_ip in management_ips: + if source_ip != target_ip and not _ping_device(target_ip): + issue = f"{source_ip} cannot reach {target_ip} in Management Network" + network_health['management_network']['issues'].append(issue) + + # Check Ceph network + for source_ip in ceph_ips: + for target_ip in ceph_ips: + if source_ip != target_ip and not _ping_device(target_ip): + issue = f"{source_ip} cannot reach {target_ip} in Ceph Network" + network_health['ceph_network']['issues'].append(issue) + + # Update statuses + network_health['management_network']['status'] = 'HEALTHY' if not network_health['management_network']['issues'] else 'ISSUES_DETECTED' + network_health['ceph_network']['status'] = 'HEALTHY' if not network_health['ceph_network']['issues'] else 'ISSUES_DETECTED' + + return network_health + + +def main(): + # Initialize the monitor + monitor = SystemHealthMonitor( + check_interval=86400, # Check once per day + ticket_api_url='http://10.10.10.45/create_ticket_api.php' + ) + + # Run the monitor + monitor.run() + +if __name__ == '__main__': + # Require root/sudo for full system access + if os.geteuid() != 0: + print("This script must be run with sudo/root privileges") + sys.exit(1) + + main() \ No newline at end of file