diff --git a/hwmonDaemon.py b/hwmonDaemon.py index c56c187..d980c90 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl +import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl, textwrap from typing import Dict, Any, List # ============================================================================= @@ -422,10 +422,10 @@ class SystemHealthMonitor: } SEVERITY_INDICATORS = { - 'CRITICAL': '🔴', - 'WARNING': '🟡', - 'HEALTHY': '🟢', - 'UNKNOWN': '⚪' + 'CRITICAL': '[CRIT]', + 'WARNING': '[WARN]', + 'HEALTHY': '[ OK ]', + 'UNKNOWN': '[ ?? ]' } SMART_DESCRIPTIONS = { @@ -1096,22 +1096,31 @@ class SystemHealthMonitor: return "Performance Issue" elif "Network" in issue: return "Network Issue" + elif any(kw in issue for kw in ["LXC", "storage usage", "container"]): + return "Container Storage Issue" return "Hardware Issue" def _get_impact_level(self, issue: str) -> str: """Determine impact level from issue description.""" issue_upper = issue.upper() + # Check storage/CPU warnings first so "critical storage" isn't caught as Critical + if any(kw in issue_upper for kw in ["STORAGE USAGE", "THRESHOLD", "CPU USAGE"]): + return "[WARN] Warning - Action Needed Soon" if "CRITICAL" in issue_upper or "UNHEALTHY" in issue_upper or "HEALTH_ERR" in issue_upper: - return "🔴 Critical - Immediate Action Required" + return "[CRIT] Critical - Immediate Action Required" elif "WARNING" in issue_upper or "HEALTH_WARN" in issue_upper or "DOWN" in issue_upper: - return "🟡 Warning - Action Needed Soon" - return "🟢 Low - Monitor Only" + return "[WARN] Warning - Action Needed Soon" + return "[LOW] Low - Monitor Only" - def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str: + def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any], priority: str = '3') -> str: """Generate detailed ticket description with properly formatted ASCII art.""" hostname = socket.gethostname() timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - priority = "⚠ HIGH" if "CRITICAL" in issue else "● MEDIUM" + priority_labels = { + '1': '⚠ P1 - CRITICAL', '2': '⚠ P2 - HIGH', + '3': '● P3 - MEDIUM', '4': '● P4 - NORMAL', '5': '● P5 - LOW', + } + priority_display = priority_labels.get(priority, '● P3 - MEDIUM') # Box width: all lines are exactly 80 chars # border lines: ┏ + 78 ━ + ┓ = 80 @@ -1124,7 +1133,7 @@ class SystemHealthMonitor: ┣{'━' * box_width}┫ ┃ Host : {hostname:<{box_width - 14}}┃ ┃ Generated : {timestamp:<{box_width - 14}}┃ -┃ Priority : {priority:<{box_width - 14}}┃ +┃ Priority : {priority_display:<{box_width - 14}}┃ ┗{'━' * box_width}┛""" issue_type = self._get_issue_type(issue) @@ -1141,13 +1150,13 @@ class SystemHealthMonitor: # Add relevant SMART descriptions for attr in self.SMART_DESCRIPTIONS: if attr in issue: - description += f"\n{attr}:\n{self.SMART_DESCRIPTIONS[attr]}\n" + description += f"\n{attr}:\n{textwrap.dedent(self.SMART_DESCRIPTIONS[attr]).strip()}\n" if "SMART" in issue: - description += """ + description += "\n" + textwrap.dedent(""" SMART (Self-Monitoring, Analysis, and Reporting Technology) Attribute Details: - Possible drive failure! - """ + """).strip() + "\n" if "Drive" in issue and "/dev/" in issue: try: @@ -1165,7 +1174,18 @@ class SystemHealthMonitor: power_on_hours = smart_data['attributes'].get('Power_On_Hours', 'N/A') last_test_date = smart_data.get('last_test_date', 'N/A') - age = f"{int(power_on_hours/24/365) if isinstance(power_on_hours, (int, float)) else 'N/A'} years" if power_on_hours != 'N/A' else 'N/A' + if power_on_hours != 'N/A' and isinstance(power_on_hours, (int, float)): + total_days = power_on_hours / 24 + years = int(total_days / 365) + months = int((total_days % 365) / 30) + if years >= 1: + age = f"{years} year{'s' if years != 1 else ''}, {months} month{'s' if months != 1 else ''}" + elif months >= 1: + age = f"{months} month{'s' if months != 1 else ''}" + else: + age = "< 1 month" + else: + age = 'N/A' # Ensure all values are properly formatted strings device_safe = device or 'N/A' @@ -1186,14 +1206,6 @@ class SystemHealthMonitor: ┗{'━' * box_width}┛ """ - if drive_info: - perf_metrics = { - 'read_speed': drive_info.get('performance_metrics', {}).get('read_speed', 'N/A'), - 'write_speed': drive_info.get('performance_metrics', {}).get('write_speed', 'N/A'), - 'access_time': drive_info.get('performance_metrics', {}).get('access_time', 'N/A'), - 'iops': drive_info.get('performance_metrics', {}).get('iops', 'N/A') - } - power_on_safe = f"{power_on_hours} hours" if power_on_hours != 'N/A' else 'N/A' last_test_safe = last_test_date or 'N/A' age_safe = age or 'N/A' @@ -1264,39 +1276,140 @@ class SystemHealthMonitor: description += f"\nError generating drive details: {str(e)}\n" if "Temperature" in issue: - description += """ + description += "\n" + textwrap.dedent(""" High drive temperatures can: - Reduce drive lifespan - Cause performance degradation - Lead to data corruption in extreme cases Optimal temperature range: 20-45°C - """ - + """).strip() + "\n" + if "ECC" in issue: - description += """ + description += "\n" + textwrap.dedent(""" ECC (Error Correction Code) Memory Issues: - Correctable: Memory errors that were successfully fixed - Uncorrectable: Serious memory errors that could not be corrected Frequent ECC corrections may indicate degrading memory modules - """ - + """).strip() + "\n" + if "CPU" in issue: - description += """ + description += "\n" + textwrap.dedent(""" High CPU usage sustained over time can indicate: - Resource constraints - Runaway processes - Need for performance optimization - Potential cooling issues - """ - + """).strip() + "\n" + + # Add CPU STATUS box + cpu_health = health_report.get('cpu_health', {}) + cpu_usage = cpu_health.get('cpu_usage_percent', 'N/A') + cpu_threshold = self.CONFIG['THRESHOLDS']['CPU_WARNING'] + cpu_status = cpu_health.get('status', 'N/A') + cpu_usage_str = f"{cpu_usage}%" if isinstance(cpu_usage, (int, float)) else cpu_usage + + description += f""" +┏━ CPU STATUS {'━' * (box_width - 13)}┓ +┃ Usage │ {cpu_usage_str:<61}┃ +┃ Threshold │ {str(cpu_threshold) + '%':<61}┃ +┃ Status │ {cpu_status:<61}┃ +┗{'━' * box_width}┛ +""" + if "Network" in issue: - description += """ + description += "\n" + textwrap.dedent(""" Network connectivity issues can impact: - Cluster communication - Data replication - Service availability - Management access - """ + """).strip() + "\n" + + # Add NETWORK STATUS box + net_health = health_report.get('network_health', {}) + mgmt = net_health.get('management_network', {}) + ceph_net = net_health.get('ceph_network', {}) + mgmt_status = mgmt.get('status', 'N/A') + ceph_status = ceph_net.get('status', 'N/A') + mgmt_latency = mgmt.get('latency') + mgmt_latency_str = f"{mgmt_latency}ms" if mgmt_latency is not None else 'N/A' + mgmt_issues = mgmt.get('issues', []) + ceph_issues = ceph_net.get('issues', []) + all_net_issues = mgmt_issues + ceph_issues + issues_str = '; '.join(all_net_issues) if all_net_issues else 'None' + # Truncate issues string to fit in box + if len(issues_str) > 61: + issues_str = issues_str[:58] + '...' + + description += f""" +┏━ NETWORK STATUS {'━' * (box_width - 17)}┓ +┃ Management │ {mgmt_status:<61}┃ +┃ Ceph Network │ {ceph_status:<61}┃ +┃ Latency │ {mgmt_latency_str:<61}┃ +┃ Issues │ {issues_str:<61}┃ +┗{'━' * box_width}┛ +""" + + if any(kw in issue for kw in ["LXC", "storage usage", "container"]): + # Add CONTAINER STORAGE box + lxc_health = health_report.get('lxc_health', {}) + containers = lxc_health.get('containers', []) + for container in containers: + vmid = container.get('vmid', 'N/A') + for fs in container.get('filesystems', []): + mountpoint = fs.get('mountpoint', 'N/A') + usage_pct = fs.get('usage_percent', 0) + total_bytes = fs.get('total_space', 0) + used_bytes = fs.get('used_space', 0) + avail_bytes = fs.get('available', 0) + # Only show filesystems relevant to this issue + if mountpoint not in issue and vmid not in issue: + continue + total_str = self._format_bytes_human(total_bytes) if isinstance(total_bytes, (int, float)) else str(total_bytes) + used_str = self._format_bytes_human(used_bytes) if isinstance(used_bytes, (int, float)) else str(used_bytes) + free_str = self._format_bytes_human(avail_bytes) if isinstance(avail_bytes, (int, float)) else str(avail_bytes) + # Create 50-char usage meter (2% per block) + blocks = int(usage_pct / 2) + usage_meter = '█' * blocks + '░' * (50 - blocks) + usage_pct_str = f"{usage_pct:.1f}%" + + description += f""" +┏━ CONTAINER STORAGE {'━' * (box_width - 20)}┓ +┃ VMID │ {vmid:<61}┃ +┃ Mountpoint │ {mountpoint:<61}┃ +┃ Usage Meter │ {usage_meter} {usage_pct_str:>10}┃ +┃ Total │ {total_str:<61}┃ +┃ Used │ {used_str:<61}┃ +┃ Free │ {free_str:<61}┃ +┗{'━' * box_width}┛ +""" + + if any(kw in issue for kw in ["Ceph", "OSD", "ceph", "HEALTH_ERR", "HEALTH_WARN"]): + # Add CEPH CLUSTER STATUS box + ceph_health = health_report.get('ceph_health', {}) + if ceph_health.get('is_ceph_node'): + cluster_health = ceph_health.get('cluster_health', 'N/A') + cluster_usage = ceph_health.get('cluster_usage', {}) + usage_pct = cluster_usage.get('usage_percent', 'N/A') if cluster_usage else 'N/A' + total_bytes = cluster_usage.get('total_bytes', 0) if cluster_usage else 0 + used_bytes = cluster_usage.get('used_bytes', 0) if cluster_usage else 0 + total_str = self._format_bytes_human(total_bytes) if total_bytes else 'N/A' + used_str = self._format_bytes_human(used_bytes) if used_bytes else 'N/A' + usage_pct_str = f"{usage_pct}%" if isinstance(usage_pct, (int, float)) else usage_pct + osd_list = ceph_health.get('osd_status', []) + osd_total = len(osd_list) + osd_up = sum(1 for o in osd_list if o.get('status') == 'up') + osd_summary = f"{osd_up}/{osd_total} up" if osd_total > 0 else 'N/A' + + description += f""" +┏━ CEPH CLUSTER STATUS {'━' * (box_width - 22)}┓ +┃ Health │ {cluster_health:<61}┃ +┃ Usage │ {usage_pct_str:<61}┃ +┃ Total │ {total_str:<61}┃ +┃ Used │ {used_str:<61}┃ +┃ OSDs │ {osd_summary:<61}┃ +┗{'━' * box_width}┛ +""" if "Disk" in issue: for partition in health_report.get('drives_health', {}).get('drives', []): @@ -1602,7 +1715,7 @@ class SystemHealthMonitor: f"{environment['PRODUCTION']}" f"{ticket_type_tag}" ) - description = self._generate_detailed_description(issue, health_report) + description = self._generate_detailed_description(issue, health_report, priority) ticket_payload = { "title": ticket_title, @@ -1822,6 +1935,14 @@ class SystemHealthMonitor: return True return False + def _format_bytes_human(self, num_bytes): + """Format a byte count into a human-readable string.""" + for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']: + if abs(num_bytes) < 1024.0: + return f"{num_bytes:.1f} {unit}" + num_bytes /= 1024.0 + return f"{num_bytes:.1f} EB" + def _parse_size(self, size_str: str) -> float: """ Parse size string with units to bytes.