Fix ticket overview: priority display, impact indicators, non-drive detail boxes

- Replace emoji severity indicators (🔴🟡🟢⚪) with ASCII ([CRIT]/[WARN]/[LOW]/[??]) - Fix banner priority to show actual P1-P5 level instead of hardcoded HIGH/MEDIUM - Add LXC/container keyword detection to _get_issue_type() - Rewrite _get_impact_level() with storage/CPU awareness to avoid false Critical - Fix SMART description indentation with textwrap.dedent() - Fix drive age showing "0 years" for drives < 1 year old (now shows months) - Remove unused perf_metrics block - Add structured boxed sections for CPU, Network, Container, and Ceph tickets - Add _format_bytes_human() helper for LXC storage display Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 19:52:51 -05:00
parent 70b02de104
commit 058ea5ad06
1 changed files with 156 additions and 35 deletions
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl
+import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl, textwrap
 from typing import Dict, Any, List

 # =============================================================================
@@ -422,10 +422,10 @@ class SystemHealthMonitor:
    }
    
    SEVERITY_INDICATORS = {
-        'CRITICAL': '🔴',
-        'WARNING': '🟡',
-        'HEALTHY': '🟢',
-        'UNKNOWN': '⚪'
+        'CRITICAL': '[CRIT]',
+        'WARNING': '[WARN]',
+        'HEALTHY': '[ OK ]',
+        'UNKNOWN': '[ ?? ]'
    }
    
    SMART_DESCRIPTIONS = {
@@ -1096,22 +1096,31 @@ class SystemHealthMonitor:
            return "Performance Issue"
        elif "Network" in issue:
            return "Network Issue"
+        elif any(kw in issue for kw in ["LXC", "storage usage", "container"]):
+            return "Container Storage Issue"
        return "Hardware Issue"

    def _get_impact_level(self, issue: str) -> str:
        """Determine impact level from issue description."""
        issue_upper = issue.upper()
+        # Check storage/CPU warnings first so "critical storage" isn't caught as Critical
+        if any(kw in issue_upper for kw in ["STORAGE USAGE", "THRESHOLD", "CPU USAGE"]):
+            return "[WARN] Warning - Action Needed Soon"
        if "CRITICAL" in issue_upper or "UNHEALTHY" in issue_upper or "HEALTH_ERR" in issue_upper:
-            return "🔴 Critical - Immediate Action Required"
+            return "[CRIT] Critical - Immediate Action Required"
        elif "WARNING" in issue_upper or "HEALTH_WARN" in issue_upper or "DOWN" in issue_upper:
-            return "🟡 Warning - Action Needed Soon"
-        return "🟢 Low - Monitor Only"
+            return "[WARN] Warning - Action Needed Soon"
+        return "[LOW]  Low - Monitor Only"

-    def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str:
+    def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any], priority: str = '3') -> str:
        """Generate detailed ticket description with properly formatted ASCII art."""
        hostname = socket.gethostname()
        timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        priority = "⚠ HIGH" if "CRITICAL" in issue else "● MEDIUM"
+        priority_labels = {
+            '1': '⚠ P1 - CRITICAL', '2': '⚠ P2 - HIGH',
+            '3': '● P3 - MEDIUM', '4': '● P4 - NORMAL', '5': '● P5 - LOW',
+        }
+        priority_display = priority_labels.get(priority, '● P3 - MEDIUM')

        # Box width: all lines are exactly 80 chars
        # border lines: ┏ + 78 ━ + ┓ = 80
@@ -1124,7 +1133,7 @@ class SystemHealthMonitor:
 ┣{'━' * box_width}┫
 ┃  Host      : {hostname:<{box_width - 14}}┃
 ┃  Generated : {timestamp:<{box_width - 14}}┃
-┃  Priority  : {priority:<{box_width - 14}}┃
+┃  Priority  : {priority_display:<{box_width - 14}}┃
 ┗{'━' * box_width}┛"""

        issue_type = self._get_issue_type(issue)
@@ -1141,13 +1150,13 @@ class SystemHealthMonitor:
        # Add relevant SMART descriptions
        for attr in self.SMART_DESCRIPTIONS:
            if attr in issue:
-                description += f"\n{attr}:\n{self.SMART_DESCRIPTIONS[attr]}\n"
+                description += f"\n{attr}:\n{textwrap.dedent(self.SMART_DESCRIPTIONS[attr]).strip()}\n"

        if "SMART" in issue:
-            description += """
+            description += "\n" + textwrap.dedent("""
            SMART (Self-Monitoring, Analysis, and Reporting Technology) Attribute Details:
            - Possible drive failure!
-            """
+            """).strip() + "\n"

        if "Drive" in issue and "/dev/" in issue:
            try:
@@ -1165,7 +1174,18 @@ class SystemHealthMonitor:

                    power_on_hours = smart_data['attributes'].get('Power_On_Hours', 'N/A')
                    last_test_date = smart_data.get('last_test_date', 'N/A')
-                    age = f"{int(power_on_hours/24/365) if isinstance(power_on_hours, (int, float)) else 'N/A'} years" if power_on_hours != 'N/A' else 'N/A'
+                    if power_on_hours != 'N/A' and isinstance(power_on_hours, (int, float)):
+                        total_days = power_on_hours / 24
+                        years = int(total_days / 365)
+                        months = int((total_days % 365) / 30)
+                        if years >= 1:
+                            age = f"{years} year{'s' if years != 1 else ''}, {months} month{'s' if months != 1 else ''}"
+                        elif months >= 1:
+                            age = f"{months} month{'s' if months != 1 else ''}"
+                        else:
+                            age = "< 1 month"
+                    else:
+                        age = 'N/A'

                    # Ensure all values are properly formatted strings
                    device_safe = device or 'N/A'
@@ -1186,14 +1206,6 @@ class SystemHealthMonitor:
 ┗{'━' * box_width}┛
 """

-                    if drive_info:
-                        perf_metrics = {
-                            'read_speed': drive_info.get('performance_metrics', {}).get('read_speed', 'N/A'),
-                            'write_speed': drive_info.get('performance_metrics', {}).get('write_speed', 'N/A'),
-                            'access_time': drive_info.get('performance_metrics', {}).get('access_time', 'N/A'),
-                            'iops': drive_info.get('performance_metrics', {}).get('iops', 'N/A')
-                        }
-
                    power_on_safe = f"{power_on_hours} hours" if power_on_hours != 'N/A' else 'N/A'
                    last_test_safe = last_test_date or 'N/A'
                    age_safe = age or 'N/A'
@@ -1264,39 +1276,140 @@ class SystemHealthMonitor:
                description += f"\nError generating drive details: {str(e)}\n"
        
        if "Temperature" in issue:
-            description += """
+            description += "\n" + textwrap.dedent("""
            High drive temperatures can:
            - Reduce drive lifespan
            - Cause performance degradation
            - Lead to data corruption in extreme cases
            Optimal temperature range: 20-45°C
-            """
-        
+            """).strip() + "\n"
+
        if "ECC" in issue:
-            description += """
+            description += "\n" + textwrap.dedent("""
            ECC (Error Correction Code) Memory Issues:
            - Correctable: Memory errors that were successfully fixed
            - Uncorrectable: Serious memory errors that could not be corrected
            Frequent ECC corrections may indicate degrading memory modules
-            """
-        
+            """).strip() + "\n"
+
        if "CPU" in issue:
-            description += """
+            description += "\n" + textwrap.dedent("""
            High CPU usage sustained over time can indicate:
            - Resource constraints
            - Runaway processes
            - Need for performance optimization
            - Potential cooling issues
-            """
-        
+            """).strip() + "\n"
+
+            # Add CPU STATUS box
+            cpu_health = health_report.get('cpu_health', {})
+            cpu_usage = cpu_health.get('cpu_usage_percent', 'N/A')
+            cpu_threshold = self.CONFIG['THRESHOLDS']['CPU_WARNING']
+            cpu_status = cpu_health.get('status', 'N/A')
+            cpu_usage_str = f"{cpu_usage}%" if isinstance(cpu_usage, (int, float)) else cpu_usage
+
+            description += f"""
+┏━ CPU STATUS {'━' * (box_width - 13)}┓
+┃  Usage        │ {cpu_usage_str:<61}┃
+┃  Threshold    │ {str(cpu_threshold) + '%':<61}┃
+┃  Status       │ {cpu_status:<61}┃
+┗{'━' * box_width}┛
+"""
+
        if "Network" in issue:
-            description += """
+            description += "\n" + textwrap.dedent("""
            Network connectivity issues can impact:
            - Cluster communication
            - Data replication
            - Service availability
            - Management access
-            """
+            """).strip() + "\n"
+
+            # Add NETWORK STATUS box
+            net_health = health_report.get('network_health', {})
+            mgmt = net_health.get('management_network', {})
+            ceph_net = net_health.get('ceph_network', {})
+            mgmt_status = mgmt.get('status', 'N/A')
+            ceph_status = ceph_net.get('status', 'N/A')
+            mgmt_latency = mgmt.get('latency')
+            mgmt_latency_str = f"{mgmt_latency}ms" if mgmt_latency is not None else 'N/A'
+            mgmt_issues = mgmt.get('issues', [])
+            ceph_issues = ceph_net.get('issues', [])
+            all_net_issues = mgmt_issues + ceph_issues
+            issues_str = '; '.join(all_net_issues) if all_net_issues else 'None'
+            # Truncate issues string to fit in box
+            if len(issues_str) > 61:
+                issues_str = issues_str[:58] + '...'
+
+            description += f"""
+┏━ NETWORK STATUS {'━' * (box_width - 17)}┓
+┃  Management   │ {mgmt_status:<61}┃
+┃  Ceph Network │ {ceph_status:<61}┃
+┃  Latency      │ {mgmt_latency_str:<61}┃
+┃  Issues       │ {issues_str:<61}┃
+┗{'━' * box_width}┛
+"""
+
+        if any(kw in issue for kw in ["LXC", "storage usage", "container"]):
+            # Add CONTAINER STORAGE box
+            lxc_health = health_report.get('lxc_health', {})
+            containers = lxc_health.get('containers', [])
+            for container in containers:
+                vmid = container.get('vmid', 'N/A')
+                for fs in container.get('filesystems', []):
+                    mountpoint = fs.get('mountpoint', 'N/A')
+                    usage_pct = fs.get('usage_percent', 0)
+                    total_bytes = fs.get('total_space', 0)
+                    used_bytes = fs.get('used_space', 0)
+                    avail_bytes = fs.get('available', 0)
+                    # Only show filesystems relevant to this issue
+                    if mountpoint not in issue and vmid not in issue:
+                        continue
+                    total_str = self._format_bytes_human(total_bytes) if isinstance(total_bytes, (int, float)) else str(total_bytes)
+                    used_str = self._format_bytes_human(used_bytes) if isinstance(used_bytes, (int, float)) else str(used_bytes)
+                    free_str = self._format_bytes_human(avail_bytes) if isinstance(avail_bytes, (int, float)) else str(avail_bytes)
+                    # Create 50-char usage meter (2% per block)
+                    blocks = int(usage_pct / 2)
+                    usage_meter = '█' * blocks + '░' * (50 - blocks)
+                    usage_pct_str = f"{usage_pct:.1f}%"
+
+                    description += f"""
+┏━ CONTAINER STORAGE {'━' * (box_width - 20)}┓
+┃  VMID         │ {vmid:<61}┃
+┃  Mountpoint   │ {mountpoint:<61}┃
+┃  Usage Meter  │ {usage_meter} {usage_pct_str:>10}┃
+┃  Total        │ {total_str:<61}┃
+┃  Used         │ {used_str:<61}┃
+┃  Free         │ {free_str:<61}┃
+┗{'━' * box_width}┛
+"""
+
+        if any(kw in issue for kw in ["Ceph", "OSD", "ceph", "HEALTH_ERR", "HEALTH_WARN"]):
+            # Add CEPH CLUSTER STATUS box
+            ceph_health = health_report.get('ceph_health', {})
+            if ceph_health.get('is_ceph_node'):
+                cluster_health = ceph_health.get('cluster_health', 'N/A')
+                cluster_usage = ceph_health.get('cluster_usage', {})
+                usage_pct = cluster_usage.get('usage_percent', 'N/A') if cluster_usage else 'N/A'
+                total_bytes = cluster_usage.get('total_bytes', 0) if cluster_usage else 0
+                used_bytes = cluster_usage.get('used_bytes', 0) if cluster_usage else 0
+                total_str = self._format_bytes_human(total_bytes) if total_bytes else 'N/A'
+                used_str = self._format_bytes_human(used_bytes) if used_bytes else 'N/A'
+                usage_pct_str = f"{usage_pct}%" if isinstance(usage_pct, (int, float)) else usage_pct
+                osd_list = ceph_health.get('osd_status', [])
+                osd_total = len(osd_list)
+                osd_up = sum(1 for o in osd_list if o.get('status') == 'up')
+                osd_summary = f"{osd_up}/{osd_total} up" if osd_total > 0 else 'N/A'
+
+                description += f"""
+┏━ CEPH CLUSTER STATUS {'━' * (box_width - 22)}┓
+┃  Health       │ {cluster_health:<61}┃
+┃  Usage        │ {usage_pct_str:<61}┃
+┃  Total        │ {total_str:<61}┃
+┃  Used         │ {used_str:<61}┃
+┃  OSDs         │ {osd_summary:<61}┃
+┗{'━' * box_width}┛
+"""
        
        if "Disk" in issue:
            for partition in health_report.get('drives_health', {}).get('drives', []):
@@ -1602,7 +1715,7 @@ class SystemHealthMonitor:
                f"{environment['PRODUCTION']}"
                f"{ticket_type_tag}"
            )
-            description = self._generate_detailed_description(issue, health_report)
+            description = self._generate_detailed_description(issue, health_report, priority)

            ticket_payload = {
                "title": ticket_title,
@@ -1822,6 +1935,14 @@ class SystemHealthMonitor:
                return True
        return False
    
+    def _format_bytes_human(self, num_bytes):
+        """Format a byte count into a human-readable string."""
+        for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
+            if abs(num_bytes) < 1024.0:
+                return f"{num_bytes:.1f} {unit}"
+            num_bytes /= 1024.0
+        return f"{num_bytes:.1f} EB"
+
    def _parse_size(self, size_str: str) -> float:
        """
        Parse size string with units to bytes.