Fix ticket overview: priority display, impact indicators, non-drive detail boxes

- Replace emoji severity indicators (🔴🟡🟢) with ASCII ([CRIT]/[WARN]/[LOW]/[??])
- Fix banner priority to show actual P1-P5 level instead of hardcoded HIGH/MEDIUM
- Add LXC/container keyword detection to _get_issue_type()
- Rewrite _get_impact_level() with storage/CPU awareness to avoid false Critical
- Fix SMART description indentation with textwrap.dedent()
- Fix drive age showing "0 years" for drives < 1 year old (now shows months)
- Remove unused perf_metrics block
- Add structured boxed sections for CPU, Network, Container, and Ceph tickets
- Add _format_bytes_human() helper for LXC storage display

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-06 19:52:51 -05:00
parent 70b02de104
commit 058ea5ad06

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl, textwrap
from typing import Dict, Any, List from typing import Dict, Any, List
# ============================================================================= # =============================================================================
@@ -422,10 +422,10 @@ class SystemHealthMonitor:
} }
SEVERITY_INDICATORS = { SEVERITY_INDICATORS = {
'CRITICAL': '🔴', 'CRITICAL': '[CRIT]',
'WARNING': '🟡', 'WARNING': '[WARN]',
'HEALTHY': '🟢', 'HEALTHY': '[ OK ]',
'UNKNOWN': '' 'UNKNOWN': '[ ?? ]'
} }
SMART_DESCRIPTIONS = { SMART_DESCRIPTIONS = {
@@ -1096,22 +1096,31 @@ class SystemHealthMonitor:
return "Performance Issue" return "Performance Issue"
elif "Network" in issue: elif "Network" in issue:
return "Network Issue" return "Network Issue"
elif any(kw in issue for kw in ["LXC", "storage usage", "container"]):
return "Container Storage Issue"
return "Hardware Issue" return "Hardware Issue"
def _get_impact_level(self, issue: str) -> str: def _get_impact_level(self, issue: str) -> str:
"""Determine impact level from issue description.""" """Determine impact level from issue description."""
issue_upper = issue.upper() issue_upper = issue.upper()
# Check storage/CPU warnings first so "critical storage" isn't caught as Critical
if any(kw in issue_upper for kw in ["STORAGE USAGE", "THRESHOLD", "CPU USAGE"]):
return "[WARN] Warning - Action Needed Soon"
if "CRITICAL" in issue_upper or "UNHEALTHY" in issue_upper or "HEALTH_ERR" in issue_upper: if "CRITICAL" in issue_upper or "UNHEALTHY" in issue_upper or "HEALTH_ERR" in issue_upper:
return "🔴 Critical - Immediate Action Required" return "[CRIT] Critical - Immediate Action Required"
elif "WARNING" in issue_upper or "HEALTH_WARN" in issue_upper or "DOWN" in issue_upper: elif "WARNING" in issue_upper or "HEALTH_WARN" in issue_upper or "DOWN" in issue_upper:
return "🟡 Warning - Action Needed Soon" return "[WARN] Warning - Action Needed Soon"
return "🟢 Low - Monitor Only" return "[LOW] Low - Monitor Only"
def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any]) -> str: def _generate_detailed_description(self, issue: str, health_report: Dict[str, Any], priority: str = '3') -> str:
"""Generate detailed ticket description with properly formatted ASCII art.""" """Generate detailed ticket description with properly formatted ASCII art."""
hostname = socket.gethostname() hostname = socket.gethostname()
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
priority = "⚠ HIGH" if "CRITICAL" in issue else "● MEDIUM" priority_labels = {
'1': '⚠ P1 - CRITICAL', '2': '⚠ P2 - HIGH',
'3': '● P3 - MEDIUM', '4': '● P4 - NORMAL', '5': '● P5 - LOW',
}
priority_display = priority_labels.get(priority, '● P3 - MEDIUM')
# Box width: all lines are exactly 80 chars # Box width: all lines are exactly 80 chars
# border lines: ┏ + 78 ━ + ┓ = 80 # border lines: ┏ + 78 ━ + ┓ = 80
@@ -1124,7 +1133,7 @@ class SystemHealthMonitor:
{'' * box_width} {'' * box_width}
┃ Host : {hostname:<{box_width - 14}} ┃ Host : {hostname:<{box_width - 14}}
┃ Generated : {timestamp:<{box_width - 14}} ┃ Generated : {timestamp:<{box_width - 14}}
┃ Priority : {priority:<{box_width - 14}} ┃ Priority : {priority_display:<{box_width - 14}}
{'' * box_width}""" {'' * box_width}"""
issue_type = self._get_issue_type(issue) issue_type = self._get_issue_type(issue)
@@ -1141,13 +1150,13 @@ class SystemHealthMonitor:
# Add relevant SMART descriptions # Add relevant SMART descriptions
for attr in self.SMART_DESCRIPTIONS: for attr in self.SMART_DESCRIPTIONS:
if attr in issue: if attr in issue:
description += f"\n{attr}:\n{self.SMART_DESCRIPTIONS[attr]}\n" description += f"\n{attr}:\n{textwrap.dedent(self.SMART_DESCRIPTIONS[attr]).strip()}\n"
if "SMART" in issue: if "SMART" in issue:
description += """ description += "\n" + textwrap.dedent("""
SMART (Self-Monitoring, Analysis, and Reporting Technology) Attribute Details: SMART (Self-Monitoring, Analysis, and Reporting Technology) Attribute Details:
- Possible drive failure! - Possible drive failure!
""" """).strip() + "\n"
if "Drive" in issue and "/dev/" in issue: if "Drive" in issue and "/dev/" in issue:
try: try:
@@ -1165,7 +1174,18 @@ class SystemHealthMonitor:
power_on_hours = smart_data['attributes'].get('Power_On_Hours', 'N/A') power_on_hours = smart_data['attributes'].get('Power_On_Hours', 'N/A')
last_test_date = smart_data.get('last_test_date', 'N/A') last_test_date = smart_data.get('last_test_date', 'N/A')
age = f"{int(power_on_hours/24/365) if isinstance(power_on_hours, (int, float)) else 'N/A'} years" if power_on_hours != 'N/A' else 'N/A' if power_on_hours != 'N/A' and isinstance(power_on_hours, (int, float)):
total_days = power_on_hours / 24
years = int(total_days / 365)
months = int((total_days % 365) / 30)
if years >= 1:
age = f"{years} year{'s' if years != 1 else ''}, {months} month{'s' if months != 1 else ''}"
elif months >= 1:
age = f"{months} month{'s' if months != 1 else ''}"
else:
age = "< 1 month"
else:
age = 'N/A'
# Ensure all values are properly formatted strings # Ensure all values are properly formatted strings
device_safe = device or 'N/A' device_safe = device or 'N/A'
@@ -1186,14 +1206,6 @@ class SystemHealthMonitor:
{'' * box_width} {'' * box_width}
""" """
if drive_info:
perf_metrics = {
'read_speed': drive_info.get('performance_metrics', {}).get('read_speed', 'N/A'),
'write_speed': drive_info.get('performance_metrics', {}).get('write_speed', 'N/A'),
'access_time': drive_info.get('performance_metrics', {}).get('access_time', 'N/A'),
'iops': drive_info.get('performance_metrics', {}).get('iops', 'N/A')
}
power_on_safe = f"{power_on_hours} hours" if power_on_hours != 'N/A' else 'N/A' power_on_safe = f"{power_on_hours} hours" if power_on_hours != 'N/A' else 'N/A'
last_test_safe = last_test_date or 'N/A' last_test_safe = last_test_date or 'N/A'
age_safe = age or 'N/A' age_safe = age or 'N/A'
@@ -1264,39 +1276,140 @@ class SystemHealthMonitor:
description += f"\nError generating drive details: {str(e)}\n" description += f"\nError generating drive details: {str(e)}\n"
if "Temperature" in issue: if "Temperature" in issue:
description += """ description += "\n" + textwrap.dedent("""
High drive temperatures can: High drive temperatures can:
- Reduce drive lifespan - Reduce drive lifespan
- Cause performance degradation - Cause performance degradation
- Lead to data corruption in extreme cases - Lead to data corruption in extreme cases
Optimal temperature range: 20-45°C Optimal temperature range: 20-45°C
""" """).strip() + "\n"
if "ECC" in issue: if "ECC" in issue:
description += """ description += "\n" + textwrap.dedent("""
ECC (Error Correction Code) Memory Issues: ECC (Error Correction Code) Memory Issues:
- Correctable: Memory errors that were successfully fixed - Correctable: Memory errors that were successfully fixed
- Uncorrectable: Serious memory errors that could not be corrected - Uncorrectable: Serious memory errors that could not be corrected
Frequent ECC corrections may indicate degrading memory modules Frequent ECC corrections may indicate degrading memory modules
""" """).strip() + "\n"
if "CPU" in issue: if "CPU" in issue:
description += """ description += "\n" + textwrap.dedent("""
High CPU usage sustained over time can indicate: High CPU usage sustained over time can indicate:
- Resource constraints - Resource constraints
- Runaway processes - Runaway processes
- Need for performance optimization - Need for performance optimization
- Potential cooling issues - Potential cooling issues
""" """).strip() + "\n"
# Add CPU STATUS box
cpu_health = health_report.get('cpu_health', {})
cpu_usage = cpu_health.get('cpu_usage_percent', 'N/A')
cpu_threshold = self.CONFIG['THRESHOLDS']['CPU_WARNING']
cpu_status = cpu_health.get('status', 'N/A')
cpu_usage_str = f"{cpu_usage}%" if isinstance(cpu_usage, (int, float)) else cpu_usage
description += f"""
┏━ CPU STATUS {'' * (box_width - 13)}
┃ Usage │ {cpu_usage_str:<61}
┃ Threshold │ {str(cpu_threshold) + '%':<61}
┃ Status │ {cpu_status:<61}
{'' * box_width}
"""
if "Network" in issue: if "Network" in issue:
description += """ description += "\n" + textwrap.dedent("""
Network connectivity issues can impact: Network connectivity issues can impact:
- Cluster communication - Cluster communication
- Data replication - Data replication
- Service availability - Service availability
- Management access - Management access
""" """).strip() + "\n"
# Add NETWORK STATUS box
net_health = health_report.get('network_health', {})
mgmt = net_health.get('management_network', {})
ceph_net = net_health.get('ceph_network', {})
mgmt_status = mgmt.get('status', 'N/A')
ceph_status = ceph_net.get('status', 'N/A')
mgmt_latency = mgmt.get('latency')
mgmt_latency_str = f"{mgmt_latency}ms" if mgmt_latency is not None else 'N/A'
mgmt_issues = mgmt.get('issues', [])
ceph_issues = ceph_net.get('issues', [])
all_net_issues = mgmt_issues + ceph_issues
issues_str = '; '.join(all_net_issues) if all_net_issues else 'None'
# Truncate issues string to fit in box
if len(issues_str) > 61:
issues_str = issues_str[:58] + '...'
description += f"""
┏━ NETWORK STATUS {'' * (box_width - 17)}
┃ Management │ {mgmt_status:<61}
┃ Ceph Network │ {ceph_status:<61}
┃ Latency │ {mgmt_latency_str:<61}
┃ Issues │ {issues_str:<61}
{'' * box_width}
"""
if any(kw in issue for kw in ["LXC", "storage usage", "container"]):
# Add CONTAINER STORAGE box
lxc_health = health_report.get('lxc_health', {})
containers = lxc_health.get('containers', [])
for container in containers:
vmid = container.get('vmid', 'N/A')
for fs in container.get('filesystems', []):
mountpoint = fs.get('mountpoint', 'N/A')
usage_pct = fs.get('usage_percent', 0)
total_bytes = fs.get('total_space', 0)
used_bytes = fs.get('used_space', 0)
avail_bytes = fs.get('available', 0)
# Only show filesystems relevant to this issue
if mountpoint not in issue and vmid not in issue:
continue
total_str = self._format_bytes_human(total_bytes) if isinstance(total_bytes, (int, float)) else str(total_bytes)
used_str = self._format_bytes_human(used_bytes) if isinstance(used_bytes, (int, float)) else str(used_bytes)
free_str = self._format_bytes_human(avail_bytes) if isinstance(avail_bytes, (int, float)) else str(avail_bytes)
# Create 50-char usage meter (2% per block)
blocks = int(usage_pct / 2)
usage_meter = '' * blocks + '' * (50 - blocks)
usage_pct_str = f"{usage_pct:.1f}%"
description += f"""
┏━ CONTAINER STORAGE {'' * (box_width - 20)}
┃ VMID │ {vmid:<61}
┃ Mountpoint │ {mountpoint:<61}
┃ Usage Meter │ {usage_meter} {usage_pct_str:>10}
┃ Total │ {total_str:<61}
┃ Used │ {used_str:<61}
┃ Free │ {free_str:<61}
{'' * box_width}
"""
if any(kw in issue for kw in ["Ceph", "OSD", "ceph", "HEALTH_ERR", "HEALTH_WARN"]):
# Add CEPH CLUSTER STATUS box
ceph_health = health_report.get('ceph_health', {})
if ceph_health.get('is_ceph_node'):
cluster_health = ceph_health.get('cluster_health', 'N/A')
cluster_usage = ceph_health.get('cluster_usage', {})
usage_pct = cluster_usage.get('usage_percent', 'N/A') if cluster_usage else 'N/A'
total_bytes = cluster_usage.get('total_bytes', 0) if cluster_usage else 0
used_bytes = cluster_usage.get('used_bytes', 0) if cluster_usage else 0
total_str = self._format_bytes_human(total_bytes) if total_bytes else 'N/A'
used_str = self._format_bytes_human(used_bytes) if used_bytes else 'N/A'
usage_pct_str = f"{usage_pct}%" if isinstance(usage_pct, (int, float)) else usage_pct
osd_list = ceph_health.get('osd_status', [])
osd_total = len(osd_list)
osd_up = sum(1 for o in osd_list if o.get('status') == 'up')
osd_summary = f"{osd_up}/{osd_total} up" if osd_total > 0 else 'N/A'
description += f"""
┏━ CEPH CLUSTER STATUS {'' * (box_width - 22)}
┃ Health │ {cluster_health:<61}
┃ Usage │ {usage_pct_str:<61}
┃ Total │ {total_str:<61}
┃ Used │ {used_str:<61}
┃ OSDs │ {osd_summary:<61}
{'' * box_width}
"""
if "Disk" in issue: if "Disk" in issue:
for partition in health_report.get('drives_health', {}).get('drives', []): for partition in health_report.get('drives_health', {}).get('drives', []):
@@ -1602,7 +1715,7 @@ class SystemHealthMonitor:
f"{environment['PRODUCTION']}" f"{environment['PRODUCTION']}"
f"{ticket_type_tag}" f"{ticket_type_tag}"
) )
description = self._generate_detailed_description(issue, health_report) description = self._generate_detailed_description(issue, health_report, priority)
ticket_payload = { ticket_payload = {
"title": ticket_title, "title": ticket_title,
@@ -1822,6 +1935,14 @@ class SystemHealthMonitor:
return True return True
return False return False
def _format_bytes_human(self, num_bytes):
"""Format a byte count into a human-readable string."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB', 'PB']:
if abs(num_bytes) < 1024.0:
return f"{num_bytes:.1f} {unit}"
num_bytes /= 1024.0
return f"{num_bytes:.1f} EB"
def _parse_size(self, size_str: str) -> float: def _parse_size(self, size_str: str) -> float:
""" """
Parse size string with units to bytes. Parse size string with units to bytes.