ci: add flake8 lint workflow; fix unused imports and f-string issues
Lint / Python (flake8) (push) Failing after 4s

Adds .gitea/workflows/lint.yml running flake8 with .flake8 config.
Removes unused sys/urllib.request imports (F401).
Removes f prefix from 52 f-strings that had no placeholders (F541).
Auto-fixes trailing whitespace in blank lines (W293) via autopep8.
Fixes over-indentation in LXC storage check try block (E117).
Config ignores F841 (unused locals) and E501 (long lines).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 22:27:15 -04:00
parent 03320c0ece
commit cbbafa05c2
3 changed files with 312 additions and 272 deletions
+6
View File
@@ -0,0 +1,6 @@
[flake8]
max-line-length = 120
# F841: local variable assigned but never used — many are intentional debug/future-use assignments
# E501: line too long — URLs and log messages in monitoring code are exempt
extend-ignore = F841, E501
exclude = __pycache__, .git
+20
View File
@@ -0,0 +1,20 @@
name: Lint
on:
push:
branches: ["**"]
pull_request:
branches: ["**"]
jobs:
python-lint:
name: Python (flake8)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install flake8
run: pip install flake8
- name: Run flake8
run: flake8 .
+112 -98
View File
@@ -1,5 +1,18 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl, textwrap, shutil import os
import json
import requests
import psutil
import socket
import subprocess
import logging
import argparse
import re
import glob
import datetime
import fcntl
import textwrap
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, Any, List from typing import Dict, Any, List
@@ -83,8 +96,8 @@ class SystemHealthMonitor:
# PBS (Proxmox Backup Server) issues # PBS (Proxmox Backup Server) issues
'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded 'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded
'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full 'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full
'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high 'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - ZFS pool usage high
'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors 'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors
'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed 'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed
'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed 'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed
@@ -272,7 +285,7 @@ class SystemHealthMonitor:
'ISSUE': '[issue]', # General issue (replaces invalid 'incident') 'ISSUE': '[issue]', # General issue (replaces invalid 'incident')
'PROBLEM': '[problem]', # Root cause investigation 'PROBLEM': '[problem]', # Root cause investigation
'TASK': '[task]', # Planned work item 'TASK': '[task]', # Planned work item
'MAINTENANCE': '[maintenance]', # Scheduled/preventive work 'MAINTENANCE': '[maintenance]', # Scheduled/preventive work
'UPGRADE': '[upgrade]' # Hardware/software upgrade 'UPGRADE': '[upgrade]' # Hardware/software upgrade
}, },
'HARDWARE_TYPE': { 'HARDWARE_TYPE': {
@@ -300,7 +313,7 @@ class SystemHealthMonitor:
'ISSUE': 'Issue', # General issue/incident 'ISSUE': 'Issue', # General issue/incident
'PROBLEM': 'Problem', # Root cause investigation needed 'PROBLEM': 'Problem', # Root cause investigation needed
'TASK': 'Task', # Planned work item 'TASK': 'Task', # Planned work item
'MAINTENANCE': 'Maintenance', # Scheduled/preventive work 'MAINTENANCE': 'Maintenance', # Scheduled/preventive work
'UPGRADE': 'Upgrade', # Hardware/software upgrade 'UPGRADE': 'Upgrade', # Hardware/software upgrade
'INSTALL': 'Install', # New installation 'INSTALL': 'Install', # New installation
'REQUEST': 'Request' # Service or information request 'REQUEST': 'Request' # Service or information request
@@ -991,7 +1004,7 @@ class SystemHealthMonitor:
# Analyze trends for critical attributes # Analyze trends for critical attributes
if len(history) >= 3: # Need at least 3 data points for trend analysis if len(history) >= 3: # Need at least 3 data points for trend analysis
critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect', critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count'] 'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
for attr in critical_attrs: for attr in critical_attrs:
if attr in current_attributes: if attr in current_attributes:
@@ -1171,7 +1184,7 @@ class SystemHealthMonitor:
try: try:
# Check dmesg for drive-related errors (last 1000 lines to avoid overwhelming output) # Check dmesg for drive-related errors (last 1000 lines to avoid overwhelming output)
result = subprocess.run(['dmesg', '-T', '--level=err,warn'], result = subprocess.run(['dmesg', '-T', '--level=err,warn'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=10) stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=10)
if result.returncode == 0: if result.returncode == 0:
error_patterns = [ error_patterns = [
@@ -1272,7 +1285,6 @@ class SystemHealthMonitor:
self._drive_details_cache[device] = drive_details self._drive_details_cache[device] = drive_details
return drive_details return drive_details
def _get_issue_type(self, issue: str) -> str: def _get_issue_type(self, issue: str) -> str:
"""Determine issue type from issue description.""" """Determine issue type from issue description."""
if "SMART" in issue: if "SMART" in issue:
@@ -1318,7 +1330,7 @@ class SystemHealthMonitor:
# content lines: prefix + field_width + ┃ = 80 # content lines: prefix + field_width + ┃ = 80
box_width = 78 box_width = 78
banner = f""" banner = """
{'' * box_width}┓ {'' * box_width}┓
{' HARDWARE MONITORING ALERT TICKET '.center(box_width)}┃ {' HARDWARE MONITORING ALERT TICKET '.center(box_width)}┃
{'' * box_width}┫ {'' * box_width}┫
@@ -1330,7 +1342,7 @@ class SystemHealthMonitor:
issue_type = self._get_issue_type(issue) issue_type = self._get_issue_type(issue)
impact_level = self._get_impact_level(issue) impact_level = self._get_impact_level(issue)
executive_summary = f""" executive_summary = """
┏━ EXECUTIVE SUMMARY {'' * (box_width - 20)}┓ ┏━ EXECUTIVE SUMMARY {'' * (box_width - 20)}┓
┃ Issue Type │ {issue_type:<60} ┃ Issue Type │ {issue_type:<60}
┃ Impact Level │ {impact_level:<60} ┃ Impact Level │ {impact_level:<60}
@@ -1395,7 +1407,7 @@ class SystemHealthMonitor:
type_safe = drive_details.get('type') or 'N/A' type_safe = drive_details.get('type') or 'N/A'
firmware_safe = drive_details.get('firmware') or 'N/A' firmware_safe = drive_details.get('firmware') or 'N/A'
description += f""" description += """
┏━ DRIVE SPECIFICATIONS {'' * (box_width - 23)}┓ ┏━ DRIVE SPECIFICATIONS {'' * (box_width - 23)}┓
┃ Device Path │ {device_safe:<61} ┃ Device Path │ {device_safe:<61}
┃ Model │ {model_safe:<61} ┃ Model │ {model_safe:<61}
@@ -1410,7 +1422,7 @@ class SystemHealthMonitor:
last_test_safe = last_test_date or 'N/A' last_test_safe = last_test_date or 'N/A'
age_safe = age or 'N/A' age_safe = age or 'N/A'
description += f""" description += """
┏━ DRIVE TIMELINE {'' * (box_width - 17)}┓ ┏━ DRIVE TIMELINE {'' * (box_width - 17)}┓
┃ Power-On Hours │ {power_on_safe:<56} ┃ Power-On Hours │ {power_on_safe:<56}
┃ Last SMART Test │ {last_test_safe:<56} ┃ Last SMART Test │ {last_test_safe:<56}
@@ -1423,7 +1435,7 @@ class SystemHealthMonitor:
temp_value = drive_info.get('temperature') temp_value = drive_info.get('temperature')
temp_safe = f"{temp_value}°C" if temp_value is not None else 'N/A' temp_safe = f"{temp_value}°C" if temp_value is not None else 'N/A'
description += f""" description += """
┏━ SMART STATUS {'' * (box_width - 15)}┓ ┏━ SMART STATUS {'' * (box_width - 15)}┓
┃ Status │ {smart_status_safe:<62} ┃ Status │ {smart_status_safe:<62}
┃ Temperature │ {temp_safe:<62} ┃ Temperature │ {temp_safe:<62}
@@ -1455,7 +1467,7 @@ class SystemHealthMonitor:
# Truncate mountpoint if too long for header # Truncate mountpoint if too long for header
mountpoint_display = mountpoint_safe[:50] if len(mountpoint_safe) > 50 else mountpoint_safe mountpoint_display = mountpoint_safe[:50] if len(mountpoint_safe) > 50 else mountpoint_safe
description += f""" description += """
┏━ PARTITION: {mountpoint_display} {'' * (box_width - 14 - len(mountpoint_display))}┓ ┏━ PARTITION: {mountpoint_display} {'' * (box_width - 14 - len(mountpoint_display))}┓
┃ Filesystem │ {fstype_safe:<61} ┃ Filesystem │ {fstype_safe:<61}
┃ Usage Meter │ {usage_meter} {usage_pct_str:>10} ┃ Usage Meter │ {usage_meter} {usage_pct_str:>10}
@@ -1508,7 +1520,7 @@ class SystemHealthMonitor:
cpu_status = cpu_health.get('status', 'N/A') cpu_status = cpu_health.get('status', 'N/A')
cpu_usage_str = f"{cpu_usage}%" if isinstance(cpu_usage, (int, float)) else cpu_usage cpu_usage_str = f"{cpu_usage}%" if isinstance(cpu_usage, (int, float)) else cpu_usage
description += f""" description += """
┏━ CPU STATUS {'' * (box_width - 13)}┓ ┏━ CPU STATUS {'' * (box_width - 13)}┓
┃ Usage │ {cpu_usage_str:<61} ┃ Usage │ {cpu_usage_str:<61}
┃ Threshold │ {str(cpu_threshold) + '%':<61}┃ ┃ Threshold │ {str(cpu_threshold) + '%':<61}┃
@@ -1541,7 +1553,7 @@ class SystemHealthMonitor:
if len(issues_str) > 61: if len(issues_str) > 61:
issues_str = issues_str[:58] + '...' issues_str = issues_str[:58] + '...'
description += f""" description += """
┏━ NETWORK STATUS {'' * (box_width - 17)}┓ ┏━ NETWORK STATUS {'' * (box_width - 17)}┓
┃ Management │ {mgmt_status:<61} ┃ Management │ {mgmt_status:<61}
┃ Ceph Network │ {ceph_status:<61} ┃ Ceph Network │ {ceph_status:<61}
@@ -1573,7 +1585,7 @@ class SystemHealthMonitor:
usage_meter = '' * blocks + '' * (50 - blocks) usage_meter = '' * blocks + '' * (50 - blocks)
usage_pct_str = f"{usage_pct:.1f}%" usage_pct_str = f"{usage_pct:.1f}%"
description += f""" description += """
┏━ CONTAINER STORAGE {'' * (box_width - 20)}┓ ┏━ CONTAINER STORAGE {'' * (box_width - 20)}┓
┃ VMID │ {vmid:<61} ┃ VMID │ {vmid:<61}
┃ Mountpoint │ {mountpoint:<61} ┃ Mountpoint │ {mountpoint:<61}
@@ -1601,7 +1613,7 @@ class SystemHealthMonitor:
osd_up = sum(1 for o in osd_list if o.get('status') == 'up') osd_up = sum(1 for o in osd_list if o.get('status') == 'up')
osd_summary = f"{osd_up}/{osd_total} up" if osd_total > 0 else 'N/A' osd_summary = f"{osd_up}/{osd_total} up" if osd_total > 0 else 'N/A'
description += f""" description += """
┏━ CEPH CLUSTER STATUS {'' * (box_width - 22)}┓ ┏━ CEPH CLUSTER STATUS {'' * (box_width - 22)}┓
┃ Health │ {cluster_health:<61} ┃ Health │ {cluster_health:<61}
┃ Usage │ {usage_pct_str:<61} ┃ Usage │ {usage_pct_str:<61}
@@ -1614,7 +1626,7 @@ class SystemHealthMonitor:
if "Disk" in issue: if "Disk" in issue:
for partition in health_report.get('drives_health', {}).get('drives', []): for partition in health_report.get('drives_health', {}).get('drives', []):
if partition.get('mountpoint') in issue: if partition.get('mountpoint') in issue:
description += f"\n=== Disk Metrics ===\n" description += "\n=== Disk Metrics ===\n"
description += f"Disk Device: {partition['device']}\n" description += f"Disk Device: {partition['device']}\n"
description += f"Mount Point: {partition['mountpoint']}\n" description += f"Mount Point: {partition['mountpoint']}\n"
description += f"Total Space: {partition['total_space']}\n" description += f"Total Space: {partition['total_space']}\n"
@@ -1973,7 +1985,7 @@ class SystemHealthMonitor:
response = requests.post( response = requests.post(
self.ticket_api_url, self.ticket_api_url,
json=ticket_payload, json=ticket_payload,
headers = { headers={
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}' 'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
}, },
@@ -3373,7 +3385,7 @@ class SystemHealthMonitor:
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
logger.warning(f"Failed to parse ceph mon stat JSON: {e}") logger.warning(f"Failed to parse ceph mon stat JSON: {e}")
logger.debug(f"=== Ceph Health Check ===") logger.debug("=== Ceph Health Check ===")
logger.debug(f"Is Ceph node: {ceph_health['is_ceph_node']}") logger.debug(f"Is Ceph node: {ceph_health['is_ceph_node']}")
logger.debug(f"Cluster health: {ceph_health['cluster_health']}") logger.debug(f"Cluster health: {ceph_health['cluster_health']}")
logger.debug(f"Cluster usage: {ceph_health['cluster_usage']}") logger.debug(f"Cluster usage: {ceph_health['cluster_usage']}")
@@ -3597,22 +3609,22 @@ class SystemHealthMonitor:
return '{' + ','.join(pairs) + '}' if pairs else '' return '{' + ','.join(pairs) + '}' if pairs else ''
# === System Info === # === System Info ===
metrics.append(f'# HELP hwmon_info System information') metrics.append('# HELP hwmon_info System information')
metrics.append(f'# TYPE hwmon_info gauge') metrics.append('# TYPE hwmon_info gauge')
metrics.append(f'hwmon_info{labels(hostname=hostname)} 1') metrics.append(f'hwmon_info{labels(hostname=hostname)} 1')
# === Drive Metrics === # === Drive Metrics ===
metrics.append(f'# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)') metrics.append('# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)')
metrics.append(f'# TYPE hwmon_drive_smart_healthy gauge') metrics.append('# TYPE hwmon_drive_smart_healthy gauge')
metrics.append(f'# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius') metrics.append('# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius')
metrics.append(f'# TYPE hwmon_drive_temperature_celsius gauge') metrics.append('# TYPE hwmon_drive_temperature_celsius gauge')
metrics.append(f'# HELP hwmon_drive_size_bytes Drive total size in bytes') metrics.append('# HELP hwmon_drive_size_bytes Drive total size in bytes')
metrics.append(f'# TYPE hwmon_drive_size_bytes gauge') metrics.append('# TYPE hwmon_drive_size_bytes gauge')
metrics.append(f'# HELP hwmon_drive_smart_issues_total Number of SMART issues detected') metrics.append('# HELP hwmon_drive_smart_issues_total Number of SMART issues detected')
metrics.append(f'# TYPE hwmon_drive_smart_issues_total gauge') metrics.append('# TYPE hwmon_drive_smart_issues_total gauge')
for drive in health_report.get('drives_health', {}).get('drives', []): for drive in health_report.get('drives_health', {}).get('drives', []):
device = drive.get('device', 'unknown') device = drive.get('device', 'unknown')
@@ -3639,33 +3651,33 @@ class SystemHealthMonitor:
# === CPU Metrics === # === CPU Metrics ===
cpu = health_report.get('cpu_health', {}) cpu = health_report.get('cpu_health', {})
metrics.append(f'# HELP hwmon_cpu_usage_percent CPU usage percentage') metrics.append('# HELP hwmon_cpu_usage_percent CPU usage percentage')
metrics.append(f'# TYPE hwmon_cpu_usage_percent gauge') metrics.append('# TYPE hwmon_cpu_usage_percent gauge')
if cpu.get('cpu_usage_percent') is not None: if cpu.get('cpu_usage_percent') is not None:
metrics.append(f'hwmon_cpu_usage_percent{labels(hostname=hostname)} {cpu["cpu_usage_percent"]}') metrics.append(f'hwmon_cpu_usage_percent{labels(hostname=hostname)} {cpu["cpu_usage_percent"]}')
# === Memory Metrics === # === Memory Metrics ===
mem = health_report.get('memory_health', {}) mem = health_report.get('memory_health', {})
metrics.append(f'# HELP hwmon_memory_usage_percent Memory usage percentage') metrics.append('# HELP hwmon_memory_usage_percent Memory usage percentage')
metrics.append(f'# TYPE hwmon_memory_usage_percent gauge') metrics.append('# TYPE hwmon_memory_usage_percent gauge')
if mem.get('memory_percent') is not None: if mem.get('memory_percent') is not None:
metrics.append(f'hwmon_memory_usage_percent{labels(hostname=hostname)} {mem["memory_percent"]}') metrics.append(f'hwmon_memory_usage_percent{labels(hostname=hostname)} {mem["memory_percent"]}')
metrics.append(f'# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)') metrics.append('# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)')
metrics.append(f'# TYPE hwmon_memory_has_ecc gauge') metrics.append('# TYPE hwmon_memory_has_ecc gauge')
has_ecc = 1 if mem.get('has_ecc') else 0 has_ecc = 1 if mem.get('has_ecc') else 0
metrics.append(f'hwmon_memory_has_ecc{labels(hostname=hostname)} {has_ecc}') metrics.append(f'hwmon_memory_has_ecc{labels(hostname=hostname)} {has_ecc}')
if mem.get('has_ecc'): if mem.get('has_ecc'):
metrics.append(f'# HELP hwmon_memory_ecc_errors_total Total ECC errors detected') metrics.append('# HELP hwmon_memory_ecc_errors_total Total ECC errors detected')
metrics.append(f'# TYPE hwmon_memory_ecc_errors_total gauge') metrics.append('# TYPE hwmon_memory_ecc_errors_total gauge')
ecc_errors = len(mem.get('ecc_errors', [])) ecc_errors = len(mem.get('ecc_errors', []))
metrics.append(f'hwmon_memory_ecc_errors_total{labels(hostname=hostname)} {ecc_errors}') metrics.append(f'hwmon_memory_ecc_errors_total{labels(hostname=hostname)} {ecc_errors}')
# === Network Metrics === # === Network Metrics ===
net = health_report.get('network_health', {}) net = health_report.get('network_health', {})
metrics.append(f'# HELP hwmon_network_status Network status (1=OK, 0=issue)') metrics.append('# HELP hwmon_network_status Network status (1=OK, 0=issue)')
metrics.append(f'# TYPE hwmon_network_status gauge') metrics.append('# TYPE hwmon_network_status gauge')
for net_type in ['management_network', 'ceph_network']: for net_type in ['management_network', 'ceph_network']:
net_info = net.get(net_type, {}) net_info = net.get(net_type, {})
@@ -3676,40 +3688,40 @@ class SystemHealthMonitor:
# === Ceph Metrics === # === Ceph Metrics ===
ceph = health_report.get('ceph_health', {}) ceph = health_report.get('ceph_health', {})
if ceph.get('is_ceph_node'): if ceph.get('is_ceph_node'):
metrics.append(f'# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)') metrics.append('# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)')
metrics.append(f'# TYPE hwmon_ceph_cluster_healthy gauge') metrics.append('# TYPE hwmon_ceph_cluster_healthy gauge')
ceph_healthy = 1 if ceph.get('cluster_health') == 'HEALTH_OK' else 0 ceph_healthy = 1 if ceph.get('cluster_health') == 'HEALTH_OK' else 0
metrics.append(f'hwmon_ceph_cluster_healthy{labels(hostname=hostname)} {ceph_healthy}') metrics.append(f'hwmon_ceph_cluster_healthy{labels(hostname=hostname)} {ceph_healthy}')
if ceph.get('cluster_usage'): if ceph.get('cluster_usage'):
usage = ceph['cluster_usage'] usage = ceph['cluster_usage']
metrics.append(f'# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage') metrics.append('# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage')
metrics.append(f'# TYPE hwmon_ceph_cluster_usage_percent gauge') metrics.append('# TYPE hwmon_ceph_cluster_usage_percent gauge')
metrics.append(f'hwmon_ceph_cluster_usage_percent{labels(hostname=hostname)} {usage.get("usage_percent", 0)}') metrics.append(f'hwmon_ceph_cluster_usage_percent{labels(hostname=hostname)} {usage.get("usage_percent", 0)}')
metrics.append(f'# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes') metrics.append('# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes')
metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_total gauge') metrics.append('# TYPE hwmon_ceph_cluster_bytes_total gauge')
metrics.append(f'hwmon_ceph_cluster_bytes_total{labels(hostname=hostname)} {usage.get("total_bytes", 0)}') metrics.append(f'hwmon_ceph_cluster_bytes_total{labels(hostname=hostname)} {usage.get("total_bytes", 0)}')
metrics.append(f'# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes') metrics.append('# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes')
metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_used gauge') metrics.append('# TYPE hwmon_ceph_cluster_bytes_used gauge')
metrics.append(f'hwmon_ceph_cluster_bytes_used{labels(hostname=hostname)} {usage.get("used_bytes", 0)}') metrics.append(f'hwmon_ceph_cluster_bytes_used{labels(hostname=hostname)} {usage.get("used_bytes", 0)}')
metrics.append(f'# HELP hwmon_ceph_osd_total Total number of OSDs') metrics.append('# HELP hwmon_ceph_osd_total Total number of OSDs')
metrics.append(f'# TYPE hwmon_ceph_osd_total gauge') metrics.append('# TYPE hwmon_ceph_osd_total gauge')
osd_count = len(ceph.get('osd_status', [])) osd_count = len(ceph.get('osd_status', []))
metrics.append(f'hwmon_ceph_osd_total{labels(hostname=hostname)} {osd_count}') metrics.append(f'hwmon_ceph_osd_total{labels(hostname=hostname)} {osd_count}')
metrics.append(f'# HELP hwmon_ceph_osd_down Number of down OSDs') metrics.append('# HELP hwmon_ceph_osd_down Number of down OSDs')
metrics.append(f'# TYPE hwmon_ceph_osd_down gauge') metrics.append('# TYPE hwmon_ceph_osd_down gauge')
down_osds = len([o for o in ceph.get('osd_status', []) if o.get('status') == 'down']) down_osds = len([o for o in ceph.get('osd_status', []) if o.get('status') == 'down'])
metrics.append(f'hwmon_ceph_osd_down{labels(hostname=hostname)} {down_osds}') metrics.append(f'hwmon_ceph_osd_down{labels(hostname=hostname)} {down_osds}')
# === LXC Metrics === # === LXC Metrics ===
lxc = health_report.get('lxc_health', {}) lxc = health_report.get('lxc_health', {})
if lxc.get('containers'): if lxc.get('containers'):
metrics.append(f'# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage') metrics.append('# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage')
metrics.append(f'# TYPE hwmon_lxc_storage_usage_percent gauge') metrics.append('# TYPE hwmon_lxc_storage_usage_percent gauge')
for container in lxc['containers']: for container in lxc['containers']:
vmid = container.get('vmid', 'unknown') vmid = container.get('vmid', 'unknown')
@@ -3721,18 +3733,18 @@ class SystemHealthMonitor:
# === PBS Metrics === # === PBS Metrics ===
pbs = health_report.get('pbs_health', {}) pbs = health_report.get('pbs_health', {})
if pbs.get('is_pbs_node'): if pbs.get('is_pbs_node'):
metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage') metrics.append('# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage')
metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge') metrics.append('# TYPE hwmon_pbs_zfs_usage_percent gauge')
for pool in pbs.get('zfs_pools', []): for pool in pbs.get('zfs_pools', []):
metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}') metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}')
metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count') metrics.append('# HELP hwmon_pbs_failed_tasks_total PBS failed task count')
metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge') metrics.append('# TYPE hwmon_pbs_failed_tasks_total gauge')
metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}') metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}')
# === Issue Summary Metrics === # === Issue Summary Metrics ===
metrics.append(f'# HELP hwmon_issues_total Total number of issues detected') metrics.append('# HELP hwmon_issues_total Total number of issues detected')
metrics.append(f'# TYPE hwmon_issues_total gauge') metrics.append('# TYPE hwmon_issues_total gauge')
system_issues = len(health_report.get('system_health', {}).get('issues', [])) system_issues = len(health_report.get('system_health', {}).get('issues', []))
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', [])) ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
@@ -3881,50 +3893,50 @@ class SystemHealthMonitor:
pool, device_col, total_str, used_str, avail_str, percent_str, mountpoint = match.groups() pool, device_col, total_str, used_str, avail_str, percent_str, mountpoint = match.groups()
try: try:
# Skip excluded mounts # Skip excluded mounts
if pool.startswith('appPool:') or '/mnt/pve/mediaf' in device_col: if pool.startswith('appPool:') or '/mnt/pve/mediaf' in device_col:
continue continue
mountpoint = mountpoint.strip() mountpoint = mountpoint.strip()
# Skip excluded mountpoints # Skip excluded mountpoints
if self._is_excluded_mount(mountpoint): if self._is_excluded_mount(mountpoint):
logger.debug(f"Skipping excluded mount: {mountpoint}") logger.debug(f"Skipping excluded mount: {mountpoint}")
continue continue
# Parse size values from named regex groups # Parse size values from named regex groups
total_space = self._parse_size(total_str) total_space = self._parse_size(total_str)
used_space = self._parse_size(used_str) used_space = self._parse_size(used_str)
available_space = self._parse_size(avail_str) available_space = self._parse_size(avail_str)
# Parse percentage from regex group # Parse percentage from regex group
try: try:
usage_percent = float(percent_str) usage_percent = float(percent_str)
except ValueError: except ValueError:
# Calculate percentage if parsing fails # Calculate percentage if parsing fails
usage_percent = (used_space / total_space * 100) if total_space > 0 else 0 usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
filesystem = { filesystem = {
'mountpoint': mountpoint, 'mountpoint': mountpoint,
'total_space': total_space, 'total_space': total_space,
'used_space': used_space, 'used_space': used_space,
'available': available_space, 'available': available_space,
'usage_percent': usage_percent 'usage_percent': usage_percent
} }
container_info['filesystems'].append(filesystem) container_info['filesystems'].append(filesystem)
# Check thresholds # Check thresholds
if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']: if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
lxc_health['status'] = 'CRITICAL' lxc_health['status'] = 'CRITICAL'
issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}" issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
lxc_health['issues'].append(issue) lxc_health['issues'].append(issue)
elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']: elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
if lxc_health['status'] != 'CRITICAL': if lxc_health['status'] != 'CRITICAL':
lxc_health['status'] = 'WARNING' lxc_health['status'] = 'WARNING'
issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}" issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
lxc_health['issues'].append(issue) lxc_health['issues'].append(issue)
logger.debug(f"Filesystem details: {filesystem}") logger.debug(f"Filesystem details: {filesystem}")
except Exception as e: except Exception as e:
logger.debug(f"Error processing line: {str(e)}") logger.debug(f"Error processing line: {str(e)}")
logger.debug(f"Full exception: {repr(e)}") logger.debug(f"Full exception: {repr(e)}")
@@ -3949,6 +3961,7 @@ class SystemHealthMonitor:
return lxc_health return lxc_health
def main(): def main():
parser = argparse.ArgumentParser(description="System Health Monitor") parser = argparse.ArgumentParser(description="System Health Monitor")
parser.add_argument( parser.add_argument(
@@ -4003,5 +4016,6 @@ def main():
else: else:
monitor.run() monitor.run()
if __name__ == "__main__": if __name__ == "__main__":
main() main()