ci: add flake8 lint workflow; fix unused imports and f-string issues
Lint / Python (flake8) (push) Failing after 4s
Lint / Python (flake8) (push) Failing after 4s
Adds .gitea/workflows/lint.yml running flake8 with .flake8 config. Removes unused sys/urllib.request imports (F401). Removes f prefix from 52 f-strings that had no placeholders (F541). Auto-fixes trailing whitespace in blank lines (W293) via autopep8. Fixes over-indentation in LXC storage check try block (E117). Config ignores F841 (unused locals) and E501 (long lines). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,6 @@
|
|||||||
|
[flake8]
|
||||||
|
max-line-length = 120
|
||||||
|
# F841: local variable assigned but never used — many are intentional debug/future-use assignments
|
||||||
|
# E501: line too long — URLs and log messages in monitoring code are exempt
|
||||||
|
extend-ignore = F841, E501
|
||||||
|
exclude = __pycache__, .git
|
||||||
@@ -0,0 +1,20 @@
|
|||||||
|
name: Lint
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: ["**"]
|
||||||
|
pull_request:
|
||||||
|
branches: ["**"]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
python-lint:
|
||||||
|
name: Python (flake8)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Install flake8
|
||||||
|
run: pip install flake8
|
||||||
|
|
||||||
|
- name: Run flake8
|
||||||
|
run: flake8 .
|
||||||
+112
-98
@@ -1,5 +1,18 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl, textwrap, shutil
|
import os
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
import psutil
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import glob
|
||||||
|
import datetime
|
||||||
|
import fcntl
|
||||||
|
import textwrap
|
||||||
|
import shutil
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from typing import Dict, Any, List
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
@@ -83,8 +96,8 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
# PBS (Proxmox Backup Server) issues
|
# PBS (Proxmox Backup Server) issues
|
||||||
'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded
|
'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded
|
||||||
'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full
|
'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full
|
||||||
'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high
|
'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - ZFS pool usage high
|
||||||
'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors
|
'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors
|
||||||
'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed
|
'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed
|
||||||
'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed
|
'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed
|
||||||
@@ -272,7 +285,7 @@ class SystemHealthMonitor:
|
|||||||
'ISSUE': '[issue]', # General issue (replaces invalid 'incident')
|
'ISSUE': '[issue]', # General issue (replaces invalid 'incident')
|
||||||
'PROBLEM': '[problem]', # Root cause investigation
|
'PROBLEM': '[problem]', # Root cause investigation
|
||||||
'TASK': '[task]', # Planned work item
|
'TASK': '[task]', # Planned work item
|
||||||
'MAINTENANCE': '[maintenance]', # Scheduled/preventive work
|
'MAINTENANCE': '[maintenance]', # Scheduled/preventive work
|
||||||
'UPGRADE': '[upgrade]' # Hardware/software upgrade
|
'UPGRADE': '[upgrade]' # Hardware/software upgrade
|
||||||
},
|
},
|
||||||
'HARDWARE_TYPE': {
|
'HARDWARE_TYPE': {
|
||||||
@@ -300,7 +313,7 @@ class SystemHealthMonitor:
|
|||||||
'ISSUE': 'Issue', # General issue/incident
|
'ISSUE': 'Issue', # General issue/incident
|
||||||
'PROBLEM': 'Problem', # Root cause investigation needed
|
'PROBLEM': 'Problem', # Root cause investigation needed
|
||||||
'TASK': 'Task', # Planned work item
|
'TASK': 'Task', # Planned work item
|
||||||
'MAINTENANCE': 'Maintenance', # Scheduled/preventive work
|
'MAINTENANCE': 'Maintenance', # Scheduled/preventive work
|
||||||
'UPGRADE': 'Upgrade', # Hardware/software upgrade
|
'UPGRADE': 'Upgrade', # Hardware/software upgrade
|
||||||
'INSTALL': 'Install', # New installation
|
'INSTALL': 'Install', # New installation
|
||||||
'REQUEST': 'Request' # Service or information request
|
'REQUEST': 'Request' # Service or information request
|
||||||
@@ -991,7 +1004,7 @@ class SystemHealthMonitor:
|
|||||||
# Analyze trends for critical attributes
|
# Analyze trends for critical attributes
|
||||||
if len(history) >= 3: # Need at least 3 data points for trend analysis
|
if len(history) >= 3: # Need at least 3 data points for trend analysis
|
||||||
critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
|
critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
|
||||||
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
|
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
|
||||||
|
|
||||||
for attr in critical_attrs:
|
for attr in critical_attrs:
|
||||||
if attr in current_attributes:
|
if attr in current_attributes:
|
||||||
@@ -1171,7 +1184,7 @@ class SystemHealthMonitor:
|
|||||||
try:
|
try:
|
||||||
# Check dmesg for drive-related errors (last 1000 lines to avoid overwhelming output)
|
# Check dmesg for drive-related errors (last 1000 lines to avoid overwhelming output)
|
||||||
result = subprocess.run(['dmesg', '-T', '--level=err,warn'],
|
result = subprocess.run(['dmesg', '-T', '--level=err,warn'],
|
||||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=10)
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, timeout=10)
|
||||||
|
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
error_patterns = [
|
error_patterns = [
|
||||||
@@ -1272,7 +1285,6 @@ class SystemHealthMonitor:
|
|||||||
self._drive_details_cache[device] = drive_details
|
self._drive_details_cache[device] = drive_details
|
||||||
return drive_details
|
return drive_details
|
||||||
|
|
||||||
|
|
||||||
def _get_issue_type(self, issue: str) -> str:
|
def _get_issue_type(self, issue: str) -> str:
|
||||||
"""Determine issue type from issue description."""
|
"""Determine issue type from issue description."""
|
||||||
if "SMART" in issue:
|
if "SMART" in issue:
|
||||||
@@ -1318,7 +1330,7 @@ class SystemHealthMonitor:
|
|||||||
# content lines: prefix + field_width + ┃ = 80
|
# content lines: prefix + field_width + ┃ = 80
|
||||||
box_width = 78
|
box_width = 78
|
||||||
|
|
||||||
banner = f"""
|
banner = """
|
||||||
┏{'━' * box_width}┓
|
┏{'━' * box_width}┓
|
||||||
┃{' HARDWARE MONITORING ALERT TICKET '.center(box_width)}┃
|
┃{' HARDWARE MONITORING ALERT TICKET '.center(box_width)}┃
|
||||||
┣{'━' * box_width}┫
|
┣{'━' * box_width}┫
|
||||||
@@ -1330,7 +1342,7 @@ class SystemHealthMonitor:
|
|||||||
issue_type = self._get_issue_type(issue)
|
issue_type = self._get_issue_type(issue)
|
||||||
impact_level = self._get_impact_level(issue)
|
impact_level = self._get_impact_level(issue)
|
||||||
|
|
||||||
executive_summary = f"""
|
executive_summary = """
|
||||||
┏━ EXECUTIVE SUMMARY {'━' * (box_width - 20)}┓
|
┏━ EXECUTIVE SUMMARY {'━' * (box_width - 20)}┓
|
||||||
┃ Issue Type │ {issue_type:<60}┃
|
┃ Issue Type │ {issue_type:<60}┃
|
||||||
┃ Impact Level │ {impact_level:<60}┃
|
┃ Impact Level │ {impact_level:<60}┃
|
||||||
@@ -1395,7 +1407,7 @@ class SystemHealthMonitor:
|
|||||||
type_safe = drive_details.get('type') or 'N/A'
|
type_safe = drive_details.get('type') or 'N/A'
|
||||||
firmware_safe = drive_details.get('firmware') or 'N/A'
|
firmware_safe = drive_details.get('firmware') or 'N/A'
|
||||||
|
|
||||||
description += f"""
|
description += """
|
||||||
┏━ DRIVE SPECIFICATIONS {'━' * (box_width - 23)}┓
|
┏━ DRIVE SPECIFICATIONS {'━' * (box_width - 23)}┓
|
||||||
┃ Device Path │ {device_safe:<61}┃
|
┃ Device Path │ {device_safe:<61}┃
|
||||||
┃ Model │ {model_safe:<61}┃
|
┃ Model │ {model_safe:<61}┃
|
||||||
@@ -1410,7 +1422,7 @@ class SystemHealthMonitor:
|
|||||||
last_test_safe = last_test_date or 'N/A'
|
last_test_safe = last_test_date or 'N/A'
|
||||||
age_safe = age or 'N/A'
|
age_safe = age or 'N/A'
|
||||||
|
|
||||||
description += f"""
|
description += """
|
||||||
┏━ DRIVE TIMELINE {'━' * (box_width - 17)}┓
|
┏━ DRIVE TIMELINE {'━' * (box_width - 17)}┓
|
||||||
┃ Power-On Hours │ {power_on_safe:<56}┃
|
┃ Power-On Hours │ {power_on_safe:<56}┃
|
||||||
┃ Last SMART Test │ {last_test_safe:<56}┃
|
┃ Last SMART Test │ {last_test_safe:<56}┃
|
||||||
@@ -1423,7 +1435,7 @@ class SystemHealthMonitor:
|
|||||||
temp_value = drive_info.get('temperature')
|
temp_value = drive_info.get('temperature')
|
||||||
temp_safe = f"{temp_value}°C" if temp_value is not None else 'N/A'
|
temp_safe = f"{temp_value}°C" if temp_value is not None else 'N/A'
|
||||||
|
|
||||||
description += f"""
|
description += """
|
||||||
┏━ SMART STATUS {'━' * (box_width - 15)}┓
|
┏━ SMART STATUS {'━' * (box_width - 15)}┓
|
||||||
┃ Status │ {smart_status_safe:<62}┃
|
┃ Status │ {smart_status_safe:<62}┃
|
||||||
┃ Temperature │ {temp_safe:<62}┃
|
┃ Temperature │ {temp_safe:<62}┃
|
||||||
@@ -1455,7 +1467,7 @@ class SystemHealthMonitor:
|
|||||||
# Truncate mountpoint if too long for header
|
# Truncate mountpoint if too long for header
|
||||||
mountpoint_display = mountpoint_safe[:50] if len(mountpoint_safe) > 50 else mountpoint_safe
|
mountpoint_display = mountpoint_safe[:50] if len(mountpoint_safe) > 50 else mountpoint_safe
|
||||||
|
|
||||||
description += f"""
|
description += """
|
||||||
┏━ PARTITION: {mountpoint_display} {'━' * (box_width - 14 - len(mountpoint_display))}┓
|
┏━ PARTITION: {mountpoint_display} {'━' * (box_width - 14 - len(mountpoint_display))}┓
|
||||||
┃ Filesystem │ {fstype_safe:<61}┃
|
┃ Filesystem │ {fstype_safe:<61}┃
|
||||||
┃ Usage Meter │ {usage_meter} {usage_pct_str:>10}┃
|
┃ Usage Meter │ {usage_meter} {usage_pct_str:>10}┃
|
||||||
@@ -1508,7 +1520,7 @@ class SystemHealthMonitor:
|
|||||||
cpu_status = cpu_health.get('status', 'N/A')
|
cpu_status = cpu_health.get('status', 'N/A')
|
||||||
cpu_usage_str = f"{cpu_usage}%" if isinstance(cpu_usage, (int, float)) else cpu_usage
|
cpu_usage_str = f"{cpu_usage}%" if isinstance(cpu_usage, (int, float)) else cpu_usage
|
||||||
|
|
||||||
description += f"""
|
description += """
|
||||||
┏━ CPU STATUS {'━' * (box_width - 13)}┓
|
┏━ CPU STATUS {'━' * (box_width - 13)}┓
|
||||||
┃ Usage │ {cpu_usage_str:<61}┃
|
┃ Usage │ {cpu_usage_str:<61}┃
|
||||||
┃ Threshold │ {str(cpu_threshold) + '%':<61}┃
|
┃ Threshold │ {str(cpu_threshold) + '%':<61}┃
|
||||||
@@ -1541,7 +1553,7 @@ class SystemHealthMonitor:
|
|||||||
if len(issues_str) > 61:
|
if len(issues_str) > 61:
|
||||||
issues_str = issues_str[:58] + '...'
|
issues_str = issues_str[:58] + '...'
|
||||||
|
|
||||||
description += f"""
|
description += """
|
||||||
┏━ NETWORK STATUS {'━' * (box_width - 17)}┓
|
┏━ NETWORK STATUS {'━' * (box_width - 17)}┓
|
||||||
┃ Management │ {mgmt_status:<61}┃
|
┃ Management │ {mgmt_status:<61}┃
|
||||||
┃ Ceph Network │ {ceph_status:<61}┃
|
┃ Ceph Network │ {ceph_status:<61}┃
|
||||||
@@ -1573,7 +1585,7 @@ class SystemHealthMonitor:
|
|||||||
usage_meter = '█' * blocks + '░' * (50 - blocks)
|
usage_meter = '█' * blocks + '░' * (50 - blocks)
|
||||||
usage_pct_str = f"{usage_pct:.1f}%"
|
usage_pct_str = f"{usage_pct:.1f}%"
|
||||||
|
|
||||||
description += f"""
|
description += """
|
||||||
┏━ CONTAINER STORAGE {'━' * (box_width - 20)}┓
|
┏━ CONTAINER STORAGE {'━' * (box_width - 20)}┓
|
||||||
┃ VMID │ {vmid:<61}┃
|
┃ VMID │ {vmid:<61}┃
|
||||||
┃ Mountpoint │ {mountpoint:<61}┃
|
┃ Mountpoint │ {mountpoint:<61}┃
|
||||||
@@ -1601,7 +1613,7 @@ class SystemHealthMonitor:
|
|||||||
osd_up = sum(1 for o in osd_list if o.get('status') == 'up')
|
osd_up = sum(1 for o in osd_list if o.get('status') == 'up')
|
||||||
osd_summary = f"{osd_up}/{osd_total} up" if osd_total > 0 else 'N/A'
|
osd_summary = f"{osd_up}/{osd_total} up" if osd_total > 0 else 'N/A'
|
||||||
|
|
||||||
description += f"""
|
description += """
|
||||||
┏━ CEPH CLUSTER STATUS {'━' * (box_width - 22)}┓
|
┏━ CEPH CLUSTER STATUS {'━' * (box_width - 22)}┓
|
||||||
┃ Health │ {cluster_health:<61}┃
|
┃ Health │ {cluster_health:<61}┃
|
||||||
┃ Usage │ {usage_pct_str:<61}┃
|
┃ Usage │ {usage_pct_str:<61}┃
|
||||||
@@ -1614,7 +1626,7 @@ class SystemHealthMonitor:
|
|||||||
if "Disk" in issue:
|
if "Disk" in issue:
|
||||||
for partition in health_report.get('drives_health', {}).get('drives', []):
|
for partition in health_report.get('drives_health', {}).get('drives', []):
|
||||||
if partition.get('mountpoint') in issue:
|
if partition.get('mountpoint') in issue:
|
||||||
description += f"\n=== Disk Metrics ===\n"
|
description += "\n=== Disk Metrics ===\n"
|
||||||
description += f"Disk Device: {partition['device']}\n"
|
description += f"Disk Device: {partition['device']}\n"
|
||||||
description += f"Mount Point: {partition['mountpoint']}\n"
|
description += f"Mount Point: {partition['mountpoint']}\n"
|
||||||
description += f"Total Space: {partition['total_space']}\n"
|
description += f"Total Space: {partition['total_space']}\n"
|
||||||
@@ -1973,7 +1985,7 @@ class SystemHealthMonitor:
|
|||||||
response = requests.post(
|
response = requests.post(
|
||||||
self.ticket_api_url,
|
self.ticket_api_url,
|
||||||
json=ticket_payload,
|
json=ticket_payload,
|
||||||
headers = {
|
headers={
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
|
'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
|
||||||
},
|
},
|
||||||
@@ -3373,7 +3385,7 @@ class SystemHealthMonitor:
|
|||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
logger.warning(f"Failed to parse ceph mon stat JSON: {e}")
|
logger.warning(f"Failed to parse ceph mon stat JSON: {e}")
|
||||||
|
|
||||||
logger.debug(f"=== Ceph Health Check ===")
|
logger.debug("=== Ceph Health Check ===")
|
||||||
logger.debug(f"Is Ceph node: {ceph_health['is_ceph_node']}")
|
logger.debug(f"Is Ceph node: {ceph_health['is_ceph_node']}")
|
||||||
logger.debug(f"Cluster health: {ceph_health['cluster_health']}")
|
logger.debug(f"Cluster health: {ceph_health['cluster_health']}")
|
||||||
logger.debug(f"Cluster usage: {ceph_health['cluster_usage']}")
|
logger.debug(f"Cluster usage: {ceph_health['cluster_usage']}")
|
||||||
@@ -3597,22 +3609,22 @@ class SystemHealthMonitor:
|
|||||||
return '{' + ','.join(pairs) + '}' if pairs else ''
|
return '{' + ','.join(pairs) + '}' if pairs else ''
|
||||||
|
|
||||||
# === System Info ===
|
# === System Info ===
|
||||||
metrics.append(f'# HELP hwmon_info System information')
|
metrics.append('# HELP hwmon_info System information')
|
||||||
metrics.append(f'# TYPE hwmon_info gauge')
|
metrics.append('# TYPE hwmon_info gauge')
|
||||||
metrics.append(f'hwmon_info{labels(hostname=hostname)} 1')
|
metrics.append(f'hwmon_info{labels(hostname=hostname)} 1')
|
||||||
|
|
||||||
# === Drive Metrics ===
|
# === Drive Metrics ===
|
||||||
metrics.append(f'# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)')
|
metrics.append('# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)')
|
||||||
metrics.append(f'# TYPE hwmon_drive_smart_healthy gauge')
|
metrics.append('# TYPE hwmon_drive_smart_healthy gauge')
|
||||||
|
|
||||||
metrics.append(f'# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius')
|
metrics.append('# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius')
|
||||||
metrics.append(f'# TYPE hwmon_drive_temperature_celsius gauge')
|
metrics.append('# TYPE hwmon_drive_temperature_celsius gauge')
|
||||||
|
|
||||||
metrics.append(f'# HELP hwmon_drive_size_bytes Drive total size in bytes')
|
metrics.append('# HELP hwmon_drive_size_bytes Drive total size in bytes')
|
||||||
metrics.append(f'# TYPE hwmon_drive_size_bytes gauge')
|
metrics.append('# TYPE hwmon_drive_size_bytes gauge')
|
||||||
|
|
||||||
metrics.append(f'# HELP hwmon_drive_smart_issues_total Number of SMART issues detected')
|
metrics.append('# HELP hwmon_drive_smart_issues_total Number of SMART issues detected')
|
||||||
metrics.append(f'# TYPE hwmon_drive_smart_issues_total gauge')
|
metrics.append('# TYPE hwmon_drive_smart_issues_total gauge')
|
||||||
|
|
||||||
for drive in health_report.get('drives_health', {}).get('drives', []):
|
for drive in health_report.get('drives_health', {}).get('drives', []):
|
||||||
device = drive.get('device', 'unknown')
|
device = drive.get('device', 'unknown')
|
||||||
@@ -3639,33 +3651,33 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
# === CPU Metrics ===
|
# === CPU Metrics ===
|
||||||
cpu = health_report.get('cpu_health', {})
|
cpu = health_report.get('cpu_health', {})
|
||||||
metrics.append(f'# HELP hwmon_cpu_usage_percent CPU usage percentage')
|
metrics.append('# HELP hwmon_cpu_usage_percent CPU usage percentage')
|
||||||
metrics.append(f'# TYPE hwmon_cpu_usage_percent gauge')
|
metrics.append('# TYPE hwmon_cpu_usage_percent gauge')
|
||||||
if cpu.get('cpu_usage_percent') is not None:
|
if cpu.get('cpu_usage_percent') is not None:
|
||||||
metrics.append(f'hwmon_cpu_usage_percent{labels(hostname=hostname)} {cpu["cpu_usage_percent"]}')
|
metrics.append(f'hwmon_cpu_usage_percent{labels(hostname=hostname)} {cpu["cpu_usage_percent"]}')
|
||||||
|
|
||||||
# === Memory Metrics ===
|
# === Memory Metrics ===
|
||||||
mem = health_report.get('memory_health', {})
|
mem = health_report.get('memory_health', {})
|
||||||
metrics.append(f'# HELP hwmon_memory_usage_percent Memory usage percentage')
|
metrics.append('# HELP hwmon_memory_usage_percent Memory usage percentage')
|
||||||
metrics.append(f'# TYPE hwmon_memory_usage_percent gauge')
|
metrics.append('# TYPE hwmon_memory_usage_percent gauge')
|
||||||
if mem.get('memory_percent') is not None:
|
if mem.get('memory_percent') is not None:
|
||||||
metrics.append(f'hwmon_memory_usage_percent{labels(hostname=hostname)} {mem["memory_percent"]}')
|
metrics.append(f'hwmon_memory_usage_percent{labels(hostname=hostname)} {mem["memory_percent"]}')
|
||||||
|
|
||||||
metrics.append(f'# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)')
|
metrics.append('# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)')
|
||||||
metrics.append(f'# TYPE hwmon_memory_has_ecc gauge')
|
metrics.append('# TYPE hwmon_memory_has_ecc gauge')
|
||||||
has_ecc = 1 if mem.get('has_ecc') else 0
|
has_ecc = 1 if mem.get('has_ecc') else 0
|
||||||
metrics.append(f'hwmon_memory_has_ecc{labels(hostname=hostname)} {has_ecc}')
|
metrics.append(f'hwmon_memory_has_ecc{labels(hostname=hostname)} {has_ecc}')
|
||||||
|
|
||||||
if mem.get('has_ecc'):
|
if mem.get('has_ecc'):
|
||||||
metrics.append(f'# HELP hwmon_memory_ecc_errors_total Total ECC errors detected')
|
metrics.append('# HELP hwmon_memory_ecc_errors_total Total ECC errors detected')
|
||||||
metrics.append(f'# TYPE hwmon_memory_ecc_errors_total gauge')
|
metrics.append('# TYPE hwmon_memory_ecc_errors_total gauge')
|
||||||
ecc_errors = len(mem.get('ecc_errors', []))
|
ecc_errors = len(mem.get('ecc_errors', []))
|
||||||
metrics.append(f'hwmon_memory_ecc_errors_total{labels(hostname=hostname)} {ecc_errors}')
|
metrics.append(f'hwmon_memory_ecc_errors_total{labels(hostname=hostname)} {ecc_errors}')
|
||||||
|
|
||||||
# === Network Metrics ===
|
# === Network Metrics ===
|
||||||
net = health_report.get('network_health', {})
|
net = health_report.get('network_health', {})
|
||||||
metrics.append(f'# HELP hwmon_network_status Network status (1=OK, 0=issue)')
|
metrics.append('# HELP hwmon_network_status Network status (1=OK, 0=issue)')
|
||||||
metrics.append(f'# TYPE hwmon_network_status gauge')
|
metrics.append('# TYPE hwmon_network_status gauge')
|
||||||
|
|
||||||
for net_type in ['management_network', 'ceph_network']:
|
for net_type in ['management_network', 'ceph_network']:
|
||||||
net_info = net.get(net_type, {})
|
net_info = net.get(net_type, {})
|
||||||
@@ -3676,40 +3688,40 @@ class SystemHealthMonitor:
|
|||||||
# === Ceph Metrics ===
|
# === Ceph Metrics ===
|
||||||
ceph = health_report.get('ceph_health', {})
|
ceph = health_report.get('ceph_health', {})
|
||||||
if ceph.get('is_ceph_node'):
|
if ceph.get('is_ceph_node'):
|
||||||
metrics.append(f'# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)')
|
metrics.append('# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)')
|
||||||
metrics.append(f'# TYPE hwmon_ceph_cluster_healthy gauge')
|
metrics.append('# TYPE hwmon_ceph_cluster_healthy gauge')
|
||||||
ceph_healthy = 1 if ceph.get('cluster_health') == 'HEALTH_OK' else 0
|
ceph_healthy = 1 if ceph.get('cluster_health') == 'HEALTH_OK' else 0
|
||||||
metrics.append(f'hwmon_ceph_cluster_healthy{labels(hostname=hostname)} {ceph_healthy}')
|
metrics.append(f'hwmon_ceph_cluster_healthy{labels(hostname=hostname)} {ceph_healthy}')
|
||||||
|
|
||||||
if ceph.get('cluster_usage'):
|
if ceph.get('cluster_usage'):
|
||||||
usage = ceph['cluster_usage']
|
usage = ceph['cluster_usage']
|
||||||
metrics.append(f'# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage')
|
metrics.append('# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage')
|
||||||
metrics.append(f'# TYPE hwmon_ceph_cluster_usage_percent gauge')
|
metrics.append('# TYPE hwmon_ceph_cluster_usage_percent gauge')
|
||||||
metrics.append(f'hwmon_ceph_cluster_usage_percent{labels(hostname=hostname)} {usage.get("usage_percent", 0)}')
|
metrics.append(f'hwmon_ceph_cluster_usage_percent{labels(hostname=hostname)} {usage.get("usage_percent", 0)}')
|
||||||
|
|
||||||
metrics.append(f'# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes')
|
metrics.append('# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes')
|
||||||
metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_total gauge')
|
metrics.append('# TYPE hwmon_ceph_cluster_bytes_total gauge')
|
||||||
metrics.append(f'hwmon_ceph_cluster_bytes_total{labels(hostname=hostname)} {usage.get("total_bytes", 0)}')
|
metrics.append(f'hwmon_ceph_cluster_bytes_total{labels(hostname=hostname)} {usage.get("total_bytes", 0)}')
|
||||||
|
|
||||||
metrics.append(f'# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes')
|
metrics.append('# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes')
|
||||||
metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_used gauge')
|
metrics.append('# TYPE hwmon_ceph_cluster_bytes_used gauge')
|
||||||
metrics.append(f'hwmon_ceph_cluster_bytes_used{labels(hostname=hostname)} {usage.get("used_bytes", 0)}')
|
metrics.append(f'hwmon_ceph_cluster_bytes_used{labels(hostname=hostname)} {usage.get("used_bytes", 0)}')
|
||||||
|
|
||||||
metrics.append(f'# HELP hwmon_ceph_osd_total Total number of OSDs')
|
metrics.append('# HELP hwmon_ceph_osd_total Total number of OSDs')
|
||||||
metrics.append(f'# TYPE hwmon_ceph_osd_total gauge')
|
metrics.append('# TYPE hwmon_ceph_osd_total gauge')
|
||||||
osd_count = len(ceph.get('osd_status', []))
|
osd_count = len(ceph.get('osd_status', []))
|
||||||
metrics.append(f'hwmon_ceph_osd_total{labels(hostname=hostname)} {osd_count}')
|
metrics.append(f'hwmon_ceph_osd_total{labels(hostname=hostname)} {osd_count}')
|
||||||
|
|
||||||
metrics.append(f'# HELP hwmon_ceph_osd_down Number of down OSDs')
|
metrics.append('# HELP hwmon_ceph_osd_down Number of down OSDs')
|
||||||
metrics.append(f'# TYPE hwmon_ceph_osd_down gauge')
|
metrics.append('# TYPE hwmon_ceph_osd_down gauge')
|
||||||
down_osds = len([o for o in ceph.get('osd_status', []) if o.get('status') == 'down'])
|
down_osds = len([o for o in ceph.get('osd_status', []) if o.get('status') == 'down'])
|
||||||
metrics.append(f'hwmon_ceph_osd_down{labels(hostname=hostname)} {down_osds}')
|
metrics.append(f'hwmon_ceph_osd_down{labels(hostname=hostname)} {down_osds}')
|
||||||
|
|
||||||
# === LXC Metrics ===
|
# === LXC Metrics ===
|
||||||
lxc = health_report.get('lxc_health', {})
|
lxc = health_report.get('lxc_health', {})
|
||||||
if lxc.get('containers'):
|
if lxc.get('containers'):
|
||||||
metrics.append(f'# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage')
|
metrics.append('# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage')
|
||||||
metrics.append(f'# TYPE hwmon_lxc_storage_usage_percent gauge')
|
metrics.append('# TYPE hwmon_lxc_storage_usage_percent gauge')
|
||||||
|
|
||||||
for container in lxc['containers']:
|
for container in lxc['containers']:
|
||||||
vmid = container.get('vmid', 'unknown')
|
vmid = container.get('vmid', 'unknown')
|
||||||
@@ -3721,18 +3733,18 @@ class SystemHealthMonitor:
|
|||||||
# === PBS Metrics ===
|
# === PBS Metrics ===
|
||||||
pbs = health_report.get('pbs_health', {})
|
pbs = health_report.get('pbs_health', {})
|
||||||
if pbs.get('is_pbs_node'):
|
if pbs.get('is_pbs_node'):
|
||||||
metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage')
|
metrics.append('# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage')
|
||||||
metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge')
|
metrics.append('# TYPE hwmon_pbs_zfs_usage_percent gauge')
|
||||||
for pool in pbs.get('zfs_pools', []):
|
for pool in pbs.get('zfs_pools', []):
|
||||||
metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}')
|
metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}')
|
||||||
|
|
||||||
metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count')
|
metrics.append('# HELP hwmon_pbs_failed_tasks_total PBS failed task count')
|
||||||
metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge')
|
metrics.append('# TYPE hwmon_pbs_failed_tasks_total gauge')
|
||||||
metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}')
|
metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}')
|
||||||
|
|
||||||
# === Issue Summary Metrics ===
|
# === Issue Summary Metrics ===
|
||||||
metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
|
metrics.append('# HELP hwmon_issues_total Total number of issues detected')
|
||||||
metrics.append(f'# TYPE hwmon_issues_total gauge')
|
metrics.append('# TYPE hwmon_issues_total gauge')
|
||||||
|
|
||||||
system_issues = len(health_report.get('system_health', {}).get('issues', []))
|
system_issues = len(health_report.get('system_health', {}).get('issues', []))
|
||||||
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
|
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
|
||||||
@@ -3881,50 +3893,50 @@ class SystemHealthMonitor:
|
|||||||
pool, device_col, total_str, used_str, avail_str, percent_str, mountpoint = match.groups()
|
pool, device_col, total_str, used_str, avail_str, percent_str, mountpoint = match.groups()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Skip excluded mounts
|
# Skip excluded mounts
|
||||||
if pool.startswith('appPool:') or '/mnt/pve/mediaf' in device_col:
|
if pool.startswith('appPool:') or '/mnt/pve/mediaf' in device_col:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
mountpoint = mountpoint.strip()
|
mountpoint = mountpoint.strip()
|
||||||
|
|
||||||
# Skip excluded mountpoints
|
# Skip excluded mountpoints
|
||||||
if self._is_excluded_mount(mountpoint):
|
if self._is_excluded_mount(mountpoint):
|
||||||
logger.debug(f"Skipping excluded mount: {mountpoint}")
|
logger.debug(f"Skipping excluded mount: {mountpoint}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Parse size values from named regex groups
|
# Parse size values from named regex groups
|
||||||
total_space = self._parse_size(total_str)
|
total_space = self._parse_size(total_str)
|
||||||
used_space = self._parse_size(used_str)
|
used_space = self._parse_size(used_str)
|
||||||
available_space = self._parse_size(avail_str)
|
available_space = self._parse_size(avail_str)
|
||||||
|
|
||||||
# Parse percentage from regex group
|
# Parse percentage from regex group
|
||||||
try:
|
try:
|
||||||
usage_percent = float(percent_str)
|
usage_percent = float(percent_str)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# Calculate percentage if parsing fails
|
# Calculate percentage if parsing fails
|
||||||
usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
|
usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
|
||||||
|
|
||||||
filesystem = {
|
filesystem = {
|
||||||
'mountpoint': mountpoint,
|
'mountpoint': mountpoint,
|
||||||
'total_space': total_space,
|
'total_space': total_space,
|
||||||
'used_space': used_space,
|
'used_space': used_space,
|
||||||
'available': available_space,
|
'available': available_space,
|
||||||
'usage_percent': usage_percent
|
'usage_percent': usage_percent
|
||||||
}
|
}
|
||||||
container_info['filesystems'].append(filesystem)
|
container_info['filesystems'].append(filesystem)
|
||||||
|
|
||||||
# Check thresholds
|
# Check thresholds
|
||||||
if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
|
if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
|
||||||
lxc_health['status'] = 'CRITICAL'
|
lxc_health['status'] = 'CRITICAL'
|
||||||
issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
|
issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
|
||||||
lxc_health['issues'].append(issue)
|
lxc_health['issues'].append(issue)
|
||||||
elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
|
elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
|
||||||
if lxc_health['status'] != 'CRITICAL':
|
if lxc_health['status'] != 'CRITICAL':
|
||||||
lxc_health['status'] = 'WARNING'
|
lxc_health['status'] = 'WARNING'
|
||||||
issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
|
issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
|
||||||
lxc_health['issues'].append(issue)
|
lxc_health['issues'].append(issue)
|
||||||
|
|
||||||
logger.debug(f"Filesystem details: {filesystem}")
|
logger.debug(f"Filesystem details: {filesystem}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Error processing line: {str(e)}")
|
logger.debug(f"Error processing line: {str(e)}")
|
||||||
logger.debug(f"Full exception: {repr(e)}")
|
logger.debug(f"Full exception: {repr(e)}")
|
||||||
@@ -3949,6 +3961,7 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
return lxc_health
|
return lxc_health
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="System Health Monitor")
|
parser = argparse.ArgumentParser(description="System Health Monitor")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@@ -4003,5 +4016,6 @@ def main():
|
|||||||
else:
|
else:
|
||||||
monitor.run()
|
monitor.run()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
Reference in New Issue
Block a user