Compare commits
17 Commits
f44fce2ba7
...
d1750ea6cf
| Author | SHA1 | Date | |
|---|---|---|---|
| d1750ea6cf | |||
| 07782da7b6 | |||
| b02e416117 | |||
| 7b36255fb4 | |||
| 92bca248ac | |||
| 4a186fb6d6 | |||
| 90346a2da1 | |||
| 308a8d5c5c | |||
| 9f9cc1b763 | |||
| ab67d786ce | |||
| da2de4375e | |||
| 38dd120da2 | |||
| 7383a0c674 | |||
| a3cf5a698f | |||
| c7309663de | |||
| 0559f2d668 | |||
| d79005eb42 |
629
hwmonDaemon.py
629
hwmonDaemon.py
@@ -1,15 +1,16 @@
|
||||
#!/usr/bin/env python3
|
||||
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl, textwrap
|
||||
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl, textwrap, shutil
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Dict, Any, List
|
||||
|
||||
# =============================================================================
|
||||
# LOGGING SETUP
|
||||
# =============================================================================
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.DEBUG)
|
||||
console_handler.setLevel(logging.INFO)
|
||||
|
||||
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
console_handler.setFormatter(formatter)
|
||||
@@ -78,7 +79,16 @@ class SystemHealthMonitor:
|
||||
'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full
|
||||
'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high
|
||||
'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded
|
||||
'CEPH_MON_DOWN': PRIORITIES['HIGH'] # P2 - Monitor down
|
||||
'CEPH_MON_DOWN': PRIORITIES['HIGH'], # P2 - Monitor down
|
||||
|
||||
# PBS (Proxmox Backup Server) issues
|
||||
'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded
|
||||
'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full
|
||||
'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high
|
||||
'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors
|
||||
'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed
|
||||
'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed
|
||||
'PBS_SYNC_FAILED': PRIORITIES['MEDIUM'] # P3 - Sync job failed
|
||||
}
|
||||
|
||||
CONFIG = {
|
||||
@@ -124,7 +134,19 @@ class SystemHealthMonitor:
|
||||
# Prometheus metrics settings
|
||||
'PROMETHEUS_ENABLED': False, # Enable Prometheus metrics export
|
||||
'PROMETHEUS_PORT': 9101, # Port for Prometheus metrics HTTP server
|
||||
'PROMETHEUS_TEXTFILE_PATH': None # Path for textfile collector (alternative to HTTP)
|
||||
'PROMETHEUS_TEXTFILE_PATH': None, # Path for textfile collector (alternative to HTTP)
|
||||
# SMART analysis thresholds
|
||||
'NEW_DRIVE_HOURS_THRESHOLD': 720, # Hours to consider a drive "new" (~30 days)
|
||||
'SMART_ERROR_RECENT_HOURS': 168, # Hours window for recent SMART errors (~1 week)
|
||||
# Storage limits
|
||||
'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files
|
||||
# Health check endpoint
|
||||
'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint
|
||||
'HEALTH_SERVER_PORT': 9102, # Port for health check endpoint
|
||||
# PBS (Proxmox Backup Server) monitoring
|
||||
'PBS_ENABLED': False, # Enable PBS health monitoring
|
||||
'PBS_ZFS_WARNING': 80, # ZFS pool usage warning threshold %
|
||||
'PBS_ZFS_CRITICAL': 90 # ZFS pool usage critical threshold %
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -166,15 +188,24 @@ class SystemHealthMonitor:
|
||||
cls.CONFIG['CEPH_TICKET_NODE'] = value if value else None
|
||||
logger.info(f"✓ Loaded CEPH_TICKET_NODE: {value}")
|
||||
elif key == 'CEPH_USAGE_WARNING':
|
||||
try:
|
||||
cls.CONFIG['CEPH_USAGE_WARNING'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid CEPH_USAGE_WARNING value: {value}")
|
||||
elif key == 'CEPH_USAGE_CRITICAL':
|
||||
try:
|
||||
cls.CONFIG['CEPH_USAGE_CRITICAL'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid CEPH_USAGE_CRITICAL value: {value}")
|
||||
# Prometheus settings
|
||||
elif key == 'PROMETHEUS_ENABLED':
|
||||
cls.CONFIG['PROMETHEUS_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||
logger.info(f"✓ Loaded PROMETHEUS_ENABLED: {cls.CONFIG['PROMETHEUS_ENABLED']}")
|
||||
elif key == 'PROMETHEUS_PORT':
|
||||
try:
|
||||
cls.CONFIG['PROMETHEUS_PORT'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid PROMETHEUS_PORT value: {value}")
|
||||
elif key == 'PROMETHEUS_TEXTFILE_PATH':
|
||||
cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None
|
||||
logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}")
|
||||
@@ -182,10 +213,53 @@ class SystemHealthMonitor:
|
||||
elif key == 'CLUSTER_NAME':
|
||||
cls.CONFIG['CLUSTER_NAME'] = value if value else 'proxmox-cluster'
|
||||
logger.info(f"✓ Loaded CLUSTER_NAME: {value}")
|
||||
elif key == 'NEW_DRIVE_HOURS_THRESHOLD':
|
||||
try:
|
||||
cls.CONFIG['NEW_DRIVE_HOURS_THRESHOLD'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid NEW_DRIVE_HOURS_THRESHOLD value: {value}")
|
||||
elif key == 'SMART_ERROR_RECENT_HOURS':
|
||||
try:
|
||||
cls.CONFIG['SMART_ERROR_RECENT_HOURS'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid SMART_ERROR_RECENT_HOURS value: {value}")
|
||||
elif key == 'HISTORY_MAX_BYTES':
|
||||
try:
|
||||
cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
|
||||
# PBS settings
|
||||
elif key == 'PBS_ENABLED':
|
||||
cls.CONFIG['PBS_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||
logger.info(f"✓ Loaded PBS_ENABLED: {cls.CONFIG['PBS_ENABLED']}")
|
||||
elif key == 'PBS_ZFS_WARNING':
|
||||
try:
|
||||
cls.CONFIG['PBS_ZFS_WARNING'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid PBS_ZFS_WARNING value: {value}")
|
||||
elif key == 'PBS_ZFS_CRITICAL':
|
||||
try:
|
||||
cls.CONFIG['PBS_ZFS_CRITICAL'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid PBS_ZFS_CRITICAL value: {value}")
|
||||
# Health server settings
|
||||
elif key == 'HEALTH_SERVER_ENABLED':
|
||||
cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||
logger.info(f"✓ Loaded HEALTH_SERVER_ENABLED: {cls.CONFIG['HEALTH_SERVER_ENABLED']}")
|
||||
elif key == 'HEALTH_SERVER_PORT':
|
||||
try:
|
||||
cls.CONFIG['HEALTH_SERVER_PORT'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid HEALTH_SERVER_PORT value: {value}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load .env file: {e}")
|
||||
|
||||
# Validate critical configuration
|
||||
api_key = cls.CONFIG.get('TICKET_API_KEY')
|
||||
if not api_key or api_key == 'your_api_key_here':
|
||||
logger.warning("TICKET_API_KEY is not configured - ticket creation will fail (dry-run will still work)")
|
||||
|
||||
TICKET_TEMPLATES = {
|
||||
'ACTION_TYPE': {
|
||||
'AUTO': '[auto]',
|
||||
@@ -607,13 +681,21 @@ class SystemHealthMonitor:
|
||||
# =============================================================================
|
||||
# INITIALIZATION
|
||||
# =============================================================================
|
||||
def __init__(self, ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php', dry_run: bool = False):
|
||||
def __init__(self, ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php', dry_run: bool = False, verbose: bool = False):
|
||||
"""
|
||||
Initialize the system health monitor.
|
||||
|
||||
:param ticket_api_url: URL for the ticket creation API.
|
||||
:param dry_run: If True, simulate API calls without sending requests.
|
||||
:param verbose: If True, enable DEBUG-level logging output.
|
||||
"""
|
||||
# Set log verbosity
|
||||
if verbose:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
for handler in logger.handlers:
|
||||
handler.setLevel(logging.DEBUG)
|
||||
logger.debug("Verbose logging enabled")
|
||||
|
||||
# Load environment configuration first (API keys, etc.)
|
||||
self.load_env_config()
|
||||
|
||||
@@ -623,13 +705,59 @@ class SystemHealthMonitor:
|
||||
# Ensure history directory exists
|
||||
os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True)
|
||||
|
||||
def _enforce_storage_limit(self, history_dir: str, max_bytes: int = 10485760):
|
||||
# Drive details cache (per-run, cleared on next execution)
|
||||
self._drive_details_cache = {}
|
||||
|
||||
# Health check tracking
|
||||
self._last_check_timestamp = None
|
||||
self._last_check_status = 'unknown'
|
||||
|
||||
# Check tool availability at startup
|
||||
self._available_tools = self._check_tool_availability()
|
||||
|
||||
def _check_tool_availability(self) -> Dict[str, bool]:
|
||||
"""Check which external tools are available on this system.
|
||||
|
||||
Returns a dict mapping tool names to availability booleans.
|
||||
Logs warnings for missing required tools and info for missing optional tools.
|
||||
"""
|
||||
Delete oldest history files if directory exceeds size limit (default 10MB).
|
||||
required_tools = {
|
||||
'smartctl': 'smartmontools',
|
||||
'lsblk': 'util-linux',
|
||||
}
|
||||
optional_tools = {
|
||||
'nvme': 'nvme-cli',
|
||||
'ceph': 'ceph-common',
|
||||
'pct': 'pve-container',
|
||||
'dmidecode': 'dmidecode',
|
||||
'proxmox-backup-manager': 'proxmox-backup-server',
|
||||
'zpool': 'zfsutils-linux',
|
||||
}
|
||||
|
||||
availability = {}
|
||||
for tool, package in required_tools.items():
|
||||
available = shutil.which(tool) is not None
|
||||
availability[tool] = available
|
||||
if not available:
|
||||
logger.warning(f"Required tool '{tool}' not found (install: apt install {package})")
|
||||
|
||||
for tool, package in optional_tools.items():
|
||||
available = shutil.which(tool) is not None
|
||||
availability[tool] = available
|
||||
if not available:
|
||||
logger.debug(f"Optional tool '{tool}' not found (install: apt install {package})")
|
||||
|
||||
return availability
|
||||
|
||||
def _enforce_storage_limit(self, history_dir: str, max_bytes: int = None):
|
||||
"""
|
||||
Delete oldest history files if directory exceeds size limit.
|
||||
|
||||
:param history_dir: Directory containing history files
|
||||
:param max_bytes: Maximum directory size in bytes (default 10MB)
|
||||
:param max_bytes: Maximum directory size in bytes (default from CONFIG)
|
||||
"""
|
||||
if max_bytes is None:
|
||||
max_bytes = self.CONFIG.get('HISTORY_MAX_BYTES', 52428800)
|
||||
if not os.path.exists(history_dir):
|
||||
return
|
||||
|
||||
@@ -668,6 +796,45 @@ class SystemHealthMonitor:
|
||||
except Exception as e:
|
||||
logger.error(f"Error enforcing storage limit: {e}")
|
||||
|
||||
# =============================================================================
|
||||
# HEALTH CHECK ENDPOINT
|
||||
# =============================================================================
|
||||
def _start_health_server(self):
|
||||
"""Start a lightweight HTTP health check endpoint as a daemon thread."""
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
import threading
|
||||
|
||||
monitor = self
|
||||
|
||||
class HealthHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == '/health':
|
||||
response = {
|
||||
'status': monitor._last_check_status,
|
||||
'hostname': socket.gethostname(),
|
||||
'last_check': monitor._last_check_timestamp,
|
||||
'uptime': datetime.datetime.now().isoformat()
|
||||
}
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'application/json')
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(response).encode())
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def log_message(self, format, *args):
|
||||
logger.debug(f"Health server: {format % args}")
|
||||
|
||||
port = self.CONFIG.get('HEALTH_SERVER_PORT', 9102)
|
||||
try:
|
||||
server = HTTPServer(('', port), HealthHandler)
|
||||
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
logger.info(f"Health check endpoint started on port {port}")
|
||||
except OSError as e:
|
||||
logger.warning(f"Could not start health server on port {port}: {e}")
|
||||
|
||||
# =============================================================================
|
||||
# MAIN EXECUTION METHODS
|
||||
# =============================================================================
|
||||
@@ -677,6 +844,10 @@ class SystemHealthMonitor:
|
||||
# Perform health checks and gather the report
|
||||
health_report = self.perform_health_checks()
|
||||
|
||||
# Track last check for health endpoint
|
||||
self._last_check_timestamp = datetime.datetime.now().isoformat()
|
||||
self._last_check_status = health_report.get('drives_health', {}).get('overall_status', 'unknown')
|
||||
|
||||
# Create tickets for any detected critical issues
|
||||
self._create_tickets_for_issues(health_report)
|
||||
|
||||
@@ -699,7 +870,8 @@ class SystemHealthMonitor:
|
||||
'network_health': self._check_network_status(),
|
||||
'ceph_health': self._check_ceph_health(),
|
||||
'lxc_health': self._check_lxc_storage(),
|
||||
'system_health': self._check_system_drive_indicators()
|
||||
'system_health': self._check_system_drive_indicators(),
|
||||
'pbs_health': self._check_pbs_health()
|
||||
}
|
||||
|
||||
if self.dry_run:
|
||||
@@ -755,6 +927,17 @@ class SystemHealthMonitor:
|
||||
if health_report['system_health']['issues']:
|
||||
logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
|
||||
|
||||
# PBS status
|
||||
pbs = health_report.get('pbs_health', {})
|
||||
if pbs.get('is_pbs_node'):
|
||||
logger.info("\nPBS Status:")
|
||||
for pool in pbs.get('zfs_pools', []):
|
||||
logger.info(f" ZFS Pool '{pool['name']}': {pool['usage_percent']}% used ({pool['used']}/{pool['total']})")
|
||||
if pbs.get('failed_tasks'):
|
||||
logger.info(f" Failed tasks: {len(pbs['failed_tasks'])}")
|
||||
if pbs.get('issues'):
|
||||
logger.info(f" Issues: {len(pbs['issues'])}")
|
||||
|
||||
logger.info("\n=== End Summary ===")
|
||||
|
||||
return health_report
|
||||
@@ -776,7 +959,10 @@ class SystemHealthMonitor:
|
||||
|
||||
# Load historical data with file locking
|
||||
history = []
|
||||
file_mode = 'r+' if os.path.exists(historical_file) else 'w+'
|
||||
if os.path.exists(historical_file) and os.path.getsize(historical_file) > 0:
|
||||
file_mode = 'r+'
|
||||
else:
|
||||
file_mode = 'w+'
|
||||
|
||||
with open(historical_file, file_mode) as f:
|
||||
# Acquire exclusive lock
|
||||
@@ -1025,7 +1211,10 @@ class SystemHealthMonitor:
|
||||
# DRIVE HEALTH CHECKING METHODS
|
||||
# =============================================================================
|
||||
def _get_drive_details(self, device: str) -> Dict[str, str]:
|
||||
"""Get detailed drive information using smartctl."""
|
||||
"""Get detailed drive information using smartctl (cached per run)."""
|
||||
if device in self._drive_details_cache:
|
||||
return self._drive_details_cache[device]
|
||||
|
||||
drive_details = {
|
||||
'model': None,
|
||||
'serial': None,
|
||||
@@ -1041,7 +1230,8 @@ class SystemHealthMonitor:
|
||||
['smartctl', '-i', device],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Check if smartctl failed completely
|
||||
@@ -1079,6 +1269,7 @@ class SystemHealthMonitor:
|
||||
except Exception as e:
|
||||
logger.debug(f"Error getting drive details for {device}: {e}")
|
||||
|
||||
self._drive_details_cache[device] = drive_details
|
||||
return drive_details
|
||||
|
||||
|
||||
@@ -1501,7 +1692,9 @@ class SystemHealthMonitor:
|
||||
'critical reallocated', 'critical current_pending',
|
||||
'network is unreachable',
|
||||
'osd is down', 'osd down', # Ceph OSD down
|
||||
'cluster usage critical' # Ceph usage critical
|
||||
'cluster usage critical', # Ceph usage critical
|
||||
'zfs pool', 'backup failed', # PBS critical issues
|
||||
'usage critical' # PBS ZFS critical usage
|
||||
]):
|
||||
return self.PRIORITIES['HIGH'] # P2
|
||||
|
||||
@@ -1520,7 +1713,8 @@ class SystemHealthMonitor:
|
||||
'warning', 'high temperature', 'correctable ecc',
|
||||
'trend alert', 'critical storage usage',
|
||||
'low available_spare', 'high wear',
|
||||
'health_warn', 'cluster usage warning' # Ceph warnings
|
||||
'health_warn', 'cluster usage warning', # Ceph warnings
|
||||
'gc failed', 'sync failed', 'usage high' # PBS warnings
|
||||
]):
|
||||
return self.PRIORITIES['MEDIUM'] # P3
|
||||
|
||||
@@ -1644,6 +1838,27 @@ class SystemHealthMonitor:
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# PBS Issues - Backup server issues (categorized as Hardware for storage, Software for tasks)
|
||||
if any(keyword in issue_lower for keyword in [
|
||||
'pbs', 'zfs pool', 'backup failed', 'gc failed', 'sync failed'
|
||||
]):
|
||||
if any(error in issue_lower for error in [
|
||||
'degraded', 'critical', 'failed', 'errors'
|
||||
]):
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
self.TICKET_TYPES['ISSUE'],
|
||||
'[pbs]',
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
|
||||
)
|
||||
else:
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
self.TICKET_TYPES['PROBLEM'],
|
||||
'[pbs]',
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# Default: Hardware Problem (for undefined cases)
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
@@ -1717,6 +1932,10 @@ class SystemHealthMonitor:
|
||||
)
|
||||
description = self._generate_detailed_description(issue, health_report, priority)
|
||||
|
||||
# NOTE: The ticket API (create_ticket_api.php) deduplicates using a SHA-256 hash of:
|
||||
# issue_category + environment_tags + hostname (excluded for [cluster-wide]) + device
|
||||
# Description content and timestamps are NOT included in the dedup hash.
|
||||
# The 24-hour dedup window prevents duplicate tickets from multiple nodes or runs.
|
||||
ticket_payload = {
|
||||
"title": ticket_title,
|
||||
"description": description,
|
||||
@@ -1776,7 +1995,7 @@ class SystemHealthMonitor:
|
||||
continue
|
||||
|
||||
# Only report issues for drives with valid SMART status
|
||||
if drive.get('smart_issues') and drive.get('smart_status') in ['HEALTHY', 'UNHEALTHY', 'UNKNOWN']:
|
||||
if drive.get('smart_issues') and drive.get('smart_status') in ['HEALTHY', 'UNHEALTHY', 'UNKNOWN', 'REPLACEMENT_NEEDED']:
|
||||
# Filter out generic error messages and manufacturer-specific false positives
|
||||
filtered_issues = []
|
||||
for issue in drive['smart_issues']:
|
||||
@@ -1840,7 +2059,8 @@ class SystemHealthMonitor:
|
||||
designated_node = self.CONFIG.get('CEPH_TICKET_NODE')
|
||||
|
||||
# Cluster-wide issues: only create tickets from designated node (or first node if not set)
|
||||
# The [cluster-wide] tag ensures deduplication in tinker_tickets API
|
||||
# The [cluster-wide] tag + CLUSTER_NAME in ticket title ensures cross-node deduplication
|
||||
# in the tinker_tickets API (dedup hash excludes hostname for cluster-wide issues)
|
||||
if ceph_health.get('cluster_wide_issues'):
|
||||
# If no designated node, all nodes can report (API deduplicates)
|
||||
# If designated node is set, only that node creates tickets
|
||||
@@ -1856,6 +2076,12 @@ class SystemHealthMonitor:
|
||||
for issue in ceph_health['issues']:
|
||||
issues.append(f"[ceph] {issue}")
|
||||
|
||||
# Check for PBS issues
|
||||
pbs_health = health_report.get('pbs_health', {})
|
||||
if pbs_health.get('is_pbs_node') and pbs_health.get('issues'):
|
||||
for issue in pbs_health['issues']:
|
||||
issues.append(f"[pbs] {issue.get('issue', str(issue))}")
|
||||
|
||||
logger.info("=== Issue Detection Started ===")
|
||||
logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
|
||||
logger.info(f"Memory status: {health_report['memory_health']['status']}")
|
||||
@@ -1886,42 +2112,26 @@ class SystemHealthMonitor:
|
||||
# DISK AND STORAGE UTILITY METHODS
|
||||
# =============================================================================
|
||||
def _get_all_disks(self) -> List[str]:
|
||||
"""Get all physical disks using multiple detection methods."""
|
||||
"""Get all physical disks using lsblk with full device paths."""
|
||||
disks = set()
|
||||
|
||||
# Method 1: Use lsblk to get physical disks, excluding virtual devices
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['lsblk', '-d', '-n', '-o', 'NAME,TYPE'],
|
||||
['lsblk', '-d', '-n', '-o', 'NAME,TYPE', '-p'],
|
||||
stdout=subprocess.PIPE,
|
||||
text=True
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line:
|
||||
parts = line.split()
|
||||
if len(parts) >= 2:
|
||||
name, device_type = parts[0], parts[1]
|
||||
# Only include actual disks, exclude virtual devices
|
||||
if device_type == 'disk' and not name.startswith('rbd'):
|
||||
disks.add(f"/dev/{name}")
|
||||
logger.debug(f"Physical disks found via lsblk: {disks}")
|
||||
if len(parts) >= 2 and parts[1] == 'disk' and not parts[0].startswith('/dev/rbd'):
|
||||
disks.add(parts[0])
|
||||
logger.debug(f"Physical disks found: {disks}")
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error("lsblk timed out during disk detection")
|
||||
except Exception as e:
|
||||
logger.debug(f"lsblk detection failed: {e}")
|
||||
|
||||
# Method 2: Direct device scanning for physical devices only
|
||||
for pattern in ['/dev/sd[a-z]', '/dev/nvme[0-9]n[0-9]']:
|
||||
try:
|
||||
import glob
|
||||
matches = glob.glob(pattern)
|
||||
# Filter out partitions (devices ending in numbers for sd*, already filtered for nvme)
|
||||
if 'sd' in pattern:
|
||||
matches = [d for d in matches if not d[-1].isdigit()]
|
||||
disks.update(matches)
|
||||
logger.debug(f"Disks found via glob {pattern}: {matches}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Glob detection failed for {pattern}: {e}")
|
||||
|
||||
return list(disks)
|
||||
logger.error(f"Failed to detect disks: {e}")
|
||||
return sorted(disks)
|
||||
|
||||
def _is_excluded_mount(self, mountpoint: str) -> bool:
|
||||
"""Check if a mountpoint should be excluded from monitoring."""
|
||||
@@ -2055,7 +2265,8 @@ class SystemHealthMonitor:
|
||||
['smartctl', '-i', device],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
model_line = None
|
||||
@@ -2095,6 +2306,8 @@ class SystemHealthMonitor:
|
||||
logger.debug(f"Known issues: {firmware_info['known_issues']}")
|
||||
logger.debug("=== End Firmware Check ===\n")
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning(f"smartctl -i timed out for {device}")
|
||||
except Exception as e:
|
||||
firmware_info['known_issues'].append(f"Error checking firmware: {str(e)}")
|
||||
|
||||
@@ -2266,7 +2479,7 @@ class SystemHealthMonitor:
|
||||
|
||||
def _is_new_drive(self, power_on_hours: int) -> bool:
|
||||
"""Determine if a drive is considered "new" based on power-on hours."""
|
||||
return power_on_hours < 720 # Less than 1 week of runtime
|
||||
return power_on_hours < self.CONFIG['NEW_DRIVE_HOURS_THRESHOLD']
|
||||
|
||||
def _check_smart_health(self, device: str) -> Dict[str, Any]:
|
||||
"""Enhanced SMART health check with better error handling and predictive analysis."""
|
||||
@@ -2303,12 +2516,13 @@ class SystemHealthMonitor:
|
||||
smart_health['issues'].append("Unable to read device information")
|
||||
return smart_health
|
||||
|
||||
# Skip Ridata drives entirely - unreliable and being replaced
|
||||
# Ridata drives - known unreliable hardware, flag for replacement
|
||||
manufacturer = self._detect_manufacturer(drive_details.get('model', ''))
|
||||
if manufacturer == 'Ridata':
|
||||
smart_health['status'] = 'SKIPPED'
|
||||
smart_health['issues'].append("Ridata drive - monitoring disabled (unreliable hardware)")
|
||||
logger.debug(f"Skipping SMART monitoring for Ridata drive {device}")
|
||||
smart_health['status'] = 'REPLACEMENT_NEEDED'
|
||||
smart_health['severity'] = 'WARNING'
|
||||
smart_health['issues'].append("Ridata drive detected - known unreliable hardware, replacement recommended")
|
||||
logger.info(f"Ridata drive {device} flagged for replacement")
|
||||
return smart_health
|
||||
|
||||
logger.debug(f"Drive details for {device}: {drive_details}")
|
||||
@@ -2501,7 +2715,7 @@ class SystemHealthMonitor:
|
||||
for match in error_matches:
|
||||
error_hour = int(match.group(1))
|
||||
current_hours = smart_health['attributes'].get('Power_On_Hours', 0)
|
||||
if current_hours - error_hour < 168: # Errors within last week
|
||||
if current_hours - error_hour < self.CONFIG['SMART_ERROR_RECENT_HOURS']:
|
||||
recent_errors.append(match.group(0))
|
||||
|
||||
if recent_errors:
|
||||
@@ -2540,8 +2754,8 @@ class SystemHealthMonitor:
|
||||
logger.debug(f"Detected Issues: {smart_health['issues']}")
|
||||
logger.debug("=== End SMART Check ===\n")
|
||||
|
||||
# Special handling for NVMe drives
|
||||
if 'nvme' in device:
|
||||
# Special handling for NVMe drives (requires nvme-cli)
|
||||
if 'nvme' in device and self._available_tools.get('nvme'):
|
||||
try:
|
||||
nvme_result = subprocess.run(
|
||||
['nvme', 'smart-log', device],
|
||||
@@ -2564,10 +2778,10 @@ class SystemHealthMonitor:
|
||||
temp_str = line.split(':')[1].strip() if ':' in line else line.strip()
|
||||
logger.debug(f"Raw temperature string: {temp_str}")
|
||||
|
||||
# Extract first temperature value more safely
|
||||
digits = ''.join(c for c in temp_str if c.isdigit())
|
||||
if len(digits) >= 2:
|
||||
temp_value = int(digits[:2])
|
||||
# Extract the first complete number from temperature string
|
||||
temp_match = re.search(r'(\d+)', temp_str)
|
||||
if temp_match:
|
||||
temp_value = int(temp_match.group(1))
|
||||
logger.debug(f"Parsed temperature value: {temp_value}")
|
||||
|
||||
# Set both temperature fields
|
||||
@@ -2608,6 +2822,10 @@ class SystemHealthMonitor:
|
||||
'manufacturer_profile': None
|
||||
}
|
||||
|
||||
if not self._available_tools.get('nvme'):
|
||||
logger.debug(f"nvme-cli not available, skipping NVMe health check for {device}")
|
||||
return smart_health
|
||||
|
||||
try:
|
||||
# Use nvme-cli for NVMe devices
|
||||
result = subprocess.run(
|
||||
@@ -2674,6 +2892,11 @@ class SystemHealthMonitor:
|
||||
"""Check health of all drives in the system."""
|
||||
drives_health = {'overall_status': 'NORMAL', 'drives': []}
|
||||
|
||||
if not self._available_tools.get('smartctl') or not self._available_tools.get('lsblk'):
|
||||
logger.warning("Drive health checks skipped: smartctl or lsblk not available")
|
||||
drives_health['overall_status'] = 'UNKNOWN'
|
||||
return drives_health
|
||||
|
||||
try:
|
||||
# Get only valid physical disks
|
||||
physical_disks = self._get_all_disks()
|
||||
@@ -2698,6 +2921,20 @@ class SystemHealthMonitor:
|
||||
device_partitions[base_dev] = []
|
||||
device_partitions[base_dev].append(part)
|
||||
|
||||
# Run SMART checks in parallel across all drives
|
||||
smart_results = {}
|
||||
max_workers = min(8, len(physical_disks))
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {executor.submit(self._check_smart_health, disk): disk for disk in physical_disks}
|
||||
for future in as_completed(futures):
|
||||
disk = futures[future]
|
||||
try:
|
||||
smart_results[disk] = future.result()
|
||||
except Exception as e:
|
||||
logger.error(f"SMART check failed for {disk}: {e}")
|
||||
smart_results[disk] = {'status': 'ERROR', 'issues': [str(e)], 'temp': None, 'attributes': {}}
|
||||
|
||||
# Build drive reports in original disk order
|
||||
overall_status = 'NORMAL'
|
||||
for disk in physical_disks:
|
||||
drive_report = {
|
||||
@@ -2733,8 +2970,8 @@ class SystemHealthMonitor:
|
||||
if total_space > 0:
|
||||
drive_report['usage_percent'] = (total_used / total_space) * 100
|
||||
|
||||
# Check SMART health
|
||||
smart_health = self._check_smart_health(disk)
|
||||
# Use pre-fetched SMART results
|
||||
smart_health = smart_results.get(disk, {'status': 'ERROR', 'issues': [], 'temp': None, 'attributes': {}})
|
||||
drive_report.update({
|
||||
'smart_status': smart_health['status'],
|
||||
'smart_issues': smart_health['issues'],
|
||||
@@ -2798,17 +3035,19 @@ class SystemHealthMonitor:
|
||||
}
|
||||
|
||||
try:
|
||||
# First check using dmidecode
|
||||
# First check using dmidecode (if available)
|
||||
if self._available_tools.get('dmidecode'):
|
||||
result = subprocess.run(
|
||||
['dmidecode', '--type', 'memory'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
if 'Error Correction Type: Multi-bit ECC' in result.stdout:
|
||||
memory_health['has_ecc'] = True
|
||||
|
||||
# If dmidecode didn't find ECC, try the edac method as backup
|
||||
# If dmidecode unavailable or didn't find ECC, try the edac method as backup
|
||||
if not memory_health['has_ecc']:
|
||||
edac_path = '/sys/devices/system/edac/mc'
|
||||
if os.path.exists(edac_path) and os.listdir(edac_path):
|
||||
@@ -2945,8 +3184,6 @@ class SystemHealthMonitor:
|
||||
Returns health status, cluster info, and any issues detected.
|
||||
Cluster-wide issues use [cluster-wide] tag for cross-node deduplication.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
ceph_health = {
|
||||
'status': 'OK',
|
||||
'is_ceph_node': False,
|
||||
@@ -2955,7 +3192,7 @@ class SystemHealthMonitor:
|
||||
'osd_status': [],
|
||||
'mon_status': [],
|
||||
'issues': [],
|
||||
'cluster_wide_issues': [] # Issues that apply to entire cluster
|
||||
'cluster_wide_issues': [] # Issues affecting entire cluster; use CLUSTER_NAME for dedup
|
||||
}
|
||||
|
||||
# Check if Ceph monitoring is enabled
|
||||
@@ -2964,7 +3201,7 @@ class SystemHealthMonitor:
|
||||
return ceph_health
|
||||
|
||||
# Check if ceph CLI is available
|
||||
if not shutil.which('ceph'):
|
||||
if not self._available_tools.get('ceph'):
|
||||
logger.debug("Ceph CLI not found - not a Ceph node")
|
||||
return ceph_health
|
||||
|
||||
@@ -3131,6 +3368,186 @@ class SystemHealthMonitor:
|
||||
|
||||
return ceph_health
|
||||
|
||||
# =============================================================================
|
||||
# PBS (PROXMOX BACKUP SERVER) HEALTH CHECKS
|
||||
# =============================================================================
|
||||
def _check_pbs_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check Proxmox Backup Server health including ZFS pools and task status.
|
||||
|
||||
Returns health status for ZFS pools, failed backup/GC/sync jobs.
|
||||
Only active when PBS_ENABLED=true and relevant tools are available.
|
||||
"""
|
||||
pbs_health = {
|
||||
'status': 'OK',
|
||||
'is_pbs_node': False,
|
||||
'zfs_pools': [],
|
||||
'failed_tasks': [],
|
||||
'issues': []
|
||||
}
|
||||
|
||||
if not self.CONFIG.get('PBS_ENABLED', False):
|
||||
logger.debug("PBS monitoring disabled in config")
|
||||
return pbs_health
|
||||
|
||||
if not self._available_tools.get('zpool'):
|
||||
logger.debug("zpool not available - skipping PBS ZFS checks")
|
||||
return pbs_health
|
||||
|
||||
pbs_health['is_pbs_node'] = True
|
||||
|
||||
# Check ZFS pool status
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['zpool', 'status', '-p'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
if result.returncode == 0:
|
||||
current_pool = None
|
||||
for line in result.stdout.splitlines():
|
||||
line_stripped = line.strip()
|
||||
if line_stripped.startswith('pool:'):
|
||||
current_pool = line_stripped.split(':', 1)[1].strip()
|
||||
elif line_stripped.startswith('state:') and current_pool:
|
||||
state = line_stripped.split(':', 1)[1].strip()
|
||||
if state != 'ONLINE':
|
||||
pbs_health['status'] = 'CRITICAL'
|
||||
pbs_health['issues'].append({
|
||||
'type': 'PBS_ZFS_DEGRADED',
|
||||
'severity': 'CRITICAL',
|
||||
'device': current_pool,
|
||||
'issue': f"ZFS pool '{current_pool}' state: {state}"
|
||||
})
|
||||
elif line_stripped.startswith('errors:') and current_pool:
|
||||
if 'No known data errors' not in line_stripped:
|
||||
pbs_health['issues'].append({
|
||||
'type': 'PBS_ZFS_ERRORS',
|
||||
'severity': 'WARNING',
|
||||
'device': current_pool,
|
||||
'issue': f"ZFS pool '{current_pool}' has errors: {line_stripped}"
|
||||
})
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("zpool status timed out")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking ZFS pool status: {e}")
|
||||
|
||||
# Check ZFS pool usage
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['zpool', 'list', '-Hp'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.splitlines():
|
||||
parts = line.split('\t')
|
||||
if len(parts) >= 8:
|
||||
pool_name = parts[0]
|
||||
try:
|
||||
total_bytes = int(parts[1])
|
||||
used_bytes = int(parts[2])
|
||||
usage_pct = (used_bytes / total_bytes * 100) if total_bytes > 0 else 0
|
||||
except (ValueError, ZeroDivisionError):
|
||||
continue
|
||||
|
||||
pool_info = {
|
||||
'name': pool_name,
|
||||
'total': self._convert_bytes(total_bytes),
|
||||
'used': self._convert_bytes(used_bytes),
|
||||
'usage_percent': round(usage_pct, 1),
|
||||
'health': parts[9] if len(parts) > 9 else 'UNKNOWN'
|
||||
}
|
||||
pbs_health['zfs_pools'].append(pool_info)
|
||||
|
||||
if usage_pct >= self.CONFIG['PBS_ZFS_CRITICAL']:
|
||||
pbs_health['status'] = 'CRITICAL'
|
||||
pbs_health['issues'].append({
|
||||
'type': 'PBS_ZFS_USAGE_CRITICAL',
|
||||
'severity': 'CRITICAL',
|
||||
'device': pool_name,
|
||||
'issue': f"ZFS pool '{pool_name}' usage critical: {usage_pct:.1f}%"
|
||||
})
|
||||
elif usage_pct >= self.CONFIG['PBS_ZFS_WARNING']:
|
||||
if pbs_health['status'] != 'CRITICAL':
|
||||
pbs_health['status'] = 'WARNING'
|
||||
pbs_health['issues'].append({
|
||||
'type': 'PBS_ZFS_USAGE_WARNING',
|
||||
'severity': 'WARNING',
|
||||
'device': pool_name,
|
||||
'issue': f"ZFS pool '{pool_name}' usage high: {usage_pct:.1f}%"
|
||||
})
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("zpool list timed out")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking ZFS pool usage: {e}")
|
||||
|
||||
# Check failed PBS tasks (requires proxmox-backup-manager)
|
||||
if self._available_tools.get('proxmox-backup-manager'):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['proxmox-backup-manager', 'task', 'list', '--output-format', 'json'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
if result.returncode == 0:
|
||||
try:
|
||||
tasks = json.loads(result.stdout)
|
||||
for task in tasks:
|
||||
task_status = task.get('status', '')
|
||||
task_type = task.get('worker_type', '')
|
||||
task_id = task.get('worker_id', '')
|
||||
|
||||
if task_status and task_status != 'OK':
|
||||
failed_task = {
|
||||
'type': task_type,
|
||||
'id': task_id,
|
||||
'status': task_status,
|
||||
'starttime': task.get('starttime', ''),
|
||||
'endtime': task.get('endtime', '')
|
||||
}
|
||||
pbs_health['failed_tasks'].append(failed_task)
|
||||
|
||||
# Categorize by task type
|
||||
if 'backup' in task_type.lower():
|
||||
issue_type = 'PBS_BACKUP_FAILED'
|
||||
severity = 'CRITICAL'
|
||||
elif 'gc' in task_type.lower() or 'garbage' in task_type.lower():
|
||||
issue_type = 'PBS_GC_FAILED'
|
||||
severity = 'WARNING'
|
||||
elif 'sync' in task_type.lower():
|
||||
issue_type = 'PBS_SYNC_FAILED'
|
||||
severity = 'WARNING'
|
||||
else:
|
||||
issue_type = 'PBS_BACKUP_FAILED'
|
||||
severity = 'WARNING'
|
||||
|
||||
pbs_health['issues'].append({
|
||||
'type': issue_type,
|
||||
'severity': severity,
|
||||
'device': f"task-{task_type}",
|
||||
'issue': f"PBS {task_type} failed: {task_id} - {task_status}"
|
||||
})
|
||||
|
||||
if severity == 'CRITICAL':
|
||||
pbs_health['status'] = 'CRITICAL'
|
||||
elif pbs_health['status'] == 'OK':
|
||||
pbs_health['status'] = 'WARNING'
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse PBS task list JSON: {e}")
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("proxmox-backup-manager task list timed out")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking PBS tasks: {e}")
|
||||
|
||||
return pbs_health
|
||||
|
||||
# =============================================================================
|
||||
# PROMETHEUS METRICS EXPORT
|
||||
# =============================================================================
|
||||
@@ -3148,9 +3565,11 @@ class SystemHealthMonitor:
|
||||
hostname = health_report.get('hostname', socket.gethostname())
|
||||
metrics = []
|
||||
|
||||
# Helper to format labels
|
||||
# Helper to format labels with proper Prometheus escaping
|
||||
def labels(**kwargs) -> str:
|
||||
pairs = [f'{k}="{v}"' for k, v in kwargs.items() if v is not None]
|
||||
def escape(value):
|
||||
return str(value).replace('\\', '\\\\').replace('"', '\\"').replace('\n', '\\n')
|
||||
pairs = [f'{k}="{escape(v)}"' for k, v in kwargs.items() if v is not None]
|
||||
return '{' + ','.join(pairs) + '}' if pairs else ''
|
||||
|
||||
# === System Info ===
|
||||
@@ -3275,6 +3694,18 @@ class SystemHealthMonitor:
|
||||
usage = fs.get('usage_percent', 0)
|
||||
metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')
|
||||
|
||||
# === PBS Metrics ===
|
||||
pbs = health_report.get('pbs_health', {})
|
||||
if pbs.get('is_pbs_node'):
|
||||
metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage')
|
||||
metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge')
|
||||
for pool in pbs.get('zfs_pools', []):
|
||||
metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}')
|
||||
|
||||
metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count')
|
||||
metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge')
|
||||
metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}')
|
||||
|
||||
# === Issue Summary Metrics ===
|
||||
metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
|
||||
metrics.append(f'# TYPE hwmon_issues_total gauge')
|
||||
@@ -3282,7 +3713,8 @@ class SystemHealthMonitor:
|
||||
system_issues = len(health_report.get('system_health', {}).get('issues', []))
|
||||
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
|
||||
lxc_issues = len(lxc.get('issues', []))
|
||||
total_issues = system_issues + ceph_issues + lxc_issues
|
||||
pbs_issues = len(pbs.get('issues', []))
|
||||
total_issues = system_issues + ceph_issues + lxc_issues + pbs_issues
|
||||
metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')
|
||||
|
||||
return '\n'.join(metrics) + '\n'
|
||||
@@ -3369,6 +3801,10 @@ class SystemHealthMonitor:
|
||||
'issues': []
|
||||
}
|
||||
|
||||
if not self._available_tools.get('pct'):
|
||||
logger.debug("pct not available - not a PVE node or pve-container not installed")
|
||||
return lxc_health
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['pct', 'list'],
|
||||
@@ -3409,32 +3845,38 @@ class SystemHealthMonitor:
|
||||
if not fs_line.strip() or 'MP' in fs_line:
|
||||
continue
|
||||
|
||||
# Fix: Use fs_line instead of line, and columns consistently
|
||||
columns = fs_line.split()
|
||||
|
||||
if len(columns) >= 6:
|
||||
try:
|
||||
# Skip excluded mounts by checking the first column
|
||||
if columns[0].startswith('appPool:') or '/mnt/pve/mediaf' in columns[1]:
|
||||
# Parse df output using regex for reliable column extraction
|
||||
match = re.match(
|
||||
r'(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\d+\.?\d*)%?\s+(.*)',
|
||||
fs_line.strip()
|
||||
)
|
||||
if not match:
|
||||
logger.debug(f"Could not parse filesystem line: {fs_line}")
|
||||
continue
|
||||
|
||||
# Get the mountpoint (last column)
|
||||
mountpoint = columns[-1]
|
||||
pool, device_col, total_str, used_str, avail_str, percent_str, mountpoint = match.groups()
|
||||
|
||||
try:
|
||||
# Skip excluded mounts
|
||||
if pool.startswith('appPool:') or '/mnt/pve/mediaf' in device_col:
|
||||
continue
|
||||
|
||||
mountpoint = mountpoint.strip()
|
||||
|
||||
# Skip excluded mountpoints
|
||||
if self._is_excluded_mount(mountpoint):
|
||||
logger.debug(f"Skipping excluded mount: {mountpoint}")
|
||||
continue
|
||||
|
||||
# Parse size values safely - use correct column indices
|
||||
total_space = self._parse_size(columns[2]) # 3rd column
|
||||
used_space = self._parse_size(columns[3]) # 4th column
|
||||
available_space = self._parse_size(columns[4]) # 5th column
|
||||
# Parse size values from named regex groups
|
||||
total_space = self._parse_size(total_str)
|
||||
used_space = self._parse_size(used_str)
|
||||
available_space = self._parse_size(avail_str)
|
||||
|
||||
# Parse percentage safely
|
||||
# Parse percentage from regex group
|
||||
try:
|
||||
usage_percent = float(columns[5].rstrip('%')) # 6th column
|
||||
except (ValueError, IndexError):
|
||||
usage_percent = float(percent_str)
|
||||
except ValueError:
|
||||
# Calculate percentage if parsing fails
|
||||
usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
|
||||
|
||||
@@ -3501,13 +3943,28 @@ def main():
|
||||
metavar="FILE",
|
||||
help="Export health report to JSON file."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v", "--verbose",
|
||||
action="store_true",
|
||||
help="Enable verbose (DEBUG) logging output."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--health-server",
|
||||
action="store_true",
|
||||
help="Start HTTP health check endpoint (default port 9102)."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
monitor = SystemHealthMonitor(
|
||||
ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
|
||||
dry_run=args.dry_run
|
||||
dry_run=args.dry_run,
|
||||
verbose=args.verbose
|
||||
)
|
||||
|
||||
# Start health server if requested via CLI or .env
|
||||
if args.health_server or monitor.CONFIG.get('HEALTH_SERVER_ENABLED', False):
|
||||
monitor._start_health_server()
|
||||
|
||||
if args.metrics:
|
||||
# Just output metrics to stdout
|
||||
health_report = monitor.perform_health_checks()
|
||||
|
||||
Reference in New Issue
Block a user