Add Proxmox Backup Server (PBS) health monitoring support
Monitors ZFS pool status/usage and failed PBS tasks (backup, GC, sync). Includes configurable thresholds (PBS_ZFS_WARNING/CRITICAL), Prometheus metrics (hwmon_pbs_*), dry-run summary, issue categorization, and priority classification. Enabled via PBS_ENABLED=true in .env config. Fixes: #5 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
278
hwmonDaemon.py
278
hwmonDaemon.py
@@ -79,7 +79,16 @@ class SystemHealthMonitor:
|
||||
'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full
|
||||
'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high
|
||||
'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded
|
||||
'CEPH_MON_DOWN': PRIORITIES['HIGH'] # P2 - Monitor down
|
||||
'CEPH_MON_DOWN': PRIORITIES['HIGH'], # P2 - Monitor down
|
||||
|
||||
# PBS (Proxmox Backup Server) issues
|
||||
'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded
|
||||
'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full
|
||||
'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high
|
||||
'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors
|
||||
'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed
|
||||
'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed
|
||||
'PBS_SYNC_FAILED': PRIORITIES['MEDIUM'] # P3 - Sync job failed
|
||||
}
|
||||
|
||||
CONFIG = {
|
||||
@@ -133,7 +142,11 @@ class SystemHealthMonitor:
|
||||
'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files
|
||||
# Health check endpoint
|
||||
'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint
|
||||
'HEALTH_SERVER_PORT': 9102 # Port for health check endpoint
|
||||
'HEALTH_SERVER_PORT': 9102, # Port for health check endpoint
|
||||
# PBS (Proxmox Backup Server) monitoring
|
||||
'PBS_ENABLED': False, # Enable PBS health monitoring
|
||||
'PBS_ZFS_WARNING': 80, # ZFS pool usage warning threshold %
|
||||
'PBS_ZFS_CRITICAL': 90 # ZFS pool usage critical threshold %
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -215,6 +228,20 @@ class SystemHealthMonitor:
|
||||
cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
|
||||
# PBS settings
|
||||
elif key == 'PBS_ENABLED':
|
||||
cls.CONFIG['PBS_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||
logger.info(f"✓ Loaded PBS_ENABLED: {cls.CONFIG['PBS_ENABLED']}")
|
||||
elif key == 'PBS_ZFS_WARNING':
|
||||
try:
|
||||
cls.CONFIG['PBS_ZFS_WARNING'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid PBS_ZFS_WARNING value: {value}")
|
||||
elif key == 'PBS_ZFS_CRITICAL':
|
||||
try:
|
||||
cls.CONFIG['PBS_ZFS_CRITICAL'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid PBS_ZFS_CRITICAL value: {value}")
|
||||
# Health server settings
|
||||
elif key == 'HEALTH_SERVER_ENABLED':
|
||||
cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||
@@ -703,6 +730,8 @@ class SystemHealthMonitor:
|
||||
'ceph': 'ceph-common',
|
||||
'pct': 'pve-container',
|
||||
'dmidecode': 'dmidecode',
|
||||
'proxmox-backup-manager': 'proxmox-backup-server',
|
||||
'zpool': 'zfsutils-linux',
|
||||
}
|
||||
|
||||
availability = {}
|
||||
@@ -841,7 +870,8 @@ class SystemHealthMonitor:
|
||||
'network_health': self._check_network_status(),
|
||||
'ceph_health': self._check_ceph_health(),
|
||||
'lxc_health': self._check_lxc_storage(),
|
||||
'system_health': self._check_system_drive_indicators()
|
||||
'system_health': self._check_system_drive_indicators(),
|
||||
'pbs_health': self._check_pbs_health()
|
||||
}
|
||||
|
||||
if self.dry_run:
|
||||
@@ -896,7 +926,18 @@ class SystemHealthMonitor:
|
||||
|
||||
if health_report['system_health']['issues']:
|
||||
logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
|
||||
|
||||
|
||||
# PBS status
|
||||
pbs = health_report.get('pbs_health', {})
|
||||
if pbs.get('is_pbs_node'):
|
||||
logger.info("\nPBS Status:")
|
||||
for pool in pbs.get('zfs_pools', []):
|
||||
logger.info(f" ZFS Pool '{pool['name']}': {pool['usage_percent']}% used ({pool['used']}/{pool['total']})")
|
||||
if pbs.get('failed_tasks'):
|
||||
logger.info(f" Failed tasks: {len(pbs['failed_tasks'])}")
|
||||
if pbs.get('issues'):
|
||||
logger.info(f" Issues: {len(pbs['issues'])}")
|
||||
|
||||
logger.info("\n=== End Summary ===")
|
||||
|
||||
return health_report
|
||||
@@ -1651,7 +1692,9 @@ class SystemHealthMonitor:
|
||||
'critical reallocated', 'critical current_pending',
|
||||
'network is unreachable',
|
||||
'osd is down', 'osd down', # Ceph OSD down
|
||||
'cluster usage critical' # Ceph usage critical
|
||||
'cluster usage critical', # Ceph usage critical
|
||||
'zfs pool', 'backup failed', # PBS critical issues
|
||||
'usage critical' # PBS ZFS critical usage
|
||||
]):
|
||||
return self.PRIORITIES['HIGH'] # P2
|
||||
|
||||
@@ -1670,7 +1713,8 @@ class SystemHealthMonitor:
|
||||
'warning', 'high temperature', 'correctable ecc',
|
||||
'trend alert', 'critical storage usage',
|
||||
'low available_spare', 'high wear',
|
||||
'health_warn', 'cluster usage warning' # Ceph warnings
|
||||
'health_warn', 'cluster usage warning', # Ceph warnings
|
||||
'gc failed', 'sync failed', 'usage high' # PBS warnings
|
||||
]):
|
||||
return self.PRIORITIES['MEDIUM'] # P3
|
||||
|
||||
@@ -1794,6 +1838,27 @@ class SystemHealthMonitor:
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# PBS Issues - Backup server issues (categorized as Hardware for storage, Software for tasks)
|
||||
if any(keyword in issue_lower for keyword in [
|
||||
'pbs', 'zfs pool', 'backup failed', 'gc failed', 'sync failed'
|
||||
]):
|
||||
if any(error in issue_lower for error in [
|
||||
'degraded', 'critical', 'failed', 'errors'
|
||||
]):
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
self.TICKET_TYPES['ISSUE'],
|
||||
'[pbs]',
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
|
||||
)
|
||||
else:
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
self.TICKET_TYPES['PROBLEM'],
|
||||
'[pbs]',
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# Default: Hardware Problem (for undefined cases)
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
@@ -2011,6 +2076,12 @@ class SystemHealthMonitor:
|
||||
for issue in ceph_health['issues']:
|
||||
issues.append(f"[ceph] {issue}")
|
||||
|
||||
# Check for PBS issues
|
||||
pbs_health = health_report.get('pbs_health', {})
|
||||
if pbs_health.get('is_pbs_node') and pbs_health.get('issues'):
|
||||
for issue in pbs_health['issues']:
|
||||
issues.append(f"[pbs] {issue.get('issue', str(issue))}")
|
||||
|
||||
logger.info("=== Issue Detection Started ===")
|
||||
logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
|
||||
logger.info(f"Memory status: {health_report['memory_health']['status']}")
|
||||
@@ -3297,6 +3368,186 @@ class SystemHealthMonitor:
|
||||
|
||||
return ceph_health
|
||||
|
||||
# =============================================================================
|
||||
# PBS (PROXMOX BACKUP SERVER) HEALTH CHECKS
|
||||
# =============================================================================
|
||||
def _check_pbs_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check Proxmox Backup Server health including ZFS pools and task status.
|
||||
|
||||
Returns health status for ZFS pools, failed backup/GC/sync jobs.
|
||||
Only active when PBS_ENABLED=true and relevant tools are available.
|
||||
"""
|
||||
pbs_health = {
|
||||
'status': 'OK',
|
||||
'is_pbs_node': False,
|
||||
'zfs_pools': [],
|
||||
'failed_tasks': [],
|
||||
'issues': []
|
||||
}
|
||||
|
||||
if not self.CONFIG.get('PBS_ENABLED', False):
|
||||
logger.debug("PBS monitoring disabled in config")
|
||||
return pbs_health
|
||||
|
||||
if not self._available_tools.get('zpool'):
|
||||
logger.debug("zpool not available - skipping PBS ZFS checks")
|
||||
return pbs_health
|
||||
|
||||
pbs_health['is_pbs_node'] = True
|
||||
|
||||
# Check ZFS pool status
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['zpool', 'status', '-p'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
if result.returncode == 0:
|
||||
current_pool = None
|
||||
for line in result.stdout.splitlines():
|
||||
line_stripped = line.strip()
|
||||
if line_stripped.startswith('pool:'):
|
||||
current_pool = line_stripped.split(':', 1)[1].strip()
|
||||
elif line_stripped.startswith('state:') and current_pool:
|
||||
state = line_stripped.split(':', 1)[1].strip()
|
||||
if state != 'ONLINE':
|
||||
pbs_health['status'] = 'CRITICAL'
|
||||
pbs_health['issues'].append({
|
||||
'type': 'PBS_ZFS_DEGRADED',
|
||||
'severity': 'CRITICAL',
|
||||
'device': current_pool,
|
||||
'issue': f"ZFS pool '{current_pool}' state: {state}"
|
||||
})
|
||||
elif line_stripped.startswith('errors:') and current_pool:
|
||||
if 'No known data errors' not in line_stripped:
|
||||
pbs_health['issues'].append({
|
||||
'type': 'PBS_ZFS_ERRORS',
|
||||
'severity': 'WARNING',
|
||||
'device': current_pool,
|
||||
'issue': f"ZFS pool '{current_pool}' has errors: {line_stripped}"
|
||||
})
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("zpool status timed out")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking ZFS pool status: {e}")
|
||||
|
||||
# Check ZFS pool usage
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['zpool', 'list', '-Hp'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.splitlines():
|
||||
parts = line.split('\t')
|
||||
if len(parts) >= 8:
|
||||
pool_name = parts[0]
|
||||
try:
|
||||
total_bytes = int(parts[1])
|
||||
used_bytes = int(parts[2])
|
||||
usage_pct = (used_bytes / total_bytes * 100) if total_bytes > 0 else 0
|
||||
except (ValueError, ZeroDivisionError):
|
||||
continue
|
||||
|
||||
pool_info = {
|
||||
'name': pool_name,
|
||||
'total': self._convert_bytes(total_bytes),
|
||||
'used': self._convert_bytes(used_bytes),
|
||||
'usage_percent': round(usage_pct, 1),
|
||||
'health': parts[9] if len(parts) > 9 else 'UNKNOWN'
|
||||
}
|
||||
pbs_health['zfs_pools'].append(pool_info)
|
||||
|
||||
if usage_pct >= self.CONFIG['PBS_ZFS_CRITICAL']:
|
||||
pbs_health['status'] = 'CRITICAL'
|
||||
pbs_health['issues'].append({
|
||||
'type': 'PBS_ZFS_USAGE_CRITICAL',
|
||||
'severity': 'CRITICAL',
|
||||
'device': pool_name,
|
||||
'issue': f"ZFS pool '{pool_name}' usage critical: {usage_pct:.1f}%"
|
||||
})
|
||||
elif usage_pct >= self.CONFIG['PBS_ZFS_WARNING']:
|
||||
if pbs_health['status'] != 'CRITICAL':
|
||||
pbs_health['status'] = 'WARNING'
|
||||
pbs_health['issues'].append({
|
||||
'type': 'PBS_ZFS_USAGE_WARNING',
|
||||
'severity': 'WARNING',
|
||||
'device': pool_name,
|
||||
'issue': f"ZFS pool '{pool_name}' usage high: {usage_pct:.1f}%"
|
||||
})
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("zpool list timed out")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking ZFS pool usage: {e}")
|
||||
|
||||
# Check failed PBS tasks (requires proxmox-backup-manager)
|
||||
if self._available_tools.get('proxmox-backup-manager'):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['proxmox-backup-manager', 'task', 'list', '--output-format', 'json'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
if result.returncode == 0:
|
||||
try:
|
||||
tasks = json.loads(result.stdout)
|
||||
for task in tasks:
|
||||
task_status = task.get('status', '')
|
||||
task_type = task.get('worker_type', '')
|
||||
task_id = task.get('worker_id', '')
|
||||
|
||||
if task_status and task_status != 'OK':
|
||||
failed_task = {
|
||||
'type': task_type,
|
||||
'id': task_id,
|
||||
'status': task_status,
|
||||
'starttime': task.get('starttime', ''),
|
||||
'endtime': task.get('endtime', '')
|
||||
}
|
||||
pbs_health['failed_tasks'].append(failed_task)
|
||||
|
||||
# Categorize by task type
|
||||
if 'backup' in task_type.lower():
|
||||
issue_type = 'PBS_BACKUP_FAILED'
|
||||
severity = 'CRITICAL'
|
||||
elif 'gc' in task_type.lower() or 'garbage' in task_type.lower():
|
||||
issue_type = 'PBS_GC_FAILED'
|
||||
severity = 'WARNING'
|
||||
elif 'sync' in task_type.lower():
|
||||
issue_type = 'PBS_SYNC_FAILED'
|
||||
severity = 'WARNING'
|
||||
else:
|
||||
issue_type = 'PBS_BACKUP_FAILED'
|
||||
severity = 'WARNING'
|
||||
|
||||
pbs_health['issues'].append({
|
||||
'type': issue_type,
|
||||
'severity': severity,
|
||||
'device': f"task-{task_type}",
|
||||
'issue': f"PBS {task_type} failed: {task_id} - {task_status}"
|
||||
})
|
||||
|
||||
if severity == 'CRITICAL':
|
||||
pbs_health['status'] = 'CRITICAL'
|
||||
elif pbs_health['status'] == 'OK':
|
||||
pbs_health['status'] = 'WARNING'
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse PBS task list JSON: {e}")
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("proxmox-backup-manager task list timed out")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking PBS tasks: {e}")
|
||||
|
||||
return pbs_health
|
||||
|
||||
# =============================================================================
|
||||
# PROMETHEUS METRICS EXPORT
|
||||
# =============================================================================
|
||||
@@ -3443,6 +3694,18 @@ class SystemHealthMonitor:
|
||||
usage = fs.get('usage_percent', 0)
|
||||
metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')
|
||||
|
||||
# === PBS Metrics ===
|
||||
pbs = health_report.get('pbs_health', {})
|
||||
if pbs.get('is_pbs_node'):
|
||||
metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage')
|
||||
metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge')
|
||||
for pool in pbs.get('zfs_pools', []):
|
||||
metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}')
|
||||
|
||||
metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count')
|
||||
metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge')
|
||||
metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}')
|
||||
|
||||
# === Issue Summary Metrics ===
|
||||
metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
|
||||
metrics.append(f'# TYPE hwmon_issues_total gauge')
|
||||
@@ -3450,7 +3713,8 @@ class SystemHealthMonitor:
|
||||
system_issues = len(health_report.get('system_health', {}).get('issues', []))
|
||||
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
|
||||
lxc_issues = len(lxc.get('issues', []))
|
||||
total_issues = system_issues + ceph_issues + lxc_issues
|
||||
pbs_issues = len(pbs.get('issues', []))
|
||||
total_issues = system_issues + ceph_issues + lxc_issues + pbs_issues
|
||||
metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')
|
||||
|
||||
return '\n'.join(metrics) + '\n'
|
||||
|
||||
Reference in New Issue
Block a user