Add Proxmox Backup Server (PBS) health monitoring support

Monitors ZFS pool status/usage and failed PBS tasks (backup, GC, sync).
Includes configurable thresholds (PBS_ZFS_WARNING/CRITICAL), Prometheus
metrics (hwmon_pbs_*), dry-run summary, issue categorization, and
priority classification. Enabled via PBS_ENABLED=true in .env config.

Fixes: #5

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-10 13:18:41 -05:00
parent 07782da7b6
commit d1750ea6cf

View File

@@ -79,7 +79,16 @@ class SystemHealthMonitor:
'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full 'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full
'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high 'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high
'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded 'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded
'CEPH_MON_DOWN': PRIORITIES['HIGH'] # P2 - Monitor down 'CEPH_MON_DOWN': PRIORITIES['HIGH'], # P2 - Monitor down
# PBS (Proxmox Backup Server) issues
'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded
'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full
'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high
'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors
'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed
'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed
'PBS_SYNC_FAILED': PRIORITIES['MEDIUM'] # P3 - Sync job failed
} }
CONFIG = { CONFIG = {
@@ -133,7 +142,11 @@ class SystemHealthMonitor:
'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files 'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files
# Health check endpoint # Health check endpoint
'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint 'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint
'HEALTH_SERVER_PORT': 9102 # Port for health check endpoint 'HEALTH_SERVER_PORT': 9102, # Port for health check endpoint
# PBS (Proxmox Backup Server) monitoring
'PBS_ENABLED': False, # Enable PBS health monitoring
'PBS_ZFS_WARNING': 80, # ZFS pool usage warning threshold %
'PBS_ZFS_CRITICAL': 90 # ZFS pool usage critical threshold %
} }
@classmethod @classmethod
@@ -215,6 +228,20 @@ class SystemHealthMonitor:
cls.CONFIG['HISTORY_MAX_BYTES'] = int(value) cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
except ValueError: except ValueError:
logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}") logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
# PBS settings
elif key == 'PBS_ENABLED':
cls.CONFIG['PBS_ENABLED'] = value.lower() in ('true', '1', 'yes')
logger.info(f"✓ Loaded PBS_ENABLED: {cls.CONFIG['PBS_ENABLED']}")
elif key == 'PBS_ZFS_WARNING':
try:
cls.CONFIG['PBS_ZFS_WARNING'] = int(value)
except ValueError:
logger.warning(f"Invalid PBS_ZFS_WARNING value: {value}")
elif key == 'PBS_ZFS_CRITICAL':
try:
cls.CONFIG['PBS_ZFS_CRITICAL'] = int(value)
except ValueError:
logger.warning(f"Invalid PBS_ZFS_CRITICAL value: {value}")
# Health server settings # Health server settings
elif key == 'HEALTH_SERVER_ENABLED': elif key == 'HEALTH_SERVER_ENABLED':
cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes') cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
@@ -703,6 +730,8 @@ class SystemHealthMonitor:
'ceph': 'ceph-common', 'ceph': 'ceph-common',
'pct': 'pve-container', 'pct': 'pve-container',
'dmidecode': 'dmidecode', 'dmidecode': 'dmidecode',
'proxmox-backup-manager': 'proxmox-backup-server',
'zpool': 'zfsutils-linux',
} }
availability = {} availability = {}
@@ -841,7 +870,8 @@ class SystemHealthMonitor:
'network_health': self._check_network_status(), 'network_health': self._check_network_status(),
'ceph_health': self._check_ceph_health(), 'ceph_health': self._check_ceph_health(),
'lxc_health': self._check_lxc_storage(), 'lxc_health': self._check_lxc_storage(),
'system_health': self._check_system_drive_indicators() 'system_health': self._check_system_drive_indicators(),
'pbs_health': self._check_pbs_health()
} }
if self.dry_run: if self.dry_run:
@@ -897,6 +927,17 @@ class SystemHealthMonitor:
if health_report['system_health']['issues']: if health_report['system_health']['issues']:
logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found") logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
# PBS status
pbs = health_report.get('pbs_health', {})
if pbs.get('is_pbs_node'):
logger.info("\nPBS Status:")
for pool in pbs.get('zfs_pools', []):
logger.info(f" ZFS Pool '{pool['name']}': {pool['usage_percent']}% used ({pool['used']}/{pool['total']})")
if pbs.get('failed_tasks'):
logger.info(f" Failed tasks: {len(pbs['failed_tasks'])}")
if pbs.get('issues'):
logger.info(f" Issues: {len(pbs['issues'])}")
logger.info("\n=== End Summary ===") logger.info("\n=== End Summary ===")
return health_report return health_report
@@ -1651,7 +1692,9 @@ class SystemHealthMonitor:
'critical reallocated', 'critical current_pending', 'critical reallocated', 'critical current_pending',
'network is unreachable', 'network is unreachable',
'osd is down', 'osd down', # Ceph OSD down 'osd is down', 'osd down', # Ceph OSD down
'cluster usage critical' # Ceph usage critical 'cluster usage critical', # Ceph usage critical
'zfs pool', 'backup failed', # PBS critical issues
'usage critical' # PBS ZFS critical usage
]): ]):
return self.PRIORITIES['HIGH'] # P2 return self.PRIORITIES['HIGH'] # P2
@@ -1670,7 +1713,8 @@ class SystemHealthMonitor:
'warning', 'high temperature', 'correctable ecc', 'warning', 'high temperature', 'correctable ecc',
'trend alert', 'critical storage usage', 'trend alert', 'critical storage usage',
'low available_spare', 'high wear', 'low available_spare', 'high wear',
'health_warn', 'cluster usage warning' # Ceph warnings 'health_warn', 'cluster usage warning', # Ceph warnings
'gc failed', 'sync failed', 'usage high' # PBS warnings
]): ]):
return self.PRIORITIES['MEDIUM'] # P3 return self.PRIORITIES['MEDIUM'] # P3
@@ -1794,6 +1838,27 @@ class SystemHealthMonitor:
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
) )
# PBS Issues - Backup server issues (categorized as Hardware for storage, Software for tasks)
if any(keyword in issue_lower for keyword in [
'pbs', 'zfs pool', 'backup failed', 'gc failed', 'sync failed'
]):
if any(error in issue_lower for error in [
'degraded', 'critical', 'failed', 'errors'
]):
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['ISSUE'],
'[pbs]',
self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
)
else:
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['PROBLEM'],
'[pbs]',
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Default: Hardware Problem (for undefined cases) # Default: Hardware Problem (for undefined cases)
return ( return (
self.TICKET_CATEGORIES['HARDWARE'], self.TICKET_CATEGORIES['HARDWARE'],
@@ -2011,6 +2076,12 @@ class SystemHealthMonitor:
for issue in ceph_health['issues']: for issue in ceph_health['issues']:
issues.append(f"[ceph] {issue}") issues.append(f"[ceph] {issue}")
# Check for PBS issues
pbs_health = health_report.get('pbs_health', {})
if pbs_health.get('is_pbs_node') and pbs_health.get('issues'):
for issue in pbs_health['issues']:
issues.append(f"[pbs] {issue.get('issue', str(issue))}")
logger.info("=== Issue Detection Started ===") logger.info("=== Issue Detection Started ===")
logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found") logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
logger.info(f"Memory status: {health_report['memory_health']['status']}") logger.info(f"Memory status: {health_report['memory_health']['status']}")
@@ -3297,6 +3368,186 @@ class SystemHealthMonitor:
return ceph_health return ceph_health
# =============================================================================
# PBS (PROXMOX BACKUP SERVER) HEALTH CHECKS
# =============================================================================
def _check_pbs_health(self) -> Dict[str, Any]:
"""
Check Proxmox Backup Server health including ZFS pools and task status.
Returns health status for ZFS pools, failed backup/GC/sync jobs.
Only active when PBS_ENABLED=true and relevant tools are available.
"""
pbs_health = {
'status': 'OK',
'is_pbs_node': False,
'zfs_pools': [],
'failed_tasks': [],
'issues': []
}
if not self.CONFIG.get('PBS_ENABLED', False):
logger.debug("PBS monitoring disabled in config")
return pbs_health
if not self._available_tools.get('zpool'):
logger.debug("zpool not available - skipping PBS ZFS checks")
return pbs_health
pbs_health['is_pbs_node'] = True
# Check ZFS pool status
try:
result = subprocess.run(
['zpool', 'status', '-p'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=30
)
if result.returncode == 0:
current_pool = None
for line in result.stdout.splitlines():
line_stripped = line.strip()
if line_stripped.startswith('pool:'):
current_pool = line_stripped.split(':', 1)[1].strip()
elif line_stripped.startswith('state:') and current_pool:
state = line_stripped.split(':', 1)[1].strip()
if state != 'ONLINE':
pbs_health['status'] = 'CRITICAL'
pbs_health['issues'].append({
'type': 'PBS_ZFS_DEGRADED',
'severity': 'CRITICAL',
'device': current_pool,
'issue': f"ZFS pool '{current_pool}' state: {state}"
})
elif line_stripped.startswith('errors:') and current_pool:
if 'No known data errors' not in line_stripped:
pbs_health['issues'].append({
'type': 'PBS_ZFS_ERRORS',
'severity': 'WARNING',
'device': current_pool,
'issue': f"ZFS pool '{current_pool}' has errors: {line_stripped}"
})
except subprocess.TimeoutExpired:
logger.warning("zpool status timed out")
except Exception as e:
logger.error(f"Error checking ZFS pool status: {e}")
# Check ZFS pool usage
try:
result = subprocess.run(
['zpool', 'list', '-Hp'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=30
)
if result.returncode == 0:
for line in result.stdout.splitlines():
parts = line.split('\t')
if len(parts) >= 8:
pool_name = parts[0]
try:
total_bytes = int(parts[1])
used_bytes = int(parts[2])
usage_pct = (used_bytes / total_bytes * 100) if total_bytes > 0 else 0
except (ValueError, ZeroDivisionError):
continue
pool_info = {
'name': pool_name,
'total': self._convert_bytes(total_bytes),
'used': self._convert_bytes(used_bytes),
'usage_percent': round(usage_pct, 1),
'health': parts[9] if len(parts) > 9 else 'UNKNOWN'
}
pbs_health['zfs_pools'].append(pool_info)
if usage_pct >= self.CONFIG['PBS_ZFS_CRITICAL']:
pbs_health['status'] = 'CRITICAL'
pbs_health['issues'].append({
'type': 'PBS_ZFS_USAGE_CRITICAL',
'severity': 'CRITICAL',
'device': pool_name,
'issue': f"ZFS pool '{pool_name}' usage critical: {usage_pct:.1f}%"
})
elif usage_pct >= self.CONFIG['PBS_ZFS_WARNING']:
if pbs_health['status'] != 'CRITICAL':
pbs_health['status'] = 'WARNING'
pbs_health['issues'].append({
'type': 'PBS_ZFS_USAGE_WARNING',
'severity': 'WARNING',
'device': pool_name,
'issue': f"ZFS pool '{pool_name}' usage high: {usage_pct:.1f}%"
})
except subprocess.TimeoutExpired:
logger.warning("zpool list timed out")
except Exception as e:
logger.error(f"Error checking ZFS pool usage: {e}")
# Check failed PBS tasks (requires proxmox-backup-manager)
if self._available_tools.get('proxmox-backup-manager'):
try:
result = subprocess.run(
['proxmox-backup-manager', 'task', 'list', '--output-format', 'json'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=30
)
if result.returncode == 0:
try:
tasks = json.loads(result.stdout)
for task in tasks:
task_status = task.get('status', '')
task_type = task.get('worker_type', '')
task_id = task.get('worker_id', '')
if task_status and task_status != 'OK':
failed_task = {
'type': task_type,
'id': task_id,
'status': task_status,
'starttime': task.get('starttime', ''),
'endtime': task.get('endtime', '')
}
pbs_health['failed_tasks'].append(failed_task)
# Categorize by task type
if 'backup' in task_type.lower():
issue_type = 'PBS_BACKUP_FAILED'
severity = 'CRITICAL'
elif 'gc' in task_type.lower() or 'garbage' in task_type.lower():
issue_type = 'PBS_GC_FAILED'
severity = 'WARNING'
elif 'sync' in task_type.lower():
issue_type = 'PBS_SYNC_FAILED'
severity = 'WARNING'
else:
issue_type = 'PBS_BACKUP_FAILED'
severity = 'WARNING'
pbs_health['issues'].append({
'type': issue_type,
'severity': severity,
'device': f"task-{task_type}",
'issue': f"PBS {task_type} failed: {task_id} - {task_status}"
})
if severity == 'CRITICAL':
pbs_health['status'] = 'CRITICAL'
elif pbs_health['status'] == 'OK':
pbs_health['status'] = 'WARNING'
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse PBS task list JSON: {e}")
except subprocess.TimeoutExpired:
logger.warning("proxmox-backup-manager task list timed out")
except Exception as e:
logger.error(f"Error checking PBS tasks: {e}")
return pbs_health
# ============================================================================= # =============================================================================
# PROMETHEUS METRICS EXPORT # PROMETHEUS METRICS EXPORT
# ============================================================================= # =============================================================================
@@ -3443,6 +3694,18 @@ class SystemHealthMonitor:
usage = fs.get('usage_percent', 0) usage = fs.get('usage_percent', 0)
metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}') metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')
# === PBS Metrics ===
pbs = health_report.get('pbs_health', {})
if pbs.get('is_pbs_node'):
metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage')
metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge')
for pool in pbs.get('zfs_pools', []):
metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}')
metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count')
metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge')
metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}')
# === Issue Summary Metrics === # === Issue Summary Metrics ===
metrics.append(f'# HELP hwmon_issues_total Total number of issues detected') metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
metrics.append(f'# TYPE hwmon_issues_total gauge') metrics.append(f'# TYPE hwmon_issues_total gauge')
@@ -3450,7 +3713,8 @@ class SystemHealthMonitor:
system_issues = len(health_report.get('system_health', {}).get('issues', [])) system_issues = len(health_report.get('system_health', {}).get('issues', []))
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', [])) ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
lxc_issues = len(lxc.get('issues', [])) lxc_issues = len(lxc.get('issues', []))
total_issues = system_issues + ceph_issues + lxc_issues pbs_issues = len(pbs.get('issues', []))
total_issues = system_issues + ceph_issues + lxc_issues + pbs_issues
metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}') metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')
return '\n'.join(metrics) + '\n' return '\n'.join(metrics) + '\n'