Add Proxmox Backup Server (PBS) health monitoring support

Monitors ZFS pool status/usage and failed PBS tasks (backup, GC, sync). Includes configurable thresholds (PBS_ZFS_WARNING/CRITICAL), Prometheus metrics (hwmon_pbs_*), dry-run summary, issue categorization, and priority classification. Enabled via PBS_ENABLED=true in .env config. Fixes: #5 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 13:18:41 -05:00
parent 07782da7b6
commit d1750ea6cf
1 changed files with 271 additions and 7 deletions
@@ -79,7 +79,16 @@ class SystemHealthMonitor:
        'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'],    # P2 - Cluster near full
        'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'],   # P3 - Cluster usage high
        'CEPH_PG_DEGRADED': PRIORITIES['HIGH'],       # P2 - PGs degraded
-        'CEPH_MON_DOWN': PRIORITIES['HIGH']           # P2 - Monitor down
+        'CEPH_MON_DOWN': PRIORITIES['HIGH'],          # P2 - Monitor down
+
+        # PBS (Proxmox Backup Server) issues
+        'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'],   # P1 - ZFS pool degraded
+        'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full
+        'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high
+        'PBS_ZFS_ERRORS': PRIORITIES['HIGH'],         # P2 - ZFS pool has errors
+        'PBS_BACKUP_FAILED': PRIORITIES['HIGH'],      # P2 - Backup job failed
+        'PBS_GC_FAILED': PRIORITIES['MEDIUM'],        # P3 - Garbage collection failed
+        'PBS_SYNC_FAILED': PRIORITIES['MEDIUM']       # P3 - Sync job failed
    }
    
    CONFIG = {
@@ -133,7 +142,11 @@ class SystemHealthMonitor:
        'HISTORY_MAX_BYTES': 52428800,        # 50MB max storage for history files
        # Health check endpoint
        'HEALTH_SERVER_ENABLED': False,  # Enable HTTP health check endpoint
-        'HEALTH_SERVER_PORT': 9102       # Port for health check endpoint
+        'HEALTH_SERVER_PORT': 9102,      # Port for health check endpoint
+        # PBS (Proxmox Backup Server) monitoring
+        'PBS_ENABLED': False,            # Enable PBS health monitoring
+        'PBS_ZFS_WARNING': 80,           # ZFS pool usage warning threshold %
+        'PBS_ZFS_CRITICAL': 90           # ZFS pool usage critical threshold %
    }

    @classmethod
@@ -215,6 +228,20 @@ class SystemHealthMonitor:
                                cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
                            except ValueError:
                                logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
+                        # PBS settings
+                        elif key == 'PBS_ENABLED':
+                            cls.CONFIG['PBS_ENABLED'] = value.lower() in ('true', '1', 'yes')
+                            logger.info(f"✓ Loaded PBS_ENABLED: {cls.CONFIG['PBS_ENABLED']}")
+                        elif key == 'PBS_ZFS_WARNING':
+                            try:
+                                cls.CONFIG['PBS_ZFS_WARNING'] = int(value)
+                            except ValueError:
+                                logger.warning(f"Invalid PBS_ZFS_WARNING value: {value}")
+                        elif key == 'PBS_ZFS_CRITICAL':
+                            try:
+                                cls.CONFIG['PBS_ZFS_CRITICAL'] = int(value)
+                            except ValueError:
+                                logger.warning(f"Invalid PBS_ZFS_CRITICAL value: {value}")
                        # Health server settings
                        elif key == 'HEALTH_SERVER_ENABLED':
                            cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
@@ -703,6 +730,8 @@ class SystemHealthMonitor:
            'ceph': 'ceph-common',
            'pct': 'pve-container',
            'dmidecode': 'dmidecode',
+            'proxmox-backup-manager': 'proxmox-backup-server',
+            'zpool': 'zfsutils-linux',
        }

        availability = {}
@@ -841,7 +870,8 @@ class SystemHealthMonitor:
            'network_health': self._check_network_status(),
            'ceph_health': self._check_ceph_health(),
            'lxc_health': self._check_lxc_storage(),
-            'system_health': self._check_system_drive_indicators()
+            'system_health': self._check_system_drive_indicators(),
+            'pbs_health': self._check_pbs_health()
        }
        
        if self.dry_run:
@@ -896,7 +926,18 @@ class SystemHealthMonitor:

            if health_report['system_health']['issues']:
                logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
-            
+
+            # PBS status
+            pbs = health_report.get('pbs_health', {})
+            if pbs.get('is_pbs_node'):
+                logger.info("\nPBS Status:")
+                for pool in pbs.get('zfs_pools', []):
+                    logger.info(f"  ZFS Pool '{pool['name']}': {pool['usage_percent']}% used ({pool['used']}/{pool['total']})")
+                if pbs.get('failed_tasks'):
+                    logger.info(f"  Failed tasks: {len(pbs['failed_tasks'])}")
+                if pbs.get('issues'):
+                    logger.info(f"  Issues: {len(pbs['issues'])}")
+
            logger.info("\n=== End Summary ===")
        
        return health_report
@@ -1651,7 +1692,9 @@ class SystemHealthMonitor:
            'critical reallocated', 'critical current_pending',
            'network is unreachable',
            'osd is down', 'osd down',  # Ceph OSD down
-            'cluster usage critical'  # Ceph usage critical
+            'cluster usage critical',  # Ceph usage critical
+            'zfs pool', 'backup failed',  # PBS critical issues
+            'usage critical'  # PBS ZFS critical usage
        ]):
            return self.PRIORITIES['HIGH']  # P2

@@ -1670,7 +1713,8 @@ class SystemHealthMonitor:
            'warning', 'high temperature', 'correctable ecc',
            'trend alert', 'critical storage usage',
            'low available_spare', 'high wear',
-            'health_warn', 'cluster usage warning'  # Ceph warnings
+            'health_warn', 'cluster usage warning',  # Ceph warnings
+            'gc failed', 'sync failed', 'usage high'  # PBS warnings
        ]):
            return self.PRIORITIES['MEDIUM']  # P3

@@ -1794,6 +1838,27 @@ class SystemHealthMonitor:
                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
                )

+        # PBS Issues - Backup server issues (categorized as Hardware for storage, Software for tasks)
+        if any(keyword in issue_lower for keyword in [
+            'pbs', 'zfs pool', 'backup failed', 'gc failed', 'sync failed'
+        ]):
+            if any(error in issue_lower for error in [
+                'degraded', 'critical', 'failed', 'errors'
+            ]):
+                return (
+                    self.TICKET_CATEGORIES['HARDWARE'],
+                    self.TICKET_TYPES['ISSUE'],
+                    '[pbs]',
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
+                )
+            else:
+                return (
+                    self.TICKET_CATEGORIES['HARDWARE'],
+                    self.TICKET_TYPES['PROBLEM'],
+                    '[pbs]',
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
+                )
+
        # Default: Hardware Problem (for undefined cases)
        return (
            self.TICKET_CATEGORIES['HARDWARE'],
@@ -2011,6 +2076,12 @@ class SystemHealthMonitor:
                for issue in ceph_health['issues']:
                    issues.append(f"[ceph] {issue}")

+        # Check for PBS issues
+        pbs_health = health_report.get('pbs_health', {})
+        if pbs_health.get('is_pbs_node') and pbs_health.get('issues'):
+            for issue in pbs_health['issues']:
+                issues.append(f"[pbs] {issue.get('issue', str(issue))}")
+
        logger.info("=== Issue Detection Started ===")
        logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
        logger.info(f"Memory status: {health_report['memory_health']['status']}")
@@ -3297,6 +3368,186 @@ class SystemHealthMonitor:

        return ceph_health

+    # =============================================================================
+    # PBS (PROXMOX BACKUP SERVER) HEALTH CHECKS
+    # =============================================================================
+    def _check_pbs_health(self) -> Dict[str, Any]:
+        """
+        Check Proxmox Backup Server health including ZFS pools and task status.
+
+        Returns health status for ZFS pools, failed backup/GC/sync jobs.
+        Only active when PBS_ENABLED=true and relevant tools are available.
+        """
+        pbs_health = {
+            'status': 'OK',
+            'is_pbs_node': False,
+            'zfs_pools': [],
+            'failed_tasks': [],
+            'issues': []
+        }
+
+        if not self.CONFIG.get('PBS_ENABLED', False):
+            logger.debug("PBS monitoring disabled in config")
+            return pbs_health
+
+        if not self._available_tools.get('zpool'):
+            logger.debug("zpool not available - skipping PBS ZFS checks")
+            return pbs_health
+
+        pbs_health['is_pbs_node'] = True
+
+        # Check ZFS pool status
+        try:
+            result = subprocess.run(
+                ['zpool', 'status', '-p'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                timeout=30
+            )
+            if result.returncode == 0:
+                current_pool = None
+                for line in result.stdout.splitlines():
+                    line_stripped = line.strip()
+                    if line_stripped.startswith('pool:'):
+                        current_pool = line_stripped.split(':', 1)[1].strip()
+                    elif line_stripped.startswith('state:') and current_pool:
+                        state = line_stripped.split(':', 1)[1].strip()
+                        if state != 'ONLINE':
+                            pbs_health['status'] = 'CRITICAL'
+                            pbs_health['issues'].append({
+                                'type': 'PBS_ZFS_DEGRADED',
+                                'severity': 'CRITICAL',
+                                'device': current_pool,
+                                'issue': f"ZFS pool '{current_pool}' state: {state}"
+                            })
+                    elif line_stripped.startswith('errors:') and current_pool:
+                        if 'No known data errors' not in line_stripped:
+                            pbs_health['issues'].append({
+                                'type': 'PBS_ZFS_ERRORS',
+                                'severity': 'WARNING',
+                                'device': current_pool,
+                                'issue': f"ZFS pool '{current_pool}' has errors: {line_stripped}"
+                            })
+        except subprocess.TimeoutExpired:
+            logger.warning("zpool status timed out")
+        except Exception as e:
+            logger.error(f"Error checking ZFS pool status: {e}")
+
+        # Check ZFS pool usage
+        try:
+            result = subprocess.run(
+                ['zpool', 'list', '-Hp'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                timeout=30
+            )
+            if result.returncode == 0:
+                for line in result.stdout.splitlines():
+                    parts = line.split('\t')
+                    if len(parts) >= 8:
+                        pool_name = parts[0]
+                        try:
+                            total_bytes = int(parts[1])
+                            used_bytes = int(parts[2])
+                            usage_pct = (used_bytes / total_bytes * 100) if total_bytes > 0 else 0
+                        except (ValueError, ZeroDivisionError):
+                            continue
+
+                        pool_info = {
+                            'name': pool_name,
+                            'total': self._convert_bytes(total_bytes),
+                            'used': self._convert_bytes(used_bytes),
+                            'usage_percent': round(usage_pct, 1),
+                            'health': parts[9] if len(parts) > 9 else 'UNKNOWN'
+                        }
+                        pbs_health['zfs_pools'].append(pool_info)
+
+                        if usage_pct >= self.CONFIG['PBS_ZFS_CRITICAL']:
+                            pbs_health['status'] = 'CRITICAL'
+                            pbs_health['issues'].append({
+                                'type': 'PBS_ZFS_USAGE_CRITICAL',
+                                'severity': 'CRITICAL',
+                                'device': pool_name,
+                                'issue': f"ZFS pool '{pool_name}' usage critical: {usage_pct:.1f}%"
+                            })
+                        elif usage_pct >= self.CONFIG['PBS_ZFS_WARNING']:
+                            if pbs_health['status'] != 'CRITICAL':
+                                pbs_health['status'] = 'WARNING'
+                            pbs_health['issues'].append({
+                                'type': 'PBS_ZFS_USAGE_WARNING',
+                                'severity': 'WARNING',
+                                'device': pool_name,
+                                'issue': f"ZFS pool '{pool_name}' usage high: {usage_pct:.1f}%"
+                            })
+        except subprocess.TimeoutExpired:
+            logger.warning("zpool list timed out")
+        except Exception as e:
+            logger.error(f"Error checking ZFS pool usage: {e}")
+
+        # Check failed PBS tasks (requires proxmox-backup-manager)
+        if self._available_tools.get('proxmox-backup-manager'):
+            try:
+                result = subprocess.run(
+                    ['proxmox-backup-manager', 'task', 'list', '--output-format', 'json'],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                    timeout=30
+                )
+                if result.returncode == 0:
+                    try:
+                        tasks = json.loads(result.stdout)
+                        for task in tasks:
+                            task_status = task.get('status', '')
+                            task_type = task.get('worker_type', '')
+                            task_id = task.get('worker_id', '')
+
+                            if task_status and task_status != 'OK':
+                                failed_task = {
+                                    'type': task_type,
+                                    'id': task_id,
+                                    'status': task_status,
+                                    'starttime': task.get('starttime', ''),
+                                    'endtime': task.get('endtime', '')
+                                }
+                                pbs_health['failed_tasks'].append(failed_task)
+
+                                # Categorize by task type
+                                if 'backup' in task_type.lower():
+                                    issue_type = 'PBS_BACKUP_FAILED'
+                                    severity = 'CRITICAL'
+                                elif 'gc' in task_type.lower() or 'garbage' in task_type.lower():
+                                    issue_type = 'PBS_GC_FAILED'
+                                    severity = 'WARNING'
+                                elif 'sync' in task_type.lower():
+                                    issue_type = 'PBS_SYNC_FAILED'
+                                    severity = 'WARNING'
+                                else:
+                                    issue_type = 'PBS_BACKUP_FAILED'
+                                    severity = 'WARNING'
+
+                                pbs_health['issues'].append({
+                                    'type': issue_type,
+                                    'severity': severity,
+                                    'device': f"task-{task_type}",
+                                    'issue': f"PBS {task_type} failed: {task_id} - {task_status}"
+                                })
+
+                                if severity == 'CRITICAL':
+                                    pbs_health['status'] = 'CRITICAL'
+                                elif pbs_health['status'] == 'OK':
+                                    pbs_health['status'] = 'WARNING'
+                    except json.JSONDecodeError as e:
+                        logger.warning(f"Failed to parse PBS task list JSON: {e}")
+            except subprocess.TimeoutExpired:
+                logger.warning("proxmox-backup-manager task list timed out")
+            except Exception as e:
+                logger.error(f"Error checking PBS tasks: {e}")
+
+        return pbs_health
+
    # =============================================================================
    # PROMETHEUS METRICS EXPORT
    # =============================================================================
@@ -3443,6 +3694,18 @@ class SystemHealthMonitor:
                    usage = fs.get('usage_percent', 0)
                    metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')

+        # === PBS Metrics ===
+        pbs = health_report.get('pbs_health', {})
+        if pbs.get('is_pbs_node'):
+            metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage')
+            metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge')
+            for pool in pbs.get('zfs_pools', []):
+                metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}')
+
+            metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count')
+            metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge')
+            metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}')
+
        # === Issue Summary Metrics ===
        metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
        metrics.append(f'# TYPE hwmon_issues_total gauge')
@@ -3450,7 +3713,8 @@ class SystemHealthMonitor:
        system_issues = len(health_report.get('system_health', {}).get('issues', []))
        ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
        lxc_issues = len(lxc.get('issues', []))
-        total_issues = system_issues + ceph_issues + lxc_issues
+        pbs_issues = len(pbs.get('issues', []))
+        total_issues = system_issues + ceph_issues + lxc_issues + pbs_issues
        metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')

        return '\n'.join(metrics) + '\n'