Add Proxmox Backup Server (PBS) health monitoring support

Monitors ZFS pool status/usage and failed PBS tasks (backup, GC, sync). Includes configurable thresholds (PBS_ZFS_WARNING/CRITICAL), Prometheus metrics (hwmon_pbs_*), dry-run summary, issue categorization, and priority classification. Enabled via PBS_ENABLED=true in .env config. Fixes: #5 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 13:18:41 -05:00
parent 07782da7b6
commit d1750ea6cf
1 changed files with 271 additions and 7 deletions
@@ -79,7 +79,16 @@ class SystemHealthMonitor:
        'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'],    # P2 - Cluster near full
        'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'],   # P3 - Cluster usage high
        'CEPH_PG_DEGRADED': PRIORITIES['HIGH'],       # P2 - PGs degraded
-        'CEPH_MON_DOWN': PRIORITIES['HIGH']           # P2 - Monitor down
+        'CEPH_MON_DOWN': PRIORITIES['HIGH'],          # P2 - Monitor down
        # PBS (Proxmox Backup Server) issues
        'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'],   # P1 - ZFS pool degraded
        'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full
        'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high
        'PBS_ZFS_ERRORS': PRIORITIES['HIGH'],         # P2 - ZFS pool has errors
        'PBS_BACKUP_FAILED': PRIORITIES['HIGH'],      # P2 - Backup job failed
        'PBS_GC_FAILED': PRIORITIES['MEDIUM'],        # P3 - Garbage collection failed
        'PBS_SYNC_FAILED': PRIORITIES['MEDIUM']       # P3 - Sync job failed
    }
    CONFIG = {
@@ -133,7 +142,11 @@ class SystemHealthMonitor:
        'HISTORY_MAX_BYTES': 52428800,        # 50MB max storage for history files
        # Health check endpoint
        'HEALTH_SERVER_ENABLED': False,  # Enable HTTP health check endpoint
-        'HEALTH_SERVER_PORT': 9102       # Port for health check endpoint
+        'HEALTH_SERVER_PORT': 9102,      # Port for health check endpoint
        # PBS (Proxmox Backup Server) monitoring
        'PBS_ENABLED': False,            # Enable PBS health monitoring
        'PBS_ZFS_WARNING': 80,           # ZFS pool usage warning threshold %
        'PBS_ZFS_CRITICAL': 90           # ZFS pool usage critical threshold %
    }
    @classmethod
@@ -215,6 +228,20 @@ class SystemHealthMonitor:
                                cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
                            except ValueError:
                                logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
                        # PBS settings
                        elif key == 'PBS_ENABLED':
                            cls.CONFIG['PBS_ENABLED'] = value.lower() in ('true', '1', 'yes')
                            logger.info(f"✓ Loaded PBS_ENABLED: {cls.CONFIG['PBS_ENABLED']}")
                        elif key == 'PBS_ZFS_WARNING':
                            try:
                                cls.CONFIG['PBS_ZFS_WARNING'] = int(value)
                            except ValueError:
                                logger.warning(f"Invalid PBS_ZFS_WARNING value: {value}")
                        elif key == 'PBS_ZFS_CRITICAL':
                            try:
                                cls.CONFIG['PBS_ZFS_CRITICAL'] = int(value)
                            except ValueError:
                                logger.warning(f"Invalid PBS_ZFS_CRITICAL value: {value}")
                        # Health server settings
                        elif key == 'HEALTH_SERVER_ENABLED':
                            cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
@@ -703,6 +730,8 @@ class SystemHealthMonitor:
            'ceph': 'ceph-common',
            'pct': 'pve-container',
            'dmidecode': 'dmidecode',
            'proxmox-backup-manager': 'proxmox-backup-server',
            'zpool': 'zfsutils-linux',
        }
        availability = {}
@@ -841,7 +870,8 @@ class SystemHealthMonitor:
            'network_health': self._check_network_status(),
            'ceph_health': self._check_ceph_health(),
            'lxc_health': self._check_lxc_storage(),
-            'system_health': self._check_system_drive_indicators()
+            'system_health': self._check_system_drive_indicators(),
            'pbs_health': self._check_pbs_health()
        }
        if self.dry_run:
@@ -897,6 +927,17 @@ class SystemHealthMonitor:
            if health_report['system_health']['issues']:
                logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
            # PBS status
            pbs = health_report.get('pbs_health', {})
            if pbs.get('is_pbs_node'):
                logger.info("\nPBS Status:")
                for pool in pbs.get('zfs_pools', []):
                    logger.info(f"  ZFS Pool '{pool['name']}': {pool['usage_percent']}% used ({pool['used']}/{pool['total']})")
                if pbs.get('failed_tasks'):
                    logger.info(f"  Failed tasks: {len(pbs['failed_tasks'])}")
                if pbs.get('issues'):
                    logger.info(f"  Issues: {len(pbs['issues'])}")
            logger.info("\n=== End Summary ===")
        return health_report
@@ -1651,7 +1692,9 @@ class SystemHealthMonitor:
            'critical reallocated', 'critical current_pending',
            'network is unreachable',
            'osd is down', 'osd down',  # Ceph OSD down
-            'cluster usage critical'  # Ceph usage critical
+            'cluster usage critical',  # Ceph usage critical
            'zfs pool', 'backup failed',  # PBS critical issues
            'usage critical'  # PBS ZFS critical usage
        ]):
            return self.PRIORITIES['HIGH']  # P2
@@ -1670,7 +1713,8 @@ class SystemHealthMonitor:
            'warning', 'high temperature', 'correctable ecc',
            'trend alert', 'critical storage usage',
            'low available_spare', 'high wear',
-            'health_warn', 'cluster usage warning'  # Ceph warnings
+            'health_warn', 'cluster usage warning',  # Ceph warnings
            'gc failed', 'sync failed', 'usage high'  # PBS warnings
        ]):
            return self.PRIORITIES['MEDIUM']  # P3
@@ -1794,6 +1838,27 @@ class SystemHealthMonitor:
                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
                )
        # PBS Issues - Backup server issues (categorized as Hardware for storage, Software for tasks)
        if any(keyword in issue_lower for keyword in [
            'pbs', 'zfs pool', 'backup failed', 'gc failed', 'sync failed'
        ]):
            if any(error in issue_lower for error in [
                'degraded', 'critical', 'failed', 'errors'
            ]):
                return (
                    self.TICKET_CATEGORIES['HARDWARE'],
                    self.TICKET_TYPES['ISSUE'],
                    '[pbs]',
                    self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
                )
            else:
                return (
                    self.TICKET_CATEGORIES['HARDWARE'],
                    self.TICKET_TYPES['PROBLEM'],
                    '[pbs]',
                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
                )
        # Default: Hardware Problem (for undefined cases)
        return (
            self.TICKET_CATEGORIES['HARDWARE'],
@@ -2011,6 +2076,12 @@ class SystemHealthMonitor:
                for issue in ceph_health['issues']:
                    issues.append(f"[ceph] {issue}")
        # Check for PBS issues
        pbs_health = health_report.get('pbs_health', {})
        if pbs_health.get('is_pbs_node') and pbs_health.get('issues'):
            for issue in pbs_health['issues']:
                issues.append(f"[pbs] {issue.get('issue', str(issue))}")
        logger.info("=== Issue Detection Started ===")
        logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
        logger.info(f"Memory status: {health_report['memory_health']['status']}")
@@ -3297,6 +3368,186 @@ class SystemHealthMonitor:
        return ceph_health
    # =============================================================================
    # PBS (PROXMOX BACKUP SERVER) HEALTH CHECKS
    # =============================================================================
    def _check_pbs_health(self) -> Dict[str, Any]:
        """
        Check Proxmox Backup Server health including ZFS pools and task status.
        Returns health status for ZFS pools, failed backup/GC/sync jobs.
        Only active when PBS_ENABLED=true and relevant tools are available.
        """
        pbs_health = {
            'status': 'OK',
            'is_pbs_node': False,
            'zfs_pools': [],
            'failed_tasks': [],
            'issues': []
        }
        if not self.CONFIG.get('PBS_ENABLED', False):
            logger.debug("PBS monitoring disabled in config")
            return pbs_health
        if not self._available_tools.get('zpool'):
            logger.debug("zpool not available - skipping PBS ZFS checks")
            return pbs_health
        pbs_health['is_pbs_node'] = True
        # Check ZFS pool status
        try:
            result = subprocess.run(
                ['zpool', 'status', '-p'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                timeout=30
            )
            if result.returncode == 0:
                current_pool = None
                for line in result.stdout.splitlines():
                    line_stripped = line.strip()
                    if line_stripped.startswith('pool:'):
                        current_pool = line_stripped.split(':', 1)[1].strip()
                    elif line_stripped.startswith('state:') and current_pool:
                        state = line_stripped.split(':', 1)[1].strip()
                        if state != 'ONLINE':
                            pbs_health['status'] = 'CRITICAL'
                            pbs_health['issues'].append({
                                'type': 'PBS_ZFS_DEGRADED',
                                'severity': 'CRITICAL',
                                'device': current_pool,
                                'issue': f"ZFS pool '{current_pool}' state: {state}"
                            })
                    elif line_stripped.startswith('errors:') and current_pool:
                        if 'No known data errors' not in line_stripped:
                            pbs_health['issues'].append({
                                'type': 'PBS_ZFS_ERRORS',
                                'severity': 'WARNING',
                                'device': current_pool,
                                'issue': f"ZFS pool '{current_pool}' has errors: {line_stripped}"
                            })
        except subprocess.TimeoutExpired:
            logger.warning("zpool status timed out")
        except Exception as e:
            logger.error(f"Error checking ZFS pool status: {e}")
        # Check ZFS pool usage
        try:
            result = subprocess.run(
                ['zpool', 'list', '-Hp'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True,
                timeout=30
            )
            if result.returncode == 0:
                for line in result.stdout.splitlines():
                    parts = line.split('\t')
                    if len(parts) >= 8:
                        pool_name = parts[0]
                        try:
                            total_bytes = int(parts[1])
                            used_bytes = int(parts[2])
                            usage_pct = (used_bytes / total_bytes * 100) if total_bytes > 0 else 0
                        except (ValueError, ZeroDivisionError):
                            continue
                        pool_info = {
                            'name': pool_name,
                            'total': self._convert_bytes(total_bytes),
                            'used': self._convert_bytes(used_bytes),
                            'usage_percent': round(usage_pct, 1),
                            'health': parts[9] if len(parts) > 9 else 'UNKNOWN'
                        }
                        pbs_health['zfs_pools'].append(pool_info)
                        if usage_pct >= self.CONFIG['PBS_ZFS_CRITICAL']:
                            pbs_health['status'] = 'CRITICAL'
                            pbs_health['issues'].append({
                                'type': 'PBS_ZFS_USAGE_CRITICAL',
                                'severity': 'CRITICAL',
                                'device': pool_name,
                                'issue': f"ZFS pool '{pool_name}' usage critical: {usage_pct:.1f}%"
                            })
                        elif usage_pct >= self.CONFIG['PBS_ZFS_WARNING']:
                            if pbs_health['status'] != 'CRITICAL':
                                pbs_health['status'] = 'WARNING'
                            pbs_health['issues'].append({
                                'type': 'PBS_ZFS_USAGE_WARNING',
                                'severity': 'WARNING',
                                'device': pool_name,
                                'issue': f"ZFS pool '{pool_name}' usage high: {usage_pct:.1f}%"
                            })
        except subprocess.TimeoutExpired:
            logger.warning("zpool list timed out")
        except Exception as e:
            logger.error(f"Error checking ZFS pool usage: {e}")
        # Check failed PBS tasks (requires proxmox-backup-manager)
        if self._available_tools.get('proxmox-backup-manager'):
            try:
                result = subprocess.run(
                    ['proxmox-backup-manager', 'task', 'list', '--output-format', 'json'],
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    text=True,
                    timeout=30
                )
                if result.returncode == 0:
                    try:
                        tasks = json.loads(result.stdout)
                        for task in tasks:
                            task_status = task.get('status', '')
                            task_type = task.get('worker_type', '')
                            task_id = task.get('worker_id', '')
                            if task_status and task_status != 'OK':
                                failed_task = {
                                    'type': task_type,
                                    'id': task_id,
                                    'status': task_status,
                                    'starttime': task.get('starttime', ''),
                                    'endtime': task.get('endtime', '')
                                }
                                pbs_health['failed_tasks'].append(failed_task)
                                # Categorize by task type
                                if 'backup' in task_type.lower():
                                    issue_type = 'PBS_BACKUP_FAILED'
                                    severity = 'CRITICAL'
                                elif 'gc' in task_type.lower() or 'garbage' in task_type.lower():
                                    issue_type = 'PBS_GC_FAILED'
                                    severity = 'WARNING'
                                elif 'sync' in task_type.lower():
                                    issue_type = 'PBS_SYNC_FAILED'
                                    severity = 'WARNING'
                                else:
                                    issue_type = 'PBS_BACKUP_FAILED'
                                    severity = 'WARNING'
                                pbs_health['issues'].append({
                                    'type': issue_type,
                                    'severity': severity,
                                    'device': f"task-{task_type}",
                                    'issue': f"PBS {task_type} failed: {task_id} - {task_status}"
                                })
                                if severity == 'CRITICAL':
                                    pbs_health['status'] = 'CRITICAL'
                                elif pbs_health['status'] == 'OK':
                                    pbs_health['status'] = 'WARNING'
                    except json.JSONDecodeError as e:
                        logger.warning(f"Failed to parse PBS task list JSON: {e}")
            except subprocess.TimeoutExpired:
                logger.warning("proxmox-backup-manager task list timed out")
            except Exception as e:
                logger.error(f"Error checking PBS tasks: {e}")
        return pbs_health
    # =============================================================================
    # PROMETHEUS METRICS EXPORT
    # =============================================================================
@@ -3443,6 +3694,18 @@ class SystemHealthMonitor:
                    usage = fs.get('usage_percent', 0)
                    metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')
        # === PBS Metrics ===
        pbs = health_report.get('pbs_health', {})
        if pbs.get('is_pbs_node'):
            metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage')
            metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge')
            for pool in pbs.get('zfs_pools', []):
                metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}')
            metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count')
            metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge')
            metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}')
        # === Issue Summary Metrics ===
        metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
        metrics.append(f'# TYPE hwmon_issues_total gauge')
@@ -3450,7 +3713,8 @@ class SystemHealthMonitor:
        system_issues = len(health_report.get('system_health', {}).get('issues', []))
        ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
        lxc_issues = len(lxc.get('issues', []))
-        total_issues = system_issues + ceph_issues + lxc_issues
+        pbs_issues = len(pbs.get('issues', []))
        total_issues = system_issues + ceph_issues + lxc_issues + pbs_issues
        metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')
        return '\n'.join(metrics) + '\n'