diff --git a/hwmonDaemon.py b/hwmonDaemon.py index afdcb02..d703ac8 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -79,7 +79,16 @@ class SystemHealthMonitor: 'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full 'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high 'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded - 'CEPH_MON_DOWN': PRIORITIES['HIGH'] # P2 - Monitor down + 'CEPH_MON_DOWN': PRIORITIES['HIGH'], # P2 - Monitor down + + # PBS (Proxmox Backup Server) issues + 'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded + 'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full + 'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high + 'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors + 'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed + 'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed + 'PBS_SYNC_FAILED': PRIORITIES['MEDIUM'] # P3 - Sync job failed } CONFIG = { @@ -133,7 +142,11 @@ class SystemHealthMonitor: 'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files # Health check endpoint 'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint - 'HEALTH_SERVER_PORT': 9102 # Port for health check endpoint + 'HEALTH_SERVER_PORT': 9102, # Port for health check endpoint + # PBS (Proxmox Backup Server) monitoring + 'PBS_ENABLED': False, # Enable PBS health monitoring + 'PBS_ZFS_WARNING': 80, # ZFS pool usage warning threshold % + 'PBS_ZFS_CRITICAL': 90 # ZFS pool usage critical threshold % } @classmethod @@ -215,6 +228,20 @@ class SystemHealthMonitor: cls.CONFIG['HISTORY_MAX_BYTES'] = int(value) except ValueError: logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}") + # PBS settings + elif key == 'PBS_ENABLED': + cls.CONFIG['PBS_ENABLED'] = value.lower() in ('true', '1', 'yes') + logger.info(f"✓ Loaded PBS_ENABLED: {cls.CONFIG['PBS_ENABLED']}") + elif key == 'PBS_ZFS_WARNING': + try: + cls.CONFIG['PBS_ZFS_WARNING'] = int(value) + except ValueError: + logger.warning(f"Invalid PBS_ZFS_WARNING value: {value}") + elif key == 'PBS_ZFS_CRITICAL': + try: + cls.CONFIG['PBS_ZFS_CRITICAL'] = int(value) + except ValueError: + logger.warning(f"Invalid PBS_ZFS_CRITICAL value: {value}") # Health server settings elif key == 'HEALTH_SERVER_ENABLED': cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes') @@ -703,6 +730,8 @@ class SystemHealthMonitor: 'ceph': 'ceph-common', 'pct': 'pve-container', 'dmidecode': 'dmidecode', + 'proxmox-backup-manager': 'proxmox-backup-server', + 'zpool': 'zfsutils-linux', } availability = {} @@ -841,7 +870,8 @@ class SystemHealthMonitor: 'network_health': self._check_network_status(), 'ceph_health': self._check_ceph_health(), 'lxc_health': self._check_lxc_storage(), - 'system_health': self._check_system_drive_indicators() + 'system_health': self._check_system_drive_indicators(), + 'pbs_health': self._check_pbs_health() } if self.dry_run: @@ -896,7 +926,18 @@ class SystemHealthMonitor: if health_report['system_health']['issues']: logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found") - + + # PBS status + pbs = health_report.get('pbs_health', {}) + if pbs.get('is_pbs_node'): + logger.info("\nPBS Status:") + for pool in pbs.get('zfs_pools', []): + logger.info(f" ZFS Pool '{pool['name']}': {pool['usage_percent']}% used ({pool['used']}/{pool['total']})") + if pbs.get('failed_tasks'): + logger.info(f" Failed tasks: {len(pbs['failed_tasks'])}") + if pbs.get('issues'): + logger.info(f" Issues: {len(pbs['issues'])}") + logger.info("\n=== End Summary ===") return health_report @@ -1651,7 +1692,9 @@ class SystemHealthMonitor: 'critical reallocated', 'critical current_pending', 'network is unreachable', 'osd is down', 'osd down', # Ceph OSD down - 'cluster usage critical' # Ceph usage critical + 'cluster usage critical', # Ceph usage critical + 'zfs pool', 'backup failed', # PBS critical issues + 'usage critical' # PBS ZFS critical usage ]): return self.PRIORITIES['HIGH'] # P2 @@ -1670,7 +1713,8 @@ class SystemHealthMonitor: 'warning', 'high temperature', 'correctable ecc', 'trend alert', 'critical storage usage', 'low available_spare', 'high wear', - 'health_warn', 'cluster usage warning' # Ceph warnings + 'health_warn', 'cluster usage warning', # Ceph warnings + 'gc failed', 'sync failed', 'usage high' # PBS warnings ]): return self.PRIORITIES['MEDIUM'] # P3 @@ -1794,6 +1838,27 @@ class SystemHealthMonitor: self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] ) + # PBS Issues - Backup server issues (categorized as Hardware for storage, Software for tasks) + if any(keyword in issue_lower for keyword in [ + 'pbs', 'zfs pool', 'backup failed', 'gc failed', 'sync failed' + ]): + if any(error in issue_lower for error in [ + 'degraded', 'critical', 'failed', 'errors' + ]): + return ( + self.TICKET_CATEGORIES['HARDWARE'], + self.TICKET_TYPES['ISSUE'], + '[pbs]', + self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE'] + ) + else: + return ( + self.TICKET_CATEGORIES['HARDWARE'], + self.TICKET_TYPES['PROBLEM'], + '[pbs]', + self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] + ) + # Default: Hardware Problem (for undefined cases) return ( self.TICKET_CATEGORIES['HARDWARE'], @@ -2011,6 +2076,12 @@ class SystemHealthMonitor: for issue in ceph_health['issues']: issues.append(f"[ceph] {issue}") + # Check for PBS issues + pbs_health = health_report.get('pbs_health', {}) + if pbs_health.get('is_pbs_node') and pbs_health.get('issues'): + for issue in pbs_health['issues']: + issues.append(f"[pbs] {issue.get('issue', str(issue))}") + logger.info("=== Issue Detection Started ===") logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found") logger.info(f"Memory status: {health_report['memory_health']['status']}") @@ -3297,6 +3368,186 @@ class SystemHealthMonitor: return ceph_health + # ============================================================================= + # PBS (PROXMOX BACKUP SERVER) HEALTH CHECKS + # ============================================================================= + def _check_pbs_health(self) -> Dict[str, Any]: + """ + Check Proxmox Backup Server health including ZFS pools and task status. + + Returns health status for ZFS pools, failed backup/GC/sync jobs. + Only active when PBS_ENABLED=true and relevant tools are available. + """ + pbs_health = { + 'status': 'OK', + 'is_pbs_node': False, + 'zfs_pools': [], + 'failed_tasks': [], + 'issues': [] + } + + if not self.CONFIG.get('PBS_ENABLED', False): + logger.debug("PBS monitoring disabled in config") + return pbs_health + + if not self._available_tools.get('zpool'): + logger.debug("zpool not available - skipping PBS ZFS checks") + return pbs_health + + pbs_health['is_pbs_node'] = True + + # Check ZFS pool status + try: + result = subprocess.run( + ['zpool', 'status', '-p'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30 + ) + if result.returncode == 0: + current_pool = None + for line in result.stdout.splitlines(): + line_stripped = line.strip() + if line_stripped.startswith('pool:'): + current_pool = line_stripped.split(':', 1)[1].strip() + elif line_stripped.startswith('state:') and current_pool: + state = line_stripped.split(':', 1)[1].strip() + if state != 'ONLINE': + pbs_health['status'] = 'CRITICAL' + pbs_health['issues'].append({ + 'type': 'PBS_ZFS_DEGRADED', + 'severity': 'CRITICAL', + 'device': current_pool, + 'issue': f"ZFS pool '{current_pool}' state: {state}" + }) + elif line_stripped.startswith('errors:') and current_pool: + if 'No known data errors' not in line_stripped: + pbs_health['issues'].append({ + 'type': 'PBS_ZFS_ERRORS', + 'severity': 'WARNING', + 'device': current_pool, + 'issue': f"ZFS pool '{current_pool}' has errors: {line_stripped}" + }) + except subprocess.TimeoutExpired: + logger.warning("zpool status timed out") + except Exception as e: + logger.error(f"Error checking ZFS pool status: {e}") + + # Check ZFS pool usage + try: + result = subprocess.run( + ['zpool', 'list', '-Hp'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30 + ) + if result.returncode == 0: + for line in result.stdout.splitlines(): + parts = line.split('\t') + if len(parts) >= 8: + pool_name = parts[0] + try: + total_bytes = int(parts[1]) + used_bytes = int(parts[2]) + usage_pct = (used_bytes / total_bytes * 100) if total_bytes > 0 else 0 + except (ValueError, ZeroDivisionError): + continue + + pool_info = { + 'name': pool_name, + 'total': self._convert_bytes(total_bytes), + 'used': self._convert_bytes(used_bytes), + 'usage_percent': round(usage_pct, 1), + 'health': parts[9] if len(parts) > 9 else 'UNKNOWN' + } + pbs_health['zfs_pools'].append(pool_info) + + if usage_pct >= self.CONFIG['PBS_ZFS_CRITICAL']: + pbs_health['status'] = 'CRITICAL' + pbs_health['issues'].append({ + 'type': 'PBS_ZFS_USAGE_CRITICAL', + 'severity': 'CRITICAL', + 'device': pool_name, + 'issue': f"ZFS pool '{pool_name}' usage critical: {usage_pct:.1f}%" + }) + elif usage_pct >= self.CONFIG['PBS_ZFS_WARNING']: + if pbs_health['status'] != 'CRITICAL': + pbs_health['status'] = 'WARNING' + pbs_health['issues'].append({ + 'type': 'PBS_ZFS_USAGE_WARNING', + 'severity': 'WARNING', + 'device': pool_name, + 'issue': f"ZFS pool '{pool_name}' usage high: {usage_pct:.1f}%" + }) + except subprocess.TimeoutExpired: + logger.warning("zpool list timed out") + except Exception as e: + logger.error(f"Error checking ZFS pool usage: {e}") + + # Check failed PBS tasks (requires proxmox-backup-manager) + if self._available_tools.get('proxmox-backup-manager'): + try: + result = subprocess.run( + ['proxmox-backup-manager', 'task', 'list', '--output-format', 'json'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30 + ) + if result.returncode == 0: + try: + tasks = json.loads(result.stdout) + for task in tasks: + task_status = task.get('status', '') + task_type = task.get('worker_type', '') + task_id = task.get('worker_id', '') + + if task_status and task_status != 'OK': + failed_task = { + 'type': task_type, + 'id': task_id, + 'status': task_status, + 'starttime': task.get('starttime', ''), + 'endtime': task.get('endtime', '') + } + pbs_health['failed_tasks'].append(failed_task) + + # Categorize by task type + if 'backup' in task_type.lower(): + issue_type = 'PBS_BACKUP_FAILED' + severity = 'CRITICAL' + elif 'gc' in task_type.lower() or 'garbage' in task_type.lower(): + issue_type = 'PBS_GC_FAILED' + severity = 'WARNING' + elif 'sync' in task_type.lower(): + issue_type = 'PBS_SYNC_FAILED' + severity = 'WARNING' + else: + issue_type = 'PBS_BACKUP_FAILED' + severity = 'WARNING' + + pbs_health['issues'].append({ + 'type': issue_type, + 'severity': severity, + 'device': f"task-{task_type}", + 'issue': f"PBS {task_type} failed: {task_id} - {task_status}" + }) + + if severity == 'CRITICAL': + pbs_health['status'] = 'CRITICAL' + elif pbs_health['status'] == 'OK': + pbs_health['status'] = 'WARNING' + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse PBS task list JSON: {e}") + except subprocess.TimeoutExpired: + logger.warning("proxmox-backup-manager task list timed out") + except Exception as e: + logger.error(f"Error checking PBS tasks: {e}") + + return pbs_health + # ============================================================================= # PROMETHEUS METRICS EXPORT # ============================================================================= @@ -3443,6 +3694,18 @@ class SystemHealthMonitor: usage = fs.get('usage_percent', 0) metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}') + # === PBS Metrics === + pbs = health_report.get('pbs_health', {}) + if pbs.get('is_pbs_node'): + metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage') + metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge') + for pool in pbs.get('zfs_pools', []): + metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}') + + metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count') + metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge') + metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}') + # === Issue Summary Metrics === metrics.append(f'# HELP hwmon_issues_total Total number of issues detected') metrics.append(f'# TYPE hwmon_issues_total gauge') @@ -3450,7 +3713,8 @@ class SystemHealthMonitor: system_issues = len(health_report.get('system_health', {}).get('issues', [])) ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', [])) lxc_issues = len(lxc.get('issues', [])) - total_issues = system_issues + ceph_issues + lxc_issues + pbs_issues = len(pbs.get('issues', [])) + total_issues = system_issues + ceph_issues + lxc_issues + pbs_issues metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}') return '\n'.join(metrics) + '\n'