Add Proxmox Backup Server (PBS) health monitoring support
Monitors ZFS pool status/usage and failed PBS tasks (backup, GC, sync). Includes configurable thresholds (PBS_ZFS_WARNING/CRITICAL), Prometheus metrics (hwmon_pbs_*), dry-run summary, issue categorization, and priority classification. Enabled via PBS_ENABLED=true in .env config. Fixes: #5 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
276
hwmonDaemon.py
276
hwmonDaemon.py
@@ -79,7 +79,16 @@ class SystemHealthMonitor:
|
|||||||
'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full
|
'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full
|
||||||
'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high
|
'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high
|
||||||
'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded
|
'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded
|
||||||
'CEPH_MON_DOWN': PRIORITIES['HIGH'] # P2 - Monitor down
|
'CEPH_MON_DOWN': PRIORITIES['HIGH'], # P2 - Monitor down
|
||||||
|
|
||||||
|
# PBS (Proxmox Backup Server) issues
|
||||||
|
'PBS_ZFS_DEGRADED': PRIORITIES['CRITICAL'], # P1 - ZFS pool degraded
|
||||||
|
'PBS_ZFS_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - ZFS pool near full
|
||||||
|
'PBS_ZFS_USAGE_WARNING': PRIORITIES['MEDIUM'],# P3 - ZFS pool usage high
|
||||||
|
'PBS_ZFS_ERRORS': PRIORITIES['HIGH'], # P2 - ZFS pool has errors
|
||||||
|
'PBS_BACKUP_FAILED': PRIORITIES['HIGH'], # P2 - Backup job failed
|
||||||
|
'PBS_GC_FAILED': PRIORITIES['MEDIUM'], # P3 - Garbage collection failed
|
||||||
|
'PBS_SYNC_FAILED': PRIORITIES['MEDIUM'] # P3 - Sync job failed
|
||||||
}
|
}
|
||||||
|
|
||||||
CONFIG = {
|
CONFIG = {
|
||||||
@@ -133,7 +142,11 @@ class SystemHealthMonitor:
|
|||||||
'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files
|
'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files
|
||||||
# Health check endpoint
|
# Health check endpoint
|
||||||
'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint
|
'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint
|
||||||
'HEALTH_SERVER_PORT': 9102 # Port for health check endpoint
|
'HEALTH_SERVER_PORT': 9102, # Port for health check endpoint
|
||||||
|
# PBS (Proxmox Backup Server) monitoring
|
||||||
|
'PBS_ENABLED': False, # Enable PBS health monitoring
|
||||||
|
'PBS_ZFS_WARNING': 80, # ZFS pool usage warning threshold %
|
||||||
|
'PBS_ZFS_CRITICAL': 90 # ZFS pool usage critical threshold %
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -215,6 +228,20 @@ class SystemHealthMonitor:
|
|||||||
cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
|
cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
|
logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
|
||||||
|
# PBS settings
|
||||||
|
elif key == 'PBS_ENABLED':
|
||||||
|
cls.CONFIG['PBS_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||||
|
logger.info(f"✓ Loaded PBS_ENABLED: {cls.CONFIG['PBS_ENABLED']}")
|
||||||
|
elif key == 'PBS_ZFS_WARNING':
|
||||||
|
try:
|
||||||
|
cls.CONFIG['PBS_ZFS_WARNING'] = int(value)
|
||||||
|
except ValueError:
|
||||||
|
logger.warning(f"Invalid PBS_ZFS_WARNING value: {value}")
|
||||||
|
elif key == 'PBS_ZFS_CRITICAL':
|
||||||
|
try:
|
||||||
|
cls.CONFIG['PBS_ZFS_CRITICAL'] = int(value)
|
||||||
|
except ValueError:
|
||||||
|
logger.warning(f"Invalid PBS_ZFS_CRITICAL value: {value}")
|
||||||
# Health server settings
|
# Health server settings
|
||||||
elif key == 'HEALTH_SERVER_ENABLED':
|
elif key == 'HEALTH_SERVER_ENABLED':
|
||||||
cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||||
@@ -703,6 +730,8 @@ class SystemHealthMonitor:
|
|||||||
'ceph': 'ceph-common',
|
'ceph': 'ceph-common',
|
||||||
'pct': 'pve-container',
|
'pct': 'pve-container',
|
||||||
'dmidecode': 'dmidecode',
|
'dmidecode': 'dmidecode',
|
||||||
|
'proxmox-backup-manager': 'proxmox-backup-server',
|
||||||
|
'zpool': 'zfsutils-linux',
|
||||||
}
|
}
|
||||||
|
|
||||||
availability = {}
|
availability = {}
|
||||||
@@ -841,7 +870,8 @@ class SystemHealthMonitor:
|
|||||||
'network_health': self._check_network_status(),
|
'network_health': self._check_network_status(),
|
||||||
'ceph_health': self._check_ceph_health(),
|
'ceph_health': self._check_ceph_health(),
|
||||||
'lxc_health': self._check_lxc_storage(),
|
'lxc_health': self._check_lxc_storage(),
|
||||||
'system_health': self._check_system_drive_indicators()
|
'system_health': self._check_system_drive_indicators(),
|
||||||
|
'pbs_health': self._check_pbs_health()
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.dry_run:
|
if self.dry_run:
|
||||||
@@ -897,6 +927,17 @@ class SystemHealthMonitor:
|
|||||||
if health_report['system_health']['issues']:
|
if health_report['system_health']['issues']:
|
||||||
logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
|
logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
|
||||||
|
|
||||||
|
# PBS status
|
||||||
|
pbs = health_report.get('pbs_health', {})
|
||||||
|
if pbs.get('is_pbs_node'):
|
||||||
|
logger.info("\nPBS Status:")
|
||||||
|
for pool in pbs.get('zfs_pools', []):
|
||||||
|
logger.info(f" ZFS Pool '{pool['name']}': {pool['usage_percent']}% used ({pool['used']}/{pool['total']})")
|
||||||
|
if pbs.get('failed_tasks'):
|
||||||
|
logger.info(f" Failed tasks: {len(pbs['failed_tasks'])}")
|
||||||
|
if pbs.get('issues'):
|
||||||
|
logger.info(f" Issues: {len(pbs['issues'])}")
|
||||||
|
|
||||||
logger.info("\n=== End Summary ===")
|
logger.info("\n=== End Summary ===")
|
||||||
|
|
||||||
return health_report
|
return health_report
|
||||||
@@ -1651,7 +1692,9 @@ class SystemHealthMonitor:
|
|||||||
'critical reallocated', 'critical current_pending',
|
'critical reallocated', 'critical current_pending',
|
||||||
'network is unreachable',
|
'network is unreachable',
|
||||||
'osd is down', 'osd down', # Ceph OSD down
|
'osd is down', 'osd down', # Ceph OSD down
|
||||||
'cluster usage critical' # Ceph usage critical
|
'cluster usage critical', # Ceph usage critical
|
||||||
|
'zfs pool', 'backup failed', # PBS critical issues
|
||||||
|
'usage critical' # PBS ZFS critical usage
|
||||||
]):
|
]):
|
||||||
return self.PRIORITIES['HIGH'] # P2
|
return self.PRIORITIES['HIGH'] # P2
|
||||||
|
|
||||||
@@ -1670,7 +1713,8 @@ class SystemHealthMonitor:
|
|||||||
'warning', 'high temperature', 'correctable ecc',
|
'warning', 'high temperature', 'correctable ecc',
|
||||||
'trend alert', 'critical storage usage',
|
'trend alert', 'critical storage usage',
|
||||||
'low available_spare', 'high wear',
|
'low available_spare', 'high wear',
|
||||||
'health_warn', 'cluster usage warning' # Ceph warnings
|
'health_warn', 'cluster usage warning', # Ceph warnings
|
||||||
|
'gc failed', 'sync failed', 'usage high' # PBS warnings
|
||||||
]):
|
]):
|
||||||
return self.PRIORITIES['MEDIUM'] # P3
|
return self.PRIORITIES['MEDIUM'] # P3
|
||||||
|
|
||||||
@@ -1794,6 +1838,27 @@ class SystemHealthMonitor:
|
|||||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# PBS Issues - Backup server issues (categorized as Hardware for storage, Software for tasks)
|
||||||
|
if any(keyword in issue_lower for keyword in [
|
||||||
|
'pbs', 'zfs pool', 'backup failed', 'gc failed', 'sync failed'
|
||||||
|
]):
|
||||||
|
if any(error in issue_lower for error in [
|
||||||
|
'degraded', 'critical', 'failed', 'errors'
|
||||||
|
]):
|
||||||
|
return (
|
||||||
|
self.TICKET_CATEGORIES['HARDWARE'],
|
||||||
|
self.TICKET_TYPES['ISSUE'],
|
||||||
|
'[pbs]',
|
||||||
|
self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return (
|
||||||
|
self.TICKET_CATEGORIES['HARDWARE'],
|
||||||
|
self.TICKET_TYPES['PROBLEM'],
|
||||||
|
'[pbs]',
|
||||||
|
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||||
|
)
|
||||||
|
|
||||||
# Default: Hardware Problem (for undefined cases)
|
# Default: Hardware Problem (for undefined cases)
|
||||||
return (
|
return (
|
||||||
self.TICKET_CATEGORIES['HARDWARE'],
|
self.TICKET_CATEGORIES['HARDWARE'],
|
||||||
@@ -2011,6 +2076,12 @@ class SystemHealthMonitor:
|
|||||||
for issue in ceph_health['issues']:
|
for issue in ceph_health['issues']:
|
||||||
issues.append(f"[ceph] {issue}")
|
issues.append(f"[ceph] {issue}")
|
||||||
|
|
||||||
|
# Check for PBS issues
|
||||||
|
pbs_health = health_report.get('pbs_health', {})
|
||||||
|
if pbs_health.get('is_pbs_node') and pbs_health.get('issues'):
|
||||||
|
for issue in pbs_health['issues']:
|
||||||
|
issues.append(f"[pbs] {issue.get('issue', str(issue))}")
|
||||||
|
|
||||||
logger.info("=== Issue Detection Started ===")
|
logger.info("=== Issue Detection Started ===")
|
||||||
logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
|
logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
|
||||||
logger.info(f"Memory status: {health_report['memory_health']['status']}")
|
logger.info(f"Memory status: {health_report['memory_health']['status']}")
|
||||||
@@ -3297,6 +3368,186 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
return ceph_health
|
return ceph_health
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# PBS (PROXMOX BACKUP SERVER) HEALTH CHECKS
|
||||||
|
# =============================================================================
|
||||||
|
def _check_pbs_health(self) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Check Proxmox Backup Server health including ZFS pools and task status.
|
||||||
|
|
||||||
|
Returns health status for ZFS pools, failed backup/GC/sync jobs.
|
||||||
|
Only active when PBS_ENABLED=true and relevant tools are available.
|
||||||
|
"""
|
||||||
|
pbs_health = {
|
||||||
|
'status': 'OK',
|
||||||
|
'is_pbs_node': False,
|
||||||
|
'zfs_pools': [],
|
||||||
|
'failed_tasks': [],
|
||||||
|
'issues': []
|
||||||
|
}
|
||||||
|
|
||||||
|
if not self.CONFIG.get('PBS_ENABLED', False):
|
||||||
|
logger.debug("PBS monitoring disabled in config")
|
||||||
|
return pbs_health
|
||||||
|
|
||||||
|
if not self._available_tools.get('zpool'):
|
||||||
|
logger.debug("zpool not available - skipping PBS ZFS checks")
|
||||||
|
return pbs_health
|
||||||
|
|
||||||
|
pbs_health['is_pbs_node'] = True
|
||||||
|
|
||||||
|
# Check ZFS pool status
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['zpool', 'status', '-p'],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
current_pool = None
|
||||||
|
for line in result.stdout.splitlines():
|
||||||
|
line_stripped = line.strip()
|
||||||
|
if line_stripped.startswith('pool:'):
|
||||||
|
current_pool = line_stripped.split(':', 1)[1].strip()
|
||||||
|
elif line_stripped.startswith('state:') and current_pool:
|
||||||
|
state = line_stripped.split(':', 1)[1].strip()
|
||||||
|
if state != 'ONLINE':
|
||||||
|
pbs_health['status'] = 'CRITICAL'
|
||||||
|
pbs_health['issues'].append({
|
||||||
|
'type': 'PBS_ZFS_DEGRADED',
|
||||||
|
'severity': 'CRITICAL',
|
||||||
|
'device': current_pool,
|
||||||
|
'issue': f"ZFS pool '{current_pool}' state: {state}"
|
||||||
|
})
|
||||||
|
elif line_stripped.startswith('errors:') and current_pool:
|
||||||
|
if 'No known data errors' not in line_stripped:
|
||||||
|
pbs_health['issues'].append({
|
||||||
|
'type': 'PBS_ZFS_ERRORS',
|
||||||
|
'severity': 'WARNING',
|
||||||
|
'device': current_pool,
|
||||||
|
'issue': f"ZFS pool '{current_pool}' has errors: {line_stripped}"
|
||||||
|
})
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.warning("zpool status timed out")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking ZFS pool status: {e}")
|
||||||
|
|
||||||
|
# Check ZFS pool usage
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['zpool', 'list', '-Hp'],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
for line in result.stdout.splitlines():
|
||||||
|
parts = line.split('\t')
|
||||||
|
if len(parts) >= 8:
|
||||||
|
pool_name = parts[0]
|
||||||
|
try:
|
||||||
|
total_bytes = int(parts[1])
|
||||||
|
used_bytes = int(parts[2])
|
||||||
|
usage_pct = (used_bytes / total_bytes * 100) if total_bytes > 0 else 0
|
||||||
|
except (ValueError, ZeroDivisionError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
pool_info = {
|
||||||
|
'name': pool_name,
|
||||||
|
'total': self._convert_bytes(total_bytes),
|
||||||
|
'used': self._convert_bytes(used_bytes),
|
||||||
|
'usage_percent': round(usage_pct, 1),
|
||||||
|
'health': parts[9] if len(parts) > 9 else 'UNKNOWN'
|
||||||
|
}
|
||||||
|
pbs_health['zfs_pools'].append(pool_info)
|
||||||
|
|
||||||
|
if usage_pct >= self.CONFIG['PBS_ZFS_CRITICAL']:
|
||||||
|
pbs_health['status'] = 'CRITICAL'
|
||||||
|
pbs_health['issues'].append({
|
||||||
|
'type': 'PBS_ZFS_USAGE_CRITICAL',
|
||||||
|
'severity': 'CRITICAL',
|
||||||
|
'device': pool_name,
|
||||||
|
'issue': f"ZFS pool '{pool_name}' usage critical: {usage_pct:.1f}%"
|
||||||
|
})
|
||||||
|
elif usage_pct >= self.CONFIG['PBS_ZFS_WARNING']:
|
||||||
|
if pbs_health['status'] != 'CRITICAL':
|
||||||
|
pbs_health['status'] = 'WARNING'
|
||||||
|
pbs_health['issues'].append({
|
||||||
|
'type': 'PBS_ZFS_USAGE_WARNING',
|
||||||
|
'severity': 'WARNING',
|
||||||
|
'device': pool_name,
|
||||||
|
'issue': f"ZFS pool '{pool_name}' usage high: {usage_pct:.1f}%"
|
||||||
|
})
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.warning("zpool list timed out")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking ZFS pool usage: {e}")
|
||||||
|
|
||||||
|
# Check failed PBS tasks (requires proxmox-backup-manager)
|
||||||
|
if self._available_tools.get('proxmox-backup-manager'):
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
['proxmox-backup-manager', 'task', 'list', '--output-format', 'json'],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
try:
|
||||||
|
tasks = json.loads(result.stdout)
|
||||||
|
for task in tasks:
|
||||||
|
task_status = task.get('status', '')
|
||||||
|
task_type = task.get('worker_type', '')
|
||||||
|
task_id = task.get('worker_id', '')
|
||||||
|
|
||||||
|
if task_status and task_status != 'OK':
|
||||||
|
failed_task = {
|
||||||
|
'type': task_type,
|
||||||
|
'id': task_id,
|
||||||
|
'status': task_status,
|
||||||
|
'starttime': task.get('starttime', ''),
|
||||||
|
'endtime': task.get('endtime', '')
|
||||||
|
}
|
||||||
|
pbs_health['failed_tasks'].append(failed_task)
|
||||||
|
|
||||||
|
# Categorize by task type
|
||||||
|
if 'backup' in task_type.lower():
|
||||||
|
issue_type = 'PBS_BACKUP_FAILED'
|
||||||
|
severity = 'CRITICAL'
|
||||||
|
elif 'gc' in task_type.lower() or 'garbage' in task_type.lower():
|
||||||
|
issue_type = 'PBS_GC_FAILED'
|
||||||
|
severity = 'WARNING'
|
||||||
|
elif 'sync' in task_type.lower():
|
||||||
|
issue_type = 'PBS_SYNC_FAILED'
|
||||||
|
severity = 'WARNING'
|
||||||
|
else:
|
||||||
|
issue_type = 'PBS_BACKUP_FAILED'
|
||||||
|
severity = 'WARNING'
|
||||||
|
|
||||||
|
pbs_health['issues'].append({
|
||||||
|
'type': issue_type,
|
||||||
|
'severity': severity,
|
||||||
|
'device': f"task-{task_type}",
|
||||||
|
'issue': f"PBS {task_type} failed: {task_id} - {task_status}"
|
||||||
|
})
|
||||||
|
|
||||||
|
if severity == 'CRITICAL':
|
||||||
|
pbs_health['status'] = 'CRITICAL'
|
||||||
|
elif pbs_health['status'] == 'OK':
|
||||||
|
pbs_health['status'] = 'WARNING'
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"Failed to parse PBS task list JSON: {e}")
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
logger.warning("proxmox-backup-manager task list timed out")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking PBS tasks: {e}")
|
||||||
|
|
||||||
|
return pbs_health
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# PROMETHEUS METRICS EXPORT
|
# PROMETHEUS METRICS EXPORT
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -3443,6 +3694,18 @@ class SystemHealthMonitor:
|
|||||||
usage = fs.get('usage_percent', 0)
|
usage = fs.get('usage_percent', 0)
|
||||||
metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')
|
metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')
|
||||||
|
|
||||||
|
# === PBS Metrics ===
|
||||||
|
pbs = health_report.get('pbs_health', {})
|
||||||
|
if pbs.get('is_pbs_node'):
|
||||||
|
metrics.append(f'# HELP hwmon_pbs_zfs_usage_percent PBS ZFS pool usage percentage')
|
||||||
|
metrics.append(f'# TYPE hwmon_pbs_zfs_usage_percent gauge')
|
||||||
|
for pool in pbs.get('zfs_pools', []):
|
||||||
|
metrics.append(f'hwmon_pbs_zfs_usage_percent{labels(hostname=hostname, pool=pool["name"])} {pool["usage_percent"]}')
|
||||||
|
|
||||||
|
metrics.append(f'# HELP hwmon_pbs_failed_tasks_total PBS failed task count')
|
||||||
|
metrics.append(f'# TYPE hwmon_pbs_failed_tasks_total gauge')
|
||||||
|
metrics.append(f'hwmon_pbs_failed_tasks_total{labels(hostname=hostname)} {len(pbs.get("failed_tasks", []))}')
|
||||||
|
|
||||||
# === Issue Summary Metrics ===
|
# === Issue Summary Metrics ===
|
||||||
metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
|
metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
|
||||||
metrics.append(f'# TYPE hwmon_issues_total gauge')
|
metrics.append(f'# TYPE hwmon_issues_total gauge')
|
||||||
@@ -3450,7 +3713,8 @@ class SystemHealthMonitor:
|
|||||||
system_issues = len(health_report.get('system_health', {}).get('issues', []))
|
system_issues = len(health_report.get('system_health', {}).get('issues', []))
|
||||||
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
|
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
|
||||||
lxc_issues = len(lxc.get('issues', []))
|
lxc_issues = len(lxc.get('issues', []))
|
||||||
total_issues = system_issues + ceph_issues + lxc_issues
|
pbs_issues = len(pbs.get('issues', []))
|
||||||
|
total_issues = system_issues + ceph_issues + lxc_issues + pbs_issues
|
||||||
metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')
|
metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')
|
||||||
|
|
||||||
return '\n'.join(metrics) + '\n'
|
return '\n'.join(metrics) + '\n'
|
||||||
|
|||||||
Reference in New Issue
Block a user