Add Ceph cluster monitoring and Prometheus metrics export

- Add comprehensive Ceph cluster health monitoring
  - Check cluster health status (HEALTH_OK/WARN/ERR)
  - Monitor cluster usage with configurable thresholds
  - Track OSD status (up/down) per node
  - Separate cluster-wide vs node-specific issues

- Cluster-wide ticket deduplication
  - Add [cluster-wide] scope tag for Ceph issues
  - Cluster-wide issues deduplicate across all nodes
  - Node-specific issues (OSD down) include hostname

- Add Prometheus metrics export
  - export_prometheus_metrics() method
  - write_prometheus_metrics() for textfile collector
  - --metrics CLI flag to output metrics to stdout
  - --export-json CLI flag to export health report as JSON

- Add Grafana dashboard template (grafana-dashboard.json)
- Add .gitignore

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-17 15:54:16 -05:00
parent 3322c5878a
commit 0f8918fb8b
3 changed files with 956 additions and 13 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
.claude
settings.local.json

375
grafana-dashboard.json Normal file
View File

@@ -0,0 +1,375 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "Prometheus",
"description": "Prometheus data source for hwmonDaemon metrics",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__elements": {},
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "10.0.0"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "gauge",
"name": "Gauge",
"version": ""
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "table",
"name": "Table",
"version": ""
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
}
],
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"id": 1,
"panels": [],
"title": "Overview",
"type": "row"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"mappings": [
{"options": {"0": {"color": "red", "index": 0, "text": "Issues Detected"}}, "type": "value"},
{"options": {"from": 1, "result": {"color": "green", "index": 1, "text": "Healthy"}, "to": 999999}, "type": "range"}
],
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
},
"overrides": []
},
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
"id": 2,
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
"pluginVersion": "10.0.0",
"targets": [{"expr": "count(hwmon_info)", "legendFormat": "Hosts", "refId": "A"}],
"title": "Monitored Hosts",
"type": "stat"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
},
"overrides": []
},
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
"id": 3,
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
"pluginVersion": "10.0.0",
"targets": [{"expr": "sum(hwmon_issues_total)", "legendFormat": "Issues", "refId": "A"}],
"title": "Total Issues",
"type": "stat"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]}
},
"overrides": []
},
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
"id": 4,
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
"pluginVersion": "10.0.0",
"targets": [{"expr": "count(hwmon_drive_smart_healthy == 0)", "legendFormat": "Unhealthy", "refId": "A"}],
"title": "Unhealthy Drives",
"type": "stat"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"mappings": [
{"options": {"0": {"color": "red", "index": 0, "text": "Unhealthy"}}, "type": "value"},
{"options": {"1": {"color": "green", "index": 1, "text": "Healthy"}}, "type": "value"}
],
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
},
"overrides": []
},
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
"id": 5,
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
"pluginVersion": "10.0.0",
"targets": [{"expr": "min(hwmon_ceph_cluster_healthy)", "legendFormat": "Ceph", "refId": "A"}],
"title": "Ceph Cluster Health",
"type": "stat"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]}
},
"overrides": []
},
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
"id": 6,
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
"pluginVersion": "10.0.0",
"targets": [{"expr": "sum(hwmon_ceph_osd_down)", "legendFormat": "Down OSDs", "refId": "A"}],
"title": "Ceph OSDs Down",
"type": "stat"
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
"id": 10,
"panels": [],
"title": "Drive Health",
"type": "row"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 45}, {"color": "red", "value": 55}]},
"unit": "celsius"
},
"overrides": []
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
"id": 11,
"options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
"pluginVersion": "10.0.0",
"targets": [{"expr": "hwmon_drive_temperature_celsius", "legendFormat": "{{hostname}} {{device}}", "refId": "A"}],
"title": "Drive Temperatures",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"custom": {"align": "auto", "cellOptions": {"type": "color-text"}, "inspect": false},
"mappings": [
{"options": {"0": {"color": "red", "index": 0, "text": "UNHEALTHY"}}, "type": "value"},
{"options": {"1": {"color": "green", "index": 1, "text": "HEALTHY"}}, "type": "value"}
],
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
},
"overrides": [
{"matcher": {"id": "byName", "options": "hostname"}, "properties": [{"id": "custom.width", "value": 120}]},
{"matcher": {"id": "byName", "options": "device"}, "properties": [{"id": "custom.width", "value": 100}]},
{"matcher": {"id": "byName", "options": "Status"}, "properties": [{"id": "custom.width", "value": 100}]},
{"matcher": {"id": "byName", "options": "Issues"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}}]}
]
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
"id": 12,
"options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Issues"}]},
"pluginVersion": "10.0.0",
"targets": [
{"expr": "hwmon_drive_smart_healthy", "format": "table", "instant": true, "legendFormat": "", "refId": "A"},
{"expr": "hwmon_drive_smart_issues_total", "format": "table", "instant": true, "legendFormat": "", "refId": "B"}
],
"title": "Drive Status",
"transformations": [
{"id": "merge", "options": {}},
{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {"Value #A": 2, "Value #B": 3, "device": 1, "hostname": 0}, "renameByName": {"Value #A": "Status", "Value #B": "Issues", "device": "Device", "hostname": "Host"}}}
],
"type": "table"
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 14},
"id": 20,
"panels": [],
"title": "System Resources",
"type": "row"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]},
"unit": "percent"
},
"overrides": []
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 15},
"id": 21,
"options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
"pluginVersion": "10.0.0",
"targets": [{"expr": "hwmon_cpu_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
"title": "CPU Usage",
"type": "timeseries"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]},
"unit": "percent"
},
"overrides": []
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 15},
"id": 22,
"options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
"pluginVersion": "10.0.0",
"targets": [{"expr": "hwmon_memory_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
"title": "Memory Usage",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 23},
"id": 30,
"panels": [],
"title": "Ceph Cluster",
"type": "row"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]},
"unit": "percent"
},
"overrides": []
},
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 24},
"id": 31,
"options": {"orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "showThresholdLabels": false, "showThresholdMarkers": true},
"pluginVersion": "10.0.0",
"targets": [{"expr": "max(hwmon_ceph_cluster_usage_percent)", "legendFormat": "Usage", "refId": "A"}],
"title": "Ceph Cluster Usage",
"type": "gauge"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
"mappings": [],
"max": 100,
"min": 0,
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]},
"unit": "percent"
},
"overrides": []
},
"gridPos": {"h": 6, "w": 18, "x": 6, "y": 24},
"id": 32,
"options": {"legend": {"calcs": ["lastNotNull"], "displayMode": "list", "placement": "right", "showLegend": true}, "tooltip": {"mode": "single", "sort": "none"}},
"pluginVersion": "10.0.0",
"targets": [{"expr": "hwmon_ceph_cluster_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
"title": "Ceph Usage Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 30},
"id": 40,
"panels": [],
"title": "LXC Containers",
"type": "row"
},
{
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"custom": {"align": "auto", "cellOptions": {"type": "auto"}, "inspect": false},
"mappings": [],
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]}
},
"overrides": [
{"matcher": {"id": "byName", "options": "Usage"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "unit", "value": "percent"}, {"id": "max", "value": 100}, {"id": "min", "value": 0}]}
]
},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 31},
"id": 41,
"options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Usage"}]},
"pluginVersion": "10.0.0",
"targets": [{"expr": "hwmon_lxc_storage_usage_percent", "format": "table", "instant": true, "legendFormat": "", "refId": "A"}],
"title": "LXC Storage Usage",
"transformations": [
{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {}, "renameByName": {"Value": "Usage", "hostname": "Host", "mountpoint": "Mountpoint", "vmid": "Container ID"}}}
],
"type": "table"
}
],
"refresh": "1m",
"schemaVersion": 38,
"style": "dark",
"tags": ["hwmon", "hardware", "monitoring", "proxmox"],
"templating": {"list": []},
"time": {"from": "now-6h", "to": "now"},
"timepicker": {},
"timezone": "browser",
"title": "hwmonDaemon - Hardware Monitor",
"uid": "hwmondaemon",
"version": 1,
"weekStart": ""
}

View File

@@ -69,7 +69,16 @@ class SystemHealthMonitor:
'TEMPERATURE_INFO': PRIORITIES['LOW'],
'DRIVE_AGE_INFO': PRIORITIES['LOW'],
'SSD_WEAR_INFO': PRIORITIES['LOW'],
'SYSTEM_LOG_INFO': PRIORITIES['LOW']
'SYSTEM_LOG_INFO': PRIORITIES['LOW'],
# Ceph cluster issues
'CEPH_HEALTH_ERR': PRIORITIES['CRITICAL'], # P1 - Cluster in error state
'CEPH_HEALTH_WARN': PRIORITIES['MEDIUM'], # P3 - Cluster warnings
'CEPH_OSD_DOWN': PRIORITIES['HIGH'], # P2 - OSD down (local node)
'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full
'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high
'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded
'CEPH_MON_DOWN': PRIORITIES['HIGH'] # P2 - Monitor down
}
CONFIG = {
@@ -104,7 +113,16 @@ class SystemHealthMonitor:
'HISTORY_DIR': '/var/log/hwmonDaemon',
'HISTORY_RETENTION_DAYS': 30,
'INCLUDE_INFO_TICKETS': False, # Set True to create P5 tickets for INFO alerts
'PRIORITY_ESCALATION_THRESHOLD': 3 # Number of criticals to trigger P1
'PRIORITY_ESCALATION_THRESHOLD': 3, # Number of criticals to trigger P1
# Ceph monitoring settings
'CEPH_ENABLED': True, # Enable/disable Ceph health monitoring
'CEPH_TICKET_NODE': None, # Hostname of node designated to create cluster-wide Ceph tickets
'CEPH_USAGE_WARNING': 70, # Ceph cluster usage warning threshold %
'CEPH_USAGE_CRITICAL': 85, # Ceph cluster usage critical threshold %
# Prometheus metrics settings
'PROMETHEUS_ENABLED': False, # Enable Prometheus metrics export
'PROMETHEUS_PORT': 9101, # Port for Prometheus metrics HTTP server
'PROMETHEUS_TEXTFILE_PATH': None # Path for textfile collector (alternative to HTTP)
}
@classmethod
@@ -138,6 +156,26 @@ class SystemHealthMonitor:
elif key == 'TICKET_API_URL':
cls.CONFIG['TICKET_API_URL'] = value
logger.info(f"✓ Loaded TICKET_API_URL: {value}")
# Ceph settings
elif key == 'CEPH_ENABLED':
cls.CONFIG['CEPH_ENABLED'] = value.lower() in ('true', '1', 'yes')
logger.info(f"✓ Loaded CEPH_ENABLED: {cls.CONFIG['CEPH_ENABLED']}")
elif key == 'CEPH_TICKET_NODE':
cls.CONFIG['CEPH_TICKET_NODE'] = value if value else None
logger.info(f"✓ Loaded CEPH_TICKET_NODE: {value}")
elif key == 'CEPH_USAGE_WARNING':
cls.CONFIG['CEPH_USAGE_WARNING'] = int(value)
elif key == 'CEPH_USAGE_CRITICAL':
cls.CONFIG['CEPH_USAGE_CRITICAL'] = int(value)
# Prometheus settings
elif key == 'PROMETHEUS_ENABLED':
cls.CONFIG['PROMETHEUS_ENABLED'] = value.lower() in ('true', '1', 'yes')
logger.info(f"✓ Loaded PROMETHEUS_ENABLED: {cls.CONFIG['PROMETHEUS_ENABLED']}")
elif key == 'PROMETHEUS_PORT':
cls.CONFIG['PROMETHEUS_PORT'] = int(value)
elif key == 'PROMETHEUS_TEXTFILE_PATH':
cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None
logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}")
except Exception as e:
logger.error(f"Failed to load .env file: {e}")
@@ -635,6 +673,10 @@ class SystemHealthMonitor:
# Create tickets for any detected critical issues
self._create_tickets_for_issues(health_report)
# Export Prometheus metrics if enabled
if self.CONFIG.get('PROMETHEUS_ENABLED', False):
self.write_prometheus_metrics(health_report)
except Exception as e:
import traceback
logger.error(f"Unexpected error during health check: {e}")
@@ -643,10 +685,13 @@ class SystemHealthMonitor:
def perform_health_checks(self) -> Dict[str, Any]:
"""Perform comprehensive system health checks and return a report."""
health_report = {
'hostname': socket.gethostname(),
'timestamp': datetime.datetime.now().isoformat(),
'drives_health': self._check_drives_health(),
'memory_health': self._check_memory_usage(),
'cpu_health': self._check_cpu_usage(),
'network_health': self._check_network_status(),
'ceph_health': self._check_ceph_health(),
'lxc_health': self._check_lxc_storage(),
'system_health': self._check_system_drive_indicators()
}
@@ -682,7 +727,24 @@ class SystemHealthMonitor:
logger.info("\nNetwork Status:")
logger.info(f"Management: {health_report['network_health']['management_network']['status']}")
logger.info(f"Ceph: {health_report['network_health']['ceph_network']['status']}")
logger.info(f"Ceph Network: {health_report['network_health']['ceph_network']['status']}")
# Ceph cluster status
ceph = health_report.get('ceph_health', {})
if ceph.get('is_ceph_node'):
logger.info("\nCeph Cluster Status:")
logger.info(f" Cluster Health: {ceph.get('cluster_health', 'UNKNOWN')}")
if ceph.get('cluster_usage'):
usage = ceph['cluster_usage']
logger.info(f" Cluster Usage: {usage.get('usage_percent', 0):.1f}%")
logger.info(f" OSDs: {len(ceph.get('osd_status', []))} total")
down_osds = [o for o in ceph.get('osd_status', []) if o.get('status') == 'down']
if down_osds:
logger.info(f" ⚠️ Down OSDs: {len(down_osds)}")
if ceph.get('cluster_wide_issues'):
logger.info(f" ⚠️ Cluster-wide issues: {len(ceph['cluster_wide_issues'])}")
if ceph.get('issues'):
logger.info(f" ⚠️ Node-specific issues: {len(ceph['issues'])}")
if health_report['system_health']['issues']:
logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
@@ -1296,8 +1358,9 @@ class SystemHealthMonitor:
# P1 - Specific cluster-affecting scenarios
if any(keyword in issue_lower for keyword in [
'cluster', 'raid degraded', 'multiple drive',
'both networks unreachable'
'raid degraded', 'multiple drive',
'both networks unreachable',
'health_err' # Ceph cluster error
]):
return self.PRIORITIES['CRITICAL'] # P1
@@ -1310,10 +1373,16 @@ class SystemHealthMonitor:
'reallocated_sector', 'pending_sector', 'offline_uncorrectable',
'critical available_spare', 'critical wear',
'critical reallocated', 'critical current_pending',
'network is unreachable'
'network is unreachable',
'osd is down', 'osd down', # Ceph OSD down
'cluster usage critical' # Ceph usage critical
]):
return self.PRIORITIES['HIGH'] # P2
# P2 - Ceph OSD issues (need to check explicitly since 'down' is in issue text)
if '[ceph]' in issue_lower and 'down' in issue_lower:
return self.PRIORITIES['HIGH'] # P2
# P2 - SMART issues with critical indicators
if 'smart issues' in issue_lower and any(error_type in issue_lower for error_type in [
'critical', 'failed', 'reallocated', 'pending', 'uncorrectable', 'offline'
@@ -1324,7 +1393,8 @@ class SystemHealthMonitor:
if any(keyword in issue_lower for keyword in [
'warning', 'high temperature', 'correctable ecc',
'trend alert', 'critical storage usage',
'low available_spare', 'high wear'
'low available_spare', 'high wear',
'health_warn', 'cluster usage warning' # Ceph warnings
]):
return self.PRIORITIES['MEDIUM'] # P3
@@ -1425,6 +1495,29 @@ class SystemHealthMonitor:
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Ceph Issues - Storage cluster issues (categorized as Hardware)
if any(keyword in issue_lower for keyword in [
'ceph', 'osd', 'health_err', 'health_warn', 'cluster usage'
]):
# Ceph errors are issues (unplanned degradation)
if any(error in issue_lower for error in [
'health_err', 'down', 'critical', 'error'
]):
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['ISSUE'],
'[ceph]',
self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
)
# Ceph warnings are problems (need investigation)
else:
return (
self.TICKET_CATEGORIES['HARDWARE'],
self.TICKET_TYPES['PROBLEM'],
'[ceph]',
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
)
# Default: Hardware Problem (for undefined cases)
return (
self.TICKET_CATEGORIES['HARDWARE'],
@@ -1446,7 +1539,6 @@ class SystemHealthMonitor:
hostname = socket.gethostname()
action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
environment = self.TICKET_TEMPLATES['ENVIRONMENT']
scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
for issue in issues:
# Use the comprehensive priority determination function
@@ -1455,6 +1547,15 @@ class SystemHealthMonitor:
# Get proper categorization for this issue
category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue)
# Determine scope: cluster-wide for Ceph cluster issues, single-node otherwise
is_cluster_wide = '[cluster-wide]' in issue
scope = self.TICKET_TEMPLATES['SCOPE']['CLUSTER_WIDE'] if is_cluster_wide else self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
# Clean issue text for title (remove [cluster-wide] and [ceph] markers if present)
clean_issue = issue
if is_cluster_wide:
clean_issue = clean_issue.replace('[cluster-wide] ', '').replace('[cluster-wide]', '')
# Extract drive capacity if this is a drive-related issue
drive_size = ""
if "Drive" in issue and "/dev/" in issue:
@@ -1473,7 +1574,7 @@ class SystemHealthMonitor:
f"{action_type['AUTO']}"
f"{issue_tag}"
f"{drive_size}"
f"{issue}"
f"{clean_issue}"
f"{scope}"
f"{environment['PRODUCTION']}"
f"{ticket_type_tag}"
@@ -1596,6 +1697,29 @@ class SystemHealthMonitor:
if system_health.get('issues'):
issues.extend(system_health['issues'])
# Check for Ceph cluster issues
ceph_health = health_report.get('ceph_health', {})
if ceph_health.get('is_ceph_node'):
hostname = socket.gethostname()
designated_node = self.CONFIG.get('CEPH_TICKET_NODE')
# Cluster-wide issues: only create tickets from designated node (or first node if not set)
# The [cluster-wide] tag ensures deduplication in tinker_tickets API
if ceph_health.get('cluster_wide_issues'):
# If no designated node, all nodes can report (API deduplicates)
# If designated node is set, only that node creates tickets
if not designated_node or hostname == designated_node:
for issue in ceph_health['cluster_wide_issues']:
# Add [cluster-wide] marker for API deduplication
issues.append(f"[cluster-wide] [ceph] {issue}")
else:
logger.debug(f"Skipping cluster-wide Ceph issues (designated node: {designated_node})")
# Node-specific issues: always report from the affected node
if ceph_health.get('issues'):
for issue in ceph_health['issues']:
issues.append(f"[ceph] {issue}")
logger.info("=== Issue Detection Started ===")
logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
logger.info(f"Memory status: {health_report['memory_health']['status']}")
@@ -2670,6 +2794,424 @@ class SystemHealthMonitor:
'error': str(e)
}
def _check_ceph_health(self) -> Dict[str, Any]:
"""
Check Ceph cluster health if this node is part of a Ceph cluster.
Returns health status, cluster info, and any issues detected.
Cluster-wide issues use [cluster-wide] tag for cross-node deduplication.
"""
import shutil
ceph_health = {
'status': 'OK',
'is_ceph_node': False,
'cluster_health': None,
'cluster_usage': None,
'osd_status': [],
'mon_status': [],
'issues': [],
'cluster_wide_issues': [] # Issues that apply to entire cluster
}
# Check if Ceph monitoring is enabled
if not self.CONFIG.get('CEPH_ENABLED', True):
logger.debug("Ceph monitoring disabled in config")
return ceph_health
# Check if ceph CLI is available
if not shutil.which('ceph'):
logger.debug("Ceph CLI not found - not a Ceph node")
return ceph_health
ceph_health['is_ceph_node'] = True
hostname = socket.gethostname()
try:
# Get cluster health status
health_result = subprocess.run(
['ceph', 'health', '--format=json'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=30
)
if health_result.returncode == 0:
try:
health_data = json.loads(health_result.stdout)
ceph_health['cluster_health'] = health_data.get('status', 'UNKNOWN')
# Check cluster health status
if ceph_health['cluster_health'] == 'HEALTH_ERR':
ceph_health['status'] = 'CRITICAL'
# This is a cluster-wide issue
ceph_health['cluster_wide_issues'].append(
f"Ceph cluster HEALTH_ERR: {health_data.get('summary', {}).get('message', 'Unknown error')}"
)
elif ceph_health['cluster_health'] == 'HEALTH_WARN':
if ceph_health['status'] != 'CRITICAL':
ceph_health['status'] = 'WARNING'
# Extract warning messages
checks = health_data.get('checks', {})
for check_name, check_data in checks.items():
severity = check_data.get('severity', 'HEALTH_WARN')
message = check_data.get('summary', {}).get('message', check_name)
ceph_health['cluster_wide_issues'].append(
f"Ceph HEALTH_WARN: {message}"
)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse ceph health JSON: {e}")
# Get cluster usage (ceph df)
df_result = subprocess.run(
['ceph', 'df', '--format=json'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=30
)
if df_result.returncode == 0:
try:
df_data = json.loads(df_result.stdout)
stats = df_data.get('stats', {})
total_bytes = stats.get('total_bytes', 0)
total_used = stats.get('total_used_raw_bytes', 0)
if total_bytes > 0:
usage_percent = (total_used / total_bytes) * 100
ceph_health['cluster_usage'] = {
'total_bytes': total_bytes,
'used_bytes': total_used,
'usage_percent': round(usage_percent, 2)
}
# Check usage thresholds
if usage_percent >= self.CONFIG.get('CEPH_USAGE_CRITICAL', 85):
ceph_health['status'] = 'CRITICAL'
ceph_health['cluster_wide_issues'].append(
f"Ceph cluster usage critical: {usage_percent:.1f}%"
)
elif usage_percent >= self.CONFIG.get('CEPH_USAGE_WARNING', 70):
if ceph_health['status'] != 'CRITICAL':
ceph_health['status'] = 'WARNING'
ceph_health['cluster_wide_issues'].append(
f"Ceph cluster usage warning: {usage_percent:.1f}%"
)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse ceph df JSON: {e}")
# Get OSD status (check for down OSDs on this node)
osd_result = subprocess.run(
['ceph', 'osd', 'tree', '--format=json'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=30
)
if osd_result.returncode == 0:
try:
osd_data = json.loads(osd_result.stdout)
nodes = osd_data.get('nodes', [])
# Find OSDs on this host
host_id = None
for node in nodes:
if node.get('type') == 'host' and node.get('name') == hostname:
host_id = node.get('id')
break
# Check OSD status for this host
for node in nodes:
if node.get('type') == 'osd':
osd_info = {
'id': node.get('id'),
'name': node.get('name'),
'status': node.get('status', 'unknown'),
'reweight': node.get('reweight', 1.0)
}
# Check if OSD belongs to this host (by checking parent in tree)
# Simplified: just track all OSDs for now
ceph_health['osd_status'].append(osd_info)
# Check for down OSDs
if node.get('status') == 'down':
ceph_health['status'] = 'CRITICAL'
# Node-specific issue (will include hostname in hash)
ceph_health['issues'].append(
f"Ceph OSD {node.get('name')} is DOWN on {hostname}"
)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse ceph osd tree JSON: {e}")
# Get monitor status
mon_result = subprocess.run(
['ceph', 'mon', 'stat', '--format=json'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=30
)
if mon_result.returncode == 0:
try:
mon_data = json.loads(mon_result.stdout)
ceph_health['mon_status'] = {
'quorum': mon_data.get('quorum', []),
'quorum_names': mon_data.get('quorum_names', [])
}
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse ceph mon stat JSON: {e}")
logger.debug(f"=== Ceph Health Check ===")
logger.debug(f"Is Ceph node: {ceph_health['is_ceph_node']}")
logger.debug(f"Cluster health: {ceph_health['cluster_health']}")
logger.debug(f"Cluster usage: {ceph_health['cluster_usage']}")
logger.debug(f"Status: {ceph_health['status']}")
logger.debug(f"Issues: {ceph_health['issues']}")
logger.debug(f"Cluster-wide issues: {ceph_health['cluster_wide_issues']}")
logger.debug("=== End Ceph Health Check ===")
except subprocess.TimeoutExpired:
ceph_health['status'] = 'ERROR'
ceph_health['issues'].append("Ceph health check timed out")
except Exception as e:
ceph_health['status'] = 'ERROR'
ceph_health['issues'].append(f"Error checking Ceph health: {str(e)}")
logger.error(f"Ceph health check failed: {e}")
return ceph_health
# =============================================================================
# PROMETHEUS METRICS EXPORT
# =============================================================================
def export_prometheus_metrics(self, health_report: Dict[str, Any]) -> str:
"""
Export health report as Prometheus metrics in text format.
Metrics follow Prometheus naming conventions:
- hwmon_* prefix for all metrics
- Labels for dimensions (device, hostname, container, etc.)
Returns:
str: Prometheus text format metrics
"""
hostname = health_report.get('hostname', socket.gethostname())
metrics = []
# Helper to format labels
def labels(**kwargs) -> str:
pairs = [f'{k}="{v}"' for k, v in kwargs.items() if v is not None]
return '{' + ','.join(pairs) + '}' if pairs else ''
# === System Info ===
metrics.append(f'# HELP hwmon_info System information')
metrics.append(f'# TYPE hwmon_info gauge')
metrics.append(f'hwmon_info{labels(hostname=hostname)} 1')
# === Drive Metrics ===
metrics.append(f'# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)')
metrics.append(f'# TYPE hwmon_drive_smart_healthy gauge')
metrics.append(f'# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius')
metrics.append(f'# TYPE hwmon_drive_temperature_celsius gauge')
metrics.append(f'# HELP hwmon_drive_size_bytes Drive total size in bytes')
metrics.append(f'# TYPE hwmon_drive_size_bytes gauge')
metrics.append(f'# HELP hwmon_drive_smart_issues_total Number of SMART issues detected')
metrics.append(f'# TYPE hwmon_drive_smart_issues_total gauge')
for drive in health_report.get('drives_health', {}).get('drives', []):
device = drive.get('device', 'unknown')
drive_labels = labels(hostname=hostname, device=device)
# SMART health status
smart_status = drive.get('smart_status', 'UNKNOWN')
healthy = 1 if smart_status == 'HEALTHY' else 0
metrics.append(f'hwmon_drive_smart_healthy{drive_labels} {healthy}')
# Temperature
if drive.get('temperature'):
metrics.append(f'hwmon_drive_temperature_celsius{drive_labels} {drive["temperature"]}')
# Drive size (convert human-readable to bytes if possible)
if drive.get('capacity'):
capacity_bytes = self._parse_size_to_bytes(drive['capacity'])
if capacity_bytes:
metrics.append(f'hwmon_drive_size_bytes{drive_labels} {capacity_bytes}')
# Issue count
issues_count = len(drive.get('smart_issues', []))
metrics.append(f'hwmon_drive_smart_issues_total{drive_labels} {issues_count}')
# === CPU Metrics ===
cpu = health_report.get('cpu_health', {})
metrics.append(f'# HELP hwmon_cpu_usage_percent CPU usage percentage')
metrics.append(f'# TYPE hwmon_cpu_usage_percent gauge')
if cpu.get('cpu_usage_percent') is not None:
metrics.append(f'hwmon_cpu_usage_percent{labels(hostname=hostname)} {cpu["cpu_usage_percent"]}')
# === Memory Metrics ===
mem = health_report.get('memory_health', {})
metrics.append(f'# HELP hwmon_memory_usage_percent Memory usage percentage')
metrics.append(f'# TYPE hwmon_memory_usage_percent gauge')
if mem.get('memory_percent') is not None:
metrics.append(f'hwmon_memory_usage_percent{labels(hostname=hostname)} {mem["memory_percent"]}')
metrics.append(f'# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)')
metrics.append(f'# TYPE hwmon_memory_has_ecc gauge')
has_ecc = 1 if mem.get('has_ecc') else 0
metrics.append(f'hwmon_memory_has_ecc{labels(hostname=hostname)} {has_ecc}')
if mem.get('has_ecc'):
metrics.append(f'# HELP hwmon_memory_ecc_errors_total Total ECC errors detected')
metrics.append(f'# TYPE hwmon_memory_ecc_errors_total gauge')
ecc_errors = len(mem.get('ecc_errors', []))
metrics.append(f'hwmon_memory_ecc_errors_total{labels(hostname=hostname)} {ecc_errors}')
# === Network Metrics ===
net = health_report.get('network_health', {})
metrics.append(f'# HELP hwmon_network_status Network status (1=OK, 0=issue)')
metrics.append(f'# TYPE hwmon_network_status gauge')
for net_type in ['management_network', 'ceph_network']:
net_info = net.get(net_type, {})
status = 1 if net_info.get('status') == 'OK' else 0
net_name = net_type.replace('_network', '')
metrics.append(f'hwmon_network_status{labels(hostname=hostname, network=net_name)} {status}')
# === Ceph Metrics ===
ceph = health_report.get('ceph_health', {})
if ceph.get('is_ceph_node'):
metrics.append(f'# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)')
metrics.append(f'# TYPE hwmon_ceph_cluster_healthy gauge')
ceph_healthy = 1 if ceph.get('cluster_health') == 'HEALTH_OK' else 0
metrics.append(f'hwmon_ceph_cluster_healthy{labels(hostname=hostname)} {ceph_healthy}')
if ceph.get('cluster_usage'):
usage = ceph['cluster_usage']
metrics.append(f'# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage')
metrics.append(f'# TYPE hwmon_ceph_cluster_usage_percent gauge')
metrics.append(f'hwmon_ceph_cluster_usage_percent{labels(hostname=hostname)} {usage.get("usage_percent", 0)}')
metrics.append(f'# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes')
metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_total gauge')
metrics.append(f'hwmon_ceph_cluster_bytes_total{labels(hostname=hostname)} {usage.get("total_bytes", 0)}')
metrics.append(f'# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes')
metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_used gauge')
metrics.append(f'hwmon_ceph_cluster_bytes_used{labels(hostname=hostname)} {usage.get("used_bytes", 0)}')
metrics.append(f'# HELP hwmon_ceph_osd_total Total number of OSDs')
metrics.append(f'# TYPE hwmon_ceph_osd_total gauge')
osd_count = len(ceph.get('osd_status', []))
metrics.append(f'hwmon_ceph_osd_total{labels(hostname=hostname)} {osd_count}')
metrics.append(f'# HELP hwmon_ceph_osd_down Number of down OSDs')
metrics.append(f'# TYPE hwmon_ceph_osd_down gauge')
down_osds = len([o for o in ceph.get('osd_status', []) if o.get('status') == 'down'])
metrics.append(f'hwmon_ceph_osd_down{labels(hostname=hostname)} {down_osds}')
# === LXC Metrics ===
lxc = health_report.get('lxc_health', {})
if lxc.get('containers'):
metrics.append(f'# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage')
metrics.append(f'# TYPE hwmon_lxc_storage_usage_percent gauge')
for container in lxc['containers']:
vmid = container.get('vmid', 'unknown')
for fs in container.get('filesystems', []):
mountpoint = fs.get('mountpoint', '/')
usage = fs.get('usage_percent', 0)
metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')
# === Issue Summary Metrics ===
metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
metrics.append(f'# TYPE hwmon_issues_total gauge')
system_issues = len(health_report.get('system_health', {}).get('issues', []))
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
lxc_issues = len(lxc.get('issues', []))
total_issues = system_issues + ceph_issues + lxc_issues
metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')
return '\n'.join(metrics) + '\n'
def _parse_size_to_bytes(self, size_str: str) -> int:
"""Parse human-readable size string to bytes."""
if not size_str:
return 0
size_str = size_str.strip().upper()
multipliers = {
'B': 1,
'KB': 1024,
'MB': 1024**2,
'GB': 1024**3,
'TB': 1024**4,
'PB': 1024**5,
'K': 1024,
'M': 1024**2,
'G': 1024**3,
'T': 1024**4,
'P': 1024**5
}
try:
for suffix, mult in sorted(multipliers.items(), key=lambda x: -len(x[0])):
if size_str.endswith(suffix):
num = float(size_str[:-len(suffix)].strip())
return int(num * mult)
return int(float(size_str))
except (ValueError, TypeError):
return 0
def write_prometheus_metrics(self, health_report: Dict[str, Any]) -> bool:
"""
Write Prometheus metrics to configured destination.
If PROMETHEUS_TEXTFILE_PATH is set, writes to that file for node_exporter.
Otherwise, logs the metrics (for debugging or other use).
Returns:
bool: True if metrics were written successfully
"""
if not self.CONFIG.get('PROMETHEUS_ENABLED', False):
return False
try:
metrics = self.export_prometheus_metrics(health_report)
textfile_path = self.CONFIG.get('PROMETHEUS_TEXTFILE_PATH')
if textfile_path:
# Write to textfile for node_exporter textfile collector
# Write to temp file first, then atomic rename
import tempfile
temp_fd, temp_path = tempfile.mkstemp(
dir=os.path.dirname(textfile_path),
prefix='.hwmon_metrics_'
)
try:
with os.fdopen(temp_fd, 'w') as f:
f.write(metrics)
os.rename(temp_path, textfile_path)
logger.info(f"Prometheus metrics written to {textfile_path}")
except Exception:
os.unlink(temp_path)
raise
else:
# Just log metrics (for debugging)
logger.debug("Prometheus metrics generated:\n" + metrics)
return True
except Exception as e:
logger.error(f"Failed to write Prometheus metrics: {e}")
return False
def _check_lxc_storage(self) -> Dict[str, Any]:
"""
Check storage utilization for all running LXC containers
@@ -2802,13 +3344,37 @@ def main():
action="store_true",
help="Enable dry-run mode (simulate ticket creation without actual API calls)."
)
parser.add_argument(
"--metrics",
action="store_true",
help="Output Prometheus metrics to stdout and exit."
)
parser.add_argument(
"--export-json",
type=str,
metavar="FILE",
help="Export health report to JSON file."
)
args = parser.parse_args()
monitor = SystemHealthMonitor(
ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
dry_run=args.dry_run
)
monitor.run()
if args.metrics:
# Just output metrics to stdout
health_report = monitor.perform_health_checks()
print(monitor.export_prometheus_metrics(health_report))
elif args.export_json:
# Export health report as JSON
import json
health_report = monitor.perform_health_checks()
with open(args.export_json, 'w') as f:
json.dump(health_report, f, indent=2, default=str)
logger.info(f"Health report exported to {args.export_json}")
else:
monitor.run()
if __name__ == "__main__":
main()