Add Ceph cluster monitoring and Prometheus metrics export
- Add comprehensive Ceph cluster health monitoring - Check cluster health status (HEALTH_OK/WARN/ERR) - Monitor cluster usage with configurable thresholds - Track OSD status (up/down) per node - Separate cluster-wide vs node-specific issues - Cluster-wide ticket deduplication - Add [cluster-wide] scope tag for Ceph issues - Cluster-wide issues deduplicate across all nodes - Node-specific issues (OSD down) include hostname - Add Prometheus metrics export - export_prometheus_metrics() method - write_prometheus_metrics() for textfile collector - --metrics CLI flag to output metrics to stdout - --export-json CLI flag to export health report as JSON - Add Grafana dashboard template (grafana-dashboard.json) - Add .gitignore Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
.claude
|
||||
settings.local.json
|
||||
375
grafana-dashboard.json
Normal file
375
grafana-dashboard.json
Normal file
@@ -0,0 +1,375 @@
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "Prometheus data source for hwmonDaemon metrics",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__elements": {},
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "10.0.0"
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "gauge",
|
||||
"name": "Gauge",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "stat",
|
||||
"name": "Stat",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "table",
|
||||
"name": "Table",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "timeseries",
|
||||
"name": "Time series",
|
||||
"version": ""
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
||||
"id": 1,
|
||||
"panels": [],
|
||||
"title": "Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [
|
||||
{"options": {"0": {"color": "red", "index": 0, "text": "Issues Detected"}}, "type": "value"},
|
||||
{"options": {"from": 1, "result": {"color": "green", "index": 1, "text": "Healthy"}, "to": 999999}, "type": "range"}
|
||||
],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
|
||||
"id": 2,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "count(hwmon_info)", "legendFormat": "Hosts", "refId": "A"}],
|
||||
"title": "Monitored Hosts",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
|
||||
"id": 3,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "sum(hwmon_issues_total)", "legendFormat": "Issues", "refId": "A"}],
|
||||
"title": "Total Issues",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
|
||||
"id": 4,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "count(hwmon_drive_smart_healthy == 0)", "legendFormat": "Unhealthy", "refId": "A"}],
|
||||
"title": "Unhealthy Drives",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [
|
||||
{"options": {"0": {"color": "red", "index": 0, "text": "Unhealthy"}}, "type": "value"},
|
||||
{"options": {"1": {"color": "green", "index": 1, "text": "Healthy"}}, "type": "value"}
|
||||
],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
|
||||
"id": 5,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "min(hwmon_ceph_cluster_healthy)", "legendFormat": "Ceph", "refId": "A"}],
|
||||
"title": "Ceph Cluster Health",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
|
||||
"id": 6,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "sum(hwmon_ceph_osd_down)", "legendFormat": "Down OSDs", "refId": "A"}],
|
||||
"title": "Ceph OSDs Down",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
|
||||
"id": 10,
|
||||
"panels": [],
|
||||
"title": "Drive Health",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 45}, {"color": "red", "value": 55}]},
|
||||
"unit": "celsius"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
|
||||
"id": 11,
|
||||
"options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_drive_temperature_celsius", "legendFormat": "{{hostname}} {{device}}", "refId": "A"}],
|
||||
"title": "Drive Temperatures",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"custom": {"align": "auto", "cellOptions": {"type": "color-text"}, "inspect": false},
|
||||
"mappings": [
|
||||
{"options": {"0": {"color": "red", "index": 0, "text": "UNHEALTHY"}}, "type": "value"},
|
||||
{"options": {"1": {"color": "green", "index": 1, "text": "HEALTHY"}}, "type": "value"}
|
||||
],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
|
||||
},
|
||||
"overrides": [
|
||||
{"matcher": {"id": "byName", "options": "hostname"}, "properties": [{"id": "custom.width", "value": 120}]},
|
||||
{"matcher": {"id": "byName", "options": "device"}, "properties": [{"id": "custom.width", "value": 100}]},
|
||||
{"matcher": {"id": "byName", "options": "Status"}, "properties": [{"id": "custom.width", "value": 100}]},
|
||||
{"matcher": {"id": "byName", "options": "Issues"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}}]}
|
||||
]
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
|
||||
"id": 12,
|
||||
"options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Issues"}]},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{"expr": "hwmon_drive_smart_healthy", "format": "table", "instant": true, "legendFormat": "", "refId": "A"},
|
||||
{"expr": "hwmon_drive_smart_issues_total", "format": "table", "instant": true, "legendFormat": "", "refId": "B"}
|
||||
],
|
||||
"title": "Drive Status",
|
||||
"transformations": [
|
||||
{"id": "merge", "options": {}},
|
||||
{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {"Value #A": 2, "Value #B": 3, "device": 1, "hostname": 0}, "renameByName": {"Value #A": "Status", "Value #B": "Issues", "device": "Device", "hostname": "Host"}}}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 14},
|
||||
"id": 20,
|
||||
"panels": [],
|
||||
"title": "System Resources",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 15},
|
||||
"id": 21,
|
||||
"options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_cpu_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
|
||||
"title": "CPU Usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 15},
|
||||
"id": 22,
|
||||
"options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_memory_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
|
||||
"title": "Memory Usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 23},
|
||||
"id": 30,
|
||||
"panels": [],
|
||||
"title": "Ceph Cluster",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 24},
|
||||
"id": 31,
|
||||
"options": {"orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "showThresholdLabels": false, "showThresholdMarkers": true},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "max(hwmon_ceph_cluster_usage_percent)", "legendFormat": "Usage", "refId": "A"}],
|
||||
"title": "Ceph Cluster Usage",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 6, "w": 18, "x": 6, "y": 24},
|
||||
"id": 32,
|
||||
"options": {"legend": {"calcs": ["lastNotNull"], "displayMode": "list", "placement": "right", "showLegend": true}, "tooltip": {"mode": "single", "sort": "none"}},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_ceph_cluster_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
|
||||
"title": "Ceph Usage Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 30},
|
||||
"id": 40,
|
||||
"panels": [],
|
||||
"title": "LXC Containers",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"custom": {"align": "auto", "cellOptions": {"type": "auto"}, "inspect": false},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]}
|
||||
},
|
||||
"overrides": [
|
||||
{"matcher": {"id": "byName", "options": "Usage"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "unit", "value": "percent"}, {"id": "max", "value": 100}, {"id": "min", "value": 0}]}
|
||||
]
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 31},
|
||||
"id": 41,
|
||||
"options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Usage"}]},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_lxc_storage_usage_percent", "format": "table", "instant": true, "legendFormat": "", "refId": "A"}],
|
||||
"title": "LXC Storage Usage",
|
||||
"transformations": [
|
||||
{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {}, "renameByName": {"Value": "Usage", "hostname": "Host", "mountpoint": "Mountpoint", "vmid": "Container ID"}}}
|
||||
],
|
||||
"type": "table"
|
||||
}
|
||||
],
|
||||
"refresh": "1m",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["hwmon", "hardware", "monitoring", "proxmox"],
|
||||
"templating": {"list": []},
|
||||
"time": {"from": "now-6h", "to": "now"},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "hwmonDaemon - Hardware Monitor",
|
||||
"uid": "hwmondaemon",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
592
hwmonDaemon.py
592
hwmonDaemon.py
@@ -69,7 +69,16 @@ class SystemHealthMonitor:
|
||||
'TEMPERATURE_INFO': PRIORITIES['LOW'],
|
||||
'DRIVE_AGE_INFO': PRIORITIES['LOW'],
|
||||
'SSD_WEAR_INFO': PRIORITIES['LOW'],
|
||||
'SYSTEM_LOG_INFO': PRIORITIES['LOW']
|
||||
'SYSTEM_LOG_INFO': PRIORITIES['LOW'],
|
||||
|
||||
# Ceph cluster issues
|
||||
'CEPH_HEALTH_ERR': PRIORITIES['CRITICAL'], # P1 - Cluster in error state
|
||||
'CEPH_HEALTH_WARN': PRIORITIES['MEDIUM'], # P3 - Cluster warnings
|
||||
'CEPH_OSD_DOWN': PRIORITIES['HIGH'], # P2 - OSD down (local node)
|
||||
'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full
|
||||
'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high
|
||||
'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded
|
||||
'CEPH_MON_DOWN': PRIORITIES['HIGH'] # P2 - Monitor down
|
||||
}
|
||||
|
||||
CONFIG = {
|
||||
@@ -104,7 +113,16 @@ class SystemHealthMonitor:
|
||||
'HISTORY_DIR': '/var/log/hwmonDaemon',
|
||||
'HISTORY_RETENTION_DAYS': 30,
|
||||
'INCLUDE_INFO_TICKETS': False, # Set True to create P5 tickets for INFO alerts
|
||||
'PRIORITY_ESCALATION_THRESHOLD': 3 # Number of criticals to trigger P1
|
||||
'PRIORITY_ESCALATION_THRESHOLD': 3, # Number of criticals to trigger P1
|
||||
# Ceph monitoring settings
|
||||
'CEPH_ENABLED': True, # Enable/disable Ceph health monitoring
|
||||
'CEPH_TICKET_NODE': None, # Hostname of node designated to create cluster-wide Ceph tickets
|
||||
'CEPH_USAGE_WARNING': 70, # Ceph cluster usage warning threshold %
|
||||
'CEPH_USAGE_CRITICAL': 85, # Ceph cluster usage critical threshold %
|
||||
# Prometheus metrics settings
|
||||
'PROMETHEUS_ENABLED': False, # Enable Prometheus metrics export
|
||||
'PROMETHEUS_PORT': 9101, # Port for Prometheus metrics HTTP server
|
||||
'PROMETHEUS_TEXTFILE_PATH': None # Path for textfile collector (alternative to HTTP)
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -138,6 +156,26 @@ class SystemHealthMonitor:
|
||||
elif key == 'TICKET_API_URL':
|
||||
cls.CONFIG['TICKET_API_URL'] = value
|
||||
logger.info(f"✓ Loaded TICKET_API_URL: {value}")
|
||||
# Ceph settings
|
||||
elif key == 'CEPH_ENABLED':
|
||||
cls.CONFIG['CEPH_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||
logger.info(f"✓ Loaded CEPH_ENABLED: {cls.CONFIG['CEPH_ENABLED']}")
|
||||
elif key == 'CEPH_TICKET_NODE':
|
||||
cls.CONFIG['CEPH_TICKET_NODE'] = value if value else None
|
||||
logger.info(f"✓ Loaded CEPH_TICKET_NODE: {value}")
|
||||
elif key == 'CEPH_USAGE_WARNING':
|
||||
cls.CONFIG['CEPH_USAGE_WARNING'] = int(value)
|
||||
elif key == 'CEPH_USAGE_CRITICAL':
|
||||
cls.CONFIG['CEPH_USAGE_CRITICAL'] = int(value)
|
||||
# Prometheus settings
|
||||
elif key == 'PROMETHEUS_ENABLED':
|
||||
cls.CONFIG['PROMETHEUS_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||
logger.info(f"✓ Loaded PROMETHEUS_ENABLED: {cls.CONFIG['PROMETHEUS_ENABLED']}")
|
||||
elif key == 'PROMETHEUS_PORT':
|
||||
cls.CONFIG['PROMETHEUS_PORT'] = int(value)
|
||||
elif key == 'PROMETHEUS_TEXTFILE_PATH':
|
||||
cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None
|
||||
logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load .env file: {e}")
|
||||
@@ -632,9 +670,13 @@ class SystemHealthMonitor:
|
||||
try:
|
||||
# Perform health checks and gather the report
|
||||
health_report = self.perform_health_checks()
|
||||
|
||||
|
||||
# Create tickets for any detected critical issues
|
||||
self._create_tickets_for_issues(health_report)
|
||||
|
||||
# Export Prometheus metrics if enabled
|
||||
if self.CONFIG.get('PROMETHEUS_ENABLED', False):
|
||||
self.write_prometheus_metrics(health_report)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
logger.error(f"Unexpected error during health check: {e}")
|
||||
@@ -643,10 +685,13 @@ class SystemHealthMonitor:
|
||||
def perform_health_checks(self) -> Dict[str, Any]:
|
||||
"""Perform comprehensive system health checks and return a report."""
|
||||
health_report = {
|
||||
'hostname': socket.gethostname(),
|
||||
'timestamp': datetime.datetime.now().isoformat(),
|
||||
'drives_health': self._check_drives_health(),
|
||||
'memory_health': self._check_memory_usage(),
|
||||
'cpu_health': self._check_cpu_usage(),
|
||||
'network_health': self._check_network_status(),
|
||||
'ceph_health': self._check_ceph_health(),
|
||||
'lxc_health': self._check_lxc_storage(),
|
||||
'system_health': self._check_system_drive_indicators()
|
||||
}
|
||||
@@ -682,8 +727,25 @@ class SystemHealthMonitor:
|
||||
|
||||
logger.info("\nNetwork Status:")
|
||||
logger.info(f"Management: {health_report['network_health']['management_network']['status']}")
|
||||
logger.info(f"Ceph: {health_report['network_health']['ceph_network']['status']}")
|
||||
|
||||
logger.info(f"Ceph Network: {health_report['network_health']['ceph_network']['status']}")
|
||||
|
||||
# Ceph cluster status
|
||||
ceph = health_report.get('ceph_health', {})
|
||||
if ceph.get('is_ceph_node'):
|
||||
logger.info("\nCeph Cluster Status:")
|
||||
logger.info(f" Cluster Health: {ceph.get('cluster_health', 'UNKNOWN')}")
|
||||
if ceph.get('cluster_usage'):
|
||||
usage = ceph['cluster_usage']
|
||||
logger.info(f" Cluster Usage: {usage.get('usage_percent', 0):.1f}%")
|
||||
logger.info(f" OSDs: {len(ceph.get('osd_status', []))} total")
|
||||
down_osds = [o for o in ceph.get('osd_status', []) if o.get('status') == 'down']
|
||||
if down_osds:
|
||||
logger.info(f" ⚠️ Down OSDs: {len(down_osds)}")
|
||||
if ceph.get('cluster_wide_issues'):
|
||||
logger.info(f" ⚠️ Cluster-wide issues: {len(ceph['cluster_wide_issues'])}")
|
||||
if ceph.get('issues'):
|
||||
logger.info(f" ⚠️ Node-specific issues: {len(ceph['issues'])}")
|
||||
|
||||
if health_report['system_health']['issues']:
|
||||
logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
|
||||
|
||||
@@ -1296,8 +1358,9 @@ class SystemHealthMonitor:
|
||||
|
||||
# P1 - Specific cluster-affecting scenarios
|
||||
if any(keyword in issue_lower for keyword in [
|
||||
'cluster', 'raid degraded', 'multiple drive',
|
||||
'both networks unreachable'
|
||||
'raid degraded', 'multiple drive',
|
||||
'both networks unreachable',
|
||||
'health_err' # Ceph cluster error
|
||||
]):
|
||||
return self.PRIORITIES['CRITICAL'] # P1
|
||||
|
||||
@@ -1310,10 +1373,16 @@ class SystemHealthMonitor:
|
||||
'reallocated_sector', 'pending_sector', 'offline_uncorrectable',
|
||||
'critical available_spare', 'critical wear',
|
||||
'critical reallocated', 'critical current_pending',
|
||||
'network is unreachable'
|
||||
'network is unreachable',
|
||||
'osd is down', 'osd down', # Ceph OSD down
|
||||
'cluster usage critical' # Ceph usage critical
|
||||
]):
|
||||
return self.PRIORITIES['HIGH'] # P2
|
||||
|
||||
# P2 - Ceph OSD issues (need to check explicitly since 'down' is in issue text)
|
||||
if '[ceph]' in issue_lower and 'down' in issue_lower:
|
||||
return self.PRIORITIES['HIGH'] # P2
|
||||
|
||||
# P2 - SMART issues with critical indicators
|
||||
if 'smart issues' in issue_lower and any(error_type in issue_lower for error_type in [
|
||||
'critical', 'failed', 'reallocated', 'pending', 'uncorrectable', 'offline'
|
||||
@@ -1324,7 +1393,8 @@ class SystemHealthMonitor:
|
||||
if any(keyword in issue_lower for keyword in [
|
||||
'warning', 'high temperature', 'correctable ecc',
|
||||
'trend alert', 'critical storage usage',
|
||||
'low available_spare', 'high wear'
|
||||
'low available_spare', 'high wear',
|
||||
'health_warn', 'cluster usage warning' # Ceph warnings
|
||||
]):
|
||||
return self.PRIORITIES['MEDIUM'] # P3
|
||||
|
||||
@@ -1425,6 +1495,29 @@ class SystemHealthMonitor:
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# Ceph Issues - Storage cluster issues (categorized as Hardware)
|
||||
if any(keyword in issue_lower for keyword in [
|
||||
'ceph', 'osd', 'health_err', 'health_warn', 'cluster usage'
|
||||
]):
|
||||
# Ceph errors are issues (unplanned degradation)
|
||||
if any(error in issue_lower for error in [
|
||||
'health_err', 'down', 'critical', 'error'
|
||||
]):
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
self.TICKET_TYPES['ISSUE'],
|
||||
'[ceph]',
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
|
||||
)
|
||||
# Ceph warnings are problems (need investigation)
|
||||
else:
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
self.TICKET_TYPES['PROBLEM'],
|
||||
'[ceph]',
|
||||
self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
|
||||
)
|
||||
|
||||
# Default: Hardware Problem (for undefined cases)
|
||||
return (
|
||||
self.TICKET_CATEGORIES['HARDWARE'],
|
||||
@@ -1446,7 +1539,6 @@ class SystemHealthMonitor:
|
||||
hostname = socket.gethostname()
|
||||
action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
|
||||
environment = self.TICKET_TEMPLATES['ENVIRONMENT']
|
||||
scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
|
||||
|
||||
for issue in issues:
|
||||
# Use the comprehensive priority determination function
|
||||
@@ -1455,6 +1547,15 @@ class SystemHealthMonitor:
|
||||
# Get proper categorization for this issue
|
||||
category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue)
|
||||
|
||||
# Determine scope: cluster-wide for Ceph cluster issues, single-node otherwise
|
||||
is_cluster_wide = '[cluster-wide]' in issue
|
||||
scope = self.TICKET_TEMPLATES['SCOPE']['CLUSTER_WIDE'] if is_cluster_wide else self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
|
||||
|
||||
# Clean issue text for title (remove [cluster-wide] and [ceph] markers if present)
|
||||
clean_issue = issue
|
||||
if is_cluster_wide:
|
||||
clean_issue = clean_issue.replace('[cluster-wide] ', '').replace('[cluster-wide]', '')
|
||||
|
||||
# Extract drive capacity if this is a drive-related issue
|
||||
drive_size = ""
|
||||
if "Drive" in issue and "/dev/" in issue:
|
||||
@@ -1473,7 +1574,7 @@ class SystemHealthMonitor:
|
||||
f"{action_type['AUTO']}"
|
||||
f"{issue_tag}"
|
||||
f"{drive_size}"
|
||||
f"{issue}"
|
||||
f"{clean_issue}"
|
||||
f"{scope}"
|
||||
f"{environment['PRODUCTION']}"
|
||||
f"{ticket_type_tag}"
|
||||
@@ -1596,6 +1697,29 @@ class SystemHealthMonitor:
|
||||
if system_health.get('issues'):
|
||||
issues.extend(system_health['issues'])
|
||||
|
||||
# Check for Ceph cluster issues
|
||||
ceph_health = health_report.get('ceph_health', {})
|
||||
if ceph_health.get('is_ceph_node'):
|
||||
hostname = socket.gethostname()
|
||||
designated_node = self.CONFIG.get('CEPH_TICKET_NODE')
|
||||
|
||||
# Cluster-wide issues: only create tickets from designated node (or first node if not set)
|
||||
# The [cluster-wide] tag ensures deduplication in tinker_tickets API
|
||||
if ceph_health.get('cluster_wide_issues'):
|
||||
# If no designated node, all nodes can report (API deduplicates)
|
||||
# If designated node is set, only that node creates tickets
|
||||
if not designated_node or hostname == designated_node:
|
||||
for issue in ceph_health['cluster_wide_issues']:
|
||||
# Add [cluster-wide] marker for API deduplication
|
||||
issues.append(f"[cluster-wide] [ceph] {issue}")
|
||||
else:
|
||||
logger.debug(f"Skipping cluster-wide Ceph issues (designated node: {designated_node})")
|
||||
|
||||
# Node-specific issues: always report from the affected node
|
||||
if ceph_health.get('issues'):
|
||||
for issue in ceph_health['issues']:
|
||||
issues.append(f"[ceph] {issue}")
|
||||
|
||||
logger.info("=== Issue Detection Started ===")
|
||||
logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
|
||||
logger.info(f"Memory status: {health_report['memory_health']['status']}")
|
||||
@@ -2669,7 +2793,425 @@ class SystemHealthMonitor:
|
||||
'status': 'ERROR',
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
|
||||
def _check_ceph_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check Ceph cluster health if this node is part of a Ceph cluster.
|
||||
|
||||
Returns health status, cluster info, and any issues detected.
|
||||
Cluster-wide issues use [cluster-wide] tag for cross-node deduplication.
|
||||
"""
|
||||
import shutil
|
||||
|
||||
ceph_health = {
|
||||
'status': 'OK',
|
||||
'is_ceph_node': False,
|
||||
'cluster_health': None,
|
||||
'cluster_usage': None,
|
||||
'osd_status': [],
|
||||
'mon_status': [],
|
||||
'issues': [],
|
||||
'cluster_wide_issues': [] # Issues that apply to entire cluster
|
||||
}
|
||||
|
||||
# Check if Ceph monitoring is enabled
|
||||
if not self.CONFIG.get('CEPH_ENABLED', True):
|
||||
logger.debug("Ceph monitoring disabled in config")
|
||||
return ceph_health
|
||||
|
||||
# Check if ceph CLI is available
|
||||
if not shutil.which('ceph'):
|
||||
logger.debug("Ceph CLI not found - not a Ceph node")
|
||||
return ceph_health
|
||||
|
||||
ceph_health['is_ceph_node'] = True
|
||||
hostname = socket.gethostname()
|
||||
|
||||
try:
|
||||
# Get cluster health status
|
||||
health_result = subprocess.run(
|
||||
['ceph', 'health', '--format=json'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if health_result.returncode == 0:
|
||||
try:
|
||||
health_data = json.loads(health_result.stdout)
|
||||
ceph_health['cluster_health'] = health_data.get('status', 'UNKNOWN')
|
||||
|
||||
# Check cluster health status
|
||||
if ceph_health['cluster_health'] == 'HEALTH_ERR':
|
||||
ceph_health['status'] = 'CRITICAL'
|
||||
# This is a cluster-wide issue
|
||||
ceph_health['cluster_wide_issues'].append(
|
||||
f"Ceph cluster HEALTH_ERR: {health_data.get('summary', {}).get('message', 'Unknown error')}"
|
||||
)
|
||||
elif ceph_health['cluster_health'] == 'HEALTH_WARN':
|
||||
if ceph_health['status'] != 'CRITICAL':
|
||||
ceph_health['status'] = 'WARNING'
|
||||
# Extract warning messages
|
||||
checks = health_data.get('checks', {})
|
||||
for check_name, check_data in checks.items():
|
||||
severity = check_data.get('severity', 'HEALTH_WARN')
|
||||
message = check_data.get('summary', {}).get('message', check_name)
|
||||
ceph_health['cluster_wide_issues'].append(
|
||||
f"Ceph HEALTH_WARN: {message}"
|
||||
)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse ceph health JSON: {e}")
|
||||
|
||||
# Get cluster usage (ceph df)
|
||||
df_result = subprocess.run(
|
||||
['ceph', 'df', '--format=json'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if df_result.returncode == 0:
|
||||
try:
|
||||
df_data = json.loads(df_result.stdout)
|
||||
stats = df_data.get('stats', {})
|
||||
total_bytes = stats.get('total_bytes', 0)
|
||||
total_used = stats.get('total_used_raw_bytes', 0)
|
||||
|
||||
if total_bytes > 0:
|
||||
usage_percent = (total_used / total_bytes) * 100
|
||||
ceph_health['cluster_usage'] = {
|
||||
'total_bytes': total_bytes,
|
||||
'used_bytes': total_used,
|
||||
'usage_percent': round(usage_percent, 2)
|
||||
}
|
||||
|
||||
# Check usage thresholds
|
||||
if usage_percent >= self.CONFIG.get('CEPH_USAGE_CRITICAL', 85):
|
||||
ceph_health['status'] = 'CRITICAL'
|
||||
ceph_health['cluster_wide_issues'].append(
|
||||
f"Ceph cluster usage critical: {usage_percent:.1f}%"
|
||||
)
|
||||
elif usage_percent >= self.CONFIG.get('CEPH_USAGE_WARNING', 70):
|
||||
if ceph_health['status'] != 'CRITICAL':
|
||||
ceph_health['status'] = 'WARNING'
|
||||
ceph_health['cluster_wide_issues'].append(
|
||||
f"Ceph cluster usage warning: {usage_percent:.1f}%"
|
||||
)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse ceph df JSON: {e}")
|
||||
|
||||
# Get OSD status (check for down OSDs on this node)
|
||||
osd_result = subprocess.run(
|
||||
['ceph', 'osd', 'tree', '--format=json'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if osd_result.returncode == 0:
|
||||
try:
|
||||
osd_data = json.loads(osd_result.stdout)
|
||||
nodes = osd_data.get('nodes', [])
|
||||
|
||||
# Find OSDs on this host
|
||||
host_id = None
|
||||
for node in nodes:
|
||||
if node.get('type') == 'host' and node.get('name') == hostname:
|
||||
host_id = node.get('id')
|
||||
break
|
||||
|
||||
# Check OSD status for this host
|
||||
for node in nodes:
|
||||
if node.get('type') == 'osd':
|
||||
osd_info = {
|
||||
'id': node.get('id'),
|
||||
'name': node.get('name'),
|
||||
'status': node.get('status', 'unknown'),
|
||||
'reweight': node.get('reweight', 1.0)
|
||||
}
|
||||
|
||||
# Check if OSD belongs to this host (by checking parent in tree)
|
||||
# Simplified: just track all OSDs for now
|
||||
ceph_health['osd_status'].append(osd_info)
|
||||
|
||||
# Check for down OSDs
|
||||
if node.get('status') == 'down':
|
||||
ceph_health['status'] = 'CRITICAL'
|
||||
# Node-specific issue (will include hostname in hash)
|
||||
ceph_health['issues'].append(
|
||||
f"Ceph OSD {node.get('name')} is DOWN on {hostname}"
|
||||
)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse ceph osd tree JSON: {e}")
|
||||
|
||||
# Get monitor status
|
||||
mon_result = subprocess.run(
|
||||
['ceph', 'mon', 'stat', '--format=json'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
if mon_result.returncode == 0:
|
||||
try:
|
||||
mon_data = json.loads(mon_result.stdout)
|
||||
ceph_health['mon_status'] = {
|
||||
'quorum': mon_data.get('quorum', []),
|
||||
'quorum_names': mon_data.get('quorum_names', [])
|
||||
}
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Failed to parse ceph mon stat JSON: {e}")
|
||||
|
||||
logger.debug(f"=== Ceph Health Check ===")
|
||||
logger.debug(f"Is Ceph node: {ceph_health['is_ceph_node']}")
|
||||
logger.debug(f"Cluster health: {ceph_health['cluster_health']}")
|
||||
logger.debug(f"Cluster usage: {ceph_health['cluster_usage']}")
|
||||
logger.debug(f"Status: {ceph_health['status']}")
|
||||
logger.debug(f"Issues: {ceph_health['issues']}")
|
||||
logger.debug(f"Cluster-wide issues: {ceph_health['cluster_wide_issues']}")
|
||||
logger.debug("=== End Ceph Health Check ===")
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
ceph_health['status'] = 'ERROR'
|
||||
ceph_health['issues'].append("Ceph health check timed out")
|
||||
except Exception as e:
|
||||
ceph_health['status'] = 'ERROR'
|
||||
ceph_health['issues'].append(f"Error checking Ceph health: {str(e)}")
|
||||
logger.error(f"Ceph health check failed: {e}")
|
||||
|
||||
return ceph_health
|
||||
|
||||
# =============================================================================
|
||||
# PROMETHEUS METRICS EXPORT
|
||||
# =============================================================================
|
||||
def export_prometheus_metrics(self, health_report: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Export health report as Prometheus metrics in text format.
|
||||
|
||||
Metrics follow Prometheus naming conventions:
|
||||
- hwmon_* prefix for all metrics
|
||||
- Labels for dimensions (device, hostname, container, etc.)
|
||||
|
||||
Returns:
|
||||
str: Prometheus text format metrics
|
||||
"""
|
||||
hostname = health_report.get('hostname', socket.gethostname())
|
||||
metrics = []
|
||||
|
||||
# Helper to format labels
|
||||
def labels(**kwargs) -> str:
|
||||
pairs = [f'{k}="{v}"' for k, v in kwargs.items() if v is not None]
|
||||
return '{' + ','.join(pairs) + '}' if pairs else ''
|
||||
|
||||
# === System Info ===
|
||||
metrics.append(f'# HELP hwmon_info System information')
|
||||
metrics.append(f'# TYPE hwmon_info gauge')
|
||||
metrics.append(f'hwmon_info{labels(hostname=hostname)} 1')
|
||||
|
||||
# === Drive Metrics ===
|
||||
metrics.append(f'# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)')
|
||||
metrics.append(f'# TYPE hwmon_drive_smart_healthy gauge')
|
||||
|
||||
metrics.append(f'# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius')
|
||||
metrics.append(f'# TYPE hwmon_drive_temperature_celsius gauge')
|
||||
|
||||
metrics.append(f'# HELP hwmon_drive_size_bytes Drive total size in bytes')
|
||||
metrics.append(f'# TYPE hwmon_drive_size_bytes gauge')
|
||||
|
||||
metrics.append(f'# HELP hwmon_drive_smart_issues_total Number of SMART issues detected')
|
||||
metrics.append(f'# TYPE hwmon_drive_smart_issues_total gauge')
|
||||
|
||||
for drive in health_report.get('drives_health', {}).get('drives', []):
|
||||
device = drive.get('device', 'unknown')
|
||||
drive_labels = labels(hostname=hostname, device=device)
|
||||
|
||||
# SMART health status
|
||||
smart_status = drive.get('smart_status', 'UNKNOWN')
|
||||
healthy = 1 if smart_status == 'HEALTHY' else 0
|
||||
metrics.append(f'hwmon_drive_smart_healthy{drive_labels} {healthy}')
|
||||
|
||||
# Temperature
|
||||
if drive.get('temperature'):
|
||||
metrics.append(f'hwmon_drive_temperature_celsius{drive_labels} {drive["temperature"]}')
|
||||
|
||||
# Drive size (convert human-readable to bytes if possible)
|
||||
if drive.get('capacity'):
|
||||
capacity_bytes = self._parse_size_to_bytes(drive['capacity'])
|
||||
if capacity_bytes:
|
||||
metrics.append(f'hwmon_drive_size_bytes{drive_labels} {capacity_bytes}')
|
||||
|
||||
# Issue count
|
||||
issues_count = len(drive.get('smart_issues', []))
|
||||
metrics.append(f'hwmon_drive_smart_issues_total{drive_labels} {issues_count}')
|
||||
|
||||
# === CPU Metrics ===
|
||||
cpu = health_report.get('cpu_health', {})
|
||||
metrics.append(f'# HELP hwmon_cpu_usage_percent CPU usage percentage')
|
||||
metrics.append(f'# TYPE hwmon_cpu_usage_percent gauge')
|
||||
if cpu.get('cpu_usage_percent') is not None:
|
||||
metrics.append(f'hwmon_cpu_usage_percent{labels(hostname=hostname)} {cpu["cpu_usage_percent"]}')
|
||||
|
||||
# === Memory Metrics ===
|
||||
mem = health_report.get('memory_health', {})
|
||||
metrics.append(f'# HELP hwmon_memory_usage_percent Memory usage percentage')
|
||||
metrics.append(f'# TYPE hwmon_memory_usage_percent gauge')
|
||||
if mem.get('memory_percent') is not None:
|
||||
metrics.append(f'hwmon_memory_usage_percent{labels(hostname=hostname)} {mem["memory_percent"]}')
|
||||
|
||||
metrics.append(f'# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)')
|
||||
metrics.append(f'# TYPE hwmon_memory_has_ecc gauge')
|
||||
has_ecc = 1 if mem.get('has_ecc') else 0
|
||||
metrics.append(f'hwmon_memory_has_ecc{labels(hostname=hostname)} {has_ecc}')
|
||||
|
||||
if mem.get('has_ecc'):
|
||||
metrics.append(f'# HELP hwmon_memory_ecc_errors_total Total ECC errors detected')
|
||||
metrics.append(f'# TYPE hwmon_memory_ecc_errors_total gauge')
|
||||
ecc_errors = len(mem.get('ecc_errors', []))
|
||||
metrics.append(f'hwmon_memory_ecc_errors_total{labels(hostname=hostname)} {ecc_errors}')
|
||||
|
||||
# === Network Metrics ===
|
||||
net = health_report.get('network_health', {})
|
||||
metrics.append(f'# HELP hwmon_network_status Network status (1=OK, 0=issue)')
|
||||
metrics.append(f'# TYPE hwmon_network_status gauge')
|
||||
|
||||
for net_type in ['management_network', 'ceph_network']:
|
||||
net_info = net.get(net_type, {})
|
||||
status = 1 if net_info.get('status') == 'OK' else 0
|
||||
net_name = net_type.replace('_network', '')
|
||||
metrics.append(f'hwmon_network_status{labels(hostname=hostname, network=net_name)} {status}')
|
||||
|
||||
# === Ceph Metrics ===
|
||||
ceph = health_report.get('ceph_health', {})
|
||||
if ceph.get('is_ceph_node'):
|
||||
metrics.append(f'# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)')
|
||||
metrics.append(f'# TYPE hwmon_ceph_cluster_healthy gauge')
|
||||
ceph_healthy = 1 if ceph.get('cluster_health') == 'HEALTH_OK' else 0
|
||||
metrics.append(f'hwmon_ceph_cluster_healthy{labels(hostname=hostname)} {ceph_healthy}')
|
||||
|
||||
if ceph.get('cluster_usage'):
|
||||
usage = ceph['cluster_usage']
|
||||
metrics.append(f'# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage')
|
||||
metrics.append(f'# TYPE hwmon_ceph_cluster_usage_percent gauge')
|
||||
metrics.append(f'hwmon_ceph_cluster_usage_percent{labels(hostname=hostname)} {usage.get("usage_percent", 0)}')
|
||||
|
||||
metrics.append(f'# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes')
|
||||
metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_total gauge')
|
||||
metrics.append(f'hwmon_ceph_cluster_bytes_total{labels(hostname=hostname)} {usage.get("total_bytes", 0)}')
|
||||
|
||||
metrics.append(f'# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes')
|
||||
metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_used gauge')
|
||||
metrics.append(f'hwmon_ceph_cluster_bytes_used{labels(hostname=hostname)} {usage.get("used_bytes", 0)}')
|
||||
|
||||
metrics.append(f'# HELP hwmon_ceph_osd_total Total number of OSDs')
|
||||
metrics.append(f'# TYPE hwmon_ceph_osd_total gauge')
|
||||
osd_count = len(ceph.get('osd_status', []))
|
||||
metrics.append(f'hwmon_ceph_osd_total{labels(hostname=hostname)} {osd_count}')
|
||||
|
||||
metrics.append(f'# HELP hwmon_ceph_osd_down Number of down OSDs')
|
||||
metrics.append(f'# TYPE hwmon_ceph_osd_down gauge')
|
||||
down_osds = len([o for o in ceph.get('osd_status', []) if o.get('status') == 'down'])
|
||||
metrics.append(f'hwmon_ceph_osd_down{labels(hostname=hostname)} {down_osds}')
|
||||
|
||||
# === LXC Metrics ===
|
||||
lxc = health_report.get('lxc_health', {})
|
||||
if lxc.get('containers'):
|
||||
metrics.append(f'# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage')
|
||||
metrics.append(f'# TYPE hwmon_lxc_storage_usage_percent gauge')
|
||||
|
||||
for container in lxc['containers']:
|
||||
vmid = container.get('vmid', 'unknown')
|
||||
for fs in container.get('filesystems', []):
|
||||
mountpoint = fs.get('mountpoint', '/')
|
||||
usage = fs.get('usage_percent', 0)
|
||||
metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')
|
||||
|
||||
# === Issue Summary Metrics ===
|
||||
metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
|
||||
metrics.append(f'# TYPE hwmon_issues_total gauge')
|
||||
|
||||
system_issues = len(health_report.get('system_health', {}).get('issues', []))
|
||||
ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
|
||||
lxc_issues = len(lxc.get('issues', []))
|
||||
total_issues = system_issues + ceph_issues + lxc_issues
|
||||
metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')
|
||||
|
||||
return '\n'.join(metrics) + '\n'
|
||||
|
||||
def _parse_size_to_bytes(self, size_str: str) -> int:
|
||||
"""Parse human-readable size string to bytes."""
|
||||
if not size_str:
|
||||
return 0
|
||||
|
||||
size_str = size_str.strip().upper()
|
||||
multipliers = {
|
||||
'B': 1,
|
||||
'KB': 1024,
|
||||
'MB': 1024**2,
|
||||
'GB': 1024**3,
|
||||
'TB': 1024**4,
|
||||
'PB': 1024**5,
|
||||
'K': 1024,
|
||||
'M': 1024**2,
|
||||
'G': 1024**3,
|
||||
'T': 1024**4,
|
||||
'P': 1024**5
|
||||
}
|
||||
|
||||
try:
|
||||
for suffix, mult in sorted(multipliers.items(), key=lambda x: -len(x[0])):
|
||||
if size_str.endswith(suffix):
|
||||
num = float(size_str[:-len(suffix)].strip())
|
||||
return int(num * mult)
|
||||
return int(float(size_str))
|
||||
except (ValueError, TypeError):
|
||||
return 0
|
||||
|
||||
def write_prometheus_metrics(self, health_report: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Write Prometheus metrics to configured destination.
|
||||
|
||||
If PROMETHEUS_TEXTFILE_PATH is set, writes to that file for node_exporter.
|
||||
Otherwise, logs the metrics (for debugging or other use).
|
||||
|
||||
Returns:
|
||||
bool: True if metrics were written successfully
|
||||
"""
|
||||
if not self.CONFIG.get('PROMETHEUS_ENABLED', False):
|
||||
return False
|
||||
|
||||
try:
|
||||
metrics = self.export_prometheus_metrics(health_report)
|
||||
textfile_path = self.CONFIG.get('PROMETHEUS_TEXTFILE_PATH')
|
||||
|
||||
if textfile_path:
|
||||
# Write to textfile for node_exporter textfile collector
|
||||
# Write to temp file first, then atomic rename
|
||||
import tempfile
|
||||
temp_fd, temp_path = tempfile.mkstemp(
|
||||
dir=os.path.dirname(textfile_path),
|
||||
prefix='.hwmon_metrics_'
|
||||
)
|
||||
try:
|
||||
with os.fdopen(temp_fd, 'w') as f:
|
||||
f.write(metrics)
|
||||
os.rename(temp_path, textfile_path)
|
||||
logger.info(f"Prometheus metrics written to {textfile_path}")
|
||||
except Exception:
|
||||
os.unlink(temp_path)
|
||||
raise
|
||||
else:
|
||||
# Just log metrics (for debugging)
|
||||
logger.debug("Prometheus metrics generated:\n" + metrics)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write Prometheus metrics: {e}")
|
||||
return False
|
||||
|
||||
def _check_lxc_storage(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check storage utilization for all running LXC containers
|
||||
@@ -2802,13 +3344,37 @@ def main():
|
||||
action="store_true",
|
||||
help="Enable dry-run mode (simulate ticket creation without actual API calls)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--metrics",
|
||||
action="store_true",
|
||||
help="Output Prometheus metrics to stdout and exit."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--export-json",
|
||||
type=str,
|
||||
metavar="FILE",
|
||||
help="Export health report to JSON file."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
monitor = SystemHealthMonitor(
|
||||
ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
|
||||
dry_run=args.dry_run
|
||||
)
|
||||
monitor.run()
|
||||
|
||||
if args.metrics:
|
||||
# Just output metrics to stdout
|
||||
health_report = monitor.perform_health_checks()
|
||||
print(monitor.export_prometheus_metrics(health_report))
|
||||
elif args.export_json:
|
||||
# Export health report as JSON
|
||||
import json
|
||||
health_report = monitor.perform_health_checks()
|
||||
with open(args.export_json, 'w') as f:
|
||||
json.dump(health_report, f, indent=2, default=str)
|
||||
logger.info(f"Health report exported to {args.export_json}")
|
||||
else:
|
||||
monitor.run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user