Add Ceph cluster monitoring and Prometheus metrics export
- Add comprehensive Ceph cluster health monitoring - Check cluster health status (HEALTH_OK/WARN/ERR) - Monitor cluster usage with configurable thresholds - Track OSD status (up/down) per node - Separate cluster-wide vs node-specific issues - Cluster-wide ticket deduplication - Add [cluster-wide] scope tag for Ceph issues - Cluster-wide issues deduplicate across all nodes - Node-specific issues (OSD down) include hostname - Add Prometheus metrics export - export_prometheus_metrics() method - write_prometheus_metrics() for textfile collector - --metrics CLI flag to output metrics to stdout - --export-json CLI flag to export health report as JSON - Add Grafana dashboard template (grafana-dashboard.json) - Add .gitignore Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
375
grafana-dashboard.json
Normal file
375
grafana-dashboard.json
Normal file
@@ -0,0 +1,375 @@
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "Prometheus data source for hwmonDaemon metrics",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__elements": {},
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "10.0.0"
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "gauge",
|
||||
"name": "Gauge",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "stat",
|
||||
"name": "Stat",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "table",
|
||||
"name": "Table",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "timeseries",
|
||||
"name": "Time series",
|
||||
"version": ""
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
||||
"id": 1,
|
||||
"panels": [],
|
||||
"title": "Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [
|
||||
{"options": {"0": {"color": "red", "index": 0, "text": "Issues Detected"}}, "type": "value"},
|
||||
{"options": {"from": 1, "result": {"color": "green", "index": 1, "text": "Healthy"}, "to": 999999}, "type": "range"}
|
||||
],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
|
||||
"id": 2,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "count(hwmon_info)", "legendFormat": "Hosts", "refId": "A"}],
|
||||
"title": "Monitored Hosts",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
|
||||
"id": 3,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "sum(hwmon_issues_total)", "legendFormat": "Issues", "refId": "A"}],
|
||||
"title": "Total Issues",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
|
||||
"id": 4,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "count(hwmon_drive_smart_healthy == 0)", "legendFormat": "Unhealthy", "refId": "A"}],
|
||||
"title": "Unhealthy Drives",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [
|
||||
{"options": {"0": {"color": "red", "index": 0, "text": "Unhealthy"}}, "type": "value"},
|
||||
{"options": {"1": {"color": "green", "index": 1, "text": "Healthy"}}, "type": "value"}
|
||||
],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
|
||||
"id": 5,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "min(hwmon_ceph_cluster_healthy)", "legendFormat": "Ceph", "refId": "A"}],
|
||||
"title": "Ceph Cluster Health",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
|
||||
"id": 6,
|
||||
"options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "sum(hwmon_ceph_osd_down)", "legendFormat": "Down OSDs", "refId": "A"}],
|
||||
"title": "Ceph OSDs Down",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
|
||||
"id": 10,
|
||||
"panels": [],
|
||||
"title": "Drive Health",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 45}, {"color": "red", "value": 55}]},
|
||||
"unit": "celsius"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
|
||||
"id": 11,
|
||||
"options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_drive_temperature_celsius", "legendFormat": "{{hostname}} {{device}}", "refId": "A"}],
|
||||
"title": "Drive Temperatures",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"custom": {"align": "auto", "cellOptions": {"type": "color-text"}, "inspect": false},
|
||||
"mappings": [
|
||||
{"options": {"0": {"color": "red", "index": 0, "text": "UNHEALTHY"}}, "type": "value"},
|
||||
{"options": {"1": {"color": "green", "index": 1, "text": "HEALTHY"}}, "type": "value"}
|
||||
],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
|
||||
},
|
||||
"overrides": [
|
||||
{"matcher": {"id": "byName", "options": "hostname"}, "properties": [{"id": "custom.width", "value": 120}]},
|
||||
{"matcher": {"id": "byName", "options": "device"}, "properties": [{"id": "custom.width", "value": 100}]},
|
||||
{"matcher": {"id": "byName", "options": "Status"}, "properties": [{"id": "custom.width", "value": 100}]},
|
||||
{"matcher": {"id": "byName", "options": "Issues"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}}]}
|
||||
]
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
|
||||
"id": 12,
|
||||
"options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Issues"}]},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{"expr": "hwmon_drive_smart_healthy", "format": "table", "instant": true, "legendFormat": "", "refId": "A"},
|
||||
{"expr": "hwmon_drive_smart_issues_total", "format": "table", "instant": true, "legendFormat": "", "refId": "B"}
|
||||
],
|
||||
"title": "Drive Status",
|
||||
"transformations": [
|
||||
{"id": "merge", "options": {}},
|
||||
{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {"Value #A": 2, "Value #B": 3, "device": 1, "hostname": 0}, "renameByName": {"Value #A": "Status", "Value #B": "Issues", "device": "Device", "hostname": "Host"}}}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 14},
|
||||
"id": 20,
|
||||
"panels": [],
|
||||
"title": "System Resources",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 15},
|
||||
"id": 21,
|
||||
"options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_cpu_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
|
||||
"title": "CPU Usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 15},
|
||||
"id": 22,
|
||||
"options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_memory_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
|
||||
"title": "Memory Usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 23},
|
||||
"id": 30,
|
||||
"panels": [],
|
||||
"title": "Ceph Cluster",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 24},
|
||||
"id": 31,
|
||||
"options": {"orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "showThresholdLabels": false, "showThresholdMarkers": true},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "max(hwmon_ceph_cluster_usage_percent)", "legendFormat": "Usage", "refId": "A"}],
|
||||
"title": "Ceph Cluster Usage",
|
||||
"type": "gauge"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
|
||||
"mappings": [],
|
||||
"max": 100,
|
||||
"min": 0,
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]},
|
||||
"unit": "percent"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 6, "w": 18, "x": 6, "y": 24},
|
||||
"id": 32,
|
||||
"options": {"legend": {"calcs": ["lastNotNull"], "displayMode": "list", "placement": "right", "showLegend": true}, "tooltip": {"mode": "single", "sort": "none"}},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_ceph_cluster_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
|
||||
"title": "Ceph Usage Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 30},
|
||||
"id": 40,
|
||||
"panels": [],
|
||||
"title": "LXC Containers",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"custom": {"align": "auto", "cellOptions": {"type": "auto"}, "inspect": false},
|
||||
"mappings": [],
|
||||
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]}
|
||||
},
|
||||
"overrides": [
|
||||
{"matcher": {"id": "byName", "options": "Usage"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "unit", "value": "percent"}, {"id": "max", "value": 100}, {"id": "min", "value": 0}]}
|
||||
]
|
||||
},
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 31},
|
||||
"id": 41,
|
||||
"options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Usage"}]},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [{"expr": "hwmon_lxc_storage_usage_percent", "format": "table", "instant": true, "legendFormat": "", "refId": "A"}],
|
||||
"title": "LXC Storage Usage",
|
||||
"transformations": [
|
||||
{"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {}, "renameByName": {"Value": "Usage", "hostname": "Host", "mountpoint": "Mountpoint", "vmid": "Container ID"}}}
|
||||
],
|
||||
"type": "table"
|
||||
}
|
||||
],
|
||||
"refresh": "1m",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": ["hwmon", "hardware", "monitoring", "proxmox"],
|
||||
"templating": {"list": []},
|
||||
"time": {"from": "now-6h", "to": "now"},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "hwmonDaemon - Hardware Monitor",
|
||||
"uid": "hwmondaemon",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
Reference in New Issue
Block a user