From 0f8918fb8baceb028d18e3225d0945467d522f87 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Sat, 17 Jan 2026 15:54:16 -0500 Subject: [PATCH] Add Ceph cluster monitoring and Prometheus metrics export - Add comprehensive Ceph cluster health monitoring - Check cluster health status (HEALTH_OK/WARN/ERR) - Monitor cluster usage with configurable thresholds - Track OSD status (up/down) per node - Separate cluster-wide vs node-specific issues - Cluster-wide ticket deduplication - Add [cluster-wide] scope tag for Ceph issues - Cluster-wide issues deduplicate across all nodes - Node-specific issues (OSD down) include hostname - Add Prometheus metrics export - export_prometheus_metrics() method - write_prometheus_metrics() for textfile collector - --metrics CLI flag to output metrics to stdout - --export-json CLI flag to export health report as JSON - Add Grafana dashboard template (grafana-dashboard.json) - Add .gitignore Co-Authored-By: Claude Opus 4.5 --- .gitignore | 2 + grafana-dashboard.json | 375 ++++++++++++++++++++++++++ hwmonDaemon.py | 592 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 956 insertions(+), 13 deletions(-) create mode 100644 .gitignore create mode 100644 grafana-dashboard.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9c68ef2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.claude +settings.local.json \ No newline at end of file diff --git a/grafana-dashboard.json b/grafana-dashboard.json new file mode 100644 index 0000000..e4085f8 --- /dev/null +++ b/grafana-dashboard.json @@ -0,0 +1,375 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "Prometheus data source for hwmonDaemon metrics", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "id": 1, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "mappings": [ + {"options": {"0": {"color": "red", "index": 0, "text": "Issues Detected"}}, "type": "value"}, + {"options": {"from": 1, "result": {"color": "green", "index": 1, "text": "Healthy"}, "to": 999999}, "type": "range"} + ], + "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1}, + "id": 2, + "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "count(hwmon_info)", "legendFormat": "Hosts", "refId": "A"}], + "title": "Monitored Hosts", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]} + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1}, + "id": 3, + "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "sum(hwmon_issues_total)", "legendFormat": "Issues", "refId": "A"}], + "title": "Total Issues", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]} + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1}, + "id": 4, + "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "count(hwmon_drive_smart_healthy == 0)", "legendFormat": "Unhealthy", "refId": "A"}], + "title": "Unhealthy Drives", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "mappings": [ + {"options": {"0": {"color": "red", "index": 0, "text": "Unhealthy"}}, "type": "value"}, + {"options": {"1": {"color": "green", "index": 1, "text": "Healthy"}}, "type": "value"} + ], + "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1}, + "id": 5, + "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "min(hwmon_ceph_cluster_healthy)", "legendFormat": "Ceph", "refId": "A"}], + "title": "Ceph Cluster Health", + "type": "stat" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]} + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1}, + "id": 6, + "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "sum(hwmon_ceph_osd_down)", "legendFormat": "Down OSDs", "refId": "A"}], + "title": "Ceph OSDs Down", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5}, + "id": 10, + "panels": [], + "title": "Drive Health", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 45}, {"color": "red", "value": 55}]}, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 6}, + "id": 11, + "options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "hwmon_drive_temperature_celsius", "legendFormat": "{{hostname}} {{device}}", "refId": "A"}], + "title": "Drive Temperatures", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "custom": {"align": "auto", "cellOptions": {"type": "color-text"}, "inspect": false}, + "mappings": [ + {"options": {"0": {"color": "red", "index": 0, "text": "UNHEALTHY"}}, "type": "value"}, + {"options": {"1": {"color": "green", "index": 1, "text": "HEALTHY"}}, "type": "value"} + ], + "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} + }, + "overrides": [ + {"matcher": {"id": "byName", "options": "hostname"}, "properties": [{"id": "custom.width", "value": 120}]}, + {"matcher": {"id": "byName", "options": "device"}, "properties": [{"id": "custom.width", "value": 100}]}, + {"matcher": {"id": "byName", "options": "Status"}, "properties": [{"id": "custom.width", "value": 100}]}, + {"matcher": {"id": "byName", "options": "Issues"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}}]} + ] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 6}, + "id": 12, + "options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Issues"}]}, + "pluginVersion": "10.0.0", + "targets": [ + {"expr": "hwmon_drive_smart_healthy", "format": "table", "instant": true, "legendFormat": "", "refId": "A"}, + {"expr": "hwmon_drive_smart_issues_total", "format": "table", "instant": true, "legendFormat": "", "refId": "B"} + ], + "title": "Drive Status", + "transformations": [ + {"id": "merge", "options": {}}, + {"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {"Value #A": 2, "Value #B": 3, "device": 1, "hostname": 0}, "renameByName": {"Value #A": "Status", "Value #B": "Issues", "device": "Device", "hostname": "Host"}}} + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 14}, + "id": 20, + "panels": [], + "title": "System Resources", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}}, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]}, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 15}, + "id": 21, + "options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "hwmon_cpu_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}}, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]}, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 15}, + "id": 22, + "options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "hwmon_memory_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 23}, + "id": 30, + "panels": [], + "title": "Ceph Cluster", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]}, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 6, "x": 0, "y": 24}, + "id": 31, + "options": {"orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "showThresholdLabels": false, "showThresholdMarkers": true}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "max(hwmon_ceph_cluster_usage_percent)", "legendFormat": "Usage", "refId": "A"}], + "title": "Ceph Cluster Usage", + "type": "gauge" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}}, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]}, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": {"h": 6, "w": 18, "x": 6, "y": 24}, + "id": 32, + "options": {"legend": {"calcs": ["lastNotNull"], "displayMode": "list", "placement": "right", "showLegend": true}, "tooltip": {"mode": "single", "sort": "none"}}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "hwmon_ceph_cluster_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}], + "title": "Ceph Usage Over Time", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 30}, + "id": 40, + "panels": [], + "title": "LXC Containers", + "type": "row" + }, + { + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "custom": {"align": "auto", "cellOptions": {"type": "auto"}, "inspect": false}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]} + }, + "overrides": [ + {"matcher": {"id": "byName", "options": "Usage"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "unit", "value": "percent"}, {"id": "max", "value": 100}, {"id": "min", "value": 0}]} + ] + }, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 31}, + "id": 41, + "options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Usage"}]}, + "pluginVersion": "10.0.0", + "targets": [{"expr": "hwmon_lxc_storage_usage_percent", "format": "table", "instant": true, "legendFormat": "", "refId": "A"}], + "title": "LXC Storage Usage", + "transformations": [ + {"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {}, "renameByName": {"Value": "Usage", "hostname": "Host", "mountpoint": "Mountpoint", "vmid": "Container ID"}}} + ], + "type": "table" + } + ], + "refresh": "1m", + "schemaVersion": 38, + "style": "dark", + "tags": ["hwmon", "hardware", "monitoring", "proxmox"], + "templating": {"list": []}, + "time": {"from": "now-6h", "to": "now"}, + "timepicker": {}, + "timezone": "browser", + "title": "hwmonDaemon - Hardware Monitor", + "uid": "hwmondaemon", + "version": 1, + "weekStart": "" +} diff --git a/hwmonDaemon.py b/hwmonDaemon.py index f7c7f36..968da95 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -69,7 +69,16 @@ class SystemHealthMonitor: 'TEMPERATURE_INFO': PRIORITIES['LOW'], 'DRIVE_AGE_INFO': PRIORITIES['LOW'], 'SSD_WEAR_INFO': PRIORITIES['LOW'], - 'SYSTEM_LOG_INFO': PRIORITIES['LOW'] + 'SYSTEM_LOG_INFO': PRIORITIES['LOW'], + + # Ceph cluster issues + 'CEPH_HEALTH_ERR': PRIORITIES['CRITICAL'], # P1 - Cluster in error state + 'CEPH_HEALTH_WARN': PRIORITIES['MEDIUM'], # P3 - Cluster warnings + 'CEPH_OSD_DOWN': PRIORITIES['HIGH'], # P2 - OSD down (local node) + 'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'], # P2 - Cluster near full + 'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'], # P3 - Cluster usage high + 'CEPH_PG_DEGRADED': PRIORITIES['HIGH'], # P2 - PGs degraded + 'CEPH_MON_DOWN': PRIORITIES['HIGH'] # P2 - Monitor down } CONFIG = { @@ -104,7 +113,16 @@ class SystemHealthMonitor: 'HISTORY_DIR': '/var/log/hwmonDaemon', 'HISTORY_RETENTION_DAYS': 30, 'INCLUDE_INFO_TICKETS': False, # Set True to create P5 tickets for INFO alerts - 'PRIORITY_ESCALATION_THRESHOLD': 3 # Number of criticals to trigger P1 + 'PRIORITY_ESCALATION_THRESHOLD': 3, # Number of criticals to trigger P1 + # Ceph monitoring settings + 'CEPH_ENABLED': True, # Enable/disable Ceph health monitoring + 'CEPH_TICKET_NODE': None, # Hostname of node designated to create cluster-wide Ceph tickets + 'CEPH_USAGE_WARNING': 70, # Ceph cluster usage warning threshold % + 'CEPH_USAGE_CRITICAL': 85, # Ceph cluster usage critical threshold % + # Prometheus metrics settings + 'PROMETHEUS_ENABLED': False, # Enable Prometheus metrics export + 'PROMETHEUS_PORT': 9101, # Port for Prometheus metrics HTTP server + 'PROMETHEUS_TEXTFILE_PATH': None # Path for textfile collector (alternative to HTTP) } @classmethod @@ -138,6 +156,26 @@ class SystemHealthMonitor: elif key == 'TICKET_API_URL': cls.CONFIG['TICKET_API_URL'] = value logger.info(f"✓ Loaded TICKET_API_URL: {value}") + # Ceph settings + elif key == 'CEPH_ENABLED': + cls.CONFIG['CEPH_ENABLED'] = value.lower() in ('true', '1', 'yes') + logger.info(f"✓ Loaded CEPH_ENABLED: {cls.CONFIG['CEPH_ENABLED']}") + elif key == 'CEPH_TICKET_NODE': + cls.CONFIG['CEPH_TICKET_NODE'] = value if value else None + logger.info(f"✓ Loaded CEPH_TICKET_NODE: {value}") + elif key == 'CEPH_USAGE_WARNING': + cls.CONFIG['CEPH_USAGE_WARNING'] = int(value) + elif key == 'CEPH_USAGE_CRITICAL': + cls.CONFIG['CEPH_USAGE_CRITICAL'] = int(value) + # Prometheus settings + elif key == 'PROMETHEUS_ENABLED': + cls.CONFIG['PROMETHEUS_ENABLED'] = value.lower() in ('true', '1', 'yes') + logger.info(f"✓ Loaded PROMETHEUS_ENABLED: {cls.CONFIG['PROMETHEUS_ENABLED']}") + elif key == 'PROMETHEUS_PORT': + cls.CONFIG['PROMETHEUS_PORT'] = int(value) + elif key == 'PROMETHEUS_TEXTFILE_PATH': + cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None + logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}") except Exception as e: logger.error(f"Failed to load .env file: {e}") @@ -632,9 +670,13 @@ class SystemHealthMonitor: try: # Perform health checks and gather the report health_report = self.perform_health_checks() - + # Create tickets for any detected critical issues self._create_tickets_for_issues(health_report) + + # Export Prometheus metrics if enabled + if self.CONFIG.get('PROMETHEUS_ENABLED', False): + self.write_prometheus_metrics(health_report) except Exception as e: import traceback logger.error(f"Unexpected error during health check: {e}") @@ -643,10 +685,13 @@ class SystemHealthMonitor: def perform_health_checks(self) -> Dict[str, Any]: """Perform comprehensive system health checks and return a report.""" health_report = { + 'hostname': socket.gethostname(), + 'timestamp': datetime.datetime.now().isoformat(), 'drives_health': self._check_drives_health(), 'memory_health': self._check_memory_usage(), 'cpu_health': self._check_cpu_usage(), 'network_health': self._check_network_status(), + 'ceph_health': self._check_ceph_health(), 'lxc_health': self._check_lxc_storage(), 'system_health': self._check_system_drive_indicators() } @@ -682,8 +727,25 @@ class SystemHealthMonitor: logger.info("\nNetwork Status:") logger.info(f"Management: {health_report['network_health']['management_network']['status']}") - logger.info(f"Ceph: {health_report['network_health']['ceph_network']['status']}") - + logger.info(f"Ceph Network: {health_report['network_health']['ceph_network']['status']}") + + # Ceph cluster status + ceph = health_report.get('ceph_health', {}) + if ceph.get('is_ceph_node'): + logger.info("\nCeph Cluster Status:") + logger.info(f" Cluster Health: {ceph.get('cluster_health', 'UNKNOWN')}") + if ceph.get('cluster_usage'): + usage = ceph['cluster_usage'] + logger.info(f" Cluster Usage: {usage.get('usage_percent', 0):.1f}%") + logger.info(f" OSDs: {len(ceph.get('osd_status', []))} total") + down_osds = [o for o in ceph.get('osd_status', []) if o.get('status') == 'down'] + if down_osds: + logger.info(f" ⚠️ Down OSDs: {len(down_osds)}") + if ceph.get('cluster_wide_issues'): + logger.info(f" ⚠️ Cluster-wide issues: {len(ceph['cluster_wide_issues'])}") + if ceph.get('issues'): + logger.info(f" ⚠️ Node-specific issues: {len(ceph['issues'])}") + if health_report['system_health']['issues']: logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found") @@ -1296,8 +1358,9 @@ class SystemHealthMonitor: # P1 - Specific cluster-affecting scenarios if any(keyword in issue_lower for keyword in [ - 'cluster', 'raid degraded', 'multiple drive', - 'both networks unreachable' + 'raid degraded', 'multiple drive', + 'both networks unreachable', + 'health_err' # Ceph cluster error ]): return self.PRIORITIES['CRITICAL'] # P1 @@ -1310,10 +1373,16 @@ class SystemHealthMonitor: 'reallocated_sector', 'pending_sector', 'offline_uncorrectable', 'critical available_spare', 'critical wear', 'critical reallocated', 'critical current_pending', - 'network is unreachable' + 'network is unreachable', + 'osd is down', 'osd down', # Ceph OSD down + 'cluster usage critical' # Ceph usage critical ]): return self.PRIORITIES['HIGH'] # P2 + # P2 - Ceph OSD issues (need to check explicitly since 'down' is in issue text) + if '[ceph]' in issue_lower and 'down' in issue_lower: + return self.PRIORITIES['HIGH'] # P2 + # P2 - SMART issues with critical indicators if 'smart issues' in issue_lower and any(error_type in issue_lower for error_type in [ 'critical', 'failed', 'reallocated', 'pending', 'uncorrectable', 'offline' @@ -1324,7 +1393,8 @@ class SystemHealthMonitor: if any(keyword in issue_lower for keyword in [ 'warning', 'high temperature', 'correctable ecc', 'trend alert', 'critical storage usage', - 'low available_spare', 'high wear' + 'low available_spare', 'high wear', + 'health_warn', 'cluster usage warning' # Ceph warnings ]): return self.PRIORITIES['MEDIUM'] # P3 @@ -1425,6 +1495,29 @@ class SystemHealthMonitor: self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] ) + # Ceph Issues - Storage cluster issues (categorized as Hardware) + if any(keyword in issue_lower for keyword in [ + 'ceph', 'osd', 'health_err', 'health_warn', 'cluster usage' + ]): + # Ceph errors are issues (unplanned degradation) + if any(error in issue_lower for error in [ + 'health_err', 'down', 'critical', 'error' + ]): + return ( + self.TICKET_CATEGORIES['HARDWARE'], + self.TICKET_TYPES['ISSUE'], + '[ceph]', + self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE'] + ) + # Ceph warnings are problems (need investigation) + else: + return ( + self.TICKET_CATEGORIES['HARDWARE'], + self.TICKET_TYPES['PROBLEM'], + '[ceph]', + self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM'] + ) + # Default: Hardware Problem (for undefined cases) return ( self.TICKET_CATEGORIES['HARDWARE'], @@ -1446,7 +1539,6 @@ class SystemHealthMonitor: hostname = socket.gethostname() action_type = self.TICKET_TEMPLATES['ACTION_TYPE'] environment = self.TICKET_TEMPLATES['ENVIRONMENT'] - scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE'] for issue in issues: # Use the comprehensive priority determination function @@ -1455,6 +1547,15 @@ class SystemHealthMonitor: # Get proper categorization for this issue category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue) + # Determine scope: cluster-wide for Ceph cluster issues, single-node otherwise + is_cluster_wide = '[cluster-wide]' in issue + scope = self.TICKET_TEMPLATES['SCOPE']['CLUSTER_WIDE'] if is_cluster_wide else self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE'] + + # Clean issue text for title (remove [cluster-wide] and [ceph] markers if present) + clean_issue = issue + if is_cluster_wide: + clean_issue = clean_issue.replace('[cluster-wide] ', '').replace('[cluster-wide]', '') + # Extract drive capacity if this is a drive-related issue drive_size = "" if "Drive" in issue and "/dev/" in issue: @@ -1473,7 +1574,7 @@ class SystemHealthMonitor: f"{action_type['AUTO']}" f"{issue_tag}" f"{drive_size}" - f"{issue}" + f"{clean_issue}" f"{scope}" f"{environment['PRODUCTION']}" f"{ticket_type_tag}" @@ -1596,6 +1697,29 @@ class SystemHealthMonitor: if system_health.get('issues'): issues.extend(system_health['issues']) + # Check for Ceph cluster issues + ceph_health = health_report.get('ceph_health', {}) + if ceph_health.get('is_ceph_node'): + hostname = socket.gethostname() + designated_node = self.CONFIG.get('CEPH_TICKET_NODE') + + # Cluster-wide issues: only create tickets from designated node (or first node if not set) + # The [cluster-wide] tag ensures deduplication in tinker_tickets API + if ceph_health.get('cluster_wide_issues'): + # If no designated node, all nodes can report (API deduplicates) + # If designated node is set, only that node creates tickets + if not designated_node or hostname == designated_node: + for issue in ceph_health['cluster_wide_issues']: + # Add [cluster-wide] marker for API deduplication + issues.append(f"[cluster-wide] [ceph] {issue}") + else: + logger.debug(f"Skipping cluster-wide Ceph issues (designated node: {designated_node})") + + # Node-specific issues: always report from the affected node + if ceph_health.get('issues'): + for issue in ceph_health['issues']: + issues.append(f"[ceph] {issue}") + logger.info("=== Issue Detection Started ===") logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found") logger.info(f"Memory status: {health_report['memory_health']['status']}") @@ -2669,7 +2793,425 @@ class SystemHealthMonitor: 'status': 'ERROR', 'error': str(e) } - + + def _check_ceph_health(self) -> Dict[str, Any]: + """ + Check Ceph cluster health if this node is part of a Ceph cluster. + + Returns health status, cluster info, and any issues detected. + Cluster-wide issues use [cluster-wide] tag for cross-node deduplication. + """ + import shutil + + ceph_health = { + 'status': 'OK', + 'is_ceph_node': False, + 'cluster_health': None, + 'cluster_usage': None, + 'osd_status': [], + 'mon_status': [], + 'issues': [], + 'cluster_wide_issues': [] # Issues that apply to entire cluster + } + + # Check if Ceph monitoring is enabled + if not self.CONFIG.get('CEPH_ENABLED', True): + logger.debug("Ceph monitoring disabled in config") + return ceph_health + + # Check if ceph CLI is available + if not shutil.which('ceph'): + logger.debug("Ceph CLI not found - not a Ceph node") + return ceph_health + + ceph_health['is_ceph_node'] = True + hostname = socket.gethostname() + + try: + # Get cluster health status + health_result = subprocess.run( + ['ceph', 'health', '--format=json'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30 + ) + + if health_result.returncode == 0: + try: + health_data = json.loads(health_result.stdout) + ceph_health['cluster_health'] = health_data.get('status', 'UNKNOWN') + + # Check cluster health status + if ceph_health['cluster_health'] == 'HEALTH_ERR': + ceph_health['status'] = 'CRITICAL' + # This is a cluster-wide issue + ceph_health['cluster_wide_issues'].append( + f"Ceph cluster HEALTH_ERR: {health_data.get('summary', {}).get('message', 'Unknown error')}" + ) + elif ceph_health['cluster_health'] == 'HEALTH_WARN': + if ceph_health['status'] != 'CRITICAL': + ceph_health['status'] = 'WARNING' + # Extract warning messages + checks = health_data.get('checks', {}) + for check_name, check_data in checks.items(): + severity = check_data.get('severity', 'HEALTH_WARN') + message = check_data.get('summary', {}).get('message', check_name) + ceph_health['cluster_wide_issues'].append( + f"Ceph HEALTH_WARN: {message}" + ) + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse ceph health JSON: {e}") + + # Get cluster usage (ceph df) + df_result = subprocess.run( + ['ceph', 'df', '--format=json'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30 + ) + + if df_result.returncode == 0: + try: + df_data = json.loads(df_result.stdout) + stats = df_data.get('stats', {}) + total_bytes = stats.get('total_bytes', 0) + total_used = stats.get('total_used_raw_bytes', 0) + + if total_bytes > 0: + usage_percent = (total_used / total_bytes) * 100 + ceph_health['cluster_usage'] = { + 'total_bytes': total_bytes, + 'used_bytes': total_used, + 'usage_percent': round(usage_percent, 2) + } + + # Check usage thresholds + if usage_percent >= self.CONFIG.get('CEPH_USAGE_CRITICAL', 85): + ceph_health['status'] = 'CRITICAL' + ceph_health['cluster_wide_issues'].append( + f"Ceph cluster usage critical: {usage_percent:.1f}%" + ) + elif usage_percent >= self.CONFIG.get('CEPH_USAGE_WARNING', 70): + if ceph_health['status'] != 'CRITICAL': + ceph_health['status'] = 'WARNING' + ceph_health['cluster_wide_issues'].append( + f"Ceph cluster usage warning: {usage_percent:.1f}%" + ) + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse ceph df JSON: {e}") + + # Get OSD status (check for down OSDs on this node) + osd_result = subprocess.run( + ['ceph', 'osd', 'tree', '--format=json'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30 + ) + + if osd_result.returncode == 0: + try: + osd_data = json.loads(osd_result.stdout) + nodes = osd_data.get('nodes', []) + + # Find OSDs on this host + host_id = None + for node in nodes: + if node.get('type') == 'host' and node.get('name') == hostname: + host_id = node.get('id') + break + + # Check OSD status for this host + for node in nodes: + if node.get('type') == 'osd': + osd_info = { + 'id': node.get('id'), + 'name': node.get('name'), + 'status': node.get('status', 'unknown'), + 'reweight': node.get('reweight', 1.0) + } + + # Check if OSD belongs to this host (by checking parent in tree) + # Simplified: just track all OSDs for now + ceph_health['osd_status'].append(osd_info) + + # Check for down OSDs + if node.get('status') == 'down': + ceph_health['status'] = 'CRITICAL' + # Node-specific issue (will include hostname in hash) + ceph_health['issues'].append( + f"Ceph OSD {node.get('name')} is DOWN on {hostname}" + ) + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse ceph osd tree JSON: {e}") + + # Get monitor status + mon_result = subprocess.run( + ['ceph', 'mon', 'stat', '--format=json'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=30 + ) + + if mon_result.returncode == 0: + try: + mon_data = json.loads(mon_result.stdout) + ceph_health['mon_status'] = { + 'quorum': mon_data.get('quorum', []), + 'quorum_names': mon_data.get('quorum_names', []) + } + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse ceph mon stat JSON: {e}") + + logger.debug(f"=== Ceph Health Check ===") + logger.debug(f"Is Ceph node: {ceph_health['is_ceph_node']}") + logger.debug(f"Cluster health: {ceph_health['cluster_health']}") + logger.debug(f"Cluster usage: {ceph_health['cluster_usage']}") + logger.debug(f"Status: {ceph_health['status']}") + logger.debug(f"Issues: {ceph_health['issues']}") + logger.debug(f"Cluster-wide issues: {ceph_health['cluster_wide_issues']}") + logger.debug("=== End Ceph Health Check ===") + + except subprocess.TimeoutExpired: + ceph_health['status'] = 'ERROR' + ceph_health['issues'].append("Ceph health check timed out") + except Exception as e: + ceph_health['status'] = 'ERROR' + ceph_health['issues'].append(f"Error checking Ceph health: {str(e)}") + logger.error(f"Ceph health check failed: {e}") + + return ceph_health + + # ============================================================================= + # PROMETHEUS METRICS EXPORT + # ============================================================================= + def export_prometheus_metrics(self, health_report: Dict[str, Any]) -> str: + """ + Export health report as Prometheus metrics in text format. + + Metrics follow Prometheus naming conventions: + - hwmon_* prefix for all metrics + - Labels for dimensions (device, hostname, container, etc.) + + Returns: + str: Prometheus text format metrics + """ + hostname = health_report.get('hostname', socket.gethostname()) + metrics = [] + + # Helper to format labels + def labels(**kwargs) -> str: + pairs = [f'{k}="{v}"' for k, v in kwargs.items() if v is not None] + return '{' + ','.join(pairs) + '}' if pairs else '' + + # === System Info === + metrics.append(f'# HELP hwmon_info System information') + metrics.append(f'# TYPE hwmon_info gauge') + metrics.append(f'hwmon_info{labels(hostname=hostname)} 1') + + # === Drive Metrics === + metrics.append(f'# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)') + metrics.append(f'# TYPE hwmon_drive_smart_healthy gauge') + + metrics.append(f'# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius') + metrics.append(f'# TYPE hwmon_drive_temperature_celsius gauge') + + metrics.append(f'# HELP hwmon_drive_size_bytes Drive total size in bytes') + metrics.append(f'# TYPE hwmon_drive_size_bytes gauge') + + metrics.append(f'# HELP hwmon_drive_smart_issues_total Number of SMART issues detected') + metrics.append(f'# TYPE hwmon_drive_smart_issues_total gauge') + + for drive in health_report.get('drives_health', {}).get('drives', []): + device = drive.get('device', 'unknown') + drive_labels = labels(hostname=hostname, device=device) + + # SMART health status + smart_status = drive.get('smart_status', 'UNKNOWN') + healthy = 1 if smart_status == 'HEALTHY' else 0 + metrics.append(f'hwmon_drive_smart_healthy{drive_labels} {healthy}') + + # Temperature + if drive.get('temperature'): + metrics.append(f'hwmon_drive_temperature_celsius{drive_labels} {drive["temperature"]}') + + # Drive size (convert human-readable to bytes if possible) + if drive.get('capacity'): + capacity_bytes = self._parse_size_to_bytes(drive['capacity']) + if capacity_bytes: + metrics.append(f'hwmon_drive_size_bytes{drive_labels} {capacity_bytes}') + + # Issue count + issues_count = len(drive.get('smart_issues', [])) + metrics.append(f'hwmon_drive_smart_issues_total{drive_labels} {issues_count}') + + # === CPU Metrics === + cpu = health_report.get('cpu_health', {}) + metrics.append(f'# HELP hwmon_cpu_usage_percent CPU usage percentage') + metrics.append(f'# TYPE hwmon_cpu_usage_percent gauge') + if cpu.get('cpu_usage_percent') is not None: + metrics.append(f'hwmon_cpu_usage_percent{labels(hostname=hostname)} {cpu["cpu_usage_percent"]}') + + # === Memory Metrics === + mem = health_report.get('memory_health', {}) + metrics.append(f'# HELP hwmon_memory_usage_percent Memory usage percentage') + metrics.append(f'# TYPE hwmon_memory_usage_percent gauge') + if mem.get('memory_percent') is not None: + metrics.append(f'hwmon_memory_usage_percent{labels(hostname=hostname)} {mem["memory_percent"]}') + + metrics.append(f'# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)') + metrics.append(f'# TYPE hwmon_memory_has_ecc gauge') + has_ecc = 1 if mem.get('has_ecc') else 0 + metrics.append(f'hwmon_memory_has_ecc{labels(hostname=hostname)} {has_ecc}') + + if mem.get('has_ecc'): + metrics.append(f'# HELP hwmon_memory_ecc_errors_total Total ECC errors detected') + metrics.append(f'# TYPE hwmon_memory_ecc_errors_total gauge') + ecc_errors = len(mem.get('ecc_errors', [])) + metrics.append(f'hwmon_memory_ecc_errors_total{labels(hostname=hostname)} {ecc_errors}') + + # === Network Metrics === + net = health_report.get('network_health', {}) + metrics.append(f'# HELP hwmon_network_status Network status (1=OK, 0=issue)') + metrics.append(f'# TYPE hwmon_network_status gauge') + + for net_type in ['management_network', 'ceph_network']: + net_info = net.get(net_type, {}) + status = 1 if net_info.get('status') == 'OK' else 0 + net_name = net_type.replace('_network', '') + metrics.append(f'hwmon_network_status{labels(hostname=hostname, network=net_name)} {status}') + + # === Ceph Metrics === + ceph = health_report.get('ceph_health', {}) + if ceph.get('is_ceph_node'): + metrics.append(f'# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)') + metrics.append(f'# TYPE hwmon_ceph_cluster_healthy gauge') + ceph_healthy = 1 if ceph.get('cluster_health') == 'HEALTH_OK' else 0 + metrics.append(f'hwmon_ceph_cluster_healthy{labels(hostname=hostname)} {ceph_healthy}') + + if ceph.get('cluster_usage'): + usage = ceph['cluster_usage'] + metrics.append(f'# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage') + metrics.append(f'# TYPE hwmon_ceph_cluster_usage_percent gauge') + metrics.append(f'hwmon_ceph_cluster_usage_percent{labels(hostname=hostname)} {usage.get("usage_percent", 0)}') + + metrics.append(f'# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes') + metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_total gauge') + metrics.append(f'hwmon_ceph_cluster_bytes_total{labels(hostname=hostname)} {usage.get("total_bytes", 0)}') + + metrics.append(f'# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes') + metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_used gauge') + metrics.append(f'hwmon_ceph_cluster_bytes_used{labels(hostname=hostname)} {usage.get("used_bytes", 0)}') + + metrics.append(f'# HELP hwmon_ceph_osd_total Total number of OSDs') + metrics.append(f'# TYPE hwmon_ceph_osd_total gauge') + osd_count = len(ceph.get('osd_status', [])) + metrics.append(f'hwmon_ceph_osd_total{labels(hostname=hostname)} {osd_count}') + + metrics.append(f'# HELP hwmon_ceph_osd_down Number of down OSDs') + metrics.append(f'# TYPE hwmon_ceph_osd_down gauge') + down_osds = len([o for o in ceph.get('osd_status', []) if o.get('status') == 'down']) + metrics.append(f'hwmon_ceph_osd_down{labels(hostname=hostname)} {down_osds}') + + # === LXC Metrics === + lxc = health_report.get('lxc_health', {}) + if lxc.get('containers'): + metrics.append(f'# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage') + metrics.append(f'# TYPE hwmon_lxc_storage_usage_percent gauge') + + for container in lxc['containers']: + vmid = container.get('vmid', 'unknown') + for fs in container.get('filesystems', []): + mountpoint = fs.get('mountpoint', '/') + usage = fs.get('usage_percent', 0) + metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}') + + # === Issue Summary Metrics === + metrics.append(f'# HELP hwmon_issues_total Total number of issues detected') + metrics.append(f'# TYPE hwmon_issues_total gauge') + + system_issues = len(health_report.get('system_health', {}).get('issues', [])) + ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', [])) + lxc_issues = len(lxc.get('issues', [])) + total_issues = system_issues + ceph_issues + lxc_issues + metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}') + + return '\n'.join(metrics) + '\n' + + def _parse_size_to_bytes(self, size_str: str) -> int: + """Parse human-readable size string to bytes.""" + if not size_str: + return 0 + + size_str = size_str.strip().upper() + multipliers = { + 'B': 1, + 'KB': 1024, + 'MB': 1024**2, + 'GB': 1024**3, + 'TB': 1024**4, + 'PB': 1024**5, + 'K': 1024, + 'M': 1024**2, + 'G': 1024**3, + 'T': 1024**4, + 'P': 1024**5 + } + + try: + for suffix, mult in sorted(multipliers.items(), key=lambda x: -len(x[0])): + if size_str.endswith(suffix): + num = float(size_str[:-len(suffix)].strip()) + return int(num * mult) + return int(float(size_str)) + except (ValueError, TypeError): + return 0 + + def write_prometheus_metrics(self, health_report: Dict[str, Any]) -> bool: + """ + Write Prometheus metrics to configured destination. + + If PROMETHEUS_TEXTFILE_PATH is set, writes to that file for node_exporter. + Otherwise, logs the metrics (for debugging or other use). + + Returns: + bool: True if metrics were written successfully + """ + if not self.CONFIG.get('PROMETHEUS_ENABLED', False): + return False + + try: + metrics = self.export_prometheus_metrics(health_report) + textfile_path = self.CONFIG.get('PROMETHEUS_TEXTFILE_PATH') + + if textfile_path: + # Write to textfile for node_exporter textfile collector + # Write to temp file first, then atomic rename + import tempfile + temp_fd, temp_path = tempfile.mkstemp( + dir=os.path.dirname(textfile_path), + prefix='.hwmon_metrics_' + ) + try: + with os.fdopen(temp_fd, 'w') as f: + f.write(metrics) + os.rename(temp_path, textfile_path) + logger.info(f"Prometheus metrics written to {textfile_path}") + except Exception: + os.unlink(temp_path) + raise + else: + # Just log metrics (for debugging) + logger.debug("Prometheus metrics generated:\n" + metrics) + + return True + except Exception as e: + logger.error(f"Failed to write Prometheus metrics: {e}") + return False + def _check_lxc_storage(self) -> Dict[str, Any]: """ Check storage utilization for all running LXC containers @@ -2802,13 +3344,37 @@ def main(): action="store_true", help="Enable dry-run mode (simulate ticket creation without actual API calls)." ) + parser.add_argument( + "--metrics", + action="store_true", + help="Output Prometheus metrics to stdout and exit." + ) + parser.add_argument( + "--export-json", + type=str, + metavar="FILE", + help="Export health report to JSON file." + ) args = parser.parse_args() monitor = SystemHealthMonitor( ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'], dry_run=args.dry_run ) - monitor.run() + + if args.metrics: + # Just output metrics to stdout + health_report = monitor.perform_health_checks() + print(monitor.export_prometheus_metrics(health_report)) + elif args.export_json: + # Export health report as JSON + import json + health_report = monitor.perform_health_checks() + with open(args.export_json, 'w') as f: + json.dump(health_report, f, indent=2, default=str) + logger.info(f"Health report exported to {args.export_json}") + else: + monitor.run() if __name__ == "__main__": main() \ No newline at end of file