Add Ceph cluster monitoring and Prometheus metrics export

- Add comprehensive Ceph cluster health monitoring - Check cluster health status (HEALTH_OK/WARN/ERR) - Monitor cluster usage with configurable thresholds - Track OSD status (up/down) per node - Separate cluster-wide vs node-specific issues - Cluster-wide ticket deduplication - Add [cluster-wide] scope tag for Ceph issues - Cluster-wide issues deduplicate across all nodes - Node-specific issues (OSD down) include hostname - Add Prometheus metrics export - export_prometheus_metrics() method - write_prometheus_metrics() for textfile collector - --metrics CLI flag to output metrics to stdout - --export-json CLI flag to export health report as JSON - Add Grafana dashboard template (grafana-dashboard.json) - Add .gitignore Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-17 15:54:16 -05:00
parent 3322c5878a
commit 0f8918fb8b
3 changed files with 956 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.claude
+settings.local.json
--- a/grafana-dashboard.json
+++ b/grafana-dashboard.json
@@ -0,0 +1,375 @@
+{
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "Prometheus",
+      "description": "Prometheus data source for hwmonDaemon metrics",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__elements": {},
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "10.0.0"
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "gauge",
+      "name": "Gauge",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "stat",
+      "name": "Stat",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "table",
+      "name": "Table",
+      "version": ""
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    }
+  ],
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "collapsed": false,
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
+      "id": 1,
+      "panels": [],
+      "title": "Overview",
+      "type": "row"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "thresholds"},
+          "mappings": [
+            {"options": {"0": {"color": "red", "index": 0, "text": "Issues Detected"}}, "type": "value"},
+            {"options": {"from": 1, "result": {"color": "green", "index": 1, "text": "Healthy"}, "to": 999999}, "type": "range"}
+          ],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 4, "w": 4, "x": 0, "y": 1},
+      "id": 2,
+      "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "count(hwmon_info)", "legendFormat": "Hosts", "refId": "A"}],
+      "title": "Monitored Hosts",
+      "type": "stat"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "thresholds"},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 4, "w": 4, "x": 4, "y": 1},
+      "id": 3,
+      "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "sum(hwmon_issues_total)", "legendFormat": "Issues", "refId": "A"}],
+      "title": "Total Issues",
+      "type": "stat"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "thresholds"},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]}
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 4, "w": 4, "x": 8, "y": 1},
+      "id": 4,
+      "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "count(hwmon_drive_smart_healthy == 0)", "legendFormat": "Unhealthy", "refId": "A"}],
+      "title": "Unhealthy Drives",
+      "type": "stat"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "thresholds"},
+          "mappings": [
+            {"options": {"0": {"color": "red", "index": 0, "text": "Unhealthy"}}, "type": "value"},
+            {"options": {"1": {"color": "green", "index": 1, "text": "Healthy"}}, "type": "value"}
+          ],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 4, "w": 4, "x": 12, "y": 1},
+      "id": 5,
+      "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "min(hwmon_ceph_cluster_healthy)", "legendFormat": "Ceph", "refId": "A"}],
+      "title": "Ceph Cluster Health",
+      "type": "stat"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "thresholds"},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 2}]}
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 4, "w": 4, "x": 16, "y": 1},
+      "id": 6,
+      "options": {"colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "sum(hwmon_ceph_osd_down)", "legendFormat": "Down OSDs", "refId": "A"}],
+      "title": "Ceph OSDs Down",
+      "type": "stat"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 5},
+      "id": 10,
+      "panels": [],
+      "title": "Drive Health",
+      "type": "row"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "palette-classic"},
+          "custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 45}, {"color": "red", "value": 55}]},
+          "unit": "celsius"
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 6},
+      "id": 11,
+      "options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "hwmon_drive_temperature_celsius", "legendFormat": "{{hostname}} {{device}}", "refId": "A"}],
+      "title": "Drive Temperatures",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "thresholds"},
+          "custom": {"align": "auto", "cellOptions": {"type": "color-text"}, "inspect": false},
+          "mappings": [
+            {"options": {"0": {"color": "red", "index": 0, "text": "UNHEALTHY"}}, "type": "value"},
+            {"options": {"1": {"color": "green", "index": 1, "text": "HEALTHY"}}, "type": "value"}
+          ],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]}
+        },
+        "overrides": [
+          {"matcher": {"id": "byName", "options": "hostname"}, "properties": [{"id": "custom.width", "value": 120}]},
+          {"matcher": {"id": "byName", "options": "device"}, "properties": [{"id": "custom.width", "value": 100}]},
+          {"matcher": {"id": "byName", "options": "Status"}, "properties": [{"id": "custom.width", "value": 100}]},
+          {"matcher": {"id": "byName", "options": "Issues"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "thresholds", "value": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 3}]}}]}
+        ]
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 6},
+      "id": 12,
+      "options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Issues"}]},
+      "pluginVersion": "10.0.0",
+      "targets": [
+        {"expr": "hwmon_drive_smart_healthy", "format": "table", "instant": true, "legendFormat": "", "refId": "A"},
+        {"expr": "hwmon_drive_smart_issues_total", "format": "table", "instant": true, "legendFormat": "", "refId": "B"}
+      ],
+      "title": "Drive Status",
+      "transformations": [
+        {"id": "merge", "options": {}},
+        {"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {"Value #A": 2, "Value #B": 3, "device": 1, "hostname": 0}, "renameByName": {"Value #A": "Status", "Value #B": "Issues", "device": "Device", "hostname": "Host"}}}
+      ],
+      "type": "table"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 14},
+      "id": 20,
+      "panels": [],
+      "title": "System Resources",
+      "type": "row"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "palette-classic"},
+          "custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
+          "mappings": [],
+          "max": 100,
+          "min": 0,
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]},
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 15},
+      "id": 21,
+      "options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "hwmon_cpu_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
+      "title": "CPU Usage",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "palette-classic"},
+          "custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
+          "mappings": [],
+          "max": 100,
+          "min": 0,
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]},
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 15},
+      "id": 22,
+      "options": {"legend": {"calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true}, "tooltip": {"mode": "multi", "sort": "desc"}},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "hwmon_memory_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
+      "title": "Memory Usage",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 23},
+      "id": 30,
+      "panels": [],
+      "title": "Ceph Cluster",
+      "type": "row"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "thresholds"},
+          "mappings": [],
+          "max": 100,
+          "min": 0,
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]},
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 6, "w": 6, "x": 0, "y": 24},
+      "id": 31,
+      "options": {"orientation": "auto", "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "showThresholdLabels": false, "showThresholdMarkers": true},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "max(hwmon_ceph_cluster_usage_percent)", "legendFormat": "Usage", "refId": "A"}],
+      "title": "Ceph Cluster Usage",
+      "type": "gauge"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "palette-classic"},
+          "custom": {"axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": false, "stacking": {"group": "A", "mode": "none"}, "thresholdsStyle": {"mode": "line"}},
+          "mappings": [],
+          "max": 100,
+          "min": 0,
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 70}, {"color": "red", "value": 85}]},
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {"h": 6, "w": 18, "x": 6, "y": 24},
+      "id": 32,
+      "options": {"legend": {"calcs": ["lastNotNull"], "displayMode": "list", "placement": "right", "showLegend": true}, "tooltip": {"mode": "single", "sort": "none"}},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "hwmon_ceph_cluster_usage_percent", "legendFormat": "{{hostname}}", "refId": "A"}],
+      "title": "Ceph Usage Over Time",
+      "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 30},
+      "id": 40,
+      "panels": [],
+      "title": "LXC Containers",
+      "type": "row"
+    },
+    {
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "fieldConfig": {
+        "defaults": {
+          "color": {"mode": "thresholds"},
+          "custom": {"align": "auto", "cellOptions": {"type": "auto"}, "inspect": false},
+          "mappings": [],
+          "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 80}, {"color": "red", "value": 95}]}
+        },
+        "overrides": [
+          {"matcher": {"id": "byName", "options": "Usage"}, "properties": [{"id": "custom.cellOptions", "value": {"mode": "gradient", "type": "gauge"}}, {"id": "unit", "value": "percent"}, {"id": "max", "value": 100}, {"id": "min", "value": 0}]}
+        ]
+      },
+      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 31},
+      "id": 41,
+      "options": {"cellHeight": "sm", "footer": {"countRows": false, "fields": "", "reducer": ["sum"], "show": false}, "showHeader": true, "sortBy": [{"desc": true, "displayName": "Usage"}]},
+      "pluginVersion": "10.0.0",
+      "targets": [{"expr": "hwmon_lxc_storage_usage_percent", "format": "table", "instant": true, "legendFormat": "", "refId": "A"}],
+      "title": "LXC Storage Usage",
+      "transformations": [
+        {"id": "organize", "options": {"excludeByName": {"Time": true, "__name__": true}, "indexByName": {}, "renameByName": {"Value": "Usage", "hostname": "Host", "mountpoint": "Mountpoint", "vmid": "Container ID"}}}
+      ],
+      "type": "table"
+    }
+  ],
+  "refresh": "1m",
+  "schemaVersion": 38,
+  "style": "dark",
+  "tags": ["hwmon", "hardware", "monitoring", "proxmox"],
+  "templating": {"list": []},
+  "time": {"from": "now-6h", "to": "now"},
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "hwmonDaemon - Hardware Monitor",
+  "uid": "hwmondaemon",
+  "version": 1,
+  "weekStart": ""
+}
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -69,7 +69,16 @@ class SystemHealthMonitor:
        'TEMPERATURE_INFO': PRIORITIES['LOW'],
        'DRIVE_AGE_INFO': PRIORITIES['LOW'],
        'SSD_WEAR_INFO': PRIORITIES['LOW'],
-        'SYSTEM_LOG_INFO': PRIORITIES['LOW']
+        'SYSTEM_LOG_INFO': PRIORITIES['LOW'],
+
+        # Ceph cluster issues
+        'CEPH_HEALTH_ERR': PRIORITIES['CRITICAL'],    # P1 - Cluster in error state
+        'CEPH_HEALTH_WARN': PRIORITIES['MEDIUM'],     # P3 - Cluster warnings
+        'CEPH_OSD_DOWN': PRIORITIES['HIGH'],          # P2 - OSD down (local node)
+        'CEPH_USAGE_CRITICAL': PRIORITIES['HIGH'],    # P2 - Cluster near full
+        'CEPH_USAGE_WARNING': PRIORITIES['MEDIUM'],   # P3 - Cluster usage high
+        'CEPH_PG_DEGRADED': PRIORITIES['HIGH'],       # P2 - PGs degraded
+        'CEPH_MON_DOWN': PRIORITIES['HIGH']           # P2 - Monitor down
    }
    
    CONFIG = {
@@ -104,7 +113,16 @@ class SystemHealthMonitor:
        'HISTORY_DIR': '/var/log/hwmonDaemon',
        'HISTORY_RETENTION_DAYS': 30,
        'INCLUDE_INFO_TICKETS': False,  # Set True to create P5 tickets for INFO alerts
-        'PRIORITY_ESCALATION_THRESHOLD': 3  # Number of criticals to trigger P1
+        'PRIORITY_ESCALATION_THRESHOLD': 3,  # Number of criticals to trigger P1
+        # Ceph monitoring settings
+        'CEPH_ENABLED': True,  # Enable/disable Ceph health monitoring
+        'CEPH_TICKET_NODE': None,  # Hostname of node designated to create cluster-wide Ceph tickets
+        'CEPH_USAGE_WARNING': 70,  # Ceph cluster usage warning threshold %
+        'CEPH_USAGE_CRITICAL': 85,  # Ceph cluster usage critical threshold %
+        # Prometheus metrics settings
+        'PROMETHEUS_ENABLED': False,  # Enable Prometheus metrics export
+        'PROMETHEUS_PORT': 9101,  # Port for Prometheus metrics HTTP server
+        'PROMETHEUS_TEXTFILE_PATH': None  # Path for textfile collector (alternative to HTTP)
    }

    @classmethod
@@ -138,6 +156,26 @@ class SystemHealthMonitor:
                        elif key == 'TICKET_API_URL':
                            cls.CONFIG['TICKET_API_URL'] = value
                            logger.info(f"✓ Loaded TICKET_API_URL: {value}")
+                        # Ceph settings
+                        elif key == 'CEPH_ENABLED':
+                            cls.CONFIG['CEPH_ENABLED'] = value.lower() in ('true', '1', 'yes')
+                            logger.info(f"✓ Loaded CEPH_ENABLED: {cls.CONFIG['CEPH_ENABLED']}")
+                        elif key == 'CEPH_TICKET_NODE':
+                            cls.CONFIG['CEPH_TICKET_NODE'] = value if value else None
+                            logger.info(f"✓ Loaded CEPH_TICKET_NODE: {value}")
+                        elif key == 'CEPH_USAGE_WARNING':
+                            cls.CONFIG['CEPH_USAGE_WARNING'] = int(value)
+                        elif key == 'CEPH_USAGE_CRITICAL':
+                            cls.CONFIG['CEPH_USAGE_CRITICAL'] = int(value)
+                        # Prometheus settings
+                        elif key == 'PROMETHEUS_ENABLED':
+                            cls.CONFIG['PROMETHEUS_ENABLED'] = value.lower() in ('true', '1', 'yes')
+                            logger.info(f"✓ Loaded PROMETHEUS_ENABLED: {cls.CONFIG['PROMETHEUS_ENABLED']}")
+                        elif key == 'PROMETHEUS_PORT':
+                            cls.CONFIG['PROMETHEUS_PORT'] = int(value)
+                        elif key == 'PROMETHEUS_TEXTFILE_PATH':
+                            cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None
+                            logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}")

        except Exception as e:
            logger.error(f"Failed to load .env file: {e}")
@@ -632,9 +670,13 @@ class SystemHealthMonitor:
        try:
            # Perform health checks and gather the report
            health_report = self.perform_health_checks()
-            
+
            # Create tickets for any detected critical issues
            self._create_tickets_for_issues(health_report)
+
+            # Export Prometheus metrics if enabled
+            if self.CONFIG.get('PROMETHEUS_ENABLED', False):
+                self.write_prometheus_metrics(health_report)
        except Exception as e:
            import traceback
            logger.error(f"Unexpected error during health check: {e}")
@@ -643,10 +685,13 @@ class SystemHealthMonitor:
    def perform_health_checks(self) -> Dict[str, Any]:
        """Perform comprehensive system health checks and return a report."""
        health_report = {
+            'hostname': socket.gethostname(),
+            'timestamp': datetime.datetime.now().isoformat(),
            'drives_health': self._check_drives_health(),
            'memory_health': self._check_memory_usage(),
            'cpu_health': self._check_cpu_usage(),
            'network_health': self._check_network_status(),
+            'ceph_health': self._check_ceph_health(),
            'lxc_health': self._check_lxc_storage(),
            'system_health': self._check_system_drive_indicators()
        }
@@ -682,8 +727,25 @@ class SystemHealthMonitor:

            logger.info("\nNetwork Status:")
            logger.info(f"Management: {health_report['network_health']['management_network']['status']}")
-            logger.info(f"Ceph: {health_report['network_health']['ceph_network']['status']}")
-            
+            logger.info(f"Ceph Network: {health_report['network_health']['ceph_network']['status']}")
+
+            # Ceph cluster status
+            ceph = health_report.get('ceph_health', {})
+            if ceph.get('is_ceph_node'):
+                logger.info("\nCeph Cluster Status:")
+                logger.info(f"  Cluster Health: {ceph.get('cluster_health', 'UNKNOWN')}")
+                if ceph.get('cluster_usage'):
+                    usage = ceph['cluster_usage']
+                    logger.info(f"  Cluster Usage: {usage.get('usage_percent', 0):.1f}%")
+                logger.info(f"  OSDs: {len(ceph.get('osd_status', []))} total")
+                down_osds = [o for o in ceph.get('osd_status', []) if o.get('status') == 'down']
+                if down_osds:
+                    logger.info(f"  ⚠️  Down OSDs: {len(down_osds)}")
+                if ceph.get('cluster_wide_issues'):
+                    logger.info(f"  ⚠️  Cluster-wide issues: {len(ceph['cluster_wide_issues'])}")
+                if ceph.get('issues'):
+                    logger.info(f"  ⚠️  Node-specific issues: {len(ceph['issues'])}")
+
            if health_report['system_health']['issues']:
                logger.info(f"\nSystem Issues: {len(health_report['system_health']['issues'])} found")
            
@@ -1296,8 +1358,9 @@ class SystemHealthMonitor:

        # P1 - Specific cluster-affecting scenarios
        if any(keyword in issue_lower for keyword in [
-            'cluster', 'raid degraded', 'multiple drive',
-            'both networks unreachable'
+            'raid degraded', 'multiple drive',
+            'both networks unreachable',
+            'health_err'  # Ceph cluster error
        ]):
            return self.PRIORITIES['CRITICAL']  # P1

@@ -1310,10 +1373,16 @@ class SystemHealthMonitor:
            'reallocated_sector', 'pending_sector', 'offline_uncorrectable',
            'critical available_spare', 'critical wear',
            'critical reallocated', 'critical current_pending',
-            'network is unreachable'
+            'network is unreachable',
+            'osd is down', 'osd down',  # Ceph OSD down
+            'cluster usage critical'  # Ceph usage critical
        ]):
            return self.PRIORITIES['HIGH']  # P2

+        # P2 - Ceph OSD issues (need to check explicitly since 'down' is in issue text)
+        if '[ceph]' in issue_lower and 'down' in issue_lower:
+            return self.PRIORITIES['HIGH']  # P2
+
        # P2 - SMART issues with critical indicators
        if 'smart issues' in issue_lower and any(error_type in issue_lower for error_type in [
            'critical', 'failed', 'reallocated', 'pending', 'uncorrectable', 'offline'
@@ -1324,7 +1393,8 @@ class SystemHealthMonitor:
        if any(keyword in issue_lower for keyword in [
            'warning', 'high temperature', 'correctable ecc',
            'trend alert', 'critical storage usage',
-            'low available_spare', 'high wear'
+            'low available_spare', 'high wear',
+            'health_warn', 'cluster usage warning'  # Ceph warnings
        ]):
            return self.PRIORITIES['MEDIUM']  # P3

@@ -1425,6 +1495,29 @@ class SystemHealthMonitor:
                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
                )

+        # Ceph Issues - Storage cluster issues (categorized as Hardware)
+        if any(keyword in issue_lower for keyword in [
+            'ceph', 'osd', 'health_err', 'health_warn', 'cluster usage'
+        ]):
+            # Ceph errors are issues (unplanned degradation)
+            if any(error in issue_lower for error in [
+                'health_err', 'down', 'critical', 'error'
+            ]):
+                return (
+                    self.TICKET_CATEGORIES['HARDWARE'],
+                    self.TICKET_TYPES['ISSUE'],
+                    '[ceph]',
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['ISSUE']
+                )
+            # Ceph warnings are problems (need investigation)
+            else:
+                return (
+                    self.TICKET_CATEGORIES['HARDWARE'],
+                    self.TICKET_TYPES['PROBLEM'],
+                    '[ceph]',
+                    self.TICKET_TEMPLATES['TICKET_TYPE']['PROBLEM']
+                )
+
        # Default: Hardware Problem (for undefined cases)
        return (
            self.TICKET_CATEGORIES['HARDWARE'],
@@ -1446,7 +1539,6 @@ class SystemHealthMonitor:
        hostname = socket.gethostname()
        action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
        environment = self.TICKET_TEMPLATES['ENVIRONMENT']
-        scope = self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']

        for issue in issues:
            # Use the comprehensive priority determination function
@@ -1455,6 +1547,15 @@ class SystemHealthMonitor:
            # Get proper categorization for this issue
            category, ticket_type, issue_tag, ticket_type_tag = self._categorize_issue(issue)

+            # Determine scope: cluster-wide for Ceph cluster issues, single-node otherwise
+            is_cluster_wide = '[cluster-wide]' in issue
+            scope = self.TICKET_TEMPLATES['SCOPE']['CLUSTER_WIDE'] if is_cluster_wide else self.TICKET_TEMPLATES['SCOPE']['SINGLE_NODE']
+
+            # Clean issue text for title (remove [cluster-wide] and [ceph] markers if present)
+            clean_issue = issue
+            if is_cluster_wide:
+                clean_issue = clean_issue.replace('[cluster-wide] ', '').replace('[cluster-wide]', '')
+
            # Extract drive capacity if this is a drive-related issue
            drive_size = ""
            if "Drive" in issue and "/dev/" in issue:
@@ -1473,7 +1574,7 @@ class SystemHealthMonitor:
                f"{action_type['AUTO']}"
                f"{issue_tag}"
                f"{drive_size}"
-                f"{issue}"
+                f"{clean_issue}"
                f"{scope}"
                f"{environment['PRODUCTION']}"
                f"{ticket_type_tag}"
@@ -1596,6 +1697,29 @@ class SystemHealthMonitor:
        if system_health.get('issues'):
            issues.extend(system_health['issues'])

+        # Check for Ceph cluster issues
+        ceph_health = health_report.get('ceph_health', {})
+        if ceph_health.get('is_ceph_node'):
+            hostname = socket.gethostname()
+            designated_node = self.CONFIG.get('CEPH_TICKET_NODE')
+
+            # Cluster-wide issues: only create tickets from designated node (or first node if not set)
+            # The [cluster-wide] tag ensures deduplication in tinker_tickets API
+            if ceph_health.get('cluster_wide_issues'):
+                # If no designated node, all nodes can report (API deduplicates)
+                # If designated node is set, only that node creates tickets
+                if not designated_node or hostname == designated_node:
+                    for issue in ceph_health['cluster_wide_issues']:
+                        # Add [cluster-wide] marker for API deduplication
+                        issues.append(f"[cluster-wide] [ceph] {issue}")
+                else:
+                    logger.debug(f"Skipping cluster-wide Ceph issues (designated node: {designated_node})")
+
+            # Node-specific issues: always report from the affected node
+            if ceph_health.get('issues'):
+                for issue in ceph_health['issues']:
+                    issues.append(f"[ceph] {issue}")
+
        logger.info("=== Issue Detection Started ===")
        logger.info(f"Checking drives: {len(health_report['drives_health']['drives'])} found")
        logger.info(f"Memory status: {health_report['memory_health']['status']}")
@@ -2669,7 +2793,425 @@ class SystemHealthMonitor:
                'status': 'ERROR',
                'error': str(e)
            }
-    
+
+    def _check_ceph_health(self) -> Dict[str, Any]:
+        """
+        Check Ceph cluster health if this node is part of a Ceph cluster.
+
+        Returns health status, cluster info, and any issues detected.
+        Cluster-wide issues use [cluster-wide] tag for cross-node deduplication.
+        """
+        import shutil
+
+        ceph_health = {
+            'status': 'OK',
+            'is_ceph_node': False,
+            'cluster_health': None,
+            'cluster_usage': None,
+            'osd_status': [],
+            'mon_status': [],
+            'issues': [],
+            'cluster_wide_issues': []  # Issues that apply to entire cluster
+        }
+
+        # Check if Ceph monitoring is enabled
+        if not self.CONFIG.get('CEPH_ENABLED', True):
+            logger.debug("Ceph monitoring disabled in config")
+            return ceph_health
+
+        # Check if ceph CLI is available
+        if not shutil.which('ceph'):
+            logger.debug("Ceph CLI not found - not a Ceph node")
+            return ceph_health
+
+        ceph_health['is_ceph_node'] = True
+        hostname = socket.gethostname()
+
+        try:
+            # Get cluster health status
+            health_result = subprocess.run(
+                ['ceph', 'health', '--format=json'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                timeout=30
+            )
+
+            if health_result.returncode == 0:
+                try:
+                    health_data = json.loads(health_result.stdout)
+                    ceph_health['cluster_health'] = health_data.get('status', 'UNKNOWN')
+
+                    # Check cluster health status
+                    if ceph_health['cluster_health'] == 'HEALTH_ERR':
+                        ceph_health['status'] = 'CRITICAL'
+                        # This is a cluster-wide issue
+                        ceph_health['cluster_wide_issues'].append(
+                            f"Ceph cluster HEALTH_ERR: {health_data.get('summary', {}).get('message', 'Unknown error')}"
+                        )
+                    elif ceph_health['cluster_health'] == 'HEALTH_WARN':
+                        if ceph_health['status'] != 'CRITICAL':
+                            ceph_health['status'] = 'WARNING'
+                        # Extract warning messages
+                        checks = health_data.get('checks', {})
+                        for check_name, check_data in checks.items():
+                            severity = check_data.get('severity', 'HEALTH_WARN')
+                            message = check_data.get('summary', {}).get('message', check_name)
+                            ceph_health['cluster_wide_issues'].append(
+                                f"Ceph HEALTH_WARN: {message}"
+                            )
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Failed to parse ceph health JSON: {e}")
+
+            # Get cluster usage (ceph df)
+            df_result = subprocess.run(
+                ['ceph', 'df', '--format=json'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                timeout=30
+            )
+
+            if df_result.returncode == 0:
+                try:
+                    df_data = json.loads(df_result.stdout)
+                    stats = df_data.get('stats', {})
+                    total_bytes = stats.get('total_bytes', 0)
+                    total_used = stats.get('total_used_raw_bytes', 0)
+
+                    if total_bytes > 0:
+                        usage_percent = (total_used / total_bytes) * 100
+                        ceph_health['cluster_usage'] = {
+                            'total_bytes': total_bytes,
+                            'used_bytes': total_used,
+                            'usage_percent': round(usage_percent, 2)
+                        }
+
+                        # Check usage thresholds
+                        if usage_percent >= self.CONFIG.get('CEPH_USAGE_CRITICAL', 85):
+                            ceph_health['status'] = 'CRITICAL'
+                            ceph_health['cluster_wide_issues'].append(
+                                f"Ceph cluster usage critical: {usage_percent:.1f}%"
+                            )
+                        elif usage_percent >= self.CONFIG.get('CEPH_USAGE_WARNING', 70):
+                            if ceph_health['status'] != 'CRITICAL':
+                                ceph_health['status'] = 'WARNING'
+                            ceph_health['cluster_wide_issues'].append(
+                                f"Ceph cluster usage warning: {usage_percent:.1f}%"
+                            )
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Failed to parse ceph df JSON: {e}")
+
+            # Get OSD status (check for down OSDs on this node)
+            osd_result = subprocess.run(
+                ['ceph', 'osd', 'tree', '--format=json'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                timeout=30
+            )
+
+            if osd_result.returncode == 0:
+                try:
+                    osd_data = json.loads(osd_result.stdout)
+                    nodes = osd_data.get('nodes', [])
+
+                    # Find OSDs on this host
+                    host_id = None
+                    for node in nodes:
+                        if node.get('type') == 'host' and node.get('name') == hostname:
+                            host_id = node.get('id')
+                            break
+
+                    # Check OSD status for this host
+                    for node in nodes:
+                        if node.get('type') == 'osd':
+                            osd_info = {
+                                'id': node.get('id'),
+                                'name': node.get('name'),
+                                'status': node.get('status', 'unknown'),
+                                'reweight': node.get('reweight', 1.0)
+                            }
+
+                            # Check if OSD belongs to this host (by checking parent in tree)
+                            # Simplified: just track all OSDs for now
+                            ceph_health['osd_status'].append(osd_info)
+
+                            # Check for down OSDs
+                            if node.get('status') == 'down':
+                                ceph_health['status'] = 'CRITICAL'
+                                # Node-specific issue (will include hostname in hash)
+                                ceph_health['issues'].append(
+                                    f"Ceph OSD {node.get('name')} is DOWN on {hostname}"
+                                )
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Failed to parse ceph osd tree JSON: {e}")
+
+            # Get monitor status
+            mon_result = subprocess.run(
+                ['ceph', 'mon', 'stat', '--format=json'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                timeout=30
+            )
+
+            if mon_result.returncode == 0:
+                try:
+                    mon_data = json.loads(mon_result.stdout)
+                    ceph_health['mon_status'] = {
+                        'quorum': mon_data.get('quorum', []),
+                        'quorum_names': mon_data.get('quorum_names', [])
+                    }
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Failed to parse ceph mon stat JSON: {e}")
+
+            logger.debug(f"=== Ceph Health Check ===")
+            logger.debug(f"Is Ceph node: {ceph_health['is_ceph_node']}")
+            logger.debug(f"Cluster health: {ceph_health['cluster_health']}")
+            logger.debug(f"Cluster usage: {ceph_health['cluster_usage']}")
+            logger.debug(f"Status: {ceph_health['status']}")
+            logger.debug(f"Issues: {ceph_health['issues']}")
+            logger.debug(f"Cluster-wide issues: {ceph_health['cluster_wide_issues']}")
+            logger.debug("=== End Ceph Health Check ===")
+
+        except subprocess.TimeoutExpired:
+            ceph_health['status'] = 'ERROR'
+            ceph_health['issues'].append("Ceph health check timed out")
+        except Exception as e:
+            ceph_health['status'] = 'ERROR'
+            ceph_health['issues'].append(f"Error checking Ceph health: {str(e)}")
+            logger.error(f"Ceph health check failed: {e}")
+
+        return ceph_health
+
+    # =============================================================================
+    # PROMETHEUS METRICS EXPORT
+    # =============================================================================
+    def export_prometheus_metrics(self, health_report: Dict[str, Any]) -> str:
+        """
+        Export health report as Prometheus metrics in text format.
+
+        Metrics follow Prometheus naming conventions:
+        - hwmon_* prefix for all metrics
+        - Labels for dimensions (device, hostname, container, etc.)
+
+        Returns:
+            str: Prometheus text format metrics
+        """
+        hostname = health_report.get('hostname', socket.gethostname())
+        metrics = []
+
+        # Helper to format labels
+        def labels(**kwargs) -> str:
+            pairs = [f'{k}="{v}"' for k, v in kwargs.items() if v is not None]
+            return '{' + ','.join(pairs) + '}' if pairs else ''
+
+        # === System Info ===
+        metrics.append(f'# HELP hwmon_info System information')
+        metrics.append(f'# TYPE hwmon_info gauge')
+        metrics.append(f'hwmon_info{labels(hostname=hostname)} 1')
+
+        # === Drive Metrics ===
+        metrics.append(f'# HELP hwmon_drive_smart_healthy SMART health status (1=healthy, 0=unhealthy)')
+        metrics.append(f'# TYPE hwmon_drive_smart_healthy gauge')
+
+        metrics.append(f'# HELP hwmon_drive_temperature_celsius Drive temperature in Celsius')
+        metrics.append(f'# TYPE hwmon_drive_temperature_celsius gauge')
+
+        metrics.append(f'# HELP hwmon_drive_size_bytes Drive total size in bytes')
+        metrics.append(f'# TYPE hwmon_drive_size_bytes gauge')
+
+        metrics.append(f'# HELP hwmon_drive_smart_issues_total Number of SMART issues detected')
+        metrics.append(f'# TYPE hwmon_drive_smart_issues_total gauge')
+
+        for drive in health_report.get('drives_health', {}).get('drives', []):
+            device = drive.get('device', 'unknown')
+            drive_labels = labels(hostname=hostname, device=device)
+
+            # SMART health status
+            smart_status = drive.get('smart_status', 'UNKNOWN')
+            healthy = 1 if smart_status == 'HEALTHY' else 0
+            metrics.append(f'hwmon_drive_smart_healthy{drive_labels} {healthy}')
+
+            # Temperature
+            if drive.get('temperature'):
+                metrics.append(f'hwmon_drive_temperature_celsius{drive_labels} {drive["temperature"]}')
+
+            # Drive size (convert human-readable to bytes if possible)
+            if drive.get('capacity'):
+                capacity_bytes = self._parse_size_to_bytes(drive['capacity'])
+                if capacity_bytes:
+                    metrics.append(f'hwmon_drive_size_bytes{drive_labels} {capacity_bytes}')
+
+            # Issue count
+            issues_count = len(drive.get('smart_issues', []))
+            metrics.append(f'hwmon_drive_smart_issues_total{drive_labels} {issues_count}')
+
+        # === CPU Metrics ===
+        cpu = health_report.get('cpu_health', {})
+        metrics.append(f'# HELP hwmon_cpu_usage_percent CPU usage percentage')
+        metrics.append(f'# TYPE hwmon_cpu_usage_percent gauge')
+        if cpu.get('cpu_usage_percent') is not None:
+            metrics.append(f'hwmon_cpu_usage_percent{labels(hostname=hostname)} {cpu["cpu_usage_percent"]}')
+
+        # === Memory Metrics ===
+        mem = health_report.get('memory_health', {})
+        metrics.append(f'# HELP hwmon_memory_usage_percent Memory usage percentage')
+        metrics.append(f'# TYPE hwmon_memory_usage_percent gauge')
+        if mem.get('memory_percent') is not None:
+            metrics.append(f'hwmon_memory_usage_percent{labels(hostname=hostname)} {mem["memory_percent"]}')
+
+        metrics.append(f'# HELP hwmon_memory_has_ecc Whether ECC memory is present (1=yes, 0=no)')
+        metrics.append(f'# TYPE hwmon_memory_has_ecc gauge')
+        has_ecc = 1 if mem.get('has_ecc') else 0
+        metrics.append(f'hwmon_memory_has_ecc{labels(hostname=hostname)} {has_ecc}')
+
+        if mem.get('has_ecc'):
+            metrics.append(f'# HELP hwmon_memory_ecc_errors_total Total ECC errors detected')
+            metrics.append(f'# TYPE hwmon_memory_ecc_errors_total gauge')
+            ecc_errors = len(mem.get('ecc_errors', []))
+            metrics.append(f'hwmon_memory_ecc_errors_total{labels(hostname=hostname)} {ecc_errors}')
+
+        # === Network Metrics ===
+        net = health_report.get('network_health', {})
+        metrics.append(f'# HELP hwmon_network_status Network status (1=OK, 0=issue)')
+        metrics.append(f'# TYPE hwmon_network_status gauge')
+
+        for net_type in ['management_network', 'ceph_network']:
+            net_info = net.get(net_type, {})
+            status = 1 if net_info.get('status') == 'OK' else 0
+            net_name = net_type.replace('_network', '')
+            metrics.append(f'hwmon_network_status{labels(hostname=hostname, network=net_name)} {status}')
+
+        # === Ceph Metrics ===
+        ceph = health_report.get('ceph_health', {})
+        if ceph.get('is_ceph_node'):
+            metrics.append(f'# HELP hwmon_ceph_cluster_healthy Ceph cluster health (1=healthy, 0=warning/error)')
+            metrics.append(f'# TYPE hwmon_ceph_cluster_healthy gauge')
+            ceph_healthy = 1 if ceph.get('cluster_health') == 'HEALTH_OK' else 0
+            metrics.append(f'hwmon_ceph_cluster_healthy{labels(hostname=hostname)} {ceph_healthy}')
+
+            if ceph.get('cluster_usage'):
+                usage = ceph['cluster_usage']
+                metrics.append(f'# HELP hwmon_ceph_cluster_usage_percent Ceph cluster usage percentage')
+                metrics.append(f'# TYPE hwmon_ceph_cluster_usage_percent gauge')
+                metrics.append(f'hwmon_ceph_cluster_usage_percent{labels(hostname=hostname)} {usage.get("usage_percent", 0)}')
+
+                metrics.append(f'# HELP hwmon_ceph_cluster_bytes_total Ceph cluster total bytes')
+                metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_total gauge')
+                metrics.append(f'hwmon_ceph_cluster_bytes_total{labels(hostname=hostname)} {usage.get("total_bytes", 0)}')
+
+                metrics.append(f'# HELP hwmon_ceph_cluster_bytes_used Ceph cluster used bytes')
+                metrics.append(f'# TYPE hwmon_ceph_cluster_bytes_used gauge')
+                metrics.append(f'hwmon_ceph_cluster_bytes_used{labels(hostname=hostname)} {usage.get("used_bytes", 0)}')
+
+            metrics.append(f'# HELP hwmon_ceph_osd_total Total number of OSDs')
+            metrics.append(f'# TYPE hwmon_ceph_osd_total gauge')
+            osd_count = len(ceph.get('osd_status', []))
+            metrics.append(f'hwmon_ceph_osd_total{labels(hostname=hostname)} {osd_count}')
+
+            metrics.append(f'# HELP hwmon_ceph_osd_down Number of down OSDs')
+            metrics.append(f'# TYPE hwmon_ceph_osd_down gauge')
+            down_osds = len([o for o in ceph.get('osd_status', []) if o.get('status') == 'down'])
+            metrics.append(f'hwmon_ceph_osd_down{labels(hostname=hostname)} {down_osds}')
+
+        # === LXC Metrics ===
+        lxc = health_report.get('lxc_health', {})
+        if lxc.get('containers'):
+            metrics.append(f'# HELP hwmon_lxc_storage_usage_percent LXC container storage usage percentage')
+            metrics.append(f'# TYPE hwmon_lxc_storage_usage_percent gauge')
+
+            for container in lxc['containers']:
+                vmid = container.get('vmid', 'unknown')
+                for fs in container.get('filesystems', []):
+                    mountpoint = fs.get('mountpoint', '/')
+                    usage = fs.get('usage_percent', 0)
+                    metrics.append(f'hwmon_lxc_storage_usage_percent{labels(hostname=hostname, vmid=vmid, mountpoint=mountpoint)} {usage}')
+
+        # === Issue Summary Metrics ===
+        metrics.append(f'# HELP hwmon_issues_total Total number of issues detected')
+        metrics.append(f'# TYPE hwmon_issues_total gauge')
+
+        system_issues = len(health_report.get('system_health', {}).get('issues', []))
+        ceph_issues = len(ceph.get('issues', [])) + len(ceph.get('cluster_wide_issues', []))
+        lxc_issues = len(lxc.get('issues', []))
+        total_issues = system_issues + ceph_issues + lxc_issues
+        metrics.append(f'hwmon_issues_total{labels(hostname=hostname)} {total_issues}')
+
+        return '\n'.join(metrics) + '\n'
+
+    def _parse_size_to_bytes(self, size_str: str) -> int:
+        """Parse human-readable size string to bytes."""
+        if not size_str:
+            return 0
+
+        size_str = size_str.strip().upper()
+        multipliers = {
+            'B': 1,
+            'KB': 1024,
+            'MB': 1024**2,
+            'GB': 1024**3,
+            'TB': 1024**4,
+            'PB': 1024**5,
+            'K': 1024,
+            'M': 1024**2,
+            'G': 1024**3,
+            'T': 1024**4,
+            'P': 1024**5
+        }
+
+        try:
+            for suffix, mult in sorted(multipliers.items(), key=lambda x: -len(x[0])):
+                if size_str.endswith(suffix):
+                    num = float(size_str[:-len(suffix)].strip())
+                    return int(num * mult)
+            return int(float(size_str))
+        except (ValueError, TypeError):
+            return 0
+
+    def write_prometheus_metrics(self, health_report: Dict[str, Any]) -> bool:
+        """
+        Write Prometheus metrics to configured destination.
+
+        If PROMETHEUS_TEXTFILE_PATH is set, writes to that file for node_exporter.
+        Otherwise, logs the metrics (for debugging or other use).
+
+        Returns:
+            bool: True if metrics were written successfully
+        """
+        if not self.CONFIG.get('PROMETHEUS_ENABLED', False):
+            return False
+
+        try:
+            metrics = self.export_prometheus_metrics(health_report)
+            textfile_path = self.CONFIG.get('PROMETHEUS_TEXTFILE_PATH')
+
+            if textfile_path:
+                # Write to textfile for node_exporter textfile collector
+                # Write to temp file first, then atomic rename
+                import tempfile
+                temp_fd, temp_path = tempfile.mkstemp(
+                    dir=os.path.dirname(textfile_path),
+                    prefix='.hwmon_metrics_'
+                )
+                try:
+                    with os.fdopen(temp_fd, 'w') as f:
+                        f.write(metrics)
+                    os.rename(temp_path, textfile_path)
+                    logger.info(f"Prometheus metrics written to {textfile_path}")
+                except Exception:
+                    os.unlink(temp_path)
+                    raise
+            else:
+                # Just log metrics (for debugging)
+                logger.debug("Prometheus metrics generated:\n" + metrics)
+
+            return True
+        except Exception as e:
+            logger.error(f"Failed to write Prometheus metrics: {e}")
+            return False
+
    def _check_lxc_storage(self) -> Dict[str, Any]:
        """
        Check storage utilization for all running LXC containers
@@ -2802,13 +3344,37 @@ def main():
        action="store_true",
        help="Enable dry-run mode (simulate ticket creation without actual API calls)."
    )
+    parser.add_argument(
+        "--metrics",
+        action="store_true",
+        help="Output Prometheus metrics to stdout and exit."
+    )
+    parser.add_argument(
+        "--export-json",
+        type=str,
+        metavar="FILE",
+        help="Export health report to JSON file."
+    )
    args = parser.parse_args()

    monitor = SystemHealthMonitor(
        ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
        dry_run=args.dry_run
    )
-    monitor.run()
+
+    if args.metrics:
+        # Just output metrics to stdout
+        health_report = monitor.perform_health_checks()
+        print(monitor.export_prometheus_metrics(health_report))
+    elif args.export_json:
+        # Export health report as JSON
+        import json
+        health_report = monitor.perform_health_checks()
+        with open(args.export_json, 'w') as f:
+            json.dump(health_report, f, indent=2, default=str)
+        logger.info(f"Health report exported to {args.export_json}")
+    else:
+        monitor.run()

 if __name__ == "__main__":
    main()