changed osd down events to be cluster wide and deduplcated

2026-01-26 11:03:55 -05:00
parent 1e84144e29
commit 509603843b
2 changed files with 19 additions and 5 deletions
--- a/pycache/hwmonDaemon.cpython-311.pyc
+++ b/pycache/hwmonDaemon.cpython-311.pyc
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -119,6 +119,8 @@ class SystemHealthMonitor:
        'CEPH_TICKET_NODE': None,  # Hostname of node designated to create cluster-wide Ceph tickets
        'CEPH_USAGE_WARNING': 70,  # Ceph cluster usage warning threshold %
        'CEPH_USAGE_CRITICAL': 85,  # Ceph cluster usage critical threshold %
        # Cluster identification for tickets
        'CLUSTER_NAME': 'proxmox-cluster',  # Name used in cluster-wide ticket titles instead of hostname
        # Prometheus metrics settings
        'PROMETHEUS_ENABLED': False,  # Enable Prometheus metrics export
        'PROMETHEUS_PORT': 9101,  # Port for Prometheus metrics HTTP server
@@ -176,6 +178,10 @@ class SystemHealthMonitor:
                        elif key == 'PROMETHEUS_TEXTFILE_PATH':
                            cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None
                            logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}")
                        # Cluster identification
                        elif key == 'CLUSTER_NAME':
                            cls.CONFIG['CLUSTER_NAME'] = value if value else 'proxmox-cluster'
                            logger.info(f"✓ Loaded CLUSTER_NAME: {value}")
        except Exception as e:
            logger.error(f"Failed to load .env file: {e}")
@@ -1582,8 +1588,14 @@ class SystemHealthMonitor:
            # Build ticket title with proper categorization
            # Add space after issue_tag if drive_size is empty (for non-drive issues)
            issue_separator = drive_size if drive_size else " "
            # Use cluster name for cluster-wide issues instead of individual hostname
            # This ensures all nodes generate the same ticket title for deduplication
            cluster_name = self.CONFIG.get('CLUSTER_NAME', 'proxmox-cluster')
            ticket_source = f"[{cluster_name}]" if is_cluster_wide else f"[{hostname}]"
            ticket_title = (
-                f"[{hostname}]"
+                f"{ticket_source}"
                f"{action_type['AUTO']}"
                f"{issue_tag}"
                f"{issue_separator}"
@@ -2950,12 +2962,14 @@ class SystemHealthMonitor:
                            # Simplified: just track all OSDs for now
                            ceph_health['osd_status'].append(osd_info)
-                            # Check for down OSDs
+                            # Check for down OSDs - this is a cluster-wide issue
                            # All nodes see the same OSD down, so treat as cluster-wide
                            if node.get('status') == 'down':
                                ceph_health['status'] = 'CRITICAL'
-                                # Node-specific issue (will include hostname in hash)
+                                # Cluster-wide issue - OSD down affects entire cluster
-                                ceph_health['issues'].append(
+                                # Do NOT include detecting hostname in message to enable deduplication
-                                    f"Ceph OSD {node.get('name')} is DOWN on {hostname}"
+                                ceph_health['cluster_wide_issues'].append(
                                    f"Ceph OSD {node.get('name')} is DOWN"
                                )
                except json.JSONDecodeError as e:
                    logger.warning(f"Failed to parse ceph osd tree JSON: {e}")