diff --git a/__pycache__/hwmonDaemon.cpython-311.pyc b/__pycache__/hwmonDaemon.cpython-311.pyc new file mode 100644 index 0000000..d4f0ae9 Binary files /dev/null and b/__pycache__/hwmonDaemon.cpython-311.pyc differ diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 0263ad1..174f85a 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -119,6 +119,8 @@ class SystemHealthMonitor: 'CEPH_TICKET_NODE': None, # Hostname of node designated to create cluster-wide Ceph tickets 'CEPH_USAGE_WARNING': 70, # Ceph cluster usage warning threshold % 'CEPH_USAGE_CRITICAL': 85, # Ceph cluster usage critical threshold % + # Cluster identification for tickets + 'CLUSTER_NAME': 'proxmox-cluster', # Name used in cluster-wide ticket titles instead of hostname # Prometheus metrics settings 'PROMETHEUS_ENABLED': False, # Enable Prometheus metrics export 'PROMETHEUS_PORT': 9101, # Port for Prometheus metrics HTTP server @@ -176,6 +178,10 @@ class SystemHealthMonitor: elif key == 'PROMETHEUS_TEXTFILE_PATH': cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}") + # Cluster identification + elif key == 'CLUSTER_NAME': + cls.CONFIG['CLUSTER_NAME'] = value if value else 'proxmox-cluster' + logger.info(f"✓ Loaded CLUSTER_NAME: {value}") except Exception as e: logger.error(f"Failed to load .env file: {e}") @@ -1582,8 +1588,14 @@ class SystemHealthMonitor: # Build ticket title with proper categorization # Add space after issue_tag if drive_size is empty (for non-drive issues) issue_separator = drive_size if drive_size else " " + + # Use cluster name for cluster-wide issues instead of individual hostname + # This ensures all nodes generate the same ticket title for deduplication + cluster_name = self.CONFIG.get('CLUSTER_NAME', 'proxmox-cluster') + ticket_source = f"[{cluster_name}]" if is_cluster_wide else f"[{hostname}]" + ticket_title = ( - f"[{hostname}]" + f"{ticket_source}" f"{action_type['AUTO']}" f"{issue_tag}" f"{issue_separator}" @@ -2950,12 +2962,14 @@ class SystemHealthMonitor: # Simplified: just track all OSDs for now ceph_health['osd_status'].append(osd_info) - # Check for down OSDs + # Check for down OSDs - this is a cluster-wide issue + # All nodes see the same OSD down, so treat as cluster-wide if node.get('status') == 'down': ceph_health['status'] = 'CRITICAL' - # Node-specific issue (will include hostname in hash) - ceph_health['issues'].append( - f"Ceph OSD {node.get('name')} is DOWN on {hostname}" + # Cluster-wide issue - OSD down affects entire cluster + # Do NOT include detecting hostname in message to enable deduplication + ceph_health['cluster_wide_issues'].append( + f"Ceph OSD {node.get('name')} is DOWN" ) except json.JSONDecodeError as e: logger.warning(f"Failed to parse ceph osd tree JSON: {e}")