changed osd down events to be cluster wide and deduplcated

This commit is contained in:
2026-01-26 11:03:55 -05:00
parent 1e84144e29
commit 509603843b
2 changed files with 19 additions and 5 deletions

Binary file not shown.

View File

@@ -119,6 +119,8 @@ class SystemHealthMonitor:
'CEPH_TICKET_NODE': None, # Hostname of node designated to create cluster-wide Ceph tickets 'CEPH_TICKET_NODE': None, # Hostname of node designated to create cluster-wide Ceph tickets
'CEPH_USAGE_WARNING': 70, # Ceph cluster usage warning threshold % 'CEPH_USAGE_WARNING': 70, # Ceph cluster usage warning threshold %
'CEPH_USAGE_CRITICAL': 85, # Ceph cluster usage critical threshold % 'CEPH_USAGE_CRITICAL': 85, # Ceph cluster usage critical threshold %
# Cluster identification for tickets
'CLUSTER_NAME': 'proxmox-cluster', # Name used in cluster-wide ticket titles instead of hostname
# Prometheus metrics settings # Prometheus metrics settings
'PROMETHEUS_ENABLED': False, # Enable Prometheus metrics export 'PROMETHEUS_ENABLED': False, # Enable Prometheus metrics export
'PROMETHEUS_PORT': 9101, # Port for Prometheus metrics HTTP server 'PROMETHEUS_PORT': 9101, # Port for Prometheus metrics HTTP server
@@ -176,6 +178,10 @@ class SystemHealthMonitor:
elif key == 'PROMETHEUS_TEXTFILE_PATH': elif key == 'PROMETHEUS_TEXTFILE_PATH':
cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None
logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}") logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}")
# Cluster identification
elif key == 'CLUSTER_NAME':
cls.CONFIG['CLUSTER_NAME'] = value if value else 'proxmox-cluster'
logger.info(f"✓ Loaded CLUSTER_NAME: {value}")
except Exception as e: except Exception as e:
logger.error(f"Failed to load .env file: {e}") logger.error(f"Failed to load .env file: {e}")
@@ -1582,8 +1588,14 @@ class SystemHealthMonitor:
# Build ticket title with proper categorization # Build ticket title with proper categorization
# Add space after issue_tag if drive_size is empty (for non-drive issues) # Add space after issue_tag if drive_size is empty (for non-drive issues)
issue_separator = drive_size if drive_size else " " issue_separator = drive_size if drive_size else " "
# Use cluster name for cluster-wide issues instead of individual hostname
# This ensures all nodes generate the same ticket title for deduplication
cluster_name = self.CONFIG.get('CLUSTER_NAME', 'proxmox-cluster')
ticket_source = f"[{cluster_name}]" if is_cluster_wide else f"[{hostname}]"
ticket_title = ( ticket_title = (
f"[{hostname}]" f"{ticket_source}"
f"{action_type['AUTO']}" f"{action_type['AUTO']}"
f"{issue_tag}" f"{issue_tag}"
f"{issue_separator}" f"{issue_separator}"
@@ -2950,12 +2962,14 @@ class SystemHealthMonitor:
# Simplified: just track all OSDs for now # Simplified: just track all OSDs for now
ceph_health['osd_status'].append(osd_info) ceph_health['osd_status'].append(osd_info)
# Check for down OSDs # Check for down OSDs - this is a cluster-wide issue
# All nodes see the same OSD down, so treat as cluster-wide
if node.get('status') == 'down': if node.get('status') == 'down':
ceph_health['status'] = 'CRITICAL' ceph_health['status'] = 'CRITICAL'
# Node-specific issue (will include hostname in hash) # Cluster-wide issue - OSD down affects entire cluster
ceph_health['issues'].append( # Do NOT include detecting hostname in message to enable deduplication
f"Ceph OSD {node.get('name')} is DOWN on {hostname}" ceph_health['cluster_wide_issues'].append(
f"Ceph OSD {node.get('name')} is DOWN"
) )
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
logger.warning(f"Failed to parse ceph osd tree JSON: {e}") logger.warning(f"Failed to parse ceph osd tree JSON: {e}")