changed osd down events to be cluster wide and deduplcated
This commit is contained in:
BIN
__pycache__/hwmonDaemon.cpython-311.pyc
Normal file
BIN
__pycache__/hwmonDaemon.cpython-311.pyc
Normal file
Binary file not shown.
@@ -119,6 +119,8 @@ class SystemHealthMonitor:
|
|||||||
'CEPH_TICKET_NODE': None, # Hostname of node designated to create cluster-wide Ceph tickets
|
'CEPH_TICKET_NODE': None, # Hostname of node designated to create cluster-wide Ceph tickets
|
||||||
'CEPH_USAGE_WARNING': 70, # Ceph cluster usage warning threshold %
|
'CEPH_USAGE_WARNING': 70, # Ceph cluster usage warning threshold %
|
||||||
'CEPH_USAGE_CRITICAL': 85, # Ceph cluster usage critical threshold %
|
'CEPH_USAGE_CRITICAL': 85, # Ceph cluster usage critical threshold %
|
||||||
|
# Cluster identification for tickets
|
||||||
|
'CLUSTER_NAME': 'proxmox-cluster', # Name used in cluster-wide ticket titles instead of hostname
|
||||||
# Prometheus metrics settings
|
# Prometheus metrics settings
|
||||||
'PROMETHEUS_ENABLED': False, # Enable Prometheus metrics export
|
'PROMETHEUS_ENABLED': False, # Enable Prometheus metrics export
|
||||||
'PROMETHEUS_PORT': 9101, # Port for Prometheus metrics HTTP server
|
'PROMETHEUS_PORT': 9101, # Port for Prometheus metrics HTTP server
|
||||||
@@ -176,6 +178,10 @@ class SystemHealthMonitor:
|
|||||||
elif key == 'PROMETHEUS_TEXTFILE_PATH':
|
elif key == 'PROMETHEUS_TEXTFILE_PATH':
|
||||||
cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None
|
cls.CONFIG['PROMETHEUS_TEXTFILE_PATH'] = value if value else None
|
||||||
logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}")
|
logger.info(f"✓ Loaded PROMETHEUS_TEXTFILE_PATH: {value}")
|
||||||
|
# Cluster identification
|
||||||
|
elif key == 'CLUSTER_NAME':
|
||||||
|
cls.CONFIG['CLUSTER_NAME'] = value if value else 'proxmox-cluster'
|
||||||
|
logger.info(f"✓ Loaded CLUSTER_NAME: {value}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to load .env file: {e}")
|
logger.error(f"Failed to load .env file: {e}")
|
||||||
@@ -1582,8 +1588,14 @@ class SystemHealthMonitor:
|
|||||||
# Build ticket title with proper categorization
|
# Build ticket title with proper categorization
|
||||||
# Add space after issue_tag if drive_size is empty (for non-drive issues)
|
# Add space after issue_tag if drive_size is empty (for non-drive issues)
|
||||||
issue_separator = drive_size if drive_size else " "
|
issue_separator = drive_size if drive_size else " "
|
||||||
|
|
||||||
|
# Use cluster name for cluster-wide issues instead of individual hostname
|
||||||
|
# This ensures all nodes generate the same ticket title for deduplication
|
||||||
|
cluster_name = self.CONFIG.get('CLUSTER_NAME', 'proxmox-cluster')
|
||||||
|
ticket_source = f"[{cluster_name}]" if is_cluster_wide else f"[{hostname}]"
|
||||||
|
|
||||||
ticket_title = (
|
ticket_title = (
|
||||||
f"[{hostname}]"
|
f"{ticket_source}"
|
||||||
f"{action_type['AUTO']}"
|
f"{action_type['AUTO']}"
|
||||||
f"{issue_tag}"
|
f"{issue_tag}"
|
||||||
f"{issue_separator}"
|
f"{issue_separator}"
|
||||||
@@ -2950,12 +2962,14 @@ class SystemHealthMonitor:
|
|||||||
# Simplified: just track all OSDs for now
|
# Simplified: just track all OSDs for now
|
||||||
ceph_health['osd_status'].append(osd_info)
|
ceph_health['osd_status'].append(osd_info)
|
||||||
|
|
||||||
# Check for down OSDs
|
# Check for down OSDs - this is a cluster-wide issue
|
||||||
|
# All nodes see the same OSD down, so treat as cluster-wide
|
||||||
if node.get('status') == 'down':
|
if node.get('status') == 'down':
|
||||||
ceph_health['status'] = 'CRITICAL'
|
ceph_health['status'] = 'CRITICAL'
|
||||||
# Node-specific issue (will include hostname in hash)
|
# Cluster-wide issue - OSD down affects entire cluster
|
||||||
ceph_health['issues'].append(
|
# Do NOT include detecting hostname in message to enable deduplication
|
||||||
f"Ceph OSD {node.get('name')} is DOWN on {hostname}"
|
ceph_health['cluster_wide_issues'].append(
|
||||||
|
f"Ceph OSD {node.get('name')} is DOWN"
|
||||||
)
|
)
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
logger.warning(f"Failed to parse ceph osd tree JSON: {e}")
|
logger.warning(f"Failed to parse ceph osd tree JSON: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user