Optimize suppression checks: load once per cycle, add error logging
db.py: - Add check_suppressed(suppressions, ...) for in-memory suppression lookups against pre-loaded list (eliminates N*M DB queries per monitoring cycle) - get_baseline(): log error instead of silently swallowing JSON parse failure monitor.py: - Load active suppressions once per cycle at the top of the alert loop - Pass suppressions list to _process_interfaces, _process_unifi, _process_ping_hosts - Replace all db.is_suppressed() calls with db.check_suppressed(suppressions, ...) - Reduces DB queries from 100-600+ per cycle down to 1 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
15
db.py
15
db.py
@@ -81,7 +81,7 @@ def get_baseline() -> dict:
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
pass
|
||||
logger.error('Failed to parse interface_baseline JSON; resetting baseline')
|
||||
return {}
|
||||
|
||||
|
||||
@@ -269,6 +269,19 @@ def deactivate_suppression(sup_id: int) -> None:
|
||||
)
|
||||
|
||||
|
||||
def check_suppressed(suppressions: list, target_type: str, target_name: str, target_detail: str = '') -> bool:
|
||||
"""Check suppression against a pre-loaded list (avoids per-call DB queries)."""
|
||||
for s in suppressions:
|
||||
if s['target_type'] == 'all':
|
||||
return True
|
||||
if s['target_type'] == target_type and s['target_name'] == target_name:
|
||||
if not (s.get('target_detail') or ''):
|
||||
return True
|
||||
if target_detail and s.get('target_detail') == target_detail:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_suppressed(target_type: str, target_name: str, target_detail: str = '') -> bool:
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
25
monitor.py
25
monitor.py
@@ -655,7 +655,7 @@ class NetworkMonitor:
|
||||
# ------------------------------------------------------------------
|
||||
# Interface monitoring (Prometheus)
|
||||
# ------------------------------------------------------------------
|
||||
def _process_interfaces(self, states: Dict[str, Dict[str, bool]]) -> None:
|
||||
def _process_interfaces(self, states: Dict[str, Dict[str, bool]], suppressions: list) -> None:
|
||||
baseline = db.get_baseline()
|
||||
new_baseline = {k: dict(v) for k, v in baseline.items()}
|
||||
# Only count hosts with genuine regressions (UP→DOWN) toward cluster threshold
|
||||
@@ -685,8 +685,8 @@ class NetworkMonitor:
|
||||
# Regression: was UP, now DOWN
|
||||
host_has_regression = True
|
||||
sup = (
|
||||
db.is_suppressed('interface', host, iface) or
|
||||
db.is_suppressed('host', host)
|
||||
db.check_suppressed(suppressions, 'interface', host, iface) or
|
||||
db.check_suppressed(suppressions, 'host', host)
|
||||
)
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
'interface_down', 'critical', 'prometheus',
|
||||
@@ -703,7 +703,7 @@ class NetworkMonitor:
|
||||
|
||||
# Cluster-wide check – only genuine regressions count
|
||||
if len(hosts_with_regression) >= self.cluster_thresh:
|
||||
sup = db.is_suppressed('all', '')
|
||||
sup = db.check_suppressed(suppressions, 'all', '')
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
'cluster_network_issue', 'critical', 'prometheus',
|
||||
CLUSTER_NAME, '',
|
||||
@@ -755,7 +755,7 @@ class NetworkMonitor:
|
||||
# ------------------------------------------------------------------
|
||||
# UniFi device monitoring
|
||||
# ------------------------------------------------------------------
|
||||
def _process_unifi(self, devices: Optional[List[dict]]) -> None:
|
||||
def _process_unifi(self, devices: Optional[List[dict]], suppressions: list) -> None:
|
||||
if devices is None:
|
||||
logger.warning('UniFi API unreachable this cycle')
|
||||
return
|
||||
@@ -763,7 +763,7 @@ class NetworkMonitor:
|
||||
for d in devices:
|
||||
name = d['name']
|
||||
if not d['connected']:
|
||||
sup = db.is_suppressed('unifi_device', name)
|
||||
sup = db.check_suppressed(suppressions, 'unifi_device', name)
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
'unifi_device_offline', 'critical', 'unifi',
|
||||
name, d.get('type', ''),
|
||||
@@ -797,13 +797,13 @@ class NetworkMonitor:
|
||||
# ------------------------------------------------------------------
|
||||
# Ping-only hosts (no node_exporter)
|
||||
# ------------------------------------------------------------------
|
||||
def _process_ping_hosts(self) -> None:
|
||||
def _process_ping_hosts(self, suppressions: list) -> None:
|
||||
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
|
||||
name, ip = h['name'], h['ip']
|
||||
reachable = ping(ip)
|
||||
|
||||
if not reachable:
|
||||
sup = db.is_suppressed('host', name)
|
||||
sup = db.check_suppressed(suppressions, 'host', name)
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
'host_unreachable', 'critical', 'ping',
|
||||
name, ip,
|
||||
@@ -904,13 +904,16 @@ class NetworkMonitor:
|
||||
logger.error(f'Link stats collection failed: {e}', exc_info=True)
|
||||
|
||||
# 3. Process alerts (separate Prometheus call for fresh data)
|
||||
# Load suppressions once per cycle to avoid N*M DB queries
|
||||
suppressions = db.get_active_suppressions()
|
||||
|
||||
iface_states = self.prom.get_interface_states()
|
||||
self._process_interfaces(iface_states)
|
||||
self._process_interfaces(iface_states, suppressions)
|
||||
|
||||
unifi_devices = self.unifi.get_devices()
|
||||
self._process_unifi(unifi_devices)
|
||||
self._process_unifi(unifi_devices, suppressions)
|
||||
|
||||
self._process_ping_hosts()
|
||||
self._process_ping_hosts(suppressions)
|
||||
|
||||
logger.info('Network check cycle complete')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user