From 85a018ff6cc85ce0e328b73dc0888c544bf8156e Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Sat, 14 Mar 2026 14:13:54 -0400 Subject: [PATCH] Optimize suppression checks: load once per cycle, add error logging db.py: - Add check_suppressed(suppressions, ...) for in-memory suppression lookups against pre-loaded list (eliminates N*M DB queries per monitoring cycle) - get_baseline(): log error instead of silently swallowing JSON parse failure monitor.py: - Load active suppressions once per cycle at the top of the alert loop - Pass suppressions list to _process_interfaces, _process_unifi, _process_ping_hosts - Replace all db.is_suppressed() calls with db.check_suppressed(suppressions, ...) - Reduces DB queries from 100-600+ per cycle down to 1 Co-Authored-By: Claude Sonnet 4.6 --- db.py | 15 ++++++++++++++- monitor.py | 25 ++++++++++++++----------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/db.py b/db.py index 03a32bd..0847fed 100644 --- a/db.py +++ b/db.py @@ -81,7 +81,7 @@ def get_baseline() -> dict: try: return json.loads(raw) except Exception: - pass + logger.error('Failed to parse interface_baseline JSON; resetting baseline') return {} @@ -269,6 +269,19 @@ def deactivate_suppression(sup_id: int) -> None: ) +def check_suppressed(suppressions: list, target_type: str, target_name: str, target_detail: str = '') -> bool: + """Check suppression against a pre-loaded list (avoids per-call DB queries).""" + for s in suppressions: + if s['target_type'] == 'all': + return True + if s['target_type'] == target_type and s['target_name'] == target_name: + if not (s.get('target_detail') or ''): + return True + if target_detail and s.get('target_detail') == target_detail: + return True + return False + + def is_suppressed(target_type: str, target_name: str, target_detail: str = '') -> bool: with get_conn() as conn: with conn.cursor() as cur: diff --git a/monitor.py b/monitor.py index e4deed6..dab544e 100644 --- a/monitor.py +++ b/monitor.py @@ -655,7 +655,7 @@ class NetworkMonitor: # ------------------------------------------------------------------ # Interface monitoring (Prometheus) # ------------------------------------------------------------------ - def _process_interfaces(self, states: Dict[str, Dict[str, bool]]) -> None: + def _process_interfaces(self, states: Dict[str, Dict[str, bool]], suppressions: list) -> None: baseline = db.get_baseline() new_baseline = {k: dict(v) for k, v in baseline.items()} # Only count hosts with genuine regressions (UP→DOWN) toward cluster threshold @@ -685,8 +685,8 @@ class NetworkMonitor: # Regression: was UP, now DOWN host_has_regression = True sup = ( - db.is_suppressed('interface', host, iface) or - db.is_suppressed('host', host) + db.check_suppressed(suppressions, 'interface', host, iface) or + db.check_suppressed(suppressions, 'host', host) ) event_id, is_new, consec = db.upsert_event( 'interface_down', 'critical', 'prometheus', @@ -703,7 +703,7 @@ class NetworkMonitor: # Cluster-wide check – only genuine regressions count if len(hosts_with_regression) >= self.cluster_thresh: - sup = db.is_suppressed('all', '') + sup = db.check_suppressed(suppressions, 'all', '') event_id, is_new, consec = db.upsert_event( 'cluster_network_issue', 'critical', 'prometheus', CLUSTER_NAME, '', @@ -755,7 +755,7 @@ class NetworkMonitor: # ------------------------------------------------------------------ # UniFi device monitoring # ------------------------------------------------------------------ - def _process_unifi(self, devices: Optional[List[dict]]) -> None: + def _process_unifi(self, devices: Optional[List[dict]], suppressions: list) -> None: if devices is None: logger.warning('UniFi API unreachable this cycle') return @@ -763,7 +763,7 @@ class NetworkMonitor: for d in devices: name = d['name'] if not d['connected']: - sup = db.is_suppressed('unifi_device', name) + sup = db.check_suppressed(suppressions, 'unifi_device', name) event_id, is_new, consec = db.upsert_event( 'unifi_device_offline', 'critical', 'unifi', name, d.get('type', ''), @@ -797,13 +797,13 @@ class NetworkMonitor: # ------------------------------------------------------------------ # Ping-only hosts (no node_exporter) # ------------------------------------------------------------------ - def _process_ping_hosts(self) -> None: + def _process_ping_hosts(self, suppressions: list) -> None: for h in self.cfg.get('monitor', {}).get('ping_hosts', []): name, ip = h['name'], h['ip'] reachable = ping(ip) if not reachable: - sup = db.is_suppressed('host', name) + sup = db.check_suppressed(suppressions, 'host', name) event_id, is_new, consec = db.upsert_event( 'host_unreachable', 'critical', 'ping', name, ip, @@ -904,13 +904,16 @@ class NetworkMonitor: logger.error(f'Link stats collection failed: {e}', exc_info=True) # 3. Process alerts (separate Prometheus call for fresh data) + # Load suppressions once per cycle to avoid N*M DB queries + suppressions = db.get_active_suppressions() + iface_states = self.prom.get_interface_states() - self._process_interfaces(iface_states) + self._process_interfaces(iface_states, suppressions) unifi_devices = self.unifi.get_devices() - self._process_unifi(unifi_devices) + self._process_unifi(unifi_devices, suppressions) - self._process_ping_hosts() + self._process_ping_hosts(suppressions) logger.info('Network check cycle complete')