Optimize suppression checks: load once per cycle, add error logging

db.py: - Add check_suppressed(suppressions, ...) for in-memory suppression lookups against pre-loaded list (eliminates N*M DB queries per monitoring cycle) - get_baseline(): log error instead of silently swallowing JSON parse failure monitor.py: - Load active suppressions once per cycle at the top of the alert loop - Pass suppressions list to _process_interfaces, _process_unifi, _process_ping_hosts - Replace all db.is_suppressed() calls with db.check_suppressed(suppressions, ...) - Reduces DB queries from 100-600+ per cycle down to 1 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-14 14:13:54 -04:00
parent af26407363
commit 85a018ff6c
2 changed files with 28 additions and 12 deletions
@@ -655,7 +655,7 @@ class NetworkMonitor:
    # ------------------------------------------------------------------
    # Interface monitoring (Prometheus)
    # ------------------------------------------------------------------
-    def _process_interfaces(self, states: Dict[str, Dict[str, bool]]) -> None:
+    def _process_interfaces(self, states: Dict[str, Dict[str, bool]], suppressions: list) -> None:
        baseline = db.get_baseline()
        new_baseline = {k: dict(v) for k, v in baseline.items()}
        # Only count hosts with genuine regressions (UP→DOWN) toward cluster threshold
@@ -685,8 +685,8 @@ class NetworkMonitor:
                        # Regression: was UP, now DOWN
                        host_has_regression = True
                        sup = (
-                            db.is_suppressed('interface', host, iface) or
-                            db.is_suppressed('host', host)
+                            db.check_suppressed(suppressions, 'interface', host, iface) or
+                            db.check_suppressed(suppressions, 'host', host)
                        )
                        event_id, is_new, consec = db.upsert_event(
                            'interface_down', 'critical', 'prometheus',
@@ -703,7 +703,7 @@ class NetworkMonitor:

        # Cluster-wide check – only genuine regressions count
        if len(hosts_with_regression) >= self.cluster_thresh:
-            sup = db.is_suppressed('all', '')
+            sup = db.check_suppressed(suppressions, 'all', '')
            event_id, is_new, consec = db.upsert_event(
                'cluster_network_issue', 'critical', 'prometheus',
                CLUSTER_NAME, '',
@@ -755,7 +755,7 @@ class NetworkMonitor:
    # ------------------------------------------------------------------
    # UniFi device monitoring
    # ------------------------------------------------------------------
-    def _process_unifi(self, devices: Optional[List[dict]]) -> None:
+    def _process_unifi(self, devices: Optional[List[dict]], suppressions: list) -> None:
        if devices is None:
            logger.warning('UniFi API unreachable this cycle')
            return
@@ -763,7 +763,7 @@ class NetworkMonitor:
        for d in devices:
            name = d['name']
            if not d['connected']:
-                sup = db.is_suppressed('unifi_device', name)
+                sup = db.check_suppressed(suppressions, 'unifi_device', name)
                event_id, is_new, consec = db.upsert_event(
                    'unifi_device_offline', 'critical', 'unifi',
                    name, d.get('type', ''),
@@ -797,13 +797,13 @@ class NetworkMonitor:
    # ------------------------------------------------------------------
    # Ping-only hosts (no node_exporter)
    # ------------------------------------------------------------------
-    def _process_ping_hosts(self) -> None:
+    def _process_ping_hosts(self, suppressions: list) -> None:
        for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
            name, ip = h['name'], h['ip']
            reachable = ping(ip)

            if not reachable:
-                sup = db.is_suppressed('host', name)
+                sup = db.check_suppressed(suppressions, 'host', name)
                event_id, is_new, consec = db.upsert_event(
                    'host_unreachable', 'critical', 'ping',
                    name, ip,
@@ -904,13 +904,16 @@ class NetworkMonitor:
                    logger.error(f'Link stats collection failed: {e}', exc_info=True)

                # 3. Process alerts (separate Prometheus call for fresh data)
+                # Load suppressions once per cycle to avoid N*M DB queries
+                suppressions = db.get_active_suppressions()
+
                iface_states = self.prom.get_interface_states()
-                self._process_interfaces(iface_states)
+                self._process_interfaces(iface_states, suppressions)

                unifi_devices = self.unifi.get_devices()
-                self._process_unifi(unifi_devices)
+                self._process_unifi(unifi_devices, suppressions)

-                self._process_ping_hosts()
+                self._process_ping_hosts(suppressions)

                logger.info('Network check cycle complete')