Optimize suppression checks: load once per cycle, add error logging

db.py:
- Add check_suppressed(suppressions, ...) for in-memory suppression lookups
  against pre-loaded list (eliminates N*M DB queries per monitoring cycle)
- get_baseline(): log error instead of silently swallowing JSON parse failure

monitor.py:
- Load active suppressions once per cycle at the top of the alert loop
- Pass suppressions list to _process_interfaces, _process_unifi, _process_ping_hosts
- Replace all db.is_suppressed() calls with db.check_suppressed(suppressions, ...)
- Reduces DB queries from 100-600+ per cycle down to 1

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-14 14:13:54 -04:00
parent af26407363
commit 85a018ff6c
2 changed files with 28 additions and 12 deletions

View File

@@ -655,7 +655,7 @@ class NetworkMonitor:
# ------------------------------------------------------------------
# Interface monitoring (Prometheus)
# ------------------------------------------------------------------
def _process_interfaces(self, states: Dict[str, Dict[str, bool]]) -> None:
def _process_interfaces(self, states: Dict[str, Dict[str, bool]], suppressions: list) -> None:
baseline = db.get_baseline()
new_baseline = {k: dict(v) for k, v in baseline.items()}
# Only count hosts with genuine regressions (UP→DOWN) toward cluster threshold
@@ -685,8 +685,8 @@ class NetworkMonitor:
# Regression: was UP, now DOWN
host_has_regression = True
sup = (
db.is_suppressed('interface', host, iface) or
db.is_suppressed('host', host)
db.check_suppressed(suppressions, 'interface', host, iface) or
db.check_suppressed(suppressions, 'host', host)
)
event_id, is_new, consec = db.upsert_event(
'interface_down', 'critical', 'prometheus',
@@ -703,7 +703,7 @@ class NetworkMonitor:
# Cluster-wide check only genuine regressions count
if len(hosts_with_regression) >= self.cluster_thresh:
sup = db.is_suppressed('all', '')
sup = db.check_suppressed(suppressions, 'all', '')
event_id, is_new, consec = db.upsert_event(
'cluster_network_issue', 'critical', 'prometheus',
CLUSTER_NAME, '',
@@ -755,7 +755,7 @@ class NetworkMonitor:
# ------------------------------------------------------------------
# UniFi device monitoring
# ------------------------------------------------------------------
def _process_unifi(self, devices: Optional[List[dict]]) -> None:
def _process_unifi(self, devices: Optional[List[dict]], suppressions: list) -> None:
if devices is None:
logger.warning('UniFi API unreachable this cycle')
return
@@ -763,7 +763,7 @@ class NetworkMonitor:
for d in devices:
name = d['name']
if not d['connected']:
sup = db.is_suppressed('unifi_device', name)
sup = db.check_suppressed(suppressions, 'unifi_device', name)
event_id, is_new, consec = db.upsert_event(
'unifi_device_offline', 'critical', 'unifi',
name, d.get('type', ''),
@@ -797,13 +797,13 @@ class NetworkMonitor:
# ------------------------------------------------------------------
# Ping-only hosts (no node_exporter)
# ------------------------------------------------------------------
def _process_ping_hosts(self) -> None:
def _process_ping_hosts(self, suppressions: list) -> None:
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip']
reachable = ping(ip)
if not reachable:
sup = db.is_suppressed('host', name)
sup = db.check_suppressed(suppressions, 'host', name)
event_id, is_new, consec = db.upsert_event(
'host_unreachable', 'critical', 'ping',
name, ip,
@@ -904,13 +904,16 @@ class NetworkMonitor:
logger.error(f'Link stats collection failed: {e}', exc_info=True)
# 3. Process alerts (separate Prometheus call for fresh data)
# Load suppressions once per cycle to avoid N*M DB queries
suppressions = db.get_active_suppressions()
iface_states = self.prom.get_interface_states()
self._process_interfaces(iface_states)
self._process_interfaces(iface_states, suppressions)
unifi_devices = self.unifi.get_devices()
self._process_unifi(unifi_devices)
self._process_unifi(unifi_devices, suppressions)
self._process_ping_hosts()
self._process_ping_hosts(suppressions)
logger.info('Network check cycle complete')