Optimize suppression checks: load once per cycle, add error logging

db.py:
- Add check_suppressed(suppressions, ...) for in-memory suppression lookups
  against pre-loaded list (eliminates N*M DB queries per monitoring cycle)
- get_baseline(): log error instead of silently swallowing JSON parse failure

monitor.py:
- Load active suppressions once per cycle at the top of the alert loop
- Pass suppressions list to _process_interfaces, _process_unifi, _process_ping_hosts
- Replace all db.is_suppressed() calls with db.check_suppressed(suppressions, ...)
- Reduces DB queries from 100-600+ per cycle down to 1

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-14 14:13:54 -04:00
parent af26407363
commit 85a018ff6c
2 changed files with 28 additions and 12 deletions

15
db.py
View File

@@ -81,7 +81,7 @@ def get_baseline() -> dict:
try:
return json.loads(raw)
except Exception:
pass
logger.error('Failed to parse interface_baseline JSON; resetting baseline')
return {}
@@ -269,6 +269,19 @@ def deactivate_suppression(sup_id: int) -> None:
)
def check_suppressed(suppressions: list, target_type: str, target_name: str, target_detail: str = '') -> bool:
"""Check suppression against a pre-loaded list (avoids per-call DB queries)."""
for s in suppressions:
if s['target_type'] == 'all':
return True
if s['target_type'] == target_type and s['target_name'] == target_name:
if not (s.get('target_detail') or ''):
return True
if target_detail and s.get('target_detail') == target_detail:
return True
return False
def is_suppressed(target_type: str, target_name: str, target_detail: str = '') -> bool:
with get_conn() as conn:
with conn.cursor() as cur:

View File

@@ -655,7 +655,7 @@ class NetworkMonitor:
# ------------------------------------------------------------------
# Interface monitoring (Prometheus)
# ------------------------------------------------------------------
def _process_interfaces(self, states: Dict[str, Dict[str, bool]]) -> None:
def _process_interfaces(self, states: Dict[str, Dict[str, bool]], suppressions: list) -> None:
baseline = db.get_baseline()
new_baseline = {k: dict(v) for k, v in baseline.items()}
# Only count hosts with genuine regressions (UP→DOWN) toward cluster threshold
@@ -685,8 +685,8 @@ class NetworkMonitor:
# Regression: was UP, now DOWN
host_has_regression = True
sup = (
db.is_suppressed('interface', host, iface) or
db.is_suppressed('host', host)
db.check_suppressed(suppressions, 'interface', host, iface) or
db.check_suppressed(suppressions, 'host', host)
)
event_id, is_new, consec = db.upsert_event(
'interface_down', 'critical', 'prometheus',
@@ -703,7 +703,7 @@ class NetworkMonitor:
# Cluster-wide check only genuine regressions count
if len(hosts_with_regression) >= self.cluster_thresh:
sup = db.is_suppressed('all', '')
sup = db.check_suppressed(suppressions, 'all', '')
event_id, is_new, consec = db.upsert_event(
'cluster_network_issue', 'critical', 'prometheus',
CLUSTER_NAME, '',
@@ -755,7 +755,7 @@ class NetworkMonitor:
# ------------------------------------------------------------------
# UniFi device monitoring
# ------------------------------------------------------------------
def _process_unifi(self, devices: Optional[List[dict]]) -> None:
def _process_unifi(self, devices: Optional[List[dict]], suppressions: list) -> None:
if devices is None:
logger.warning('UniFi API unreachable this cycle')
return
@@ -763,7 +763,7 @@ class NetworkMonitor:
for d in devices:
name = d['name']
if not d['connected']:
sup = db.is_suppressed('unifi_device', name)
sup = db.check_suppressed(suppressions, 'unifi_device', name)
event_id, is_new, consec = db.upsert_event(
'unifi_device_offline', 'critical', 'unifi',
name, d.get('type', ''),
@@ -797,13 +797,13 @@ class NetworkMonitor:
# ------------------------------------------------------------------
# Ping-only hosts (no node_exporter)
# ------------------------------------------------------------------
def _process_ping_hosts(self) -> None:
def _process_ping_hosts(self, suppressions: list) -> None:
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip']
reachable = ping(ip)
if not reachable:
sup = db.is_suppressed('host', name)
sup = db.check_suppressed(suppressions, 'host', name)
event_id, is_new, consec = db.upsert_event(
'host_unreachable', 'critical', 'ping',
name, ip,
@@ -904,13 +904,16 @@ class NetworkMonitor:
logger.error(f'Link stats collection failed: {e}', exc_info=True)
# 3. Process alerts (separate Prometheus call for fresh data)
# Load suppressions once per cycle to avoid N*M DB queries
suppressions = db.get_active_suppressions()
iface_states = self.prom.get_interface_states()
self._process_interfaces(iface_states)
self._process_interfaces(iface_states, suppressions)
unifi_devices = self.unifi.get_devices()
self._process_unifi(unifi_devices)
self._process_unifi(unifi_devices, suppressions)
self._process_ping_hosts()
self._process_ping_hosts(suppressions)
logger.info('Network check cycle complete')