From b80fda7cb2bb3a13ac87953c05af6cfbaf598008 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Tue, 17 Mar 2026 17:17:40 -0400 Subject: [PATCH] Fix host filtering: only show/monitor configured hosts; add PBS - _collect_snapshot() and _process_interfaces() now skip any Prometheus instance not explicitly listed in config.json hosts[]. LXC app servers (postgresql, matrix, etc.) report node_exporter metrics but are not infrastructure hosts Gandalf should display or alert on. - Add PBS (10.10.10.3) to config hosts[] with prometheus_instance; remove from ping_hosts (node_exporter already running on PBS, now added to Prometheus scrape config as job pbs-node). - The _instance_map membership check is now consistent across snapshot, alerting, and ethtool SSH collection. Co-Authored-By: Claude Sonnet 4.6 --- config.json | 21 ++++++++++++--------- monitor.py | 4 ++++ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/config.json b/config.json index d8578fa..5481549 100644 --- a/config.json +++ b/config.json @@ -1,9 +1,9 @@ { - "ssh": { - "user": "root", - "password": "Server#980000Panda", - "connect_timeout": 5, - "timeout": 20 + "pulse": { + "url": "http://10.10.10.65:8080", + "api_key": "012b303a324152c509bf5ade6f942cfc21404f68662f01a17001cba9e4486049", + "worker_id": "1b11d1b5-4ed0-42df-a6af-8d57fffe1343", + "timeout": 45 }, "unifi": { "controller": "https://10.10.10.1", @@ -28,14 +28,17 @@ "allowed_groups": ["admin"] }, "monitor": { - "poll_interval": 120, + "poll_interval": 300, "failure_threshold": 2, "cluster_threshold": 3, - "ping_hosts": [ - {"name": "pbs", "ip": "10.10.10.3"} - ] + "ping_hosts": [] }, "hosts": [ + { + "name": "pbs", + "ip": "10.10.10.3", + "prometheus_instance": "10.10.10.3:9100" + }, { "name": "large1", "ip": "10.10.10.2", diff --git a/monitor.py b/monitor.py index 9effb14..8f97934 100644 --- a/monitor.py +++ b/monitor.py @@ -694,6 +694,8 @@ class NetworkMonitor: hosts_with_regression: List[str] = [] for instance, ifaces in states.items(): + if instance not in self._instance_map: + continue # skip unconfigured Prometheus instances host = self._hostname(instance) new_baseline.setdefault(host, {}) host_has_regression = False @@ -877,6 +879,8 @@ class NetworkMonitor: hosts = {} for instance, ifaces in iface_states.items(): + if instance not in self._instance_map: + continue # skip Prometheus instances not in config (e.g. LXC app servers) host = self._hostname(instance) phys = {k: v for k, v in ifaces.items()} up_count = sum(1 for v in phys.values() if v)