Fix host filtering: only show/monitor configured hosts; add PBS

- _collect_snapshot() and _process_interfaces() now skip any Prometheus instance not explicitly listed in config.json hosts[]. LXC app servers (postgresql, matrix, etc.) report node_exporter metrics but are not infrastructure hosts Gandalf should display or alert on. - Add PBS (10.10.10.3) to config hosts[] with prometheus_instance; remove from ping_hosts (node_exporter already running on PBS, now added to Prometheus scrape config as job pbs-node). - The _instance_map membership check is now consistent across snapshot, alerting, and ethtool SSH collection. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-17 17:17:40 -04:00
parent eb8c0ded5e
commit b80fda7cb2
2 changed files with 16 additions and 9 deletions
@@ -1,9 +1,9 @@
 {
-  "ssh": {
+  "pulse": {
-    "user": "root",
+    "url": "http://10.10.10.65:8080",
-    "password": "Server#980000Panda",
+    "api_key": "012b303a324152c509bf5ade6f942cfc21404f68662f01a17001cba9e4486049",
-    "connect_timeout": 5,
+    "worker_id": "1b11d1b5-4ed0-42df-a6af-8d57fffe1343",
-    "timeout": 20
+    "timeout": 45
  },
  "unifi": {
    "controller": "https://10.10.10.1",
@@ -28,14 +28,17 @@
    "allowed_groups": ["admin"]
  },
  "monitor": {
-    "poll_interval": 120,
+    "poll_interval": 300,
    "failure_threshold": 2,
    "cluster_threshold": 3,
-    "ping_hosts": [
+    "ping_hosts": []
      {"name": "pbs", "ip": "10.10.10.3"}
    ]
  },
  "hosts": [
    {
      "name": "pbs",
      "ip": "10.10.10.3",
      "prometheus_instance": "10.10.10.3:9100"
    },
    {
      "name": "large1",
      "ip": "10.10.10.2",
@@ -694,6 +694,8 @@ class NetworkMonitor:
        hosts_with_regression: List[str] = []
        for instance, ifaces in states.items():
            if instance not in self._instance_map:
                continue  # skip unconfigured Prometheus instances
            host = self._hostname(instance)
            new_baseline.setdefault(host, {})
            host_has_regression = False
@@ -877,6 +879,8 @@ class NetworkMonitor:
        hosts = {}
        for instance, ifaces in iface_states.items():
            if instance not in self._instance_map:
                continue  # skip Prometheus instances not in config (e.g. LXC app servers)
            host = self._hostname(instance)
            phys = {k: v for k, v in ifaces.items()}
            up_count = sum(1 for v in phys.values() if v)