Fix host filtering: only show/monitor configured hosts; add PBS
- _collect_snapshot() and _process_interfaces() now skip any Prometheus instance not explicitly listed in config.json hosts[]. LXC app servers (postgresql, matrix, etc.) report node_exporter metrics but are not infrastructure hosts Gandalf should display or alert on. - Add PBS (10.10.10.3) to config hosts[] with prometheus_instance; remove from ping_hosts (node_exporter already running on PBS, now added to Prometheus scrape config as job pbs-node). - The _instance_map membership check is now consistent across snapshot, alerting, and ethtool SSH collection. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
21
config.json
21
config.json
@@ -1,9 +1,9 @@
|
|||||||
{
|
{
|
||||||
"ssh": {
|
"pulse": {
|
||||||
"user": "root",
|
"url": "http://10.10.10.65:8080",
|
||||||
"password": "Server#980000Panda",
|
"api_key": "012b303a324152c509bf5ade6f942cfc21404f68662f01a17001cba9e4486049",
|
||||||
"connect_timeout": 5,
|
"worker_id": "1b11d1b5-4ed0-42df-a6af-8d57fffe1343",
|
||||||
"timeout": 20
|
"timeout": 45
|
||||||
},
|
},
|
||||||
"unifi": {
|
"unifi": {
|
||||||
"controller": "https://10.10.10.1",
|
"controller": "https://10.10.10.1",
|
||||||
@@ -28,14 +28,17 @@
|
|||||||
"allowed_groups": ["admin"]
|
"allowed_groups": ["admin"]
|
||||||
},
|
},
|
||||||
"monitor": {
|
"monitor": {
|
||||||
"poll_interval": 120,
|
"poll_interval": 300,
|
||||||
"failure_threshold": 2,
|
"failure_threshold": 2,
|
||||||
"cluster_threshold": 3,
|
"cluster_threshold": 3,
|
||||||
"ping_hosts": [
|
"ping_hosts": []
|
||||||
{"name": "pbs", "ip": "10.10.10.3"}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"hosts": [
|
"hosts": [
|
||||||
|
{
|
||||||
|
"name": "pbs",
|
||||||
|
"ip": "10.10.10.3",
|
||||||
|
"prometheus_instance": "10.10.10.3:9100"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "large1",
|
"name": "large1",
|
||||||
"ip": "10.10.10.2",
|
"ip": "10.10.10.2",
|
||||||
|
|||||||
@@ -694,6 +694,8 @@ class NetworkMonitor:
|
|||||||
hosts_with_regression: List[str] = []
|
hosts_with_regression: List[str] = []
|
||||||
|
|
||||||
for instance, ifaces in states.items():
|
for instance, ifaces in states.items():
|
||||||
|
if instance not in self._instance_map:
|
||||||
|
continue # skip unconfigured Prometheus instances
|
||||||
host = self._hostname(instance)
|
host = self._hostname(instance)
|
||||||
new_baseline.setdefault(host, {})
|
new_baseline.setdefault(host, {})
|
||||||
host_has_regression = False
|
host_has_regression = False
|
||||||
@@ -877,6 +879,8 @@ class NetworkMonitor:
|
|||||||
|
|
||||||
hosts = {}
|
hosts = {}
|
||||||
for instance, ifaces in iface_states.items():
|
for instance, ifaces in iface_states.items():
|
||||||
|
if instance not in self._instance_map:
|
||||||
|
continue # skip Prometheus instances not in config (e.g. LXC app servers)
|
||||||
host = self._hostname(instance)
|
host = self._hostname(instance)
|
||||||
phys = {k: v for k, v in ifaces.items()}
|
phys = {k: v for k, v in ifaces.items()}
|
||||||
up_count = sum(1 for v in phys.values() if v)
|
up_count = sum(1 for v in phys.values() if v)
|
||||||
|
|||||||
Reference in New Issue
Block a user