- Two-service architecture: Flask web app (gandalf.service) + background polling daemon (gandalf-monitor.service) - Monitor polls Prometheus node_network_up for physical NIC states on all 6 hypervisors (added storage-01 at 10.10.10.11:9100) - UniFi API monitoring for switches, APs, and gateway device status - Ping reachability for hosts without node_exporter (pbs only now) - Smart baseline: interfaces first seen as down are never alerted on; only UP→DOWN regressions trigger tickets - Cluster-wide P1 ticket when 3+ hosts have genuine simultaneous interface regressions (guards against false positives on startup) - Tinker Tickets integration with 24-hour hash-based deduplication - Alert suppression: manual toggle or timed windows (30m/1h/4h/8h) - Authelia SSO via forward-auth headers, admin group required - Network topology: Internet → UDM-Pro → Agg Switch (10G DAC) → PoE Switch (10G DAC) → Hosts - MariaDB schema, suppression management UI, host/interface cards Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
65 lines
1.4 KiB
JSON
65 lines
1.4 KiB
JSON
{
|
|
"unifi": {
|
|
"controller": "https://10.10.10.1",
|
|
"api_key": "kyPfIsAVie3hwMD4Bc1MjAu8N7HVPIb8",
|
|
"site_id": "default"
|
|
},
|
|
"prometheus": {
|
|
"url": "http://10.10.10.48:9090"
|
|
},
|
|
"database": {
|
|
"host": "10.10.10.50",
|
|
"port": 3306,
|
|
"user": "gandalf",
|
|
"password": "Gandalf2026Lotus",
|
|
"name": "gandalf"
|
|
},
|
|
"ticket_api": {
|
|
"url": "http://10.10.10.45/create_ticket_api.php",
|
|
"api_key": "5acc5d3c647b84f7c6f59082ce4450ee772e2d1633238b960136f653d20c93af"
|
|
},
|
|
"auth": {
|
|
"allowed_groups": ["admin"]
|
|
},
|
|
"monitor": {
|
|
"poll_interval": 120,
|
|
"failure_threshold": 2,
|
|
"cluster_threshold": 3,
|
|
"ping_hosts": [
|
|
{"name": "pbs", "ip": "10.10.10.3"}
|
|
]
|
|
},
|
|
"hosts": [
|
|
{
|
|
"name": "large1",
|
|
"ip": "10.10.10.2",
|
|
"prometheus_instance": "10.10.10.2:9100"
|
|
},
|
|
{
|
|
"name": "compute-storage-01",
|
|
"ip": "10.10.10.4",
|
|
"prometheus_instance": "10.10.10.4:9100"
|
|
},
|
|
{
|
|
"name": "micro1",
|
|
"ip": "10.10.10.8",
|
|
"prometheus_instance": "10.10.10.8:9100"
|
|
},
|
|
{
|
|
"name": "monitor-02",
|
|
"ip": "10.10.10.9",
|
|
"prometheus_instance": "10.10.10.9:9100"
|
|
},
|
|
{
|
|
"name": "compute-storage-gpu-01",
|
|
"ip": "10.10.10.10",
|
|
"prometheus_instance": "10.10.10.10:9100"
|
|
},
|
|
{
|
|
"name": "storage-01",
|
|
"ip": "10.10.10.11",
|
|
"prometheus_instance": "10.10.10.11:9100"
|
|
}
|
|
]
|
|
}
|