Complete rewrite: full-featured network monitoring dashboard

- Two-service architecture: Flask web app (gandalf.service) + background polling daemon (gandalf-monitor.service) - Monitor polls Prometheus node_network_up for physical NIC states on all 6 hypervisors (added storage-01 at 10.10.10.11:9100) - UniFi API monitoring for switches, APs, and gateway device status - Ping reachability for hosts without node_exporter (pbs only now) - Smart baseline: interfaces first seen as down are never alerted on; only UP→DOWN regressions trigger tickets - Cluster-wide P1 ticket when 3+ hosts have genuine simultaneous interface regressions (guards against false positives on startup) - Tinker Tickets integration with 24-hour hash-based deduplication - Alert suppression: manual toggle or timed windows (30m/1h/4h/8h) - Authelia SSO via forward-auth headers, admin group required - Network topology: Internet → UDM-Pro → Agg Switch (10G DAC) → PoE Switch (10G DAC) → Hosts - MariaDB schema, suppression management UI, host/interface cards Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-01 23:03:18 -05:00
parent 4ed5ecacbb
commit 0c0150f698
13 changed files with 2787 additions and 512 deletions
@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+"""Gandalf network monitor daemon.
+
+Polls Prometheus (node_exporter) and the UniFi controller for network
+interface and device state. Creates tickets in Tinker Tickets when issues
+are detected, with deduplication and suppression support.
+
+Run as a separate systemd service alongside the Flask web app.
+"""
+import json
+import logging
+import re
+import subprocess
+import time
+from datetime import datetime
+from typing import Dict, List, Optional
+
+import requests
+from urllib3.exceptions import InsecureRequestWarning
+
+import db
+
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s %(levelname)s %(name)s %(message)s',
+)
+logger = logging.getLogger('gandalf.monitor')
+
+# --------------------------------------------------------------------------
+# Interface filtering
+# --------------------------------------------------------------------------
+_SKIP_PREFIXES = (
+    'lo', 'veth', 'tap', 'fwbr', 'fwln', 'fwpr',
+    'docker', 'dummy', 'br-', 'virbr', 'vmbr',
+)
+_VLAN_SUFFIX = re.compile(r'\.\d+$')
+
+
+def is_physical_interface(name: str) -> bool:
+    """Return True for physical/bond interfaces worth monitoring."""
+    if any(name.startswith(p) for p in _SKIP_PREFIXES):
+        return False
+    if _VLAN_SUFFIX.search(name):
+        return False
+    return True
+
+
+# --------------------------------------------------------------------------
+# Prometheus client
+# --------------------------------------------------------------------------
+class PrometheusClient:
+    def __init__(self, url: str):
+        self.url = url.rstrip('/')
+
+    def query(self, promql: str) -> list:
+        try:
+            resp = requests.get(
+                f'{self.url}/api/v1/query',
+                params={'query': promql},
+                timeout=15,
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            if data.get('status') == 'success':
+                return data['data']['result']
+        except Exception as e:
+            logger.error(f'Prometheus query failed ({promql!r}): {e}')
+        return []
+
+    def get_interface_states(self) -> Dict[str, Dict[str, bool]]:
+        """Return {instance: {device: is_up}} for physical interfaces."""
+        results = self.query('node_network_up')
+        hosts: Dict[str, Dict[str, bool]] = {}
+        for r in results:
+            instance = r['metric'].get('instance', '')
+            device = r['metric'].get('device', '')
+            if not is_physical_interface(device):
+                continue
+            hosts.setdefault(instance, {})[device] = (r['value'][1] == '1')
+        return hosts
+
+
+# --------------------------------------------------------------------------
+# UniFi client
+# --------------------------------------------------------------------------
+class UnifiClient:
+    def __init__(self, cfg: dict):
+        self.base_url = cfg['controller']
+        self.site_id = cfg.get('site_id', 'default')
+        self.session = requests.Session()
+        self.session.verify = False
+        self.headers = {
+            'X-API-KEY': cfg['api_key'],
+            'Accept': 'application/json',
+        }
+
+    def get_devices(self) -> Optional[List[dict]]:
+        """Return list of UniFi devices, or None if the controller is unreachable."""
+        try:
+            url = f'{self.base_url}/proxy/network/v2/api/site/{self.site_id}/device'
+            resp = self.session.get(url, headers=self.headers, timeout=15)
+            resp.raise_for_status()
+            data = resp.json()
+            devices = []
+            for d in data.get('network_devices', []):
+                state = d.get('state', 1)
+                devices.append({
+                    'name': d.get('name') or d.get('mac', 'unknown'),
+                    'mac': d.get('mac', ''),
+                    'ip': d.get('ip', ''),
+                    'type': d.get('type', 'unknown'),
+                    'model': d.get('model', ''),
+                    'state': state,
+                    'connected': state == 1,
+                })
+            return devices
+        except Exception as e:
+            logger.error(f'UniFi API error: {e}')
+            return None
+
+
+# --------------------------------------------------------------------------
+# Ticket client
+# --------------------------------------------------------------------------
+class TicketClient:
+    def __init__(self, cfg: dict):
+        self.url = cfg.get('url', '')
+        self.api_key = cfg.get('api_key', '')
+
+    def create(self, title: str, description: str, priority: str = '2') -> Optional[str]:
+        if not self.api_key or not self.url:
+            logger.warning('Ticket API not configured – skipping ticket creation')
+            return None
+        try:
+            resp = requests.post(
+                self.url,
+                json={
+                    'title': title,
+                    'description': description,
+                    'status': 'Open',
+                    'priority': priority,
+                    'category': 'Network',
+                    'type': 'Issue',
+                },
+                headers={'Authorization': f'Bearer {self.api_key}'},
+                timeout=15,
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            if data.get('success'):
+                tid = data['ticket_id']
+                logger.info(f'Created ticket #{tid}: {title}')
+                return tid
+            if data.get('existing_ticket_id'):
+                logger.info(f'Duplicate suppressed by API – existing #{data["existing_ticket_id"]}')
+                return data['existing_ticket_id']
+            logger.warning(f'Unexpected ticket API response: {data}')
+        except Exception as e:
+            logger.error(f'Ticket creation failed: {e}')
+        return None
+
+
+# --------------------------------------------------------------------------
+# Helpers
+# --------------------------------------------------------------------------
+def ping(ip: str, count: int = 3, timeout: int = 2) -> bool:
+    try:
+        r = subprocess.run(
+            ['ping', '-c', str(count), '-W', str(timeout), ip],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            timeout=30,
+        )
+        return r.returncode == 0
+    except Exception:
+        return False
+
+
+def _now_utc() -> str:
+    return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')
+
+
+# --------------------------------------------------------------------------
+# Monitor
+# --------------------------------------------------------------------------
+CLUSTER_NAME = 'proxmox-cluster'
+
+
+class NetworkMonitor:
+    def __init__(self):
+        with open('config.json') as f:
+            self.cfg = json.load(f)
+
+        prom_url = self.cfg['prometheus']['url']
+        self.prom = PrometheusClient(prom_url)
+        self.unifi = UnifiClient(self.cfg['unifi'])
+        self.tickets = TicketClient(self.cfg.get('ticket_api', {}))
+
+        mon = self.cfg.get('monitor', {})
+        self.poll_interval = mon.get('poll_interval', 120)
+        self.fail_thresh = mon.get('failure_threshold', 2)
+        self.cluster_thresh = mon.get('cluster_threshold', 3)
+
+        # Build Prometheus instance → hostname lookup
+        self._instance_map: Dict[str, str] = {
+            h['prometheus_instance']: h['name']
+            for h in self.cfg.get('hosts', [])
+            if 'prometheus_instance' in h
+        }
+
+    def _hostname(self, instance: str) -> str:
+        return self._instance_map.get(instance, instance.split(':')[0])
+
+    # ------------------------------------------------------------------
+    # Interface monitoring (Prometheus)
+    # ------------------------------------------------------------------
+    def _process_interfaces(self, states: Dict[str, Dict[str, bool]]) -> None:
+        baseline = db.get_baseline()
+        new_baseline = {k: dict(v) for k, v in baseline.items()}
+        # Only count hosts with genuine regressions (UP→DOWN) toward cluster threshold
+        hosts_with_regression: List[str] = []
+
+        for instance, ifaces in states.items():
+            host = self._hostname(instance)
+            new_baseline.setdefault(host, {})
+            host_has_regression = False
+
+            for iface, is_up in ifaces.items():
+                prev = baseline.get(host, {}).get(iface)  # 'up', 'initial_down', or None
+
+                if is_up:
+                    new_baseline[host][iface] = 'up'
+                    db.resolve_event('interface_down', host, iface)
+                else:
+                    if prev is None:
+                        # First observation is down – could be unused port, don't alert
+                        new_baseline[host][iface] = 'initial_down'
+
+                    elif prev == 'initial_down':
+                        # Persistently down since first observation – no alert
+                        pass
+
+                    else:  # prev == 'up'
+                        # Regression: was UP, now DOWN
+                        host_has_regression = True
+                        sup = (
+                            db.is_suppressed('interface', host, iface) or
+                            db.is_suppressed('host', host)
+                        )
+                        event_id, is_new, consec = db.upsert_event(
+                            'interface_down', 'critical', 'prometheus',
+                            host, iface,
+                            f'Interface {iface} on {host} went link-down ({_now_utc()})',
+                        )
+                        if not sup and consec >= self.fail_thresh:
+                            self._ticket_interface(event_id, is_new, host, iface, consec)
+
+            if host_has_regression:
+                hosts_with_regression.append(host)
+
+        db.set_baseline(new_baseline)
+
+        # Cluster-wide check – only genuine regressions count
+        if len(hosts_with_regression) >= self.cluster_thresh:
+            sup = db.is_suppressed('all', '')
+            event_id, is_new, consec = db.upsert_event(
+                'cluster_network_issue', 'critical', 'prometheus',
+                CLUSTER_NAME, '',
+                f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
+                f'{", ".join(hosts_with_regression)}',
+            )
+            if not sup and is_new:
+                title = (
+                    f'[{CLUSTER_NAME}][auto][production][issue][network][cluster-wide] '
+                    f'Multiple hosts reporting interface failures'
+                )
+                desc = (
+                    f'Cluster Network Alert\n{"=" * 40}\n\n'
+                    f'Affected hosts: {", ".join(hosts_with_regression)}\n'
+                    f'Detected: {_now_utc()}\n\n'
+                    f'{len(hosts_with_regression)} Proxmox hosts simultaneously reported '
+                    f'interface regressions (link-down on interfaces previously known UP).\n'
+                    f'This likely indicates a switch or upstream network failure.\n\n'
+                    f'Please check the core and management switches immediately.'
+                )
+                tid = self.tickets.create(title, desc, priority='1')
+                if tid:
+                    db.set_ticket_id(event_id, tid)
+        else:
+            db.resolve_event('cluster_network_issue', CLUSTER_NAME, '')
+
+    def _ticket_interface(
+        self, event_id: int, is_new: bool, host: str, iface: str, consec: int
+    ) -> None:
+        title = (
+            f'[{host}][auto][production][issue][network][single-node] '
+            f'Interface {iface} link-down'
+        )
+        desc = (
+            f'Network Interface Alert\n{"=" * 40}\n\n'
+            f'Host: {host}\n'
+            f'Interface: {iface}\n'
+            f'Detected: {_now_utc()}\n'
+            f'Consecutive check failures: {consec}\n\n'
+            f'Interface {iface} on {host} is reporting link-down state via '
+            f'Prometheus node_exporter.\n\n'
+            f'Note: {host} may still be reachable via its other network interface.\n'
+            f'Please inspect the cable/SFP/switch port for {host}/{iface}.'
+        )
+        tid = self.tickets.create(title, desc, priority='2')
+        if tid and is_new:
+            db.set_ticket_id(event_id, tid)
+
+    # ------------------------------------------------------------------
+    # UniFi device monitoring
+    # ------------------------------------------------------------------
+    def _process_unifi(self, devices: Optional[List[dict]]) -> None:
+        if devices is None:
+            logger.warning('UniFi API unreachable this cycle')
+            return
+
+        for d in devices:
+            name = d['name']
+            if not d['connected']:
+                sup = db.is_suppressed('unifi_device', name)
+                event_id, is_new, consec = db.upsert_event(
+                    'unifi_device_offline', 'critical', 'unifi',
+                    name, d.get('type', ''),
+                    f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
+                )
+                if not sup and consec >= self.fail_thresh:
+                    self._ticket_unifi(event_id, is_new, d)
+            else:
+                db.resolve_event('unifi_device_offline', name, d.get('type', ''))
+
+    def _ticket_unifi(self, event_id: int, is_new: bool, device: dict) -> None:
+        name = device['name']
+        title = (
+            f'[{name}][auto][production][issue][network][single-node] '
+            f'UniFi device offline'
+        )
+        desc = (
+            f'UniFi Device Alert\n{"=" * 40}\n\n'
+            f'Device: {name}\n'
+            f'Type: {device.get("type","unknown")}\n'
+            f'Model: {device.get("model","")}\n'
+            f'Last Known IP: {device.get("ip","unknown")}\n'
+            f'Detected: {_now_utc()}\n\n'
+            f'The UniFi device {name} is offline per the UniFi controller.\n'
+            f'Please check power and cable connectivity.'
+        )
+        tid = self.tickets.create(title, desc, priority='2')
+        if tid and is_new:
+            db.set_ticket_id(event_id, tid)
+
+    # ------------------------------------------------------------------
+    # Ping-only hosts (no node_exporter)
+    # ------------------------------------------------------------------
+    def _process_ping_hosts(self) -> None:
+        for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
+            name, ip = h['name'], h['ip']
+            reachable = ping(ip)
+
+            if not reachable:
+                sup = db.is_suppressed('host', name)
+                event_id, is_new, consec = db.upsert_event(
+                    'host_unreachable', 'critical', 'ping',
+                    name, ip,
+                    f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
+                )
+                if not sup and consec >= self.fail_thresh:
+                    self._ticket_unreachable(event_id, is_new, name, ip, consec)
+            else:
+                db.resolve_event('host_unreachable', name, ip)
+
+    def _ticket_unreachable(
+        self, event_id: int, is_new: bool, name: str, ip: str, consec: int
+    ) -> None:
+        title = (
+            f'[{name}][auto][production][issue][network][single-node] '
+            f'Host unreachable'
+        )
+        desc = (
+            f'Host Reachability Alert\n{"=" * 40}\n\n'
+            f'Host: {name}\n'
+            f'IP: {ip}\n'
+            f'Detected: {_now_utc()}\n'
+            f'Consecutive check failures: {consec}\n\n'
+            f'Host {name} ({ip}) is not responding to ping from the Gandalf monitor.\n'
+            f'This host does not have a Prometheus node_exporter, so interface-level '
+            f'detail is unavailable.\n\n'
+            f'Please check the host power, management interface, and network connectivity.'
+        )
+        tid = self.tickets.create(title, desc, priority='2')
+        if tid and is_new:
+            db.set_ticket_id(event_id, tid)
+
+    # ------------------------------------------------------------------
+    # Snapshot collection (for dashboard)
+    # ------------------------------------------------------------------
+    def _collect_snapshot(self) -> dict:
+        iface_states = self.prom.get_interface_states()
+        unifi_devices = self.unifi.get_devices() or []
+
+        hosts = {}
+        for instance, ifaces in iface_states.items():
+            host = self._hostname(instance)
+            phys = {k: v for k, v in ifaces.items()}
+            up_count = sum(1 for v in phys.values() if v)
+            total = len(phys)
+            if total == 0 or up_count == total:
+                status = 'up'
+            elif up_count == 0:
+                status = 'down'
+            else:
+                status = 'degraded'
+
+            hosts[host] = {
+                'ip': instance.split(':')[0],
+                'interfaces': {k: ('up' if v else 'down') for k, v in phys.items()},
+                'status': status,
+                'source': 'prometheus',
+            }
+
+        for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
+            name, ip = h['name'], h['ip']
+            reachable = ping(ip, count=1, timeout=2)
+            hosts[name] = {
+                'ip': ip,
+                'interfaces': {},
+                'status': 'up' if reachable else 'down',
+                'source': 'ping',
+            }
+
+        return {
+            'hosts': hosts,
+            'unifi': unifi_devices,
+            'updated': datetime.utcnow().isoformat(),
+        }
+
+    # ------------------------------------------------------------------
+    # Main loop
+    # ------------------------------------------------------------------
+    def run(self) -> None:
+        logger.info(
+            f'Gandalf monitor started – poll_interval={self.poll_interval}s '
+            f'fail_thresh={self.fail_thresh}'
+        )
+        while True:
+            try:
+                logger.info('Starting network check cycle')
+
+                # 1. Collect and store snapshot for dashboard
+                snapshot = self._collect_snapshot()
+                db.set_state('network_snapshot', snapshot)
+                db.set_state('last_check', _now_utc())
+
+                # 2. Process alerts (separate Prometheus call for fresh data)
+                iface_states = self.prom.get_interface_states()
+                self._process_interfaces(iface_states)
+
+                unifi_devices = self.unifi.get_devices()
+                self._process_unifi(unifi_devices)
+
+                self._process_ping_hosts()
+
+                logger.info('Network check cycle complete')
+
+            except Exception as e:
+                logger.error(f'Monitor loop error: {e}', exc_info=True)
+
+            time.sleep(self.poll_interval)
+
+
+if __name__ == '__main__':
+    monitor = NetworkMonitor()
+    monitor.run()