',
+ 401,
+ )
+ allowed = _config().get('auth', {}).get('allowed_groups', ['admin'])
+ if not any(g in allowed for g in user['groups']):
+ return (
+ f'
403 – Access denied
'
+ f'
Your account ({user["username"]}) is not in an allowed group '
+ f'({", ".join(allowed)}).
',
+ 403,
+ )
+ return f(*args, **kwargs)
+ return wrapper
+
+
+# ---------------------------------------------------------------------------
+# Page routes
+# ---------------------------------------------------------------------------
@app.route('/')
-def home():
- config = load_config()
- unifi = UnifiAPI(config)
- devices = unifi.get_devices()
- return render_template('index.html', devices=devices)
+@require_auth
+def index():
+ user = _get_user()
+ events = db.get_active_events()
+ summary = db.get_status_summary()
+ snapshot_raw = db.get_state('network_snapshot')
+ last_check = db.get_state('last_check', 'Never')
+ snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
+ suppressions = db.get_active_suppressions()
+ return render_template(
+ 'index.html',
+ user=user,
+ events=events,
+ summary=summary,
+ snapshot=snapshot,
+ last_check=last_check,
+ suppressions=suppressions,
+ )
+
+
+@app.route('/suppressions')
+@require_auth
+def suppressions_page():
+ user = _get_user()
+ active = db.get_active_suppressions()
+ history = db.get_suppression_history(limit=50)
+ snapshot_raw = db.get_state('network_snapshot')
+ snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
+ return render_template(
+ 'suppressions.html',
+ user=user,
+ active=active,
+ history=history,
+ snapshot=snapshot,
+ )
+
+
+# ---------------------------------------------------------------------------
+# API routes
+# ---------------------------------------------------------------------------
@app.route('/api/status')
-def status():
- return jsonify(device_status)
+@require_auth
+def api_status():
+ return jsonify({
+ 'summary': db.get_status_summary(),
+ 'last_check': db.get_state('last_check', 'Never'),
+ 'events': db.get_active_events(),
+ })
+
+
+@app.route('/api/network')
+@require_auth
+def api_network():
+ raw = db.get_state('network_snapshot')
+ if raw:
+ try:
+ return jsonify(json.loads(raw))
+ except Exception:
+ pass
+ return jsonify({'hosts': {}, 'unifi': [], 'updated': None})
+
+
+@app.route('/api/events')
+@require_auth
+def api_events():
+ return jsonify({
+ 'active': db.get_active_events(),
+ 'resolved': db.get_recent_resolved(hours=24, limit=30),
+ })
+
+
+@app.route('/api/suppressions', methods=['GET'])
+@require_auth
+def api_get_suppressions():
+ return jsonify(db.get_active_suppressions())
+
+
+@app.route('/api/suppressions', methods=['POST'])
+@require_auth
+def api_create_suppression():
+ user = _get_user()
+ data = request.get_json(silent=True) or {}
+
+ target_type = data.get('target_type', 'host')
+ target_name = (data.get('target_name') or '').strip()
+ target_detail = (data.get('target_detail') or '').strip()
+ reason = (data.get('reason') or '').strip()
+ expires_minutes = data.get('expires_minutes') # None = manual/permanent
+
+ if target_type not in ('host', 'interface', 'unifi_device', 'all'):
+ return jsonify({'error': 'Invalid target_type'}), 400
+ if target_type != 'all' and not target_name:
+ return jsonify({'error': 'target_name required'}), 400
+ if not reason:
+ return jsonify({'error': 'reason required'}), 400
+
+ sup_id = db.create_suppression(
+ target_type=target_type,
+ target_name=target_name,
+ target_detail=target_detail,
+ reason=reason,
+ suppressed_by=user['username'],
+ expires_minutes=int(expires_minutes) if expires_minutes else None,
+ )
+ logger.info(
+ f'Suppression #{sup_id} created by {user["username"]}: '
+ f'{target_type}/{target_name}/{target_detail} – {reason}'
+ )
+ return jsonify({'success': True, 'id': sup_id})
+
+
+@app.route('/api/suppressions/', methods=['DELETE'])
+@require_auth
+def api_delete_suppression(sup_id: int):
+ user = _get_user()
+ db.deactivate_suppression(sup_id)
+ logger.info(f'Suppression #{sup_id} removed by {user["username"]}')
+ return jsonify({'success': True})
+
+
+@app.route('/health')
+def health():
+ """Health check endpoint (no auth)."""
+ return jsonify({'status': 'ok', 'service': 'gandalf'})
-@app.route('/api/diagnostics')
-def get_diagnostics():
- config = load_config()
- unifi = UnifiAPI(config)
- devices = unifi.get_devices()
- diagnostics = {}
- for device in devices:
- diagnostics[device['name']] = unifi.get_device_diagnostics(device)
- return jsonify(diagnostics)
if __name__ == '__main__':
- status_thread = threading.Thread(target=update_status, daemon=True)
- status_thread.start()
- app.run(debug=True)
\ No newline at end of file
+ app.run(debug=True, host='0.0.0.0', port=5000)
diff --git a/config.json b/config.json
index a2f1313..4a3e966 100644
--- a/config.json
+++ b/config.json
@@ -4,5 +4,61 @@
"api_key": "kyPfIsAVie3hwMD4Bc1MjAu8N7HVPIb8",
"site_id": "default"
},
- "check_interval": 30
-}
\ No newline at end of file
+ "prometheus": {
+ "url": "http://10.10.10.48:9090"
+ },
+ "database": {
+ "host": "10.10.10.50",
+ "port": 3306,
+ "user": "gandalf",
+ "password": "Gandalf2026Lotus",
+ "name": "gandalf"
+ },
+ "ticket_api": {
+ "url": "http://10.10.10.45/create_ticket_api.php",
+ "api_key": "5acc5d3c647b84f7c6f59082ce4450ee772e2d1633238b960136f653d20c93af"
+ },
+ "auth": {
+ "allowed_groups": ["admin"]
+ },
+ "monitor": {
+ "poll_interval": 120,
+ "failure_threshold": 2,
+ "cluster_threshold": 3,
+ "ping_hosts": [
+ {"name": "pbs", "ip": "10.10.10.3"}
+ ]
+ },
+ "hosts": [
+ {
+ "name": "large1",
+ "ip": "10.10.10.2",
+ "prometheus_instance": "10.10.10.2:9100"
+ },
+ {
+ "name": "compute-storage-01",
+ "ip": "10.10.10.4",
+ "prometheus_instance": "10.10.10.4:9100"
+ },
+ {
+ "name": "micro1",
+ "ip": "10.10.10.8",
+ "prometheus_instance": "10.10.10.8:9100"
+ },
+ {
+ "name": "monitor-02",
+ "ip": "10.10.10.9",
+ "prometheus_instance": "10.10.10.9:9100"
+ },
+ {
+ "name": "compute-storage-gpu-01",
+ "ip": "10.10.10.10",
+ "prometheus_instance": "10.10.10.10:9100"
+ },
+ {
+ "name": "storage-01",
+ "ip": "10.10.10.11",
+ "prometheus_instance": "10.10.10.11:9100"
+ }
+ ]
+}
diff --git a/db.py b/db.py
new file mode 100644
index 0000000..5cd638b
--- /dev/null
+++ b/db.py
@@ -0,0 +1,304 @@
+"""Database operations for Gandalf network monitor."""
+import json
+import logging
+from contextlib import contextmanager
+from datetime import datetime, timedelta
+from typing import Optional
+
+import pymysql
+import pymysql.cursors
+
+logger = logging.getLogger(__name__)
+
+_config_cache = None
+
+
+def _config() -> dict:
+ global _config_cache
+ if _config_cache is None:
+ with open('config.json') as f:
+ _config_cache = json.load(f)['database']
+ return _config_cache
+
+
+@contextmanager
+def get_conn():
+ cfg = _config()
+ conn = pymysql.connect(
+ host=cfg['host'],
+ port=cfg.get('port', 3306),
+ user=cfg['user'],
+ password=cfg['password'],
+ database=cfg['name'],
+ autocommit=True,
+ cursorclass=pymysql.cursors.DictCursor,
+ connect_timeout=10,
+ charset='utf8mb4',
+ )
+ try:
+ yield conn
+ finally:
+ conn.close()
+
+
+# ---------------------------------------------------------------------------
+# Monitor state (key/value store)
+# ---------------------------------------------------------------------------
+
+def set_state(key: str, value) -> None:
+ if not isinstance(value, str):
+ value = json.dumps(value, default=str)
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ """INSERT INTO monitor_state (key_name, value)
+ VALUES (%s, %s)
+ ON DUPLICATE KEY UPDATE value=VALUES(value), updated_at=NOW()""",
+ (key, value),
+ )
+
+
+def get_state(key: str, default=None):
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute('SELECT value FROM monitor_state WHERE key_name=%s', (key,))
+ row = cur.fetchone()
+ return row['value'] if row else default
+
+
+# ---------------------------------------------------------------------------
+# Interface baseline tracking
+# ---------------------------------------------------------------------------
+
+def get_baseline() -> dict:
+ raw = get_state('interface_baseline')
+ if raw:
+ try:
+ return json.loads(raw)
+ except Exception:
+ pass
+ return {}
+
+
+def set_baseline(baseline: dict) -> None:
+ set_state('interface_baseline', json.dumps(baseline))
+
+
+# ---------------------------------------------------------------------------
+# Network events
+# ---------------------------------------------------------------------------
+
+def upsert_event(
+ event_type: str,
+ severity: str,
+ source_type: str,
+ target_name: str,
+ target_detail: str,
+ description: str,
+) -> tuple:
+ """Insert or update a network event. Returns (id, is_new, consecutive_failures)."""
+ detail = target_detail or ''
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ """SELECT id, consecutive_failures FROM network_events
+ WHERE event_type=%s AND target_name=%s AND target_detail=%s
+ AND resolved_at IS NULL LIMIT 1""",
+ (event_type, target_name, detail),
+ )
+ existing = cur.fetchone()
+
+ if existing:
+ new_count = existing['consecutive_failures'] + 1
+ cur.execute(
+ """UPDATE network_events
+ SET last_seen=NOW(), consecutive_failures=%s, description=%s
+ WHERE id=%s""",
+ (new_count, description, existing['id']),
+ )
+ return existing['id'], False, new_count
+ else:
+ cur.execute(
+ """INSERT INTO network_events
+ (event_type, severity, source_type, target_name, target_detail, description)
+ VALUES (%s, %s, %s, %s, %s, %s)""",
+ (event_type, severity, source_type, target_name, detail, description),
+ )
+ return cur.lastrowid, True, 1
+
+
+def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None:
+ detail = target_detail or ''
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ """UPDATE network_events SET resolved_at=NOW()
+ WHERE event_type=%s AND target_name=%s AND target_detail=%s
+ AND resolved_at IS NULL""",
+ (event_type, target_name, detail),
+ )
+
+
+def set_ticket_id(event_id: int, ticket_id: str) -> None:
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ 'UPDATE network_events SET ticket_id=%s WHERE id=%s',
+ (ticket_id, event_id),
+ )
+
+
+def get_active_events() -> list:
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ """SELECT * FROM network_events
+ WHERE resolved_at IS NULL
+ ORDER BY
+ FIELD(severity,'critical','warning','info'),
+ first_seen DESC"""
+ )
+ rows = cur.fetchall()
+ for r in rows:
+ for k in ('first_seen', 'last_seen'):
+ if r.get(k) and hasattr(r[k], 'isoformat'):
+ r[k] = r[k].isoformat()
+ return rows
+
+
+def get_recent_resolved(hours: int = 24, limit: int = 50) -> list:
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ """SELECT * FROM network_events
+ WHERE resolved_at IS NOT NULL
+ AND resolved_at > DATE_SUB(NOW(), INTERVAL %s HOUR)
+ ORDER BY resolved_at DESC LIMIT %s""",
+ (hours, limit),
+ )
+ rows = cur.fetchall()
+ for r in rows:
+ for k in ('first_seen', 'last_seen', 'resolved_at'):
+ if r.get(k) and hasattr(r[k], 'isoformat'):
+ r[k] = r[k].isoformat()
+ return rows
+
+
+def get_status_summary() -> dict:
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ """SELECT severity, COUNT(*) as cnt FROM network_events
+ WHERE resolved_at IS NULL GROUP BY severity"""
+ )
+ counts = {r['severity']: r['cnt'] for r in cur.fetchall()}
+ return {
+ 'critical': counts.get('critical', 0),
+ 'warning': counts.get('warning', 0),
+ 'info': counts.get('info', 0),
+ }
+
+
+# ---------------------------------------------------------------------------
+# Suppression rules
+# ---------------------------------------------------------------------------
+
+def get_active_suppressions() -> list:
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ """SELECT * FROM suppression_rules
+ WHERE active=TRUE AND (expires_at IS NULL OR expires_at > NOW())
+ ORDER BY created_at DESC"""
+ )
+ rows = cur.fetchall()
+ for r in rows:
+ for k in ('created_at', 'expires_at'):
+ if r.get(k) and hasattr(r[k], 'isoformat'):
+ r[k] = r[k].isoformat()
+ return rows
+
+
+def get_suppression_history(limit: int = 50) -> list:
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ 'SELECT * FROM suppression_rules ORDER BY created_at DESC LIMIT %s',
+ (limit,),
+ )
+ rows = cur.fetchall()
+ for r in rows:
+ for k in ('created_at', 'expires_at'):
+ if r.get(k) and hasattr(r[k], 'isoformat'):
+ r[k] = r[k].isoformat()
+ return rows
+
+
+def create_suppression(
+ target_type: str,
+ target_name: str,
+ target_detail: str,
+ reason: str,
+ suppressed_by: str,
+ expires_minutes: Optional[int] = None,
+) -> int:
+ expires_at = None
+ if expires_minutes:
+ expires_at = datetime.utcnow() + timedelta(minutes=int(expires_minutes))
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ """INSERT INTO suppression_rules
+ (target_type, target_name, target_detail, reason, suppressed_by, expires_at, active)
+ VALUES (%s, %s, %s, %s, %s, %s, TRUE)""",
+ (target_type, target_name or '', target_detail or '', reason, suppressed_by, expires_at),
+ )
+ return cur.lastrowid
+
+
+def deactivate_suppression(sup_id: int) -> None:
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ cur.execute(
+ 'UPDATE suppression_rules SET active=FALSE WHERE id=%s', (sup_id,)
+ )
+
+
+def is_suppressed(target_type: str, target_name: str, target_detail: str = '') -> bool:
+ with get_conn() as conn:
+ with conn.cursor() as cur:
+ # Global suppression (all)
+ cur.execute(
+ """SELECT id FROM suppression_rules
+ WHERE active=TRUE AND (expires_at IS NULL OR expires_at > NOW())
+ AND target_type='all' LIMIT 1"""
+ )
+ if cur.fetchone():
+ return True
+
+ if not target_name:
+ return False
+
+ # Host-level suppression (covers all interfaces on that host)
+ cur.execute(
+ """SELECT id FROM suppression_rules
+ WHERE active=TRUE AND (expires_at IS NULL OR expires_at > NOW())
+ AND target_type=%s AND target_name=%s
+ AND (target_detail IS NULL OR target_detail='') LIMIT 1""",
+ (target_type, target_name),
+ )
+ if cur.fetchone():
+ return True
+
+ # Interface/device-specific suppression
+ if target_detail:
+ cur.execute(
+ """SELECT id FROM suppression_rules
+ WHERE active=TRUE AND (expires_at IS NULL OR expires_at > NOW())
+ AND target_type=%s AND target_name=%s AND target_detail=%s LIMIT 1""",
+ (target_type, target_name, target_detail),
+ )
+ if cur.fetchone():
+ return True
+
+ return False
diff --git a/gandalf-monitor.service b/gandalf-monitor.service
new file mode 100644
index 0000000..aa3296d
--- /dev/null
+++ b/gandalf-monitor.service
@@ -0,0 +1,22 @@
+[Unit]
+Description=Gandalf Network Monitor Daemon
+Documentation=https://gitea.lotusguild.org/LotusGuild/gandalf
+After=network.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+User=www-data
+WorkingDirectory=/var/www/html/prod
+ExecStart=/usr/bin/python3 /var/www/html/prod/monitor.py
+Restart=on-failure
+RestartSec=30
+TimeoutStopSec=10
+
+# Logging
+StandardOutput=journal
+StandardError=journal
+SyslogIdentifier=gandalf-monitor
+
+[Install]
+WantedBy=multi-user.target
diff --git a/monitor.py b/monitor.py
new file mode 100644
index 0000000..8d09607
--- /dev/null
+++ b/monitor.py
@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+"""Gandalf network monitor daemon.
+
+Polls Prometheus (node_exporter) and the UniFi controller for network
+interface and device state. Creates tickets in Tinker Tickets when issues
+are detected, with deduplication and suppression support.
+
+Run as a separate systemd service alongside the Flask web app.
+"""
+import json
+import logging
+import re
+import subprocess
+import time
+from datetime import datetime
+from typing import Dict, List, Optional
+
+import requests
+from urllib3.exceptions import InsecureRequestWarning
+
+import db
+
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s %(levelname)s %(name)s %(message)s',
+)
+logger = logging.getLogger('gandalf.monitor')
+
+# --------------------------------------------------------------------------
+# Interface filtering
+# --------------------------------------------------------------------------
+_SKIP_PREFIXES = (
+ 'lo', 'veth', 'tap', 'fwbr', 'fwln', 'fwpr',
+ 'docker', 'dummy', 'br-', 'virbr', 'vmbr',
+)
+_VLAN_SUFFIX = re.compile(r'\.\d+$')
+
+
+def is_physical_interface(name: str) -> bool:
+ """Return True for physical/bond interfaces worth monitoring."""
+ if any(name.startswith(p) for p in _SKIP_PREFIXES):
+ return False
+ if _VLAN_SUFFIX.search(name):
+ return False
+ return True
+
+
+# --------------------------------------------------------------------------
+# Prometheus client
+# --------------------------------------------------------------------------
+class PrometheusClient:
+ def __init__(self, url: str):
+ self.url = url.rstrip('/')
+
+ def query(self, promql: str) -> list:
+ try:
+ resp = requests.get(
+ f'{self.url}/api/v1/query',
+ params={'query': promql},
+ timeout=15,
+ )
+ resp.raise_for_status()
+ data = resp.json()
+ if data.get('status') == 'success':
+ return data['data']['result']
+ except Exception as e:
+ logger.error(f'Prometheus query failed ({promql!r}): {e}')
+ return []
+
+ def get_interface_states(self) -> Dict[str, Dict[str, bool]]:
+ """Return {instance: {device: is_up}} for physical interfaces."""
+ results = self.query('node_network_up')
+ hosts: Dict[str, Dict[str, bool]] = {}
+ for r in results:
+ instance = r['metric'].get('instance', '')
+ device = r['metric'].get('device', '')
+ if not is_physical_interface(device):
+ continue
+ hosts.setdefault(instance, {})[device] = (r['value'][1] == '1')
+ return hosts
+
+
+# --------------------------------------------------------------------------
+# UniFi client
+# --------------------------------------------------------------------------
+class UnifiClient:
+ def __init__(self, cfg: dict):
+ self.base_url = cfg['controller']
+ self.site_id = cfg.get('site_id', 'default')
+ self.session = requests.Session()
+ self.session.verify = False
+ self.headers = {
+ 'X-API-KEY': cfg['api_key'],
+ 'Accept': 'application/json',
+ }
+
+ def get_devices(self) -> Optional[List[dict]]:
+ """Return list of UniFi devices, or None if the controller is unreachable."""
+ try:
+ url = f'{self.base_url}/proxy/network/v2/api/site/{self.site_id}/device'
+ resp = self.session.get(url, headers=self.headers, timeout=15)
+ resp.raise_for_status()
+ data = resp.json()
+ devices = []
+ for d in data.get('network_devices', []):
+ state = d.get('state', 1)
+ devices.append({
+ 'name': d.get('name') or d.get('mac', 'unknown'),
+ 'mac': d.get('mac', ''),
+ 'ip': d.get('ip', ''),
+ 'type': d.get('type', 'unknown'),
+ 'model': d.get('model', ''),
+ 'state': state,
+ 'connected': state == 1,
+ })
+ return devices
+ except Exception as e:
+ logger.error(f'UniFi API error: {e}')
+ return None
+
+
+# --------------------------------------------------------------------------
+# Ticket client
+# --------------------------------------------------------------------------
+class TicketClient:
+ def __init__(self, cfg: dict):
+ self.url = cfg.get('url', '')
+ self.api_key = cfg.get('api_key', '')
+
+ def create(self, title: str, description: str, priority: str = '2') -> Optional[str]:
+ if not self.api_key or not self.url:
+ logger.warning('Ticket API not configured – skipping ticket creation')
+ return None
+ try:
+ resp = requests.post(
+ self.url,
+ json={
+ 'title': title,
+ 'description': description,
+ 'status': 'Open',
+ 'priority': priority,
+ 'category': 'Network',
+ 'type': 'Issue',
+ },
+ headers={'Authorization': f'Bearer {self.api_key}'},
+ timeout=15,
+ )
+ resp.raise_for_status()
+ data = resp.json()
+ if data.get('success'):
+ tid = data['ticket_id']
+ logger.info(f'Created ticket #{tid}: {title}')
+ return tid
+ if data.get('existing_ticket_id'):
+ logger.info(f'Duplicate suppressed by API – existing #{data["existing_ticket_id"]}')
+ return data['existing_ticket_id']
+ logger.warning(f'Unexpected ticket API response: {data}')
+ except Exception as e:
+ logger.error(f'Ticket creation failed: {e}')
+ return None
+
+
+# --------------------------------------------------------------------------
+# Helpers
+# --------------------------------------------------------------------------
+def ping(ip: str, count: int = 3, timeout: int = 2) -> bool:
+ try:
+ r = subprocess.run(
+ ['ping', '-c', str(count), '-W', str(timeout), ip],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ timeout=30,
+ )
+ return r.returncode == 0
+ except Exception:
+ return False
+
+
+def _now_utc() -> str:
+ return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')
+
+
+# --------------------------------------------------------------------------
+# Monitor
+# --------------------------------------------------------------------------
+CLUSTER_NAME = 'proxmox-cluster'
+
+
+class NetworkMonitor:
+ def __init__(self):
+ with open('config.json') as f:
+ self.cfg = json.load(f)
+
+ prom_url = self.cfg['prometheus']['url']
+ self.prom = PrometheusClient(prom_url)
+ self.unifi = UnifiClient(self.cfg['unifi'])
+ self.tickets = TicketClient(self.cfg.get('ticket_api', {}))
+
+ mon = self.cfg.get('monitor', {})
+ self.poll_interval = mon.get('poll_interval', 120)
+ self.fail_thresh = mon.get('failure_threshold', 2)
+ self.cluster_thresh = mon.get('cluster_threshold', 3)
+
+ # Build Prometheus instance → hostname lookup
+ self._instance_map: Dict[str, str] = {
+ h['prometheus_instance']: h['name']
+ for h in self.cfg.get('hosts', [])
+ if 'prometheus_instance' in h
+ }
+
+ def _hostname(self, instance: str) -> str:
+ return self._instance_map.get(instance, instance.split(':')[0])
+
+ # ------------------------------------------------------------------
+ # Interface monitoring (Prometheus)
+ # ------------------------------------------------------------------
+ def _process_interfaces(self, states: Dict[str, Dict[str, bool]]) -> None:
+ baseline = db.get_baseline()
+ new_baseline = {k: dict(v) for k, v in baseline.items()}
+ # Only count hosts with genuine regressions (UP→DOWN) toward cluster threshold
+ hosts_with_regression: List[str] = []
+
+ for instance, ifaces in states.items():
+ host = self._hostname(instance)
+ new_baseline.setdefault(host, {})
+ host_has_regression = False
+
+ for iface, is_up in ifaces.items():
+ prev = baseline.get(host, {}).get(iface) # 'up', 'initial_down', or None
+
+ if is_up:
+ new_baseline[host][iface] = 'up'
+ db.resolve_event('interface_down', host, iface)
+ else:
+ if prev is None:
+ # First observation is down – could be unused port, don't alert
+ new_baseline[host][iface] = 'initial_down'
+
+ elif prev == 'initial_down':
+ # Persistently down since first observation – no alert
+ pass
+
+ else: # prev == 'up'
+ # Regression: was UP, now DOWN
+ host_has_regression = True
+ sup = (
+ db.is_suppressed('interface', host, iface) or
+ db.is_suppressed('host', host)
+ )
+ event_id, is_new, consec = db.upsert_event(
+ 'interface_down', 'critical', 'prometheus',
+ host, iface,
+ f'Interface {iface} on {host} went link-down ({_now_utc()})',
+ )
+ if not sup and consec >= self.fail_thresh:
+ self._ticket_interface(event_id, is_new, host, iface, consec)
+
+ if host_has_regression:
+ hosts_with_regression.append(host)
+
+ db.set_baseline(new_baseline)
+
+ # Cluster-wide check – only genuine regressions count
+ if len(hosts_with_regression) >= self.cluster_thresh:
+ sup = db.is_suppressed('all', '')
+ event_id, is_new, consec = db.upsert_event(
+ 'cluster_network_issue', 'critical', 'prometheus',
+ CLUSTER_NAME, '',
+ f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
+ f'{", ".join(hosts_with_regression)}',
+ )
+ if not sup and is_new:
+ title = (
+ f'[{CLUSTER_NAME}][auto][production][issue][network][cluster-wide] '
+ f'Multiple hosts reporting interface failures'
+ )
+ desc = (
+ f'Cluster Network Alert\n{"=" * 40}\n\n'
+ f'Affected hosts: {", ".join(hosts_with_regression)}\n'
+ f'Detected: {_now_utc()}\n\n'
+ f'{len(hosts_with_regression)} Proxmox hosts simultaneously reported '
+ f'interface regressions (link-down on interfaces previously known UP).\n'
+ f'This likely indicates a switch or upstream network failure.\n\n'
+ f'Please check the core and management switches immediately.'
+ )
+ tid = self.tickets.create(title, desc, priority='1')
+ if tid:
+ db.set_ticket_id(event_id, tid)
+ else:
+ db.resolve_event('cluster_network_issue', CLUSTER_NAME, '')
+
+ def _ticket_interface(
+ self, event_id: int, is_new: bool, host: str, iface: str, consec: int
+ ) -> None:
+ title = (
+ f'[{host}][auto][production][issue][network][single-node] '
+ f'Interface {iface} link-down'
+ )
+ desc = (
+ f'Network Interface Alert\n{"=" * 40}\n\n'
+ f'Host: {host}\n'
+ f'Interface: {iface}\n'
+ f'Detected: {_now_utc()}\n'
+ f'Consecutive check failures: {consec}\n\n'
+ f'Interface {iface} on {host} is reporting link-down state via '
+ f'Prometheus node_exporter.\n\n'
+ f'Note: {host} may still be reachable via its other network interface.\n'
+ f'Please inspect the cable/SFP/switch port for {host}/{iface}.'
+ )
+ tid = self.tickets.create(title, desc, priority='2')
+ if tid and is_new:
+ db.set_ticket_id(event_id, tid)
+
+ # ------------------------------------------------------------------
+ # UniFi device monitoring
+ # ------------------------------------------------------------------
+ def _process_unifi(self, devices: Optional[List[dict]]) -> None:
+ if devices is None:
+ logger.warning('UniFi API unreachable this cycle')
+ return
+
+ for d in devices:
+ name = d['name']
+ if not d['connected']:
+ sup = db.is_suppressed('unifi_device', name)
+ event_id, is_new, consec = db.upsert_event(
+ 'unifi_device_offline', 'critical', 'unifi',
+ name, d.get('type', ''),
+ f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
+ )
+ if not sup and consec >= self.fail_thresh:
+ self._ticket_unifi(event_id, is_new, d)
+ else:
+ db.resolve_event('unifi_device_offline', name, d.get('type', ''))
+
+ def _ticket_unifi(self, event_id: int, is_new: bool, device: dict) -> None:
+ name = device['name']
+ title = (
+ f'[{name}][auto][production][issue][network][single-node] '
+ f'UniFi device offline'
+ )
+ desc = (
+ f'UniFi Device Alert\n{"=" * 40}\n\n'
+ f'Device: {name}\n'
+ f'Type: {device.get("type","unknown")}\n'
+ f'Model: {device.get("model","")}\n'
+ f'Last Known IP: {device.get("ip","unknown")}\n'
+ f'Detected: {_now_utc()}\n\n'
+ f'The UniFi device {name} is offline per the UniFi controller.\n'
+ f'Please check power and cable connectivity.'
+ )
+ tid = self.tickets.create(title, desc, priority='2')
+ if tid and is_new:
+ db.set_ticket_id(event_id, tid)
+
+ # ------------------------------------------------------------------
+ # Ping-only hosts (no node_exporter)
+ # ------------------------------------------------------------------
+ def _process_ping_hosts(self) -> None:
+ for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
+ name, ip = h['name'], h['ip']
+ reachable = ping(ip)
+
+ if not reachable:
+ sup = db.is_suppressed('host', name)
+ event_id, is_new, consec = db.upsert_event(
+ 'host_unreachable', 'critical', 'ping',
+ name, ip,
+ f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
+ )
+ if not sup and consec >= self.fail_thresh:
+ self._ticket_unreachable(event_id, is_new, name, ip, consec)
+ else:
+ db.resolve_event('host_unreachable', name, ip)
+
+ def _ticket_unreachable(
+ self, event_id: int, is_new: bool, name: str, ip: str, consec: int
+ ) -> None:
+ title = (
+ f'[{name}][auto][production][issue][network][single-node] '
+ f'Host unreachable'
+ )
+ desc = (
+ f'Host Reachability Alert\n{"=" * 40}\n\n'
+ f'Host: {name}\n'
+ f'IP: {ip}\n'
+ f'Detected: {_now_utc()}\n'
+ f'Consecutive check failures: {consec}\n\n'
+ f'Host {name} ({ip}) is not responding to ping from the Gandalf monitor.\n'
+ f'This host does not have a Prometheus node_exporter, so interface-level '
+ f'detail is unavailable.\n\n'
+ f'Please check the host power, management interface, and network connectivity.'
+ )
+ tid = self.tickets.create(title, desc, priority='2')
+ if tid and is_new:
+ db.set_ticket_id(event_id, tid)
+
+ # ------------------------------------------------------------------
+ # Snapshot collection (for dashboard)
+ # ------------------------------------------------------------------
+ def _collect_snapshot(self) -> dict:
+ iface_states = self.prom.get_interface_states()
+ unifi_devices = self.unifi.get_devices() or []
+
+ hosts = {}
+ for instance, ifaces in iface_states.items():
+ host = self._hostname(instance)
+ phys = {k: v for k, v in ifaces.items()}
+ up_count = sum(1 for v in phys.values() if v)
+ total = len(phys)
+ if total == 0 or up_count == total:
+ status = 'up'
+ elif up_count == 0:
+ status = 'down'
+ else:
+ status = 'degraded'
+
+ hosts[host] = {
+ 'ip': instance.split(':')[0],
+ 'interfaces': {k: ('up' if v else 'down') for k, v in phys.items()},
+ 'status': status,
+ 'source': 'prometheus',
+ }
+
+ for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
+ name, ip = h['name'], h['ip']
+ reachable = ping(ip, count=1, timeout=2)
+ hosts[name] = {
+ 'ip': ip,
+ 'interfaces': {},
+ 'status': 'up' if reachable else 'down',
+ 'source': 'ping',
+ }
+
+ return {
+ 'hosts': hosts,
+ 'unifi': unifi_devices,
+ 'updated': datetime.utcnow().isoformat(),
+ }
+
+ # ------------------------------------------------------------------
+ # Main loop
+ # ------------------------------------------------------------------
+ def run(self) -> None:
+ logger.info(
+ f'Gandalf monitor started – poll_interval={self.poll_interval}s '
+ f'fail_thresh={self.fail_thresh}'
+ )
+ while True:
+ try:
+ logger.info('Starting network check cycle')
+
+ # 1. Collect and store snapshot for dashboard
+ snapshot = self._collect_snapshot()
+ db.set_state('network_snapshot', snapshot)
+ db.set_state('last_check', _now_utc())
+
+ # 2. Process alerts (separate Prometheus call for fresh data)
+ iface_states = self.prom.get_interface_states()
+ self._process_interfaces(iface_states)
+
+ unifi_devices = self.unifi.get_devices()
+ self._process_unifi(unifi_devices)
+
+ self._process_ping_hosts()
+
+ logger.info('Network check cycle complete')
+
+ except Exception as e:
+ logger.error(f'Monitor loop error: {e}', exc_info=True)
+
+ time.sleep(self.poll_interval)
+
+
+if __name__ == '__main__':
+ monitor = NetworkMonitor()
+ monitor.run()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e506263
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+flask>=2.2.0
+gunicorn>=20.1.0
+pymysql>=1.1.0
+requests>=2.31.0
+urllib3>=2.0.0
diff --git a/schema.sql b/schema.sql
new file mode 100644
index 0000000..d663361
--- /dev/null
+++ b/schema.sql
@@ -0,0 +1,50 @@
+-- Gandalf Network Monitor – Database Schema
+-- Run on MariaDB LXC 149 (10.10.10.50)
+
+CREATE DATABASE IF NOT EXISTS gandalf
+ CHARACTER SET utf8mb4
+ COLLATE utf8mb4_unicode_ci;
+
+USE gandalf;
+
+-- ── Network events (open and resolved alerts) ─────────────────────────
+CREATE TABLE IF NOT EXISTS network_events (
+ id INT AUTO_INCREMENT PRIMARY KEY,
+ event_type VARCHAR(60) NOT NULL,
+ severity ENUM('critical','warning','info') NOT NULL DEFAULT 'warning',
+ source_type VARCHAR(20) NOT NULL, -- 'prometheus', 'unifi', 'ping'
+ target_name VARCHAR(255) NOT NULL, -- hostname or device name
+ target_detail VARCHAR(255) NOT NULL DEFAULT '', -- interface name, device type, IP
+ description TEXT,
+ first_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+ last_seen TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+ resolved_at TIMESTAMP NULL,
+ consecutive_failures INT NOT NULL DEFAULT 1,
+ ticket_id VARCHAR(20) NULL,
+
+ INDEX idx_active (resolved_at),
+ INDEX idx_target (target_name, target_detail),
+ INDEX idx_type (event_type)
+) ENGINE=InnoDB;
+
+-- ── Suppression rules ─────────────────────────────────────────────────
+CREATE TABLE IF NOT EXISTS suppression_rules (
+ id INT AUTO_INCREMENT PRIMARY KEY,
+ target_type VARCHAR(50) NOT NULL, -- 'host', 'interface', 'unifi_device', 'all'
+ target_name VARCHAR(255) NOT NULL DEFAULT '',
+ target_detail VARCHAR(255) NOT NULL DEFAULT '',
+ reason TEXT NOT NULL,
+ suppressed_by VARCHAR(255) NOT NULL,
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+ expires_at TIMESTAMP NULL, -- NULL = manual (never auto-expires)
+ active BOOLEAN NOT NULL DEFAULT TRUE,
+
+ INDEX idx_active_exp (active, expires_at)
+) ENGINE=InnoDB;
+
+-- ── Monitor state (key/value store for snapshot + baseline) ───────────
+CREATE TABLE IF NOT EXISTS monitor_state (
+ key_name VARCHAR(100) PRIMARY KEY,
+ value MEDIUMTEXT NOT NULL,
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP
+) ENGINE=InnoDB;
diff --git a/static/app.js b/static/app.js
index f1ce2ed..73095be 100644
--- a/static/app.js
+++ b/static/app.js
@@ -1,147 +1,272 @@
-// Initialization
-const UPDATE_INTERVALS = {
- deviceStatus: 30000,
- diagnostics: 60000
-};
+'use strict';
-// Core update functions
-function updateDeviceStatus() {
- console.log('Fetching device status...');
- fetch('/api/status')
- .then(response => response.json())
- .then(data => {
- console.log('Received status data:', data);
- Object.entries(data).forEach(([deviceName, status]) => {
- const deviceElement = document.querySelector(`.device-status[data-device-name="${deviceName}"]`);
- if (deviceElement) {
- const indicator = deviceElement.querySelector('.status-indicator');
- indicator.className = `status-indicator status-${status ? 'up' : 'down'}`;
- }
- });
- });
+// ── Toast notifications ───────────────────────────────────────────────
+function showToast(msg, type = 'success') {
+ let container = document.querySelector('.toast-container');
+ if (!container) {
+ container = document.createElement('div');
+ container.className = 'toast-container';
+ document.body.appendChild(container);
+ }
+ const toast = document.createElement('div');
+ toast.className = `toast toast-${type}`;
+ toast.textContent = msg;
+ container.appendChild(toast);
+ setTimeout(() => toast.remove(), 3500);
}
-function toggleInterfaces(header) {
- const list = header.nextElementSibling;
- const icon = header.querySelector('.expand-icon');
- list.classList.toggle('collapsed');
- icon.style.transform = list.classList.contains('collapsed') ? 'rotate(-90deg)' : 'rotate(0deg)';
+// ── Dashboard auto-refresh ────────────────────────────────────────────
+async function refreshAll() {
+ try {
+ const [netResp, statusResp] = await Promise.all([
+ fetch('/api/network'),
+ fetch('/api/status'),
+ ]);
+ if (!netResp.ok || !statusResp.ok) return;
+
+ const net = await netResp.json();
+ const status = await statusResp.json();
+
+ updateHostGrid(net.hosts || {});
+ updateUnifiTable(net.unifi || []);
+ updateEventsTable(status.events || []);
+ updateStatusBar(status.summary || {}, status.last_check || '');
+ updateTopology(net.hosts || {});
+
+ } catch (e) {
+ console.warn('Refresh failed:', e);
+ }
}
-function updateInterfaceStatus(deviceName, interfaces) {
- const interfaceList = document.querySelector(`.interface-group[data-device-name="${deviceName}"] .interface-list`);
- if (interfaceList && interfaces) {
- interfaceList.innerHTML = '';
- Object.entries(interfaces.ports || {}).forEach(([portName, port]) => {
- interfaceList.innerHTML += `
-