Complete rewrite: full-featured network monitoring dashboard

- Two-service architecture: Flask web app (gandalf.service) + background
  polling daemon (gandalf-monitor.service)
- Monitor polls Prometheus node_network_up for physical NIC states on all
  6 hypervisors (added storage-01 at 10.10.10.11:9100)
- UniFi API monitoring for switches, APs, and gateway device status
- Ping reachability for hosts without node_exporter (pbs only now)
- Smart baseline: interfaces first seen as down are never alerted on;
  only UP→DOWN regressions trigger tickets
- Cluster-wide P1 ticket when 3+ hosts have genuine simultaneous
  interface regressions (guards against false positives on startup)
- Tinker Tickets integration with 24-hour hash-based deduplication
- Alert suppression: manual toggle or timed windows (30m/1h/4h/8h)
- Authelia SSO via forward-auth headers, admin group required
- Network topology: Internet → UDM-Pro → Agg Switch (10G DAC) →
  PoE Switch (10G DAC) → Hosts
- MariaDB schema, suppression management UI, host/interface cards

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-01 23:03:18 -05:00
parent 4ed5ecacbb
commit 0c0150f698
13 changed files with 2787 additions and 512 deletions

323
app.py
View File

@@ -1,144 +1,207 @@
import logging
"""Gandalf Global Advanced Network Detection And Link Facilitator.
Flask web application serving the monitoring dashboard and suppression
management UI. Authentication via Authelia forward-auth headers.
All monitoring and alerting is handled by the separate monitor.py daemon.
"""
import json
import platform
import subprocess
import threading
import time
from datetime import datetime
from flask import Flask, render_template, jsonify
import requests
from urllib3.exceptions import InsecureRequestWarning
import logging
from functools import wraps
from flask import Flask, jsonify, redirect, render_template, request, url_for
import db
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s %(message)s',
)
logger = logging.getLogger('gandalf.web')
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
app = Flask(__name__)
device_status = {}
def load_config():
with open('config.json') as f:
return json.load(f)
_cfg = None
class UnifiAPI:
def __init__(self, config):
self.base_url = config['unifi']['controller']
self.session = requests.Session()
self.session.verify = False
self.headers = {
'X-API-KEY': config['unifi']['api_key'],
'Accept': 'application/json'
}
self.site_id = "default"
def get_devices(self):
try:
url = f"{self.base_url}/proxy/network/v2/api/site/{self.site_id}/device"
response = self.session.get(url, headers=self.headers)
response.raise_for_status()
# Log raw response
logger.debug(f"Response status: {response.status_code}")
logger.debug(f"Response headers: {response.headers}")
logger.debug(f"Raw response text: {response.text}")
devices_data = response.json()
logger.debug(f"Parsed JSON: {devices_data}")
# Extract network_devices from the response
network_devices = devices_data.get('network_devices', [])
devices = []
for device in network_devices:
devices.append({
'name': device.get('name', 'Unknown'),
'ip': device.get('ip', '0.0.0.0'),
'type': device.get('type', 'unknown'),
'connection_type': 'fiber' if device.get('uplink', {}).get('media') == 'sfp' else 'copper',
'critical': True if device.get('type') in ['udm', 'usw'] else False,
'device_id': device.get('mac')
})
logger.debug(f"Processed devices: {devices}")
return devices
except Exception as e:
logger.error(f"Error fetching devices: {e}")
logger.exception("Full traceback:")
return []
def get_device_details(self, device_id):
try:
url = f"{self.base_url}/proxy/network/v2/api/site/{self.site_id}/device/{device_id}"
response = self.session.get(url, headers=self.headers)
response.raise_for_status()
return response.json()
except Exception as e:
logger.error(f"Failed to get device details: {e}")
return None
def get_device_diagnostics(self, device):
details = self.get_device_details(device['device_id'])
if not details:
return {'state': 'ERROR', 'error': 'Failed to fetch device details'}
diagnostics = {
'state': details.get('state', 'unknown'),
'interfaces': {
'ports': {}
}
}
# Parse port information
for port in details.get('port_table', []):
diagnostics['interfaces']['ports'][f"Port {port.get('port_idx')}"] = {
'state': 'up' if port.get('up') else 'down',
'speed': {
'current': port.get('speed', 0),
'max': port.get('max_speed', 0)
},
'poe': port.get('poe_enable', False),
'media': port.get('media', 'unknown')
}
return diagnostics
def _config() -> dict:
global _cfg
if _cfg is None:
with open('config.json') as f:
_cfg = json.load(f)
return _cfg
def _parse_interfaces(self, interfaces):
result = {
'ports': {},
'radios': {}
}
for port in interfaces:
result['ports'][f"port_{port['index']}"] = {
'state': port['up'] and 'up' or 'down',
'speed': {
'current': port.get('speed', 0),
'max': port.get('max_speed', 0)
}
}
return result
# ---------------------------------------------------------------------------
# Auth helpers
# ---------------------------------------------------------------------------
def _get_user() -> dict:
return {
'username': request.headers.get('Remote-User', ''),
'name': request.headers.get('Remote-Name', ''),
'email': request.headers.get('Remote-Email', ''),
'groups': [
g.strip()
for g in request.headers.get('Remote-Groups', '').split(',')
if g.strip()
],
}
def require_auth(f):
@wraps(f)
def wrapper(*args, **kwargs):
user = _get_user()
if not user['username']:
return (
'<h1>401 Not authenticated</h1>'
'<p>Please access Gandalf through '
'<a href="https://auth.lotusguild.org">auth.lotusguild.org</a>.</p>',
401,
)
allowed = _config().get('auth', {}).get('allowed_groups', ['admin'])
if not any(g in allowed for g in user['groups']):
return (
f'<h1>403 Access denied</h1>'
f'<p>Your account ({user["username"]}) is not in an allowed group '
f'({", ".join(allowed)}).</p>',
403,
)
return f(*args, **kwargs)
return wrapper
# ---------------------------------------------------------------------------
# Page routes
# ---------------------------------------------------------------------------
@app.route('/')
def home():
config = load_config()
unifi = UnifiAPI(config)
devices = unifi.get_devices()
return render_template('index.html', devices=devices)
@require_auth
def index():
user = _get_user()
events = db.get_active_events()
summary = db.get_status_summary()
snapshot_raw = db.get_state('network_snapshot')
last_check = db.get_state('last_check', 'Never')
snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
suppressions = db.get_active_suppressions()
return render_template(
'index.html',
user=user,
events=events,
summary=summary,
snapshot=snapshot,
last_check=last_check,
suppressions=suppressions,
)
@app.route('/suppressions')
@require_auth
def suppressions_page():
user = _get_user()
active = db.get_active_suppressions()
history = db.get_suppression_history(limit=50)
snapshot_raw = db.get_state('network_snapshot')
snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
return render_template(
'suppressions.html',
user=user,
active=active,
history=history,
snapshot=snapshot,
)
# ---------------------------------------------------------------------------
# API routes
# ---------------------------------------------------------------------------
@app.route('/api/status')
def status():
return jsonify(device_status)
@require_auth
def api_status():
return jsonify({
'summary': db.get_status_summary(),
'last_check': db.get_state('last_check', 'Never'),
'events': db.get_active_events(),
})
@app.route('/api/network')
@require_auth
def api_network():
raw = db.get_state('network_snapshot')
if raw:
try:
return jsonify(json.loads(raw))
except Exception:
pass
return jsonify({'hosts': {}, 'unifi': [], 'updated': None})
@app.route('/api/events')
@require_auth
def api_events():
return jsonify({
'active': db.get_active_events(),
'resolved': db.get_recent_resolved(hours=24, limit=30),
})
@app.route('/api/suppressions', methods=['GET'])
@require_auth
def api_get_suppressions():
return jsonify(db.get_active_suppressions())
@app.route('/api/suppressions', methods=['POST'])
@require_auth
def api_create_suppression():
user = _get_user()
data = request.get_json(silent=True) or {}
target_type = data.get('target_type', 'host')
target_name = (data.get('target_name') or '').strip()
target_detail = (data.get('target_detail') or '').strip()
reason = (data.get('reason') or '').strip()
expires_minutes = data.get('expires_minutes') # None = manual/permanent
if target_type not in ('host', 'interface', 'unifi_device', 'all'):
return jsonify({'error': 'Invalid target_type'}), 400
if target_type != 'all' and not target_name:
return jsonify({'error': 'target_name required'}), 400
if not reason:
return jsonify({'error': 'reason required'}), 400
sup_id = db.create_suppression(
target_type=target_type,
target_name=target_name,
target_detail=target_detail,
reason=reason,
suppressed_by=user['username'],
expires_minutes=int(expires_minutes) if expires_minutes else None,
)
logger.info(
f'Suppression #{sup_id} created by {user["username"]}: '
f'{target_type}/{target_name}/{target_detail} {reason}'
)
return jsonify({'success': True, 'id': sup_id})
@app.route('/api/suppressions/<int:sup_id>', methods=['DELETE'])
@require_auth
def api_delete_suppression(sup_id: int):
user = _get_user()
db.deactivate_suppression(sup_id)
logger.info(f'Suppression #{sup_id} removed by {user["username"]}')
return jsonify({'success': True})
@app.route('/health')
def health():
"""Health check endpoint (no auth)."""
return jsonify({'status': 'ok', 'service': 'gandalf'})
@app.route('/api/diagnostics')
def get_diagnostics():
config = load_config()
unifi = UnifiAPI(config)
devices = unifi.get_devices()
diagnostics = {}
for device in devices:
diagnostics[device['name']] = unifi.get_device_diagnostics(device)
return jsonify(diagnostics)
if __name__ == '__main__':
status_thread = threading.Thread(target=update_status, daemon=True)
status_thread.start()
app.run(debug=True)
app.run(debug=True, host='0.0.0.0', port=5000)