Complete rewrite: full-featured network monitoring dashboard
- Two-service architecture: Flask web app (gandalf.service) + background polling daemon (gandalf-monitor.service) - Monitor polls Prometheus node_network_up for physical NIC states on all 6 hypervisors (added storage-01 at 10.10.10.11:9100) - UniFi API monitoring for switches, APs, and gateway device status - Ping reachability for hosts without node_exporter (pbs only now) - Smart baseline: interfaces first seen as down are never alerted on; only UP→DOWN regressions trigger tickets - Cluster-wide P1 ticket when 3+ hosts have genuine simultaneous interface regressions (guards against false positives on startup) - Tinker Tickets integration with 24-hour hash-based deduplication - Alert suppression: manual toggle or timed windows (30m/1h/4h/8h) - Authelia SSO via forward-auth headers, admin group required - Network topology: Internet → UDM-Pro → Agg Switch (10G DAC) → PoE Switch (10G DAC) → Hosts - MariaDB schema, suppression management UI, host/interface cards Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
323
app.py
323
app.py
@@ -1,144 +1,207 @@
|
||||
import logging
|
||||
"""Gandalf – Global Advanced Network Detection And Link Facilitator.
|
||||
|
||||
Flask web application serving the monitoring dashboard and suppression
|
||||
management UI. Authentication via Authelia forward-auth headers.
|
||||
All monitoring and alerting is handled by the separate monitor.py daemon.
|
||||
"""
|
||||
import json
|
||||
import platform
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime
|
||||
from flask import Flask, render_template, jsonify
|
||||
import requests
|
||||
from urllib3.exceptions import InsecureRequestWarning
|
||||
import logging
|
||||
from functools import wraps
|
||||
|
||||
from flask import Flask, jsonify, redirect, render_template, request, url_for
|
||||
|
||||
import db
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s %(levelname)s %(name)s %(message)s',
|
||||
)
|
||||
logger = logging.getLogger('gandalf.web')
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger(__name__)
|
||||
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
|
||||
app = Flask(__name__)
|
||||
device_status = {}
|
||||
|
||||
def load_config():
|
||||
with open('config.json') as f:
|
||||
return json.load(f)
|
||||
_cfg = None
|
||||
|
||||
class UnifiAPI:
|
||||
def __init__(self, config):
|
||||
self.base_url = config['unifi']['controller']
|
||||
self.session = requests.Session()
|
||||
self.session.verify = False
|
||||
self.headers = {
|
||||
'X-API-KEY': config['unifi']['api_key'],
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
self.site_id = "default"
|
||||
|
||||
def get_devices(self):
|
||||
try:
|
||||
url = f"{self.base_url}/proxy/network/v2/api/site/{self.site_id}/device"
|
||||
response = self.session.get(url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
|
||||
# Log raw response
|
||||
logger.debug(f"Response status: {response.status_code}")
|
||||
logger.debug(f"Response headers: {response.headers}")
|
||||
logger.debug(f"Raw response text: {response.text}")
|
||||
|
||||
devices_data = response.json()
|
||||
logger.debug(f"Parsed JSON: {devices_data}")
|
||||
|
||||
# Extract network_devices from the response
|
||||
network_devices = devices_data.get('network_devices', [])
|
||||
|
||||
devices = []
|
||||
for device in network_devices:
|
||||
devices.append({
|
||||
'name': device.get('name', 'Unknown'),
|
||||
'ip': device.get('ip', '0.0.0.0'),
|
||||
'type': device.get('type', 'unknown'),
|
||||
'connection_type': 'fiber' if device.get('uplink', {}).get('media') == 'sfp' else 'copper',
|
||||
'critical': True if device.get('type') in ['udm', 'usw'] else False,
|
||||
'device_id': device.get('mac')
|
||||
})
|
||||
|
||||
logger.debug(f"Processed devices: {devices}")
|
||||
return devices
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching devices: {e}")
|
||||
logger.exception("Full traceback:")
|
||||
return []
|
||||
|
||||
def get_device_details(self, device_id):
|
||||
try:
|
||||
url = f"{self.base_url}/proxy/network/v2/api/site/{self.site_id}/device/{device_id}"
|
||||
response = self.session.get(url, headers=self.headers)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get device details: {e}")
|
||||
return None
|
||||
|
||||
def get_device_diagnostics(self, device):
|
||||
details = self.get_device_details(device['device_id'])
|
||||
if not details:
|
||||
return {'state': 'ERROR', 'error': 'Failed to fetch device details'}
|
||||
|
||||
diagnostics = {
|
||||
'state': details.get('state', 'unknown'),
|
||||
'interfaces': {
|
||||
'ports': {}
|
||||
}
|
||||
}
|
||||
|
||||
# Parse port information
|
||||
for port in details.get('port_table', []):
|
||||
diagnostics['interfaces']['ports'][f"Port {port.get('port_idx')}"] = {
|
||||
'state': 'up' if port.get('up') else 'down',
|
||||
'speed': {
|
||||
'current': port.get('speed', 0),
|
||||
'max': port.get('max_speed', 0)
|
||||
},
|
||||
'poe': port.get('poe_enable', False),
|
||||
'media': port.get('media', 'unknown')
|
||||
}
|
||||
|
||||
return diagnostics
|
||||
def _config() -> dict:
|
||||
global _cfg
|
||||
if _cfg is None:
|
||||
with open('config.json') as f:
|
||||
_cfg = json.load(f)
|
||||
return _cfg
|
||||
|
||||
def _parse_interfaces(self, interfaces):
|
||||
result = {
|
||||
'ports': {},
|
||||
'radios': {}
|
||||
}
|
||||
for port in interfaces:
|
||||
result['ports'][f"port_{port['index']}"] = {
|
||||
'state': port['up'] and 'up' or 'down',
|
||||
'speed': {
|
||||
'current': port.get('speed', 0),
|
||||
'max': port.get('max_speed', 0)
|
||||
}
|
||||
}
|
||||
return result
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Auth helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_user() -> dict:
|
||||
return {
|
||||
'username': request.headers.get('Remote-User', ''),
|
||||
'name': request.headers.get('Remote-Name', ''),
|
||||
'email': request.headers.get('Remote-Email', ''),
|
||||
'groups': [
|
||||
g.strip()
|
||||
for g in request.headers.get('Remote-Groups', '').split(',')
|
||||
if g.strip()
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def require_auth(f):
|
||||
@wraps(f)
|
||||
def wrapper(*args, **kwargs):
|
||||
user = _get_user()
|
||||
if not user['username']:
|
||||
return (
|
||||
'<h1>401 – Not authenticated</h1>'
|
||||
'<p>Please access Gandalf through '
|
||||
'<a href="https://auth.lotusguild.org">auth.lotusguild.org</a>.</p>',
|
||||
401,
|
||||
)
|
||||
allowed = _config().get('auth', {}).get('allowed_groups', ['admin'])
|
||||
if not any(g in allowed for g in user['groups']):
|
||||
return (
|
||||
f'<h1>403 – Access denied</h1>'
|
||||
f'<p>Your account ({user["username"]}) is not in an allowed group '
|
||||
f'({", ".join(allowed)}).</p>',
|
||||
403,
|
||||
)
|
||||
return f(*args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Page routes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@app.route('/')
|
||||
def home():
|
||||
config = load_config()
|
||||
unifi = UnifiAPI(config)
|
||||
devices = unifi.get_devices()
|
||||
return render_template('index.html', devices=devices)
|
||||
@require_auth
|
||||
def index():
|
||||
user = _get_user()
|
||||
events = db.get_active_events()
|
||||
summary = db.get_status_summary()
|
||||
snapshot_raw = db.get_state('network_snapshot')
|
||||
last_check = db.get_state('last_check', 'Never')
|
||||
snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
|
||||
suppressions = db.get_active_suppressions()
|
||||
return render_template(
|
||||
'index.html',
|
||||
user=user,
|
||||
events=events,
|
||||
summary=summary,
|
||||
snapshot=snapshot,
|
||||
last_check=last_check,
|
||||
suppressions=suppressions,
|
||||
)
|
||||
|
||||
|
||||
@app.route('/suppressions')
|
||||
@require_auth
|
||||
def suppressions_page():
|
||||
user = _get_user()
|
||||
active = db.get_active_suppressions()
|
||||
history = db.get_suppression_history(limit=50)
|
||||
snapshot_raw = db.get_state('network_snapshot')
|
||||
snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
|
||||
return render_template(
|
||||
'suppressions.html',
|
||||
user=user,
|
||||
active=active,
|
||||
history=history,
|
||||
snapshot=snapshot,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# API routes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@app.route('/api/status')
|
||||
def status():
|
||||
return jsonify(device_status)
|
||||
@require_auth
|
||||
def api_status():
|
||||
return jsonify({
|
||||
'summary': db.get_status_summary(),
|
||||
'last_check': db.get_state('last_check', 'Never'),
|
||||
'events': db.get_active_events(),
|
||||
})
|
||||
|
||||
|
||||
@app.route('/api/network')
|
||||
@require_auth
|
||||
def api_network():
|
||||
raw = db.get_state('network_snapshot')
|
||||
if raw:
|
||||
try:
|
||||
return jsonify(json.loads(raw))
|
||||
except Exception:
|
||||
pass
|
||||
return jsonify({'hosts': {}, 'unifi': [], 'updated': None})
|
||||
|
||||
|
||||
@app.route('/api/events')
|
||||
@require_auth
|
||||
def api_events():
|
||||
return jsonify({
|
||||
'active': db.get_active_events(),
|
||||
'resolved': db.get_recent_resolved(hours=24, limit=30),
|
||||
})
|
||||
|
||||
|
||||
@app.route('/api/suppressions', methods=['GET'])
|
||||
@require_auth
|
||||
def api_get_suppressions():
|
||||
return jsonify(db.get_active_suppressions())
|
||||
|
||||
|
||||
@app.route('/api/suppressions', methods=['POST'])
|
||||
@require_auth
|
||||
def api_create_suppression():
|
||||
user = _get_user()
|
||||
data = request.get_json(silent=True) or {}
|
||||
|
||||
target_type = data.get('target_type', 'host')
|
||||
target_name = (data.get('target_name') or '').strip()
|
||||
target_detail = (data.get('target_detail') or '').strip()
|
||||
reason = (data.get('reason') or '').strip()
|
||||
expires_minutes = data.get('expires_minutes') # None = manual/permanent
|
||||
|
||||
if target_type not in ('host', 'interface', 'unifi_device', 'all'):
|
||||
return jsonify({'error': 'Invalid target_type'}), 400
|
||||
if target_type != 'all' and not target_name:
|
||||
return jsonify({'error': 'target_name required'}), 400
|
||||
if not reason:
|
||||
return jsonify({'error': 'reason required'}), 400
|
||||
|
||||
sup_id = db.create_suppression(
|
||||
target_type=target_type,
|
||||
target_name=target_name,
|
||||
target_detail=target_detail,
|
||||
reason=reason,
|
||||
suppressed_by=user['username'],
|
||||
expires_minutes=int(expires_minutes) if expires_minutes else None,
|
||||
)
|
||||
logger.info(
|
||||
f'Suppression #{sup_id} created by {user["username"]}: '
|
||||
f'{target_type}/{target_name}/{target_detail} – {reason}'
|
||||
)
|
||||
return jsonify({'success': True, 'id': sup_id})
|
||||
|
||||
|
||||
@app.route('/api/suppressions/<int:sup_id>', methods=['DELETE'])
|
||||
@require_auth
|
||||
def api_delete_suppression(sup_id: int):
|
||||
user = _get_user()
|
||||
db.deactivate_suppression(sup_id)
|
||||
logger.info(f'Suppression #{sup_id} removed by {user["username"]}')
|
||||
return jsonify({'success': True})
|
||||
|
||||
|
||||
@app.route('/health')
|
||||
def health():
|
||||
"""Health check endpoint (no auth)."""
|
||||
return jsonify({'status': 'ok', 'service': 'gandalf'})
|
||||
|
||||
@app.route('/api/diagnostics')
|
||||
def get_diagnostics():
|
||||
config = load_config()
|
||||
unifi = UnifiAPI(config)
|
||||
devices = unifi.get_devices()
|
||||
diagnostics = {}
|
||||
for device in devices:
|
||||
diagnostics[device['name']] = unifi.get_device_diagnostics(device)
|
||||
return jsonify(diagnostics)
|
||||
|
||||
if __name__ == '__main__':
|
||||
status_thread = threading.Thread(target=update_status, daemon=True)
|
||||
status_thread.start()
|
||||
app.run(debug=True)
|
||||
app.run(debug=True, host='0.0.0.0', port=5000)
|
||||
|
||||
Reference in New Issue
Block a user