Files
gandalf/app.py
T
jared 9d6583a08a
Lint / Python (flake8) (push) Successful in 1m13s
Lint / JS (eslint) (push) Successful in 9s
Security / Python Security (bandit) (push) Failing after 45s
Test / Python Tests (pytest) (push) Successful in 57s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 5s
Add LDAP avatar photos, UX polish, and TDS component upgrades
- Add /api/avatar endpoint querying lldap for user jpegPhoto; disk cache
  with sentinel pattern avoids repeat LDAP hits for users without photos
- Add ldap3 dependency and ldap config block to config.json
- Wire lt-avatar img overlay in base.html with capture-phase error
  fallback (lt-avatar-img-err) to reveal initials when image is absent
- Fix lt-avatar CSS shim: position:relative + absolute inset on img
  (local base.css was missing these; added to style.css)
- Replace all empty-state paragraphs with proper lt-empty-state markup
  (icon + title + body) across index, suppressions, inspector, app.js
- Add lt-spinner--cyan next to refresh button; shows during refreshAll()
- Replace inspector panel-section-title with lt-divider throughout
- Add data-tooltip attributes to SFP DOM metrics, TX/RX/Carrier/Duplex/
  Auto-neg/Error labels in links.html and inspector panel
- Add tooltips to events table column headers (Sev, First Seen, Failures)
- Fix links.html host panel timestamp (was reading sample.updated which
  is always undefined; now uses data.updated)
- Fix UniFi status text casing (Online→ONLINE to match server render)
- Remove dead topo-status-* class manipulation from updateTopology()
- Always render alert-count-badge; toggle display:none when count is 0
- Fix double UniFi get_devices() call in monitor.py run loop
- Fix chip-critical animation (was using green pulse-glow; now red)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 21:09:56 -04:00

560 lines
19 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Gandalf Global Advanced Network Detection And Link Facilitator.
Flask web application serving the monitoring dashboard and suppression
management UI. Authentication via Authelia forward-auth headers.
All monitoring and alerting is handled by the separate monitor.py daemon.
"""
import hashlib
import ipaddress
import json
import logging
import os
import re
import threading
import time
import uuid
from datetime import datetime, timezone
from functools import wraps
from flask import Flask, jsonify, make_response, render_template, request, send_file
import db
import diagnose
from monitor import PulseClient
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s %(message)s',
)
logger = logging.getLogger('gandalf.web')
app = Flask(__name__)
_AVATAR_COLORS = ['lt-avatar--orange', 'lt-avatar--green', 'lt-avatar--purple', '']
@app.template_filter('avatar_color')
def avatar_color_filter(name: str) -> str:
return _AVATAR_COLORS[int(hashlib.md5(name.encode()).hexdigest(), 16) % len(_AVATAR_COLORS)] # nosec B324
_cfg = None
_cfg_lock = threading.Lock()
@app.context_processor
def inject_config():
"""Inject safe config values into all templates."""
cfg = _config()
return {
'config': {
'ticket_api': {
'web_url': cfg.get('ticket_api', {}).get('web_url', 'http://t.lotusguild.org/ticket/'),
}
}
}
# In-memory diagnostic job store { job_id: { status, result, created_at } }
_diag_jobs: dict = {}
_diag_lock = threading.Lock()
def _purge_old_jobs_loop():
"""Background thread: remove stale diag jobs and run daily event purge."""
while True:
time.sleep(120)
cutoff = time.time() - 600
stuck_cutoff = time.time() - 300 # 5 min: job still 'running' → thread must have crashed
with _diag_lock:
stale = [jid for jid, j in _diag_jobs.items() if j.get('created_at', 0) < cutoff]
for jid in stale:
del _diag_jobs[jid]
for jid, j in list(_diag_jobs.items()):
if j['status'] == 'running' and j.get('created_at', 0) < stuck_cutoff:
j['status'] = 'done'
j['result'] = {'status': 'error', 'error': 'Diagnostic timed out (thread crash)'}
logger.error(f'Diagnostic job {jid} appeared stuck; marked as errored')
_purge_thread = threading.Thread(target=_purge_old_jobs_loop, daemon=True)
_purge_thread.start()
def _config() -> dict:
global _cfg
if _cfg is None:
with _cfg_lock:
if _cfg is None:
with open('config.json') as f:
_cfg = json.load(f)
return _cfg
def _daemon_ok(last_check: str) -> bool:
"""Return True if monitor last checked within 20 minutes."""
if not last_check or last_check == 'Never':
return False
try:
ts = datetime.strptime(last_check, '%Y-%m-%d %H:%M:%S UTC').replace(tzinfo=timezone.utc)
return (datetime.now(timezone.utc) - ts).total_seconds() < 1200
except Exception:
return False
# ---------------------------------------------------------------------------
# Auth helpers
# ---------------------------------------------------------------------------
def _get_user() -> dict:
return {
'username': request.headers.get('Remote-User', ''),
'name': request.headers.get('Remote-Name', ''),
'email': request.headers.get('Remote-Email', ''),
'groups': [
g.strip()
for g in request.headers.get('Remote-Groups', '').split(',')
if g.strip()
],
}
def require_auth(f):
@wraps(f)
def wrapper(*args, **kwargs):
user = _get_user()
if not user['username']:
return (
'<h1>401 Not authenticated</h1>'
'<p>Please access Gandalf through '
'<a href="https://auth.lotusguild.org">auth.lotusguild.org</a>.</p>',
401,
)
allowed = _config().get('auth', {}).get('allowed_groups', ['admin'])
if not any(g in allowed for g in user['groups']):
return (
f'<h1>403 Access denied</h1>'
f'<p>Your account ({user["username"]}) is not in an allowed group '
f'({", ".join(allowed)}).</p>',
403,
)
return f(*args, **kwargs)
return wrapper
# ---------------------------------------------------------------------------
# Page routes
# ---------------------------------------------------------------------------
_PAGE_LIMIT = 200 # max events returned per request
@app.route('/')
@require_auth
def index():
user = _get_user()
events = db.get_active_events(limit=_PAGE_LIMIT)
total_active = db.count_active_events()
summary = db.get_status_summary()
snapshot_raw = db.get_state('network_snapshot')
last_check = db.get_state('last_check', 'Never')
snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
suppressions = db.get_active_suppressions()
recent_resolved = db.get_recent_resolved(hours=24, limit=10)
return render_template(
'index.html',
user=user,
events=events,
total_active=total_active,
summary=summary,
snapshot=snapshot,
last_check=last_check,
suppressions=suppressions,
recent_resolved=recent_resolved,
daemon_ok=_daemon_ok(last_check),
)
@app.route('/links')
@require_auth
def links_page():
user = _get_user()
return render_template('links.html', user=user)
@app.route('/inspector')
@require_auth
def inspector():
user = _get_user()
return render_template('inspector.html', user=user)
@app.route('/suppressions')
@require_auth
def suppressions_page():
user = _get_user()
active = db.get_active_suppressions()
history = db.get_suppression_history(limit=50)
snapshot_raw = db.get_state('network_snapshot')
snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
return render_template(
'suppressions.html',
user=user,
active=active,
history=history,
snapshot=snapshot,
)
# ---------------------------------------------------------------------------
# API routes
# ---------------------------------------------------------------------------
@app.route('/api/status')
@require_auth
def api_status():
active = db.get_active_events(limit=_PAGE_LIMIT)
last_check = db.get_state('last_check', 'Never')
return jsonify({
'summary': db.get_status_summary(),
'last_check': last_check,
'events': active,
'total_active': db.count_active_events(),
'daemon_ok': _daemon_ok(last_check),
})
@app.route('/api/network')
@require_auth
def api_network():
raw = db.get_state('network_snapshot')
if raw:
try:
return jsonify(json.loads(raw))
except Exception:
logger.error('Failed to parse network_snapshot JSON')
return jsonify({'hosts': {}, 'unifi': [], 'updated': None})
@app.route('/api/links')
@require_auth
def api_links():
raw = db.get_state('link_stats')
if raw:
try:
return jsonify(json.loads(raw))
except Exception:
logger.error('Failed to parse link_stats JSON')
return jsonify({'hosts': {}, 'updated': None})
@app.route('/api/events')
@require_auth
def api_events():
try:
limit = min(int(request.args.get('limit', _PAGE_LIMIT)), 1000)
offset = max(int(request.args.get('offset', 0)), 0)
except ValueError:
return jsonify({'error': 'limit and offset must be integers'}), 400
status_filter = request.args.get('status', 'active')
if status_filter not in ('active', 'resolved', 'all'):
return jsonify({'error': 'status must be active, resolved, or all'}), 400
result: dict = {}
if status_filter in ('active', 'all'):
result['active'] = db.get_active_events(limit=limit, offset=offset)
result['total_active'] = db.count_active_events()
if status_filter in ('resolved', 'all'):
result['resolved'] = db.get_recent_resolved(hours=24, limit=30)
return jsonify(result)
@app.route('/api/suppressions', methods=['GET'])
@require_auth
def api_get_suppressions():
return jsonify(db.get_active_suppressions())
@app.route('/api/suppressions', methods=['POST'])
@require_auth
def api_create_suppression():
user = _get_user()
data = request.get_json(silent=True) or {}
target_type = data.get('target_type', 'host')
target_name = (data.get('target_name') or '').strip()
target_detail = (data.get('target_detail') or '').strip()
reason = (data.get('reason') or '').strip()
expires_minutes = data.get('expires_minutes') # None = manual/permanent
if target_type not in ('host', 'interface', 'unifi_device', 'all'):
return jsonify({'error': 'Invalid target_type'}), 400
if target_type != 'all' and not target_name:
return jsonify({'error': 'target_name required'}), 400
if not reason:
return jsonify({'error': 'reason required'}), 400
if len(reason) > 500:
return jsonify({'error': 'reason must be 500 characters or fewer'}), 400
if len(target_name) > 255:
return jsonify({'error': 'target_name must be 255 characters or fewer'}), 400
if len(target_detail) > 255:
return jsonify({'error': 'target_detail must be 255 characters or fewer'}), 400
sup_id = db.create_suppression(
target_type=target_type,
target_name=target_name,
target_detail=target_detail,
reason=reason,
suppressed_by=user['username'],
expires_minutes=int(expires_minutes) if expires_minutes else None,
)
logger.info(
f'Suppression #{sup_id} created by {user["username"]}: '
f'{target_type}/{target_name}/{target_detail} {reason}'
)
return jsonify({'success': True, 'id': sup_id})
@app.route('/api/suppressions/<int:sup_id>', methods=['DELETE'])
@require_auth
def api_delete_suppression(sup_id: int):
user = _get_user()
db.deactivate_suppression(sup_id)
logger.info(f'Suppression #{sup_id} removed by {user["username"]}')
return jsonify({'success': True})
@app.route('/api/diagnose', methods=['POST'])
@require_auth
def api_diagnose_start():
"""Start a link diagnostic job. Returns {job_id}."""
data = request.get_json(silent=True) or {}
switch_name = (data.get('switch_name') or '').strip()
try:
port_idx = int(data.get('port_idx'))
except (TypeError, ValueError):
return jsonify({'error': 'port_idx must be an integer'}), 400
if not switch_name:
return jsonify({'error': 'switch_name and port_idx required'}), 400
# Look up switch + port in cached link_stats
raw = db.get_state('link_stats')
if not raw:
return jsonify({'error': 'No link_stats data available'}), 503
try:
link_data = json.loads(raw)
except Exception:
logger.error('Failed to parse link_stats JSON in /api/diagnose')
return jsonify({'error': 'Internal data error'}), 500
switches = link_data.get('unifi_switches', {})
sw = switches.get(switch_name)
if not sw:
return jsonify({'error': f'Switch "{switch_name}" not found'}), 404
# Find port by port_idx
port_data = None
for pname, pd in sw.get('ports', {}).items():
if pd.get('port_idx') == port_idx:
port_data = dict(pd)
port_data['name'] = pname
break
if not port_data:
return jsonify({'error': f'Port {port_idx} not found on switch "{switch_name}"'}), 404
# LLDP neighbor required to know which host+iface to SSH into
lldp = port_data.get('lldp')
if not lldp or not lldp.get('system_name'):
return jsonify({'error': 'No LLDP neighbor data for this port'}), 400
server_name = lldp['system_name']
lldp_port_id = lldp.get('port_id', '')
# Find matching host + interface in link_stats hosts
hosts = link_data.get('hosts', {})
server_ifaces = hosts.get(server_name)
if not server_ifaces:
return jsonify({'error': f'Host "{server_name}" not in link stats'}), 404
# Match interface by LLDP port_id (exact then fuzzy)
matched_iface = None
if lldp_port_id and lldp_port_id in server_ifaces:
matched_iface = lldp_port_id
if not matched_iface and lldp_port_id:
matched_iface = next(
(k for k in server_ifaces if lldp_port_id in k or k in lldp_port_id),
None
)
if not matched_iface:
matched_iface = next(iter(server_ifaces), None)
if not matched_iface:
return jsonify({'error': 'Cannot determine server interface'}), 400
# Resolve host IP from link_stats host data
host_ip = (server_ifaces.get(matched_iface) or {}).get('host_ip')
if not host_ip:
# Fallback: use LLDP mgmt IPs
mgmt_ips = lldp.get('mgmt_ips') or []
host_ip = mgmt_ips[0] if mgmt_ips else None
if not host_ip:
return jsonify({'error': 'Cannot determine host IP for SSH'}), 400
# Validate resolved values before passing to SSH command builder
try:
ipaddress.ip_address(host_ip)
except ValueError:
logger.error(f'Refusing diagnostic: invalid host_ip "{host_ip}" for {server_name}')
return jsonify({'error': 'Resolved host IP is not a valid IP address'}), 400
if not re.fullmatch(r'[a-zA-Z0-9._-]+', matched_iface):
logger.error(f'Refusing diagnostic: invalid iface "{matched_iface}" for {server_name}')
return jsonify({'error': 'Resolved interface name contains invalid characters'}), 400
job_id = str(uuid.uuid4())
with _diag_lock:
_diag_jobs[job_id] = {'status': 'running', 'result': None, 'created_at': time.time()}
def _run():
try:
cfg = _config()
pulse = PulseClient(cfg)
runner = diagnose.DiagnosticsRunner(pulse)
result = runner.run(host_ip, server_name, matched_iface, port_data)
except Exception as e:
logger.error(f'Diagnostic job {job_id} failed: {e}', exc_info=True)
result = {'status': 'error', 'error': str(e)}
with _diag_lock:
if job_id in _diag_jobs:
_diag_jobs[job_id]['status'] = 'done'
_diag_jobs[job_id]['result'] = result
t = threading.Thread(target=_run, daemon=True)
t.start()
return jsonify({'job_id': job_id})
@app.route('/api/diagnose/<job_id>', methods=['GET'])
@require_auth
def api_diagnose_poll(job_id: str):
"""Poll a diagnostic job. Returns {status, result}."""
with _diag_lock:
job = _diag_jobs.get(job_id)
if not job:
return jsonify({'error': 'Job not found'}), 404
return jsonify({'status': job['status'], 'result': job.get('result')})
@app.route('/api/avatar')
@require_auth
def api_avatar():
"""Serve the current user's LDAP avatar photo (JPEG), cached to disk."""
username = request.headers.get('Remote-User', '').strip()
if not username:
return '', 404
ldap_cfg = _config().get('ldap', {})
if not ldap_cfg.get('host') or not ldap_cfg.get('bind_dn'):
return '', 404
# Build a safe cache filename from the username (alphanumeric + - _ .)
safe_name = re.sub(r'[^a-zA-Z0-9._-]', '_', username)
cache_dir = ldap_cfg.get('cache_dir', '/tmp/gandalf_avatars')
os.makedirs(cache_dir, exist_ok=True)
cache_file = os.path.join(cache_dir, f'user_{safe_name}.jpg')
sentinel = os.path.join(cache_dir, f'user_{safe_name}.none')
cache_ttl = int(ldap_cfg.get('cache_ttl', 3600))
now = time.time()
# Serve cached image if fresh
if os.path.exists(cache_file) and now - os.path.getmtime(cache_file) < cache_ttl:
return send_file(cache_file, mimetype='image/jpeg',
max_age=cache_ttl, conditional=True)
# Skip LDAP if we already know this user has no avatar
if os.path.exists(sentinel) and now - os.path.getmtime(sentinel) < cache_ttl:
return '', 404
# Query lldap
avatar_data = None
try:
import ldap3
server = ldap3.Server(ldap_cfg['host'], port=int(ldap_cfg.get('port', 3890)))
conn = ldap3.Connection(server,
user=ldap_cfg['bind_dn'],
password=ldap_cfg.get('bind_pw', ''),
auto_bind=True, receive_timeout=5)
safe_uid = ldap3.utils.conv.escape_filter_chars(username)
conn.search(ldap_cfg.get('user_base', 'ou=people,dc=example,dc=com'),
f'(uid={safe_uid})', attributes=['avatar'])
if conn.entries and conn.entries[0]['avatar'].value:
avatar_data = conn.entries[0]['avatar'].value
conn.unbind()
except ImportError:
logger.error('ldap3 not installed — run: pip install ldap3')
return '', 404
except Exception as e:
logger.error(f'LDAP avatar lookup failed for {username}: {e}')
return '', 404
if not avatar_data or len(avatar_data) < 100:
open(sentinel, 'w').close()
return '', 404
# Validate JPEG magic bytes (FF D8 FF)
if isinstance(avatar_data, str):
avatar_data = avatar_data.encode('latin-1')
if avatar_data[:3] != b'\xFF\xD8\xFF':
logger.warning(f'Non-JPEG avatar data for {username}')
open(sentinel, 'w').close()
return '', 404
with open(cache_file, 'wb') as f:
f.write(avatar_data)
if os.path.exists(sentinel):
os.unlink(sentinel)
resp = make_response(avatar_data)
resp.headers['Content-Type'] = 'image/jpeg'
resp.headers['Cache-Control'] = f'private, max-age={cache_ttl}'
return resp
@app.route('/health')
def health():
"""Health check endpoint (no auth). Checks DB and monitor freshness."""
checks = {}
overall = 'ok'
# DB connectivity
try:
db.get_state('last_check')
checks['db'] = 'ok'
except Exception as e:
checks['db'] = f'error: {e}'
overall = 'degraded'
# Monitor freshness: fail if last_check is older than 20 minutes
try:
last_check = db.get_state('last_check', '')
if last_check:
ts = datetime.strptime(last_check, '%Y-%m-%d %H:%M:%S UTC').replace(tzinfo=timezone.utc)
age_s = (datetime.now(timezone.utc) - ts).total_seconds()
if age_s > 1200:
checks['monitor'] = f'stale ({int(age_s)}s since last check)'
overall = 'degraded'
else:
checks['monitor'] = f'ok ({int(age_s)}s ago)'
else:
checks['monitor'] = 'no data yet'
except Exception as e:
checks['monitor'] = f'error: {e}'
overall = 'degraded'
status_code = 200 if overall == 'ok' else 503
return jsonify({'status': overall, 'service': 'gandalf', 'checks': checks}), status_code
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=5000) # nosec B201 B104 — dev runner only; production uses gunicorn