2026-03-01 23:03:18 -05:00
|
|
|
|
"""Gandalf – Global Advanced Network Detection And Link Facilitator.
|
|
|
|
|
|
|
|
|
|
|
|
Flask web application serving the monitoring dashboard and suppression
|
|
|
|
|
|
management UI. Authentication via Authelia forward-auth headers.
|
|
|
|
|
|
All monitoring and alerting is handled by the separate monitor.py daemon.
|
|
|
|
|
|
"""
|
2026-03-12 17:30:50 -04:00
|
|
|
|
import ipaddress
|
2025-01-04 01:42:16 -05:00
|
|
|
|
import json
|
2026-03-01 23:03:18 -05:00
|
|
|
|
import logging
|
2026-03-12 17:30:50 -04:00
|
|
|
|
import re
|
feat: deep link diagnostics via Pulse SSH
Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.
- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
host is resolvable); full results panel: health banner, physical layer,
SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
flow control/ring buffers, driver info, LLDP 2-col validation,
collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 16:03:54 -05:00
|
|
|
|
import threading
|
|
|
|
|
|
import time
|
|
|
|
|
|
import uuid
|
2026-03-01 23:03:18 -05:00
|
|
|
|
from functools import wraps
|
|
|
|
|
|
|
|
|
|
|
|
from flask import Flask, jsonify, redirect, render_template, request, url_for
|
|
|
|
|
|
|
|
|
|
|
|
import db
|
feat: deep link diagnostics via Pulse SSH
Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.
- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
host is resolvable); full results panel: health banner, physical layer,
SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
flow control/ring buffers, driver info, LLDP 2-col validation,
collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 16:03:54 -05:00
|
|
|
|
import diagnose
|
|
|
|
|
|
from monitor import PulseClient
|
2026-03-01 23:03:18 -05:00
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
|
|
|
level=logging.INFO,
|
|
|
|
|
|
format='%(asctime)s %(levelname)s %(name)s %(message)s',
|
|
|
|
|
|
)
|
|
|
|
|
|
logger = logging.getLogger('gandalf.web')
|
|
|
|
|
|
|
2025-01-04 01:42:16 -05:00
|
|
|
|
app = Flask(__name__)
|
2026-03-01 23:03:18 -05:00
|
|
|
|
|
|
|
|
|
|
_cfg = None
|
|
|
|
|
|
|
feat: deep link diagnostics via Pulse SSH
Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.
- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
host is resolvable); full results panel: health banner, physical layer,
SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
flow control/ring buffers, driver info, LLDP 2-col validation,
collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 16:03:54 -05:00
|
|
|
|
# In-memory diagnostic job store { job_id: { status, result, created_at } }
|
|
|
|
|
|
_diag_jobs: dict = {}
|
|
|
|
|
|
_diag_lock = threading.Lock()
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-12 17:30:50 -04:00
|
|
|
|
def _purge_old_jobs_loop():
|
|
|
|
|
|
"""Background thread: remove jobs older than 10 minutes and mark stuck running jobs as errored."""
|
|
|
|
|
|
while True:
|
|
|
|
|
|
time.sleep(120)
|
|
|
|
|
|
cutoff = time.time() - 600
|
|
|
|
|
|
stuck_cutoff = time.time() - 300 # 5 min: job still 'running' → thread must have crashed
|
|
|
|
|
|
with _diag_lock:
|
|
|
|
|
|
stale = [jid for jid, j in _diag_jobs.items() if j.get('created_at', 0) < cutoff]
|
|
|
|
|
|
for jid in stale:
|
|
|
|
|
|
del _diag_jobs[jid]
|
|
|
|
|
|
for jid, j in _diag_jobs.items():
|
|
|
|
|
|
if j['status'] == 'running' and j.get('created_at', 0) < stuck_cutoff:
|
|
|
|
|
|
j['status'] = 'done'
|
|
|
|
|
|
j['result'] = {'status': 'error', 'error': 'Diagnostic timed out (thread crash)'}
|
|
|
|
|
|
logger.error(f'Diagnostic job {jid} appeared stuck; marked as errored')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_purge_thread = threading.Thread(target=_purge_old_jobs_loop, daemon=True)
|
|
|
|
|
|
_purge_thread.start()
|
feat: deep link diagnostics via Pulse SSH
Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.
- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
host is resolvable); full results panel: health banner, physical layer,
SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
flow control/ring buffers, driver info, LLDP 2-col validation,
collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 16:03:54 -05:00
|
|
|
|
|
2026-03-01 23:03:18 -05:00
|
|
|
|
|
|
|
|
|
|
def _config() -> dict:
|
|
|
|
|
|
global _cfg
|
|
|
|
|
|
if _cfg is None:
|
|
|
|
|
|
with open('config.json') as f:
|
|
|
|
|
|
_cfg = json.load(f)
|
|
|
|
|
|
return _cfg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Auth helpers
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
def _get_user() -> dict:
|
|
|
|
|
|
return {
|
|
|
|
|
|
'username': request.headers.get('Remote-User', ''),
|
|
|
|
|
|
'name': request.headers.get('Remote-Name', ''),
|
|
|
|
|
|
'email': request.headers.get('Remote-Email', ''),
|
|
|
|
|
|
'groups': [
|
|
|
|
|
|
g.strip()
|
|
|
|
|
|
for g in request.headers.get('Remote-Groups', '').split(',')
|
|
|
|
|
|
if g.strip()
|
|
|
|
|
|
],
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def require_auth(f):
|
|
|
|
|
|
@wraps(f)
|
|
|
|
|
|
def wrapper(*args, **kwargs):
|
|
|
|
|
|
user = _get_user()
|
|
|
|
|
|
if not user['username']:
|
|
|
|
|
|
return (
|
|
|
|
|
|
'<h1>401 – Not authenticated</h1>'
|
|
|
|
|
|
'<p>Please access Gandalf through '
|
|
|
|
|
|
'<a href="https://auth.lotusguild.org">auth.lotusguild.org</a>.</p>',
|
|
|
|
|
|
401,
|
|
|
|
|
|
)
|
|
|
|
|
|
allowed = _config().get('auth', {}).get('allowed_groups', ['admin'])
|
|
|
|
|
|
if not any(g in allowed for g in user['groups']):
|
|
|
|
|
|
return (
|
|
|
|
|
|
f'<h1>403 – Access denied</h1>'
|
|
|
|
|
|
f'<p>Your account ({user["username"]}) is not in an allowed group '
|
|
|
|
|
|
f'({", ".join(allowed)}).</p>',
|
|
|
|
|
|
403,
|
|
|
|
|
|
)
|
|
|
|
|
|
return f(*args, **kwargs)
|
|
|
|
|
|
return wrapper
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# Page routes
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
2025-02-08 00:03:01 -05:00
|
|
|
|
|
|
|
|
|
|
@app.route('/')
|
2026-03-01 23:03:18 -05:00
|
|
|
|
@require_auth
|
|
|
|
|
|
def index():
|
|
|
|
|
|
user = _get_user()
|
|
|
|
|
|
events = db.get_active_events()
|
|
|
|
|
|
summary = db.get_status_summary()
|
|
|
|
|
|
snapshot_raw = db.get_state('network_snapshot')
|
|
|
|
|
|
last_check = db.get_state('last_check', 'Never')
|
|
|
|
|
|
snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
|
|
|
|
|
|
suppressions = db.get_active_suppressions()
|
|
|
|
|
|
return render_template(
|
|
|
|
|
|
'index.html',
|
|
|
|
|
|
user=user,
|
|
|
|
|
|
events=events,
|
|
|
|
|
|
summary=summary,
|
|
|
|
|
|
snapshot=snapshot,
|
|
|
|
|
|
last_check=last_check,
|
|
|
|
|
|
suppressions=suppressions,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
feat: terminal aesthetic rewrite + link debug page
- Full dark terminal aesthetic (Pulse/TinkerTickets style):
- #0a0a0a background, #00ff41 green, #ffb000 amber, #00ffff cyan
- CRT scanline overlay, phosphor glow, ASCII corner pseudoelements
- Bracket-notation badges [CRITICAL], monospace font throughout
- style.css, base.html, index.html, suppressions.html all rewritten
- New Link Debug page (/links, /api/links):
- Per-host, per-interface cards with speed/duplex/port type/auto-neg
- Traffic bars (TX cyan, RX green) with rate labels
- Error/drop counters, carrier change history
- SFP/DOM optical panel: vendor, temp, voltage, bias, TX/RX power dBm bars
- RX-TX delta shown; color-coded warn/crit thresholds
- Auto-refresh every 60s, anchor-jump to #hostname
- LinkStatsCollector in monitor.py:
- SSHes to each host (one connection, all ifaces batched)
- Parses ethtool + ethtool -m (SFP DOM) output
- Merges with Prometheus traffic/error/carrier metrics
- Stores as link_stats in monitor_state table
- config.json: added ssh section for ethtool collection
- app.js: terminal chip style consistency (uppercase, ● bullet)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-02 12:43:11 -05:00
|
|
|
|
@app.route('/links')
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def links_page():
|
|
|
|
|
|
user = _get_user()
|
|
|
|
|
|
return render_template('links.html', user=user)
|
|
|
|
|
|
|
|
|
|
|
|
|
feat: inspector page, link debug enhancements, security hardening
- Add /inspector page: visual model-accurate switch chassis diagrams
(USF5P, USL8A, US24PRO, USPPDUP, USMINI), clickable port blocks
with color coding (green=up, amber=PoE, cyan=uplink, grey=down),
detail panel with stats/PoE/LLDP, LLDP-based path debug side-by-side
- Link Debug: port number badges (#N), LLDP neighbor line, PoE class/max,
collapsible host/switch panels with sessionStorage persistence
- monitor.py: collect LLDP neighbor map + PoE class/max/mode per switch
port; PulseClient uses requests.Session() for HTTP keep-alive; add
shlex.quote() around interface names (defense-in-depth)
- Security: suppress buttons use data-* attrs + delegated click handler
instead of inline onclick with Jinja2 variable interpolation; remove
| safe filter from user-controlled fields in suppressions.html;
setDuration() takes explicit el param instead of implicit event global
- db.py: thread-local connection reuse with ping(reconnect=True) to
avoid a new TCP handshake per query
- .gitignore: add config.json (contains credentials), __pycache__
- README: full rewrite covering architecture, all 4 pages, alert logic,
config reference, deployment, troubleshooting, security notes
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 15:39:48 -05:00
|
|
|
|
@app.route('/inspector')
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def inspector():
|
|
|
|
|
|
user = _get_user()
|
|
|
|
|
|
return render_template('inspector.html', user=user)
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-01 23:03:18 -05:00
|
|
|
|
@app.route('/suppressions')
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def suppressions_page():
|
|
|
|
|
|
user = _get_user()
|
|
|
|
|
|
active = db.get_active_suppressions()
|
|
|
|
|
|
history = db.get_suppression_history(limit=50)
|
|
|
|
|
|
snapshot_raw = db.get_state('network_snapshot')
|
|
|
|
|
|
snapshot = json.loads(snapshot_raw) if snapshot_raw else {}
|
|
|
|
|
|
return render_template(
|
|
|
|
|
|
'suppressions.html',
|
|
|
|
|
|
user=user,
|
|
|
|
|
|
active=active,
|
|
|
|
|
|
history=history,
|
|
|
|
|
|
snapshot=snapshot,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
# API routes
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
2025-02-07 21:03:31 -05:00
|
|
|
|
|
|
|
|
|
|
@app.route('/api/status')
|
2026-03-01 23:03:18 -05:00
|
|
|
|
@require_auth
|
|
|
|
|
|
def api_status():
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'summary': db.get_status_summary(),
|
|
|
|
|
|
'last_check': db.get_state('last_check', 'Never'),
|
|
|
|
|
|
'events': db.get_active_events(),
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/api/network')
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def api_network():
|
|
|
|
|
|
raw = db.get_state('network_snapshot')
|
|
|
|
|
|
if raw:
|
|
|
|
|
|
try:
|
|
|
|
|
|
return jsonify(json.loads(raw))
|
|
|
|
|
|
except Exception:
|
2026-03-12 17:35:41 -04:00
|
|
|
|
logger.error('Failed to parse network_snapshot JSON')
|
2026-03-01 23:03:18 -05:00
|
|
|
|
return jsonify({'hosts': {}, 'unifi': [], 'updated': None})
|
|
|
|
|
|
|
|
|
|
|
|
|
feat: terminal aesthetic rewrite + link debug page
- Full dark terminal aesthetic (Pulse/TinkerTickets style):
- #0a0a0a background, #00ff41 green, #ffb000 amber, #00ffff cyan
- CRT scanline overlay, phosphor glow, ASCII corner pseudoelements
- Bracket-notation badges [CRITICAL], monospace font throughout
- style.css, base.html, index.html, suppressions.html all rewritten
- New Link Debug page (/links, /api/links):
- Per-host, per-interface cards with speed/duplex/port type/auto-neg
- Traffic bars (TX cyan, RX green) with rate labels
- Error/drop counters, carrier change history
- SFP/DOM optical panel: vendor, temp, voltage, bias, TX/RX power dBm bars
- RX-TX delta shown; color-coded warn/crit thresholds
- Auto-refresh every 60s, anchor-jump to #hostname
- LinkStatsCollector in monitor.py:
- SSHes to each host (one connection, all ifaces batched)
- Parses ethtool + ethtool -m (SFP DOM) output
- Merges with Prometheus traffic/error/carrier metrics
- Stores as link_stats in monitor_state table
- config.json: added ssh section for ethtool collection
- app.js: terminal chip style consistency (uppercase, ● bullet)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-02 12:43:11 -05:00
|
|
|
|
@app.route('/api/links')
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def api_links():
|
|
|
|
|
|
raw = db.get_state('link_stats')
|
|
|
|
|
|
if raw:
|
|
|
|
|
|
try:
|
|
|
|
|
|
return jsonify(json.loads(raw))
|
|
|
|
|
|
except Exception:
|
2026-03-12 17:35:41 -04:00
|
|
|
|
logger.error('Failed to parse link_stats JSON')
|
feat: terminal aesthetic rewrite + link debug page
- Full dark terminal aesthetic (Pulse/TinkerTickets style):
- #0a0a0a background, #00ff41 green, #ffb000 amber, #00ffff cyan
- CRT scanline overlay, phosphor glow, ASCII corner pseudoelements
- Bracket-notation badges [CRITICAL], monospace font throughout
- style.css, base.html, index.html, suppressions.html all rewritten
- New Link Debug page (/links, /api/links):
- Per-host, per-interface cards with speed/duplex/port type/auto-neg
- Traffic bars (TX cyan, RX green) with rate labels
- Error/drop counters, carrier change history
- SFP/DOM optical panel: vendor, temp, voltage, bias, TX/RX power dBm bars
- RX-TX delta shown; color-coded warn/crit thresholds
- Auto-refresh every 60s, anchor-jump to #hostname
- LinkStatsCollector in monitor.py:
- SSHes to each host (one connection, all ifaces batched)
- Parses ethtool + ethtool -m (SFP DOM) output
- Merges with Prometheus traffic/error/carrier metrics
- Stores as link_stats in monitor_state table
- config.json: added ssh section for ethtool collection
- app.js: terminal chip style consistency (uppercase, ● bullet)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-02 12:43:11 -05:00
|
|
|
|
return jsonify({'hosts': {}, 'updated': None})
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-01 23:03:18 -05:00
|
|
|
|
@app.route('/api/events')
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def api_events():
|
|
|
|
|
|
return jsonify({
|
|
|
|
|
|
'active': db.get_active_events(),
|
|
|
|
|
|
'resolved': db.get_recent_resolved(hours=24, limit=30),
|
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/api/suppressions', methods=['GET'])
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def api_get_suppressions():
|
|
|
|
|
|
return jsonify(db.get_active_suppressions())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/api/suppressions', methods=['POST'])
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def api_create_suppression():
|
|
|
|
|
|
user = _get_user()
|
|
|
|
|
|
data = request.get_json(silent=True) or {}
|
|
|
|
|
|
|
|
|
|
|
|
target_type = data.get('target_type', 'host')
|
|
|
|
|
|
target_name = (data.get('target_name') or '').strip()
|
|
|
|
|
|
target_detail = (data.get('target_detail') or '').strip()
|
|
|
|
|
|
reason = (data.get('reason') or '').strip()
|
|
|
|
|
|
expires_minutes = data.get('expires_minutes') # None = manual/permanent
|
|
|
|
|
|
|
|
|
|
|
|
if target_type not in ('host', 'interface', 'unifi_device', 'all'):
|
|
|
|
|
|
return jsonify({'error': 'Invalid target_type'}), 400
|
|
|
|
|
|
if target_type != 'all' and not target_name:
|
|
|
|
|
|
return jsonify({'error': 'target_name required'}), 400
|
|
|
|
|
|
if not reason:
|
|
|
|
|
|
return jsonify({'error': 'reason required'}), 400
|
|
|
|
|
|
|
|
|
|
|
|
sup_id = db.create_suppression(
|
|
|
|
|
|
target_type=target_type,
|
|
|
|
|
|
target_name=target_name,
|
|
|
|
|
|
target_detail=target_detail,
|
|
|
|
|
|
reason=reason,
|
|
|
|
|
|
suppressed_by=user['username'],
|
|
|
|
|
|
expires_minutes=int(expires_minutes) if expires_minutes else None,
|
|
|
|
|
|
)
|
|
|
|
|
|
logger.info(
|
|
|
|
|
|
f'Suppression #{sup_id} created by {user["username"]}: '
|
|
|
|
|
|
f'{target_type}/{target_name}/{target_detail} – {reason}'
|
|
|
|
|
|
)
|
|
|
|
|
|
return jsonify({'success': True, 'id': sup_id})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/api/suppressions/<int:sup_id>', methods=['DELETE'])
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def api_delete_suppression(sup_id: int):
|
|
|
|
|
|
user = _get_user()
|
|
|
|
|
|
db.deactivate_suppression(sup_id)
|
|
|
|
|
|
logger.info(f'Suppression #{sup_id} removed by {user["username"]}')
|
|
|
|
|
|
return jsonify({'success': True})
|
|
|
|
|
|
|
|
|
|
|
|
|
feat: deep link diagnostics via Pulse SSH
Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.
- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
host is resolvable); full results panel: health banner, physical layer,
SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
flow control/ring buffers, driver info, LLDP 2-col validation,
collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 16:03:54 -05:00
|
|
|
|
@app.route('/api/diagnose', methods=['POST'])
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def api_diagnose_start():
|
|
|
|
|
|
"""Start a link diagnostic job. Returns {job_id}."""
|
|
|
|
|
|
data = request.get_json(silent=True) or {}
|
|
|
|
|
|
switch_name = (data.get('switch_name') or '').strip()
|
2026-03-12 17:35:41 -04:00
|
|
|
|
try:
|
|
|
|
|
|
port_idx = int(data.get('port_idx'))
|
|
|
|
|
|
except (TypeError, ValueError):
|
|
|
|
|
|
return jsonify({'error': 'port_idx must be an integer'}), 400
|
feat: deep link diagnostics via Pulse SSH
Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.
- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
host is resolvable); full results panel: health banner, physical layer,
SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
flow control/ring buffers, driver info, LLDP 2-col validation,
collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 16:03:54 -05:00
|
|
|
|
|
2026-03-12 17:35:41 -04:00
|
|
|
|
if not switch_name:
|
feat: deep link diagnostics via Pulse SSH
Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.
- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
host is resolvable); full results panel: health banner, physical layer,
SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
flow control/ring buffers, driver info, LLDP 2-col validation,
collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 16:03:54 -05:00
|
|
|
|
return jsonify({'error': 'switch_name and port_idx required'}), 400
|
|
|
|
|
|
|
|
|
|
|
|
# Look up switch + port in cached link_stats
|
|
|
|
|
|
raw = db.get_state('link_stats')
|
|
|
|
|
|
if not raw:
|
|
|
|
|
|
return jsonify({'error': 'No link_stats data available'}), 503
|
|
|
|
|
|
try:
|
|
|
|
|
|
link_data = json.loads(raw)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
logger.error('Failed to parse link_stats JSON in /api/diagnose')
|
|
|
|
|
|
return jsonify({'error': 'Internal data error'}), 500
|
|
|
|
|
|
|
|
|
|
|
|
switches = link_data.get('unifi_switches', {})
|
|
|
|
|
|
sw = switches.get(switch_name)
|
|
|
|
|
|
if not sw:
|
|
|
|
|
|
return jsonify({'error': f'Switch "{switch_name}" not found'}), 404
|
|
|
|
|
|
|
|
|
|
|
|
# Find port by port_idx
|
|
|
|
|
|
port_data = None
|
|
|
|
|
|
for pname, pd in sw.get('ports', {}).items():
|
|
|
|
|
|
if pd.get('port_idx') == port_idx:
|
|
|
|
|
|
port_data = dict(pd)
|
|
|
|
|
|
port_data['name'] = pname
|
|
|
|
|
|
break
|
|
|
|
|
|
if not port_data:
|
|
|
|
|
|
return jsonify({'error': f'Port {port_idx} not found on switch "{switch_name}"'}), 404
|
|
|
|
|
|
|
|
|
|
|
|
# LLDP neighbor required to know which host+iface to SSH into
|
|
|
|
|
|
lldp = port_data.get('lldp')
|
|
|
|
|
|
if not lldp or not lldp.get('system_name'):
|
|
|
|
|
|
return jsonify({'error': 'No LLDP neighbor data for this port'}), 400
|
|
|
|
|
|
|
|
|
|
|
|
server_name = lldp['system_name']
|
|
|
|
|
|
lldp_port_id = lldp.get('port_id', '')
|
|
|
|
|
|
|
|
|
|
|
|
# Find matching host + interface in link_stats hosts
|
|
|
|
|
|
hosts = link_data.get('hosts', {})
|
|
|
|
|
|
server_ifaces = hosts.get(server_name)
|
|
|
|
|
|
if not server_ifaces:
|
|
|
|
|
|
return jsonify({'error': f'Host "{server_name}" not in link stats'}), 404
|
|
|
|
|
|
|
|
|
|
|
|
# Match interface by LLDP port_id (exact then fuzzy)
|
|
|
|
|
|
matched_iface = None
|
|
|
|
|
|
if lldp_port_id and lldp_port_id in server_ifaces:
|
|
|
|
|
|
matched_iface = lldp_port_id
|
|
|
|
|
|
if not matched_iface and lldp_port_id:
|
|
|
|
|
|
matched_iface = next(
|
|
|
|
|
|
(k for k in server_ifaces if lldp_port_id in k or k in lldp_port_id),
|
|
|
|
|
|
None
|
|
|
|
|
|
)
|
|
|
|
|
|
if not matched_iface:
|
|
|
|
|
|
matched_iface = next(iter(server_ifaces), None)
|
|
|
|
|
|
if not matched_iface:
|
|
|
|
|
|
return jsonify({'error': 'Cannot determine server interface'}), 400
|
|
|
|
|
|
|
|
|
|
|
|
# Resolve host IP from link_stats host data
|
|
|
|
|
|
host_ip = (server_ifaces.get(matched_iface) or {}).get('host_ip')
|
|
|
|
|
|
if not host_ip:
|
|
|
|
|
|
# Fallback: use LLDP mgmt IPs
|
|
|
|
|
|
mgmt_ips = lldp.get('mgmt_ips') or []
|
|
|
|
|
|
host_ip = mgmt_ips[0] if mgmt_ips else None
|
|
|
|
|
|
if not host_ip:
|
|
|
|
|
|
return jsonify({'error': 'Cannot determine host IP for SSH'}), 400
|
|
|
|
|
|
|
2026-03-12 17:30:50 -04:00
|
|
|
|
# Validate resolved values before passing to SSH command builder
|
|
|
|
|
|
try:
|
|
|
|
|
|
ipaddress.ip_address(host_ip)
|
|
|
|
|
|
except ValueError:
|
|
|
|
|
|
logger.error(f'Refusing diagnostic: invalid host_ip "{host_ip}" for {server_name}')
|
|
|
|
|
|
return jsonify({'error': 'Resolved host IP is not a valid IP address'}), 400
|
|
|
|
|
|
if not re.fullmatch(r'[a-zA-Z0-9._-]+', matched_iface):
|
|
|
|
|
|
logger.error(f'Refusing diagnostic: invalid iface "{matched_iface}" for {server_name}')
|
|
|
|
|
|
return jsonify({'error': 'Resolved interface name contains invalid characters'}), 400
|
|
|
|
|
|
|
feat: deep link diagnostics via Pulse SSH
Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.
- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
host is resolvable); full results panel: health banner, physical layer,
SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
flow control/ring buffers, driver info, LLDP 2-col validation,
collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 16:03:54 -05:00
|
|
|
|
job_id = str(uuid.uuid4())
|
|
|
|
|
|
with _diag_lock:
|
|
|
|
|
|
_diag_jobs[job_id] = {'status': 'running', 'result': None, 'created_at': time.time()}
|
|
|
|
|
|
|
|
|
|
|
|
def _run():
|
|
|
|
|
|
try:
|
|
|
|
|
|
cfg = _config()
|
|
|
|
|
|
pulse = PulseClient(cfg)
|
|
|
|
|
|
runner = diagnose.DiagnosticsRunner(pulse)
|
|
|
|
|
|
result = runner.run(host_ip, server_name, matched_iface, port_data)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logger.error(f'Diagnostic job {job_id} failed: {e}', exc_info=True)
|
|
|
|
|
|
result = {'status': 'error', 'error': str(e)}
|
|
|
|
|
|
with _diag_lock:
|
|
|
|
|
|
if job_id in _diag_jobs:
|
|
|
|
|
|
_diag_jobs[job_id]['status'] = 'done'
|
|
|
|
|
|
_diag_jobs[job_id]['result'] = result
|
|
|
|
|
|
|
|
|
|
|
|
t = threading.Thread(target=_run, daemon=True)
|
|
|
|
|
|
t.start()
|
|
|
|
|
|
|
|
|
|
|
|
return jsonify({'job_id': job_id})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/api/diagnose/<job_id>', methods=['GET'])
|
|
|
|
|
|
@require_auth
|
|
|
|
|
|
def api_diagnose_poll(job_id: str):
|
|
|
|
|
|
"""Poll a diagnostic job. Returns {status, result}."""
|
|
|
|
|
|
with _diag_lock:
|
|
|
|
|
|
job = _diag_jobs.get(job_id)
|
|
|
|
|
|
if not job:
|
|
|
|
|
|
return jsonify({'error': 'Job not found'}), 404
|
|
|
|
|
|
return jsonify({'status': job['status'], 'result': job.get('result')})
|
|
|
|
|
|
|
|
|
|
|
|
|
2026-03-01 23:03:18 -05:00
|
|
|
|
@app.route('/health')
|
|
|
|
|
|
def health():
|
|
|
|
|
|
"""Health check endpoint (no auth)."""
|
|
|
|
|
|
return jsonify({'status': 'ok', 'service': 'gandalf'})
|
|
|
|
|
|
|
2025-01-04 01:42:16 -05:00
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2026-03-01 23:03:18 -05:00
|
|
|
|
app.run(debug=True, host='0.0.0.0', port=5000)
|