feat: deep link diagnostics via Pulse SSH

Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.

- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
  operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
  ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
  analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
  SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
  link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
  thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
  host is resolvable); full results panel: health banner, physical layer,
  SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
  flow control/ring buffers, driver info, LLDP 2-col validation,
  collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-03 16:03:54 -05:00
parent 0278dad502
commit b1dd5f9cad
5 changed files with 1272 additions and 0 deletions

127
app.py
View File

@@ -6,11 +6,16 @@ All monitoring and alerting is handled by the separate monitor.py daemon.
"""
import json
import logging
import threading
import time
import uuid
from functools import wraps
from flask import Flask, jsonify, redirect, render_template, request, url_for
import db
import diagnose
from monitor import PulseClient
logging.basicConfig(
level=logging.INFO,
@@ -22,6 +27,19 @@ app = Flask(__name__)
_cfg = None
# In-memory diagnostic job store { job_id: { status, result, created_at } }
_diag_jobs: dict = {}
_diag_lock = threading.Lock()
def _purge_old_jobs():
"""Remove jobs older than 10 minutes (called before each new job creation)."""
cutoff = time.time() - 600
with _diag_lock:
stale = [jid for jid, j in _diag_jobs.items() if j.get('created_at', 0) < cutoff]
for jid in stale:
del _diag_jobs[jid]
def _config() -> dict:
global _cfg
@@ -223,6 +241,115 @@ def api_delete_suppression(sup_id: int):
return jsonify({'success': True})
@app.route('/api/diagnose', methods=['POST'])
@require_auth
def api_diagnose_start():
"""Start a link diagnostic job. Returns {job_id}."""
data = request.get_json(silent=True) or {}
switch_name = (data.get('switch_name') or '').strip()
port_idx = data.get('port_idx')
if not switch_name or port_idx is None:
return jsonify({'error': 'switch_name and port_idx required'}), 400
# Look up switch + port in cached link_stats
raw = db.get_state('link_stats')
if not raw:
return jsonify({'error': 'No link_stats data available'}), 503
try:
link_data = json.loads(raw)
except Exception:
logger.error('Failed to parse link_stats JSON in /api/diagnose')
return jsonify({'error': 'Internal data error'}), 500
switches = link_data.get('unifi_switches', {})
sw = switches.get(switch_name)
if not sw:
return jsonify({'error': f'Switch "{switch_name}" not found'}), 404
# Find port by port_idx
port_data = None
for pname, pd in sw.get('ports', {}).items():
if pd.get('port_idx') == port_idx:
port_data = dict(pd)
port_data['name'] = pname
break
if not port_data:
return jsonify({'error': f'Port {port_idx} not found on switch "{switch_name}"'}), 404
# LLDP neighbor required to know which host+iface to SSH into
lldp = port_data.get('lldp')
if not lldp or not lldp.get('system_name'):
return jsonify({'error': 'No LLDP neighbor data for this port'}), 400
server_name = lldp['system_name']
lldp_port_id = lldp.get('port_id', '')
# Find matching host + interface in link_stats hosts
hosts = link_data.get('hosts', {})
server_ifaces = hosts.get(server_name)
if not server_ifaces:
return jsonify({'error': f'Host "{server_name}" not in link stats'}), 404
# Match interface by LLDP port_id (exact then fuzzy)
matched_iface = None
if lldp_port_id and lldp_port_id in server_ifaces:
matched_iface = lldp_port_id
if not matched_iface and lldp_port_id:
matched_iface = next(
(k for k in server_ifaces if lldp_port_id in k or k in lldp_port_id),
None
)
if not matched_iface:
matched_iface = next(iter(server_ifaces), None)
if not matched_iface:
return jsonify({'error': 'Cannot determine server interface'}), 400
# Resolve host IP from link_stats host data
host_ip = (server_ifaces.get(matched_iface) or {}).get('host_ip')
if not host_ip:
# Fallback: use LLDP mgmt IPs
mgmt_ips = lldp.get('mgmt_ips') or []
host_ip = mgmt_ips[0] if mgmt_ips else None
if not host_ip:
return jsonify({'error': 'Cannot determine host IP for SSH'}), 400
_purge_old_jobs()
job_id = str(uuid.uuid4())
with _diag_lock:
_diag_jobs[job_id] = {'status': 'running', 'result': None, 'created_at': time.time()}
def _run():
try:
cfg = _config()
pulse = PulseClient(cfg)
runner = diagnose.DiagnosticsRunner(pulse)
result = runner.run(host_ip, server_name, matched_iface, port_data)
except Exception as e:
logger.error(f'Diagnostic job {job_id} failed: {e}', exc_info=True)
result = {'status': 'error', 'error': str(e)}
with _diag_lock:
if job_id in _diag_jobs:
_diag_jobs[job_id]['status'] = 'done'
_diag_jobs[job_id]['result'] = result
t = threading.Thread(target=_run, daemon=True)
t.start()
return jsonify({'job_id': job_id})
@app.route('/api/diagnose/<job_id>', methods=['GET'])
@require_auth
def api_diagnose_poll(job_id: str):
"""Poll a diagnostic job. Returns {status, result}."""
with _diag_lock:
job = _diag_jobs.get(job_id)
if not job:
return jsonify({'error': 'Job not found'}), 404
return jsonify({'status': job['status'], 'result': job.get('result')})
@app.route('/health')
def health():
"""Health check endpoint (no auth)."""