feat: deep link diagnostics via Pulse SSH
Adds comprehensive per-port link troubleshooting triggered from the Inspector panel when a port has an LLDP-identified server counterpart. - diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier, operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link, ip addr, ip route, dmesg, lldpctl); parsers for all sections; health analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH, SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.) - monitor.py: PulseClient now tracks last_execution_id so callers can link back to the raw Pulse execution URL - app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon thread background execution and 10-minute in-memory job store - inspector.html: "Run Link Diagnostics" button (shown only when LLDP host is resolvable); full results panel: health banner, physical layer, SFP/DOM with power bars, NIC error counters, collapsible ethtool -S, flow control/ring buffers, driver info, LLDP 2-col validation, collapsible dmesg, switch port summary, "View in Pulse" link - style.css: all .diag-* CSS classes with terminal aesthetic Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
127
app.py
127
app.py
@@ -6,11 +6,16 @@ All monitoring and alerting is handled by the separate monitor.py daemon.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from functools import wraps
|
||||
|
||||
from flask import Flask, jsonify, redirect, render_template, request, url_for
|
||||
|
||||
import db
|
||||
import diagnose
|
||||
from monitor import PulseClient
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
@@ -22,6 +27,19 @@ app = Flask(__name__)
|
||||
|
||||
_cfg = None
|
||||
|
||||
# In-memory diagnostic job store { job_id: { status, result, created_at } }
|
||||
_diag_jobs: dict = {}
|
||||
_diag_lock = threading.Lock()
|
||||
|
||||
|
||||
def _purge_old_jobs():
|
||||
"""Remove jobs older than 10 minutes (called before each new job creation)."""
|
||||
cutoff = time.time() - 600
|
||||
with _diag_lock:
|
||||
stale = [jid for jid, j in _diag_jobs.items() if j.get('created_at', 0) < cutoff]
|
||||
for jid in stale:
|
||||
del _diag_jobs[jid]
|
||||
|
||||
|
||||
def _config() -> dict:
|
||||
global _cfg
|
||||
@@ -223,6 +241,115 @@ def api_delete_suppression(sup_id: int):
|
||||
return jsonify({'success': True})
|
||||
|
||||
|
||||
@app.route('/api/diagnose', methods=['POST'])
|
||||
@require_auth
|
||||
def api_diagnose_start():
|
||||
"""Start a link diagnostic job. Returns {job_id}."""
|
||||
data = request.get_json(silent=True) or {}
|
||||
switch_name = (data.get('switch_name') or '').strip()
|
||||
port_idx = data.get('port_idx')
|
||||
|
||||
if not switch_name or port_idx is None:
|
||||
return jsonify({'error': 'switch_name and port_idx required'}), 400
|
||||
|
||||
# Look up switch + port in cached link_stats
|
||||
raw = db.get_state('link_stats')
|
||||
if not raw:
|
||||
return jsonify({'error': 'No link_stats data available'}), 503
|
||||
try:
|
||||
link_data = json.loads(raw)
|
||||
except Exception:
|
||||
logger.error('Failed to parse link_stats JSON in /api/diagnose')
|
||||
return jsonify({'error': 'Internal data error'}), 500
|
||||
|
||||
switches = link_data.get('unifi_switches', {})
|
||||
sw = switches.get(switch_name)
|
||||
if not sw:
|
||||
return jsonify({'error': f'Switch "{switch_name}" not found'}), 404
|
||||
|
||||
# Find port by port_idx
|
||||
port_data = None
|
||||
for pname, pd in sw.get('ports', {}).items():
|
||||
if pd.get('port_idx') == port_idx:
|
||||
port_data = dict(pd)
|
||||
port_data['name'] = pname
|
||||
break
|
||||
if not port_data:
|
||||
return jsonify({'error': f'Port {port_idx} not found on switch "{switch_name}"'}), 404
|
||||
|
||||
# LLDP neighbor required to know which host+iface to SSH into
|
||||
lldp = port_data.get('lldp')
|
||||
if not lldp or not lldp.get('system_name'):
|
||||
return jsonify({'error': 'No LLDP neighbor data for this port'}), 400
|
||||
|
||||
server_name = lldp['system_name']
|
||||
lldp_port_id = lldp.get('port_id', '')
|
||||
|
||||
# Find matching host + interface in link_stats hosts
|
||||
hosts = link_data.get('hosts', {})
|
||||
server_ifaces = hosts.get(server_name)
|
||||
if not server_ifaces:
|
||||
return jsonify({'error': f'Host "{server_name}" not in link stats'}), 404
|
||||
|
||||
# Match interface by LLDP port_id (exact then fuzzy)
|
||||
matched_iface = None
|
||||
if lldp_port_id and lldp_port_id in server_ifaces:
|
||||
matched_iface = lldp_port_id
|
||||
if not matched_iface and lldp_port_id:
|
||||
matched_iface = next(
|
||||
(k for k in server_ifaces if lldp_port_id in k or k in lldp_port_id),
|
||||
None
|
||||
)
|
||||
if not matched_iface:
|
||||
matched_iface = next(iter(server_ifaces), None)
|
||||
if not matched_iface:
|
||||
return jsonify({'error': 'Cannot determine server interface'}), 400
|
||||
|
||||
# Resolve host IP from link_stats host data
|
||||
host_ip = (server_ifaces.get(matched_iface) or {}).get('host_ip')
|
||||
if not host_ip:
|
||||
# Fallback: use LLDP mgmt IPs
|
||||
mgmt_ips = lldp.get('mgmt_ips') or []
|
||||
host_ip = mgmt_ips[0] if mgmt_ips else None
|
||||
if not host_ip:
|
||||
return jsonify({'error': 'Cannot determine host IP for SSH'}), 400
|
||||
|
||||
_purge_old_jobs()
|
||||
job_id = str(uuid.uuid4())
|
||||
with _diag_lock:
|
||||
_diag_jobs[job_id] = {'status': 'running', 'result': None, 'created_at': time.time()}
|
||||
|
||||
def _run():
|
||||
try:
|
||||
cfg = _config()
|
||||
pulse = PulseClient(cfg)
|
||||
runner = diagnose.DiagnosticsRunner(pulse)
|
||||
result = runner.run(host_ip, server_name, matched_iface, port_data)
|
||||
except Exception as e:
|
||||
logger.error(f'Diagnostic job {job_id} failed: {e}', exc_info=True)
|
||||
result = {'status': 'error', 'error': str(e)}
|
||||
with _diag_lock:
|
||||
if job_id in _diag_jobs:
|
||||
_diag_jobs[job_id]['status'] = 'done'
|
||||
_diag_jobs[job_id]['result'] = result
|
||||
|
||||
t = threading.Thread(target=_run, daemon=True)
|
||||
t.start()
|
||||
|
||||
return jsonify({'job_id': job_id})
|
||||
|
||||
|
||||
@app.route('/api/diagnose/<job_id>', methods=['GET'])
|
||||
@require_auth
|
||||
def api_diagnose_poll(job_id: str):
|
||||
"""Poll a diagnostic job. Returns {status, result}."""
|
||||
with _diag_lock:
|
||||
job = _diag_jobs.get(job_id)
|
||||
if not job:
|
||||
return jsonify({'error': 'Job not found'}), 404
|
||||
return jsonify({'status': job['status'], 'result': job.get('result')})
|
||||
|
||||
|
||||
@app.route('/health')
|
||||
def health():
|
||||
"""Health check endpoint (no auth)."""
|
||||
|
||||
Reference in New Issue
Block a user