feat: deep link diagnostics via Pulse SSH

Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.

- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
  operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
  ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
  analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
  SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
  link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
  thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
  host is resolvable); full results panel: health banner, physical layer,
  SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
  flow control/ring buffers, driver info, LLDP 2-col validation,
  collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-03 16:03:54 -05:00
parent 0278dad502
commit b1dd5f9cad
5 changed files with 1272 additions and 0 deletions

View File

@@ -264,6 +264,16 @@ function renderPanel(swName, idx) {
}
}
// Diagnose button (only when LLDP has an identified neighbor we can map)
const hasDiagTarget = !!(d.lldp && d.lldp.system_name &&
_apiData.hosts && _apiData.hosts[d.lldp.system_name]);
const diagHtml = hasDiagTarget ? `
<div class="diag-bar">
<button class="btn-diag" onclick="runDiagnostic('${escHtml(swName)}', ${idx})">Run Link Diagnostics</button>
<span class="diag-status" id="diag-status"></span>
</div>
<div class="diag-results" id="diag-results"></div>` : '';
const inner = document.getElementById('inspector-panel-inner');
inner.innerHTML = `
<div class="panel-header">
@@ -286,6 +296,7 @@ function renderPanel(swName, idx) {
${errHtml}
${lldpHtml}
${pathHtml}
${diagHtml}
`;
document.getElementById('inspector-panel').classList.add('open');
@@ -387,5 +398,313 @@ async function loadInspector() {
loadInspector();
setInterval(loadInspector, 60000);
// ── Link Diagnostics ─────────────────────────────────────────────────
let _diagPollTimer = null;
function runDiagnostic(swName, portIdx) {
const statusEl = document.getElementById('diag-status');
const resultsEl = document.getElementById('diag-results');
if (!statusEl || !resultsEl) return;
// Clear any previous poll
if (_diagPollTimer) { clearInterval(_diagPollTimer); _diagPollTimer = null; }
statusEl.textContent = 'Submitting to Pulse...';
resultsEl.innerHTML = '';
fetch('/api/diagnose', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({switch_name: swName, port_idx: portIdx}),
})
.then(r => r.json())
.then(resp => {
if (resp.error) {
statusEl.textContent = 'Error: ' + resp.error;
return;
}
statusEl.textContent = 'Collecting diagnostics via Pulse...';
pollDiagnostic(resp.job_id, statusEl, resultsEl);
})
.catch(e => {
statusEl.textContent = 'Request failed: ' + e;
});
}
function pollDiagnostic(jobId, statusEl, resultsEl) {
let attempts = 0;
_diagPollTimer = setInterval(() => {
attempts++;
if (attempts > 120) { // 2min timeout
clearInterval(_diagPollTimer);
statusEl.textContent = 'Timed out waiting for results.';
return;
}
fetch(`/api/diagnose/${jobId}`)
.then(r => r.json())
.then(resp => {
if (resp.status === 'done') {
clearInterval(_diagPollTimer);
_diagPollTimer = null;
statusEl.textContent = '';
renderDiagnosticResults(resp.result, resultsEl);
}
})
.catch(() => {});
}, 2000);
}
function renderDiagnosticResults(d, container) {
if (!d || d.status === 'error') {
container.innerHTML = `<div class="diag-error">Diagnostic error: ${escHtml((d && d.error) || 'unknown')}</div>`;
return;
}
const health = d.health || {};
const issues = health.issues || [];
const warns = health.warnings || [];
const infoArr = health.info || [];
const secs = d.sections || {};
const eth = secs.ethtool || {};
const drv = secs.ethtool_driver || {};
const pause = secs.ethtool_pause || {};
const ring = secs.ethtool_ring || {};
const dom = secs.ethtool_dom || {};
const sysfs = secs.sysfs_stats || {};
const dmesg = secs.dmesg || [];
const lldpctl = secs.lldpctl || {};
const nicStats = secs.ethtool_stats || {};
const swPort = d.switch_port || {};
// ── Health banner ──
let bannerHtml = '';
if (issues.length === 0 && warns.length === 0) {
bannerHtml = '<div class="diag-health-banner"><span class="diag-health-ok">ALL OK</span></div>';
} else {
const parts = [];
if (issues.length) parts.push(`<span class="diag-health-critical">${issues.length} CRITICAL</span>`);
if (warns.length) parts.push(`<span class="diag-health-warning">${warns.length} WARNING</span>`);
bannerHtml = `<div class="diag-health-banner">${parts.join(' ')}</div>`;
}
const issueRows = [...issues, ...warns, ...infoArr].map(item => {
const cls = issues.includes(item) ? 'diag-val-bad' : warns.includes(item) ? 'diag-val-warn' : 'diag-val-good';
const label = issues.includes(item) ? 'CRIT' : warns.includes(item) ? 'WARN' : 'INFO';
return `<div class="diag-issue-row"><span class="${cls}">[${label}]</span> <span class="diag-code">${escHtml(item.code)}</span> — ${escHtml(item.message)}</div>`;
}).join('');
// ── Physical layer ──
const carrierVal = secs.carrier === '1' ? '<span class="diag-val-good">YES</span>' :
secs.carrier === '0' ? '<span class="diag-val-bad">NO</span>' : '';
const operstateVal = (secs.operstate || '?').toUpperCase();
const opstateCls = secs.operstate === 'up' ? 'diag-val-good' : secs.operstate === 'down' ? 'diag-val-bad' : 'diag-val-warn';
const speedVal = eth.speed_mbps ? `<span class="diag-val-good">${fmtSpeed(eth.speed_mbps)}bps</span>` : '<span class="diag-val-warn"></span>';
const duplexVal = eth.duplex === 'full' ? '<span class="diag-val-good">Full</span>' :
eth.duplex === 'half' ? '<span class="diag-val-bad">Half</span>' : '';
const linkDetVal = eth.link_detected === true ? '<span class="diag-val-good">Yes</span>' :
eth.link_detected === false ? '<span class="diag-val-bad">No</span>' : '';
const autonegVal = eth.auto_neg === true ? '<span class="diag-val-good">On</span>' :
eth.auto_neg === false ? '<span class="diag-val-warn">Off</span>' : '';
const physHtml = `
<div class="diag-section">
<div class="diag-section-header">Physical Layer</div>
<table class="diag-table">
<tr><td>Carrier</td><td>${carrierVal}</td></tr>
<tr><td>Oper State</td><td><span class="${opstateCls}">${escHtml(operstateVal)}</span></td></tr>
<tr><td>Speed</td><td>${speedVal}</td></tr>
<tr><td>Duplex</td><td>${duplexVal}</td></tr>
<tr><td>Link Detected</td><td>${linkDetVal}</td></tr>
<tr><td>Auto-neg</td><td>${autonegVal}</td></tr>
${secs.carrier_changes != null ? `<tr><td>Carrier Changes</td><td><span class="${secs.carrier_changes > 20 ? 'diag-val-warn' : 'diag-val-good'}">${secs.carrier_changes}</span></td></tr>` : ''}
</table>
</div>`;
// ── SFP / DOM ──
let domHtml = '';
if (dom && Object.keys(dom).length > 0) {
const rxBar = dom.rx_power_dbm != null ? renderPowerBar(dom.rx_power_dbm, -18, -25) : '';
const txBar = dom.tx_power_dbm != null ? renderPowerBar(dom.tx_power_dbm, -10, -13) : '';
domHtml = `
<div class="diag-section">
<div class="diag-section-header">SFP / DOM</div>
<table class="diag-table">
${dom.vendor ? `<tr><td>Vendor</td><td>${escHtml(dom.vendor)}${dom.part_no ? ' / ' + escHtml(dom.part_no) : ''}</td></tr>` : ''}
${dom.sfp_type ? `<tr><td>Type</td><td>${escHtml(dom.sfp_type)}</td></tr>` : ''}
${dom.connector ? `<tr><td>Connector</td><td>${escHtml(dom.connector)}</td></tr>` : ''}
${dom.wavelength_nm != null ? `<tr><td>Wavelength</td><td>${dom.wavelength_nm} nm</td></tr>` : ''}
${dom.temp_c != null ? `<tr><td>Temperature</td><td>${dom.temp_c.toFixed(1)} °C</td></tr>` : ''}
${dom.voltage_v != null ? `<tr><td>Voltage</td><td>${dom.voltage_v.toFixed(4)} V</td></tr>` : ''}
${dom.bias_ma != null ? `<tr><td>Bias Current</td><td>${dom.bias_ma.toFixed(3)} mA</td></tr>` : ''}
${dom.tx_power_dbm != null ? `<tr><td>TX Power</td><td>${dom.tx_power_dbm.toFixed(2)} dBm ${txBar}</td></tr>` : ''}
${dom.rx_power_dbm != null ? `<tr><td>RX Power</td><td>${dom.rx_power_dbm.toFixed(2)} dBm ${rxBar}</td></tr>` : ''}
</table>
</div>`;
}
// ── NIC Error Counters ──
const errCounters = ['rx_crc_errors','rx_frame_errors','collisions','tx_carrier_errors','rx_missed_errors','rx_fifo_errors'];
const nonZeroCounters = errCounters.filter(k => sysfs[k] > 0);
let errCounterHtml = '';
if (nonZeroCounters.length > 0 || secs.carrier_changes > 0) {
const rows = nonZeroCounters.map(k => {
const v = sysfs[k];
const cls = v > 100 ? 'diag-val-bad' : 'diag-val-warn';
return `<tr><td>${escHtml(k)}</td><td class="${cls}">${v.toLocaleString()}</td></tr>`;
}).join('');
errCounterHtml = `
<div class="diag-section">
<div class="diag-section-header">NIC Error Counters</div>
<table class="diag-table">
${rows || '<tr><td colspan="2" class="diag-val-good">All zero</td></tr>'}
</table>
</div>`;
}
// ── ethtool -S (collapsible) ──
let nicStatHtml = '';
if (Object.keys(nicStats).length > 0) {
const _ERR_KEYS = /err|drop|miss|crc|frame|fifo|abort|carrier|collision|fault|discard|overflow|reset/i;
const rows = Object.entries(nicStats).map(([k, v]) => {
const cls = _ERR_KEYS.test(k) && v > 0 ? ' class="diag-stat-nonzero-warn"' : '';
return `<tr${cls}><td>${escHtml(k)}</td><td>${v.toLocaleString()}</td></tr>`;
}).join('');
nicStatHtml = `
<div class="diag-section diag-collapsible">
<div class="diag-section-header diag-toggle" onclick="this.parentElement.classList.toggle('diag-open')">
ethtool -S (NIC stats) <span class="diag-toggle-hint">[expand]</span>
</div>
<div class="diag-section-body">
<table class="diag-stat-table">${rows}</table>
</div>
</div>`;
}
// ── Flow Control + Ring Buffers ──
let flowRingHtml = '';
const hasPause = Object.keys(pause).length > 0;
const hasRing = Object.keys(ring).length > 0;
if (hasPause || hasRing) {
flowRingHtml = `
<div class="diag-section">
<div class="diag-section-header">Flow Control &amp; Ring Buffers</div>
<table class="diag-table">
${hasPause ? `
<tr><td>RX Pause</td><td>${pause.rx_pause ? '<span class="diag-val-good">On</span>' : 'Off'}</td></tr>
<tr><td>TX Pause</td><td>${pause.tx_pause ? '<span class="diag-val-good">On</span>' : 'Off'}</td></tr>` : ''}
${hasRing ? `
<tr><td>RX Ring</td><td>${ring.rx_current != null ? ring.rx_current : ''} / ${ring.rx_max != null ? ring.rx_max : ''} max</td></tr>
<tr><td>TX Ring</td><td>${ring.tx_current != null ? ring.tx_current : ''} / ${ring.tx_max != null ? ring.tx_max : ''} max</td></tr>` : ''}
</table>
</div>`;
}
// ── Driver Info ──
let drvHtml = '';
if (Object.keys(drv).length > 0) {
drvHtml = `
<div class="diag-section">
<div class="diag-section-header">Driver Info</div>
<table class="diag-table">
${drv.driver ? `<tr><td>Driver</td><td>${escHtml(drv.driver)}</td></tr>` : ''}
${drv.version ? `<tr><td>Version</td><td>${escHtml(drv.version)}</td></tr>` : ''}
${drv.firmware_version ? `<tr><td>Firmware</td><td>${escHtml(drv.firmware_version)}</td></tr>` : ''}
${drv.bus_info ? `<tr><td>Bus</td><td>${escHtml(drv.bus_info)}</td></tr>` : ''}
</table>
</div>`;
}
// ── LLDP Validation ──
let lldpValHtml = '';
const swLldp = swPort.lldp || {};
lldpValHtml = `
<div class="diag-section">
<div class="diag-section-header">LLDP Validation</div>
<div class="path-debug-cols">
<div class="path-col">
<div class="path-col-header">Switch sees</div>
<div class="path-row"><span>System</span><span>${escHtml(swLldp.system_name || '')}</span></div>
<div class="path-row"><span>Port</span><span>${escHtml(swLldp.port_id || '')}</span></div>
<div class="path-row"><span>Chassis</span><span>${escHtml(swLldp.chassis_id || '')}</span></div>
</div>
<div class="path-col">
<div class="path-col-header">Server lldpctl</div>
${lldpctl.available
? `<div class="path-row"><span>Neighbor</span><span>${escHtml(lldpctl.neighbor_system || '')}</span></div>
<div class="path-row"><span>Port</span><span>${escHtml(lldpctl.neighbor_port || '')}</span></div>`
: '<div class="path-row"><span class="diag-val-warn">lldpd not running</span></div>'}
</div>
</div>
</div>`;
// ── dmesg ──
let dmesgHtml = '';
if (dmesg.length > 0) {
const dlines = dmesg.map(e => {
const cls = e.severity === 'error' ? ' diag-dmesg-err' : e.severity === 'warn' ? ' diag-dmesg-warn' : '';
const ts = e.timestamp ? `[${e.timestamp}] ` : '';
return `<div class="diag-dmesg-line${cls}">${escHtml(ts + e.msg)}</div>`;
}).join('');
dmesgHtml = `
<div class="diag-section diag-collapsible">
<div class="diag-section-header diag-toggle" onclick="this.parentElement.classList.toggle('diag-open')">
Kernel Events (dmesg) <span class="diag-toggle-hint">[expand]</span>
</div>
<div class="diag-section-body">
<div class="diag-dmesg-wrap">${dlines}</div>
</div>
</div>`;
}
// ── Switch Port Summary ──
const swSummaryHtml = `
<div class="diag-section">
<div class="diag-section-header">Switch Port Summary</div>
<table class="diag-table">
<tr><td>Status</td><td>${swPort.up ? '<span class="diag-val-good">UP</span>' : '<span class="diag-val-bad">DOWN</span>'}</td></tr>
<tr><td>Speed</td><td>${swPort.speed_mbps ? fmtSpeed(swPort.speed_mbps) + 'bps' : ''}</td></tr>
<tr><td>Duplex</td><td>${swPort.full_duplex ? 'Full' : (swPort.up ? '<span class="diag-val-bad">Half</span>' : '')}</td></tr>
<tr><td>TX Err</td><td>${fmtErrors(swPort.tx_errs_rate)}</td></tr>
<tr><td>RX Err</td><td>${fmtErrors(swPort.rx_errs_rate)}</td></tr>
${swPort.poe_power != null ? `<tr><td>PoE</td><td><span class="val-amber">${swPort.poe_power.toFixed(1)}W</span></td></tr>` : ''}
</table>
</div>`;
// ── Pulse link ──
const pulseLink = d.pulse_url
? `<div class="diag-pulse-link"><a href="${escHtml(d.pulse_url)}" target="_blank" rel="noopener">View raw output in Pulse ↗</a></div>`
: '';
container.innerHTML = `
<div class="diag-results-inner">
${bannerHtml}
<div class="diag-issue-list">${issueRows}</div>
${physHtml}
${domHtml}
${errCounterHtml}
${nicStatHtml}
${flowRingHtml}
${drvHtml}
${lldpValHtml}
${dmesgHtml}
${swSummaryHtml}
${pulseLink}
</div>`;
}
// SFP power bar: range is 0 dBm (best) to -35 dBm (worst)
function renderPowerBar(dbm, warnThreshold, critThreshold) {
const minDbm = -35, maxDbm = 0;
const pct = Math.max(0, Math.min(100, ((dbm - minDbm) / (maxDbm - minDbm)) * 100));
const warnPct = ((warnThreshold - minDbm) / (maxDbm - minDbm)) * 100;
const critPct = ((critThreshold - minDbm) / (maxDbm - minDbm)) * 100;
const barCls = dbm < critThreshold ? 'diag-val-bad' : dbm < warnThreshold ? 'diag-val-warn' : 'diag-val-good';
return `<span class="diag-power-bar-wrap">
<span class="diag-power-bar ${barCls}" style="width:${pct.toFixed(1)}%"></span>
<span class="diag-power-zone-warn" style="left:${warnPct.toFixed(1)}%"></span>
<span class="diag-power-zone-crit" style="left:${critPct.toFixed(1)}%"></span>
</span>`;
}
</script>
{% endblock %}