feat: deep link diagnostics via Pulse SSH

Adds comprehensive per-port link troubleshooting triggered from the
Inspector panel when a port has an LLDP-identified server counterpart.

- diagnose.py: DiagnosticsRunner with 15-section SSH command (carrier,
  operstate, sysfs counters, ethtool, ethtool -i/-a/-g/-S/-m, ip link,
  ip addr, ip route, dmesg, lldpctl); parsers for all sections; health
  analyzer with 14 check codes (NO_CARRIER, HALF_DUPLEX, SPEED_MISMATCH,
  SFP_RX_CRITICAL, CARRIER_FLAPPING, CRC_ERRORS_HIGH, LLDP_MISMATCH, etc.)
- monitor.py: PulseClient now tracks last_execution_id so callers can
  link back to the raw Pulse execution URL
- app.py: POST /api/diagnose + GET /api/diagnose/<job_id> with daemon
  thread background execution and 10-minute in-memory job store
- inspector.html: "Run Link Diagnostics" button (shown only when LLDP
  host is resolvable); full results panel: health banner, physical layer,
  SFP/DOM with power bars, NIC error counters, collapsible ethtool -S,
  flow control/ring buffers, driver info, LLDP 2-col validation,
  collapsible dmesg, switch port summary, "View in Pulse" link
- style.css: all .diag-* CSS classes with terminal aesthetic

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-03 16:03:54 -05:00
parent 0278dad502
commit b1dd5f9cad
5 changed files with 1272 additions and 0 deletions

127
app.py
View File

@@ -6,11 +6,16 @@ All monitoring and alerting is handled by the separate monitor.py daemon.
"""
import json
import logging
import threading
import time
import uuid
from functools import wraps
from flask import Flask, jsonify, redirect, render_template, request, url_for
import db
import diagnose
from monitor import PulseClient
logging.basicConfig(
level=logging.INFO,
@@ -22,6 +27,19 @@ app = Flask(__name__)
_cfg = None
# In-memory diagnostic job store { job_id: { status, result, created_at } }
_diag_jobs: dict = {}
_diag_lock = threading.Lock()
def _purge_old_jobs():
"""Remove jobs older than 10 minutes (called before each new job creation)."""
cutoff = time.time() - 600
with _diag_lock:
stale = [jid for jid, j in _diag_jobs.items() if j.get('created_at', 0) < cutoff]
for jid in stale:
del _diag_jobs[jid]
def _config() -> dict:
global _cfg
@@ -223,6 +241,115 @@ def api_delete_suppression(sup_id: int):
return jsonify({'success': True})
@app.route('/api/diagnose', methods=['POST'])
@require_auth
def api_diagnose_start():
"""Start a link diagnostic job. Returns {job_id}."""
data = request.get_json(silent=True) or {}
switch_name = (data.get('switch_name') or '').strip()
port_idx = data.get('port_idx')
if not switch_name or port_idx is None:
return jsonify({'error': 'switch_name and port_idx required'}), 400
# Look up switch + port in cached link_stats
raw = db.get_state('link_stats')
if not raw:
return jsonify({'error': 'No link_stats data available'}), 503
try:
link_data = json.loads(raw)
except Exception:
logger.error('Failed to parse link_stats JSON in /api/diagnose')
return jsonify({'error': 'Internal data error'}), 500
switches = link_data.get('unifi_switches', {})
sw = switches.get(switch_name)
if not sw:
return jsonify({'error': f'Switch "{switch_name}" not found'}), 404
# Find port by port_idx
port_data = None
for pname, pd in sw.get('ports', {}).items():
if pd.get('port_idx') == port_idx:
port_data = dict(pd)
port_data['name'] = pname
break
if not port_data:
return jsonify({'error': f'Port {port_idx} not found on switch "{switch_name}"'}), 404
# LLDP neighbor required to know which host+iface to SSH into
lldp = port_data.get('lldp')
if not lldp or not lldp.get('system_name'):
return jsonify({'error': 'No LLDP neighbor data for this port'}), 400
server_name = lldp['system_name']
lldp_port_id = lldp.get('port_id', '')
# Find matching host + interface in link_stats hosts
hosts = link_data.get('hosts', {})
server_ifaces = hosts.get(server_name)
if not server_ifaces:
return jsonify({'error': f'Host "{server_name}" not in link stats'}), 404
# Match interface by LLDP port_id (exact then fuzzy)
matched_iface = None
if lldp_port_id and lldp_port_id in server_ifaces:
matched_iface = lldp_port_id
if not matched_iface and lldp_port_id:
matched_iface = next(
(k for k in server_ifaces if lldp_port_id in k or k in lldp_port_id),
None
)
if not matched_iface:
matched_iface = next(iter(server_ifaces), None)
if not matched_iface:
return jsonify({'error': 'Cannot determine server interface'}), 400
# Resolve host IP from link_stats host data
host_ip = (server_ifaces.get(matched_iface) or {}).get('host_ip')
if not host_ip:
# Fallback: use LLDP mgmt IPs
mgmt_ips = lldp.get('mgmt_ips') or []
host_ip = mgmt_ips[0] if mgmt_ips else None
if not host_ip:
return jsonify({'error': 'Cannot determine host IP for SSH'}), 400
_purge_old_jobs()
job_id = str(uuid.uuid4())
with _diag_lock:
_diag_jobs[job_id] = {'status': 'running', 'result': None, 'created_at': time.time()}
def _run():
try:
cfg = _config()
pulse = PulseClient(cfg)
runner = diagnose.DiagnosticsRunner(pulse)
result = runner.run(host_ip, server_name, matched_iface, port_data)
except Exception as e:
logger.error(f'Diagnostic job {job_id} failed: {e}', exc_info=True)
result = {'status': 'error', 'error': str(e)}
with _diag_lock:
if job_id in _diag_jobs:
_diag_jobs[job_id]['status'] = 'done'
_diag_jobs[job_id]['result'] = result
t = threading.Thread(target=_run, daemon=True)
t.start()
return jsonify({'job_id': job_id})
@app.route('/api/diagnose/<job_id>', methods=['GET'])
@require_auth
def api_diagnose_poll(job_id: str):
"""Poll a diagnostic job. Returns {status, result}."""
with _diag_lock:
job = _diag_jobs.get(job_id)
if not job:
return jsonify({'error': 'Job not found'}), 404
return jsonify({'status': job['status'], 'result': job.get('result')})
@app.route('/health')
def health():
"""Health check endpoint (no auth)."""

546
diagnose.py Normal file
View File

@@ -0,0 +1,546 @@
"""Gandalf Link Diagnostics module.
Runs a comprehensive SSH-based diagnostic against a server NIC and
analyses the result against switch port data to surface root causes.
Executed in a background thread; results stored in _diag_jobs (app.py).
"""
import re
import shlex
import time
import logging
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger('gandalf.diagnose')
# sysfs counters collected per interface
_SYSFS_STATS = [
'rx_bytes', 'tx_bytes', 'rx_errors', 'tx_errors',
'rx_dropped', 'tx_dropped', 'rx_crc_errors',
'rx_frame_errors', 'rx_fifo_errors', 'tx_carrier_errors',
'collisions', 'rx_missed_errors',
]
class DiagnosticsRunner:
"""Build and run a link diagnostic against a server NIC via PulseClient."""
def __init__(self, pulse_client):
self.pulse = pulse_client
# ------------------------------------------------------------------
# SSH command builder
# ------------------------------------------------------------------
@staticmethod
def build_ssh_command(host_ip: str, iface: str) -> str:
"""Return a single-line SSH command that collects all diagnostic data."""
q = shlex.quote(iface)
ip_q = shlex.quote(host_ip)
sysfs_loop = '; '.join(
f'echo "{s}:$(cat /sys/class/net/{q}/statistics/{s} 2>/dev/null || echo 0)"'
for s in _SYSFS_STATS
)
remote_cmd = (
f'echo "=== carrier ===";'
f' cat /sys/class/net/{q}/carrier 2>/dev/null || echo "?";'
f' echo "=== operstate ===";'
f' cat /sys/class/net/{q}/operstate 2>/dev/null || echo "?";'
f' echo "=== sysfs_stats ===";'
f' {sysfs_loop};'
f' echo "=== carrier_changes ===";'
f' cat /sys/class/net/{q}/carrier_changes 2>/dev/null || echo "0";'
f' echo "=== ethtool ===";'
f' ethtool {q} 2>/dev/null;'
f' echo "=== ethtool_driver ===";'
f' ethtool -i {q} 2>/dev/null;'
f' echo "=== ethtool_pause ===";'
f' ethtool -a {q} 2>/dev/null;'
f' echo "=== ethtool_ring ===";'
f' ethtool -g {q} 2>/dev/null;'
f' echo "=== ethtool_stats ===";'
f' ethtool -S {q} 2>/dev/null;'
f' echo "=== ethtool_dom ===";'
f' ethtool -m {q} 2>/dev/null;'
f' echo "=== ip_link ===";'
f' ip -s link show {q} 2>/dev/null;'
f' echo "=== ip_addr ===";'
f' ip addr show {q} 2>/dev/null;'
f' echo "=== ip_route ===";'
f' ip route show dev {q} 2>/dev/null;'
f' echo "=== dmesg ===";'
f' dmesg 2>/dev/null | grep {q} | tail -50;'
f' echo "=== lldpctl ===";'
f' lldpctl 2>/dev/null || echo "lldpd not running";'
f' echo "=== end ==="'
)
return (
f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 '
f'-o LogLevel=ERROR root@{ip_q} \'{remote_cmd}\''
)
# ------------------------------------------------------------------
# Main entry point
# ------------------------------------------------------------------
def run(self, host_ip: str, host_name: str, iface: str,
switch_port_data: dict) -> dict:
"""Execute diagnostics and return structured result dict."""
cmd = self.build_ssh_command(host_ip, iface)
logger.info(f'Running link diagnostic: {host_name}/{iface} via Pulse')
# Reset execution_id before call
self.pulse.last_execution_id = None
output = self.pulse.run_command(cmd)
execution_id = getattr(self.pulse, 'last_execution_id', None)
if output is None:
return {
'status': 'error',
'error': 'Pulse command failed or timed out',
'host': host_name,
'iface': iface,
'pulse_execution_id': execution_id,
}
sections = self.parse_output(output)
health = self.analyze(sections, switch_port_data)
pulse_url = None
if execution_id:
pulse_url = f'http://pulse.lotusguild.org/executions/{execution_id}'
return {
'status': 'done',
'host': host_name,
'iface': iface,
'sections': sections,
'health': health,
'pulse_execution_id': execution_id,
'pulse_url': pulse_url,
'switch_port': switch_port_data,
}
# ------------------------------------------------------------------
# Output parser (splits on === SECTION_NAME === sentinels)
# ------------------------------------------------------------------
@staticmethod
def parse_output(raw: str) -> dict:
sections: Dict[str, str] = {}
current: Optional[str] = None
buf: List[str] = []
for line in raw.splitlines():
m = re.match(r'^=== (.+?) ===$', line.strip())
if m:
if current and current != 'end':
sections[current] = '\n'.join(buf).strip()
name = m.group(1)
if name == 'end':
current = None
else:
current = name
buf = []
elif current:
buf.append(line)
if current and current != 'end':
sections[current] = '\n'.join(buf).strip()
parsed: dict = {}
# Simple string sections
parsed['carrier'] = sections.get('carrier', '?').strip()
parsed['operstate'] = sections.get('operstate', '?').strip()
# carrier_changes
cc_raw = sections.get('carrier_changes', '0').strip()
try:
parsed['carrier_changes'] = int(cc_raw)
except ValueError:
parsed['carrier_changes'] = None
# Structured sections
parsed['sysfs_stats'] = DiagnosticsRunner.parse_sysfs_stats(sections.get('sysfs_stats', ''))
parsed['ethtool'] = DiagnosticsRunner.parse_ethtool(sections.get('ethtool', ''))
parsed['ethtool_driver'] = DiagnosticsRunner.parse_ethtool_driver(sections.get('ethtool_driver', ''))
parsed['ethtool_pause'] = DiagnosticsRunner.parse_ethtool_pause(sections.get('ethtool_pause', ''))
parsed['ethtool_ring'] = DiagnosticsRunner.parse_ethtool_ring(sections.get('ethtool_ring', ''))
parsed['ethtool_stats'] = DiagnosticsRunner.parse_nic_stats(sections.get('ethtool_stats', ''))
parsed['ethtool_dom'] = DiagnosticsRunner.parse_ethtool_dom(sections.get('ethtool_dom', ''))
parsed['ip_link'] = DiagnosticsRunner.parse_ip_link(sections.get('ip_link', ''))
parsed['ip_addr'] = sections.get('ip_addr', '').strip()
parsed['ip_route'] = sections.get('ip_route', '').strip()
parsed['dmesg'] = DiagnosticsRunner.parse_dmesg(sections.get('dmesg', ''))
parsed['lldpctl'] = DiagnosticsRunner.parse_lldpctl(sections.get('lldpctl', ''))
return parsed
# ------------------------------------------------------------------
# Individual parsers
# ------------------------------------------------------------------
@staticmethod
def parse_sysfs_stats(text: str) -> dict:
result: dict = {}
for line in text.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
if key in _SYSFS_STATS:
try:
result[key] = int(val)
except ValueError:
result[key] = 0
return result
@staticmethod
def parse_ethtool(text: str) -> dict:
"""Parse ethtool <iface> output."""
data: dict = {}
for line in text.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
if key == 'Speed':
m = re.match(r'(\d+)\s*Mb/s', val)
if m:
data['speed_mbps'] = int(m.group(1))
elif 'Unknown' in val or 'unknown' in val:
data['speed_mbps'] = None
elif key == 'Duplex':
data['duplex'] = val.lower()
elif key == 'Port':
data['port_type'] = val
elif key == 'Auto-negotiation':
data['auto_neg'] = (val.lower() == 'on')
elif key == 'Link detected':
data['link_detected'] = (val.lower() == 'yes')
elif 'Supported link modes' in key:
data.setdefault('supported_modes', []).append(val)
return data
@staticmethod
def parse_ethtool_driver(text: str) -> dict:
data: dict = {}
for line in text.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
if key == 'driver':
data['driver'] = val
elif key == 'version':
data['version'] = val
elif key == 'firmware-version':
data['firmware_version'] = val
elif key == 'bus-info':
data['bus_info'] = val
return data
@staticmethod
def parse_ethtool_pause(text: str) -> dict:
data = {'rx_pause': False, 'tx_pause': False}
for line in text.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip().lower()
if key == 'RX':
data['rx_pause'] = (val == 'on')
elif key == 'TX':
data['tx_pause'] = (val == 'on')
return data
@staticmethod
def parse_ethtool_ring(text: str) -> dict:
data: dict = {}
in_current = False
for line in text.splitlines():
if 'Current hardware settings' in line:
in_current = True
continue
if 'Pre-set maximums' in line:
in_current = False
continue
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
try:
v = int(val)
except ValueError:
continue
if in_current:
if 'RX' in key and 'rx_current' not in data:
data['rx_current'] = v
elif 'TX' in key and 'tx_current' not in data:
data['tx_current'] = v
else:
if 'RX' in key and 'rx_max' not in data:
data['rx_max'] = v
elif 'TX' in key and 'tx_max' not in data:
data['tx_max'] = v
return data
@staticmethod
def parse_nic_stats(text: str) -> dict:
"""Parse ethtool -S output into {key: int} dict."""
data: dict = {}
for line in text.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
try:
data[key] = int(val)
except ValueError:
pass
return data
@staticmethod
def parse_ethtool_dom(text: str) -> dict:
"""Parse ethtool -m (SFP DOM) output."""
if not text:
return {}
lower = text.lower()
if any(s in lower for s in ('cannot get', 'not supported', 'no sfp', 'operation not supported')):
return {}
data: dict = {}
for line in text.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
if key == 'Vendor name':
data['vendor'] = val
elif key == 'Vendor PN':
data['part_no'] = val
elif key == 'Identifier':
m = re.search(r'\((.+?)\)', val)
if m:
data['sfp_type'] = m.group(1)
elif key == 'Connector':
m = re.search(r'\((.+?)\)', val)
if m:
data['connector'] = m.group(1)
elif key == 'Laser wavelength':
m = re.match(r'(\d+)', val)
if m:
data['wavelength_nm'] = int(m.group(1))
elif key == 'Laser bias current':
m = re.match(r'([\d.]+)\s+mA', val)
if m:
data['bias_ma'] = float(m.group(1))
elif key == 'Laser output power':
m = re.search(r'/\s*([-\d.]+)\s*dBm', val)
if m:
try:
data['tx_power_dbm'] = float(m.group(1))
except ValueError:
pass
elif 'receiver' in key.lower() and ('power' in key.lower() or 'optical' in key.lower()):
m = re.search(r'/\s*([-\d.]+)\s*dBm', val)
if m:
try:
data['rx_power_dbm'] = float(m.group(1))
except ValueError:
pass
elif key == 'Module temperature':
m = re.match(r'([\d.]+)\s+degrees', val)
if m:
data['temp_c'] = float(m.group(1))
elif key == 'Module voltage':
m = re.match(r'([\d.]+)\s+V', val)
if m:
data['voltage_v'] = float(m.group(1))
return data
@staticmethod
def parse_ip_link(text: str) -> dict:
"""Parse ip -s link show output for basic link state and counters."""
data: dict = {}
lines = text.splitlines()
for i, line in enumerate(lines):
# MTU and state: "2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 ..."
m = re.search(r'mtu\s+(\d+)', line)
if m:
data['mtu'] = int(m.group(1))
m = re.search(r'state\s+(\S+)', line)
if m:
data['state'] = m.group(1).lower()
# RX line follows "RX:" label
if line.strip().startswith('RX:') and i + 1 < len(lines):
vals = lines[i + 1].split()
if len(vals) >= 5:
try:
data['ip_rx_bytes'] = int(vals[0])
data['ip_rx_packets'] = int(vals[1])
data['ip_rx_errors'] = int(vals[2])
data['ip_rx_dropped'] = int(vals[3])
except (ValueError, IndexError):
pass
if line.strip().startswith('TX:') and i + 1 < len(lines):
vals = lines[i + 1].split()
if len(vals) >= 5:
try:
data['ip_tx_bytes'] = int(vals[0])
data['ip_tx_packets'] = int(vals[1])
data['ip_tx_errors'] = int(vals[2])
data['ip_tx_dropped'] = int(vals[3])
except (ValueError, IndexError):
pass
return data
@staticmethod
def parse_dmesg(text: str) -> List[dict]:
"""Parse dmesg lines into [{timestamp, msg, severity}]."""
events = []
for line in text.splitlines():
if not line.strip():
continue
# Extract timestamp from [ 123.456789]
m = re.match(r'^\[\s*([\d.]+)\]\s*(.*)', line)
if m:
ts = m.group(1)
msg = m.group(2)
else:
ts = ''
msg = line
lower = msg.lower()
if any(w in lower for w in ('error', 'fail', 'reset', 'panic', 'oops', 'hung', 'timeout')):
severity = 'error'
elif any(w in lower for w in ('warn', 'drop', 'lost', 'miss')):
severity = 'warn'
else:
severity = 'info'
events.append({'timestamp': ts, 'msg': msg, 'severity': severity})
return events
@staticmethod
def parse_lldpctl(text: str) -> dict:
"""Extract neighbor info from lldpctl output."""
if not text or 'lldpd not running' in text or 'not found' in text.lower():
return {'available': False}
data: dict = {'available': True}
for line in text.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
if 'SysName' in key:
data['neighbor_system'] = val
elif 'PortID' in key and 'neighbor_port' not in data:
data['neighbor_port'] = val
elif 'ChassisID' in key and 'neighbor_chassis_id' not in data:
data['neighbor_chassis_id'] = val
return data
# ------------------------------------------------------------------
# Health analysis
# ------------------------------------------------------------------
@staticmethod
def analyze(sections: dict, switch_port_data: dict) -> dict:
"""Return {issues: [...], warnings: [...], info: [...]} health analysis."""
issues: List[dict] = []
warnings: List[dict] = []
info: List[dict] = []
def add(collection, code, message):
collection.append({'code': code, 'message': message})
carrier = sections.get('carrier', '?')
eth = sections.get('ethtool', {})
sysfs = sections.get('sysfs_stats', {})
dom = sections.get('ethtool_dom', {})
dmesg = sections.get('dmesg', [])
lldp = sections.get('lldpctl', {})
cc = sections.get('carrier_changes')
# Physical carrier
if carrier == '0':
add(issues, 'NO_CARRIER',
'No physical carrier — cable/SFP disconnected or switch port disabled')
elif eth.get('link_detected') is False and carrier != '0':
add(issues, 'LINK_NOT_DETECTED',
'NIC does not detect link signal despite carrier sysfs reading non-zero')
# Duplex
if eth.get('duplex') == 'half':
add(issues, 'HALF_DUPLEX',
'Half-duplex detected — likely duplex mismatch; force full-duplex on both ends')
# Speed mismatch (switch vs server NIC)
sw_speed = switch_port_data.get('speed_mbps', 0) or 0
srv_speed = eth.get('speed_mbps', 0) or 0
if sw_speed > 0 and srv_speed > 0 and sw_speed != srv_speed:
add(warnings, 'SPEED_MISMATCH',
f'Speed mismatch: switch reports {sw_speed} Mbps, NIC reports {srv_speed} Mbps')
# SFP DOM power levels
rx_dbm = dom.get('rx_power_dbm')
tx_dbm = dom.get('tx_power_dbm')
if rx_dbm is not None:
if rx_dbm < -25:
add(issues, 'SFP_RX_CRITICAL',
f'RX power critically low ({rx_dbm:.2f} dBm) — fiber not connected or SFP failed')
elif rx_dbm < -18:
add(warnings, 'SFP_RX_LOW',
f'RX power low ({rx_dbm:.2f} dBm) — check fiber cleanliness and SFP seating')
if tx_dbm is not None and tx_dbm < -10:
add(warnings, 'SFP_TX_LOW',
f'TX power low ({tx_dbm:.2f} dBm) — SFP may be failing or requires cleaning')
# Carrier changes (flapping)
if cc is not None:
if cc > 100:
add(issues, 'CARRIER_FLAPPING',
f'Link has flapped {cc} times — severe physical instability')
elif cc > 20:
add(warnings, 'CARRIER_FLAPS',
f'Link has flapped {cc} times — intermittent physical issue')
# CRC errors
crc = sysfs.get('rx_crc_errors', 0) or 0
if crc > 100:
add(issues, 'CRC_ERRORS_HIGH',
f'High CRC error count ({crc}) — dirty fiber/connector or cable damage')
elif crc > 10:
add(warnings, 'CRC_ERRORS_LOW',
f'CRC errors present ({crc}) — cable or SFP quality issue')
# Kernel events
err_events = [e for e in dmesg if e['severity'] == 'error']
if err_events:
add(warnings, 'KERNEL_EVENTS',
f'{len(err_events)} recent kernel error event(s) for this interface in dmesg')
# LLDP validation
if lldp.get('available'):
sw_lldp = switch_port_data.get('lldp') or {}
sw_system = (sw_lldp.get('system_name') or '').lower()
srv_neighbor = (lldp.get('neighbor_system') or '').lower()
if sw_system and srv_neighbor and sw_system not in srv_neighbor and srv_neighbor not in sw_system:
add(warnings, 'LLDP_MISMATCH',
f'LLDP mismatch: switch sees "{sw_lldp.get("system_name")}" but '
f'server lldpctl sees "{lldp.get("neighbor_system")}" — cross-cabled port?')
else:
add(info, 'LLDP_MISSING',
'lldpd not running on server — install lldpd for full path validation')
return {'issues': issues, 'warnings': warnings, 'info': info}

View File

@@ -239,6 +239,7 @@ class PulseClient:
self.api_key = p.get('api_key', '')
self.worker_id = p.get('worker_id', '')
self.timeout = p.get('timeout', 45)
self.last_execution_id: Optional[str] = None
self.session = requests.Session()
self.session.headers.update({
'X-Gandalf-API-Key': self.api_key,
@@ -247,6 +248,7 @@ class PulseClient:
def run_command(self, command: str) -> Optional[str]:
"""Submit *command* to Pulse, poll until done, return stdout or None."""
self.last_execution_id = None
if not self.url or not self.api_key or not self.worker_id:
return None
try:
@@ -257,6 +259,7 @@ class PulseClient:
)
resp.raise_for_status()
execution_id = resp.json()['execution_id']
self.last_execution_id = execution_id
except Exception as e:
logger.debug(f'Pulse command submit failed: {e}')
return None

View File

@@ -1133,6 +1133,283 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-amber); }
}
.path-dom-row span:first-child { color:var(--text-muted); }
/* ── Link Diagnostics ─────────────────────────────────────────────── */
.diag-bar {
display: flex;
align-items: center;
gap: 10px;
margin-top: 14px;
padding-top: 10px;
border-top: 1px solid var(--border);
}
.btn-diag {
font-family: var(--font);
font-size: .65em;
color: var(--cyan);
background: transparent;
border: 1px solid var(--cyan);
padding: 4px 10px;
cursor: pointer;
letter-spacing: .04em;
transition: background .15s, box-shadow .15s;
animation: diag-pulse 2.5s ease-in-out infinite;
}
.btn-diag:hover {
background: var(--cyan-dim);
box-shadow: var(--glow-cyan);
}
@keyframes diag-pulse {
0%, 100% { box-shadow: none; }
50% { box-shadow: 0 0 6px rgba(0,255,255,.4); }
}
.diag-status {
font-size: .6em;
color: var(--text-muted);
font-style: italic;
}
.diag-error {
color: var(--red);
font-size: .65em;
margin-top: 8px;
}
.diag-results {
margin-top: 4px;
}
.diag-results-inner {
display: flex;
flex-direction: column;
gap: 6px;
}
/* Health banner */
.diag-health-banner {
display: flex;
gap: 8px;
padding: 6px 0 4px;
margin-bottom: 2px;
}
.diag-health-critical {
background: var(--red-dim);
color: var(--red);
border: 1px solid var(--red);
padding: 2px 8px;
font-size: .62em;
font-weight: bold;
letter-spacing: .05em;
}
.diag-health-warning {
background: var(--amber-dim);
color: var(--amber);
border: 1px solid var(--amber);
padding: 2px 8px;
font-size: .62em;
font-weight: bold;
letter-spacing: .05em;
}
.diag-health-ok {
background: var(--green-dim);
color: var(--green);
border: 1px solid var(--green);
padding: 2px 8px;
font-size: .62em;
font-weight: bold;
letter-spacing: .05em;
}
/* Issue list */
.diag-issue-list {
display: flex;
flex-direction: column;
gap: 3px;
}
.diag-issue-row {
font-size: .62em;
padding: 3px 6px;
background: var(--bg2);
border-left: 2px solid var(--border);
line-height: 1.4;
}
.diag-code {
font-weight: bold;
color: var(--amber);
}
/* Sections */
.diag-section {
background: var(--bg2);
border: 1px solid rgba(0,255,65,.12);
}
.diag-section-header {
font-size: .62em;
font-weight: bold;
color: var(--amber);
padding: 4px 8px;
letter-spacing: .04em;
border-bottom: 1px solid rgba(0,255,65,.12);
background: rgba(255,176,0,.04);
}
/* Collapsible sections */
.diag-collapsible .diag-section-body {
display: none;
}
.diag-collapsible.diag-open .diag-section-body {
display: block;
}
.diag-toggle {
cursor: pointer;
user-select: none;
}
.diag-toggle-hint {
font-weight: normal;
color: var(--text-muted);
font-size: .9em;
}
.diag-collapsible.diag-open .diag-toggle-hint::after {
content: '';
}
/* Data tables */
.diag-table {
width: 100%;
border-collapse: collapse;
font-size: .62em;
}
.diag-table td {
padding: 3px 8px;
vertical-align: top;
}
.diag-table td:first-child {
color: var(--text-muted);
width: 40%;
white-space: nowrap;
}
.diag-table td:last-child {
color: var(--text-dim);
font-weight: bold;
word-break: break-all;
}
.diag-table tr:nth-child(even) {
background: rgba(0,255,65,.025);
}
/* Value colour classes */
.diag-val-good { color: var(--green); }
.diag-val-warn { color: var(--amber); }
.diag-val-bad { color: var(--red); }
/* SFP power bar */
.diag-power-bar-wrap {
position: relative;
display: inline-block;
width: 60px;
height: 7px;
background: var(--bg3);
border: 1px solid var(--border);
vertical-align: middle;
margin-left: 6px;
overflow: visible;
}
.diag-power-bar {
display: inline-block;
position: absolute;
left: 0;
top: 0;
height: 100%;
}
.diag-power-bar.diag-val-good { background: var(--green); }
.diag-power-bar.diag-val-warn { background: var(--amber); }
.diag-power-bar.diag-val-bad { background: var(--red); }
.diag-power-zone-warn,
.diag-power-zone-crit {
position: absolute;
top: -2px;
width: 1px;
height: calc(100% + 4px);
pointer-events: none;
}
.diag-power-zone-warn { background: var(--amber); opacity: .7; }
.diag-power-zone-crit { background: var(--red); opacity: .7; }
/* ethtool -S stat table */
.diag-stat-table {
width: 100%;
border-collapse: collapse;
font-size: .58em;
}
.diag-stat-table td {
padding: 2px 8px;
}
.diag-stat-table td:first-child { color: var(--text-muted); }
.diag-stat-table td:last-child { color: var(--text-dim); text-align: right; }
.diag-stat-nonzero-warn {
background: var(--amber-dim);
}
.diag-stat-nonzero-warn td { color: var(--amber); }
/* dmesg */
.diag-dmesg-wrap {
max-height: 200px;
overflow-y: auto;
padding: 6px 8px;
}
.diag-dmesg-line {
font-family: var(--font);
font-size: .58em;
white-space: pre-wrap;
word-break: break-all;
padding: 1px 0;
color: var(--text-dim);
}
.diag-dmesg-warn { color: var(--amber); }
.diag-dmesg-err { color: var(--red); }
/* Pulse link */
.diag-pulse-link {
font-size: .62em;
padding: 4px 0;
text-align: right;
}
.diag-pulse-link a {
color: var(--cyan);
text-decoration: none;
}
.diag-pulse-link a:hover {
text-shadow: var(--glow-cyan);
}
/* ── Responsive ───────────────────────────────────────────────────── */
@media (max-width: 768px) {
.host-grid { grid-template-columns:1fr; }

View File

@@ -264,6 +264,16 @@ function renderPanel(swName, idx) {
}
}
// Diagnose button (only when LLDP has an identified neighbor we can map)
const hasDiagTarget = !!(d.lldp && d.lldp.system_name &&
_apiData.hosts && _apiData.hosts[d.lldp.system_name]);
const diagHtml = hasDiagTarget ? `
<div class="diag-bar">
<button class="btn-diag" onclick="runDiagnostic('${escHtml(swName)}', ${idx})">Run Link Diagnostics</button>
<span class="diag-status" id="diag-status"></span>
</div>
<div class="diag-results" id="diag-results"></div>` : '';
const inner = document.getElementById('inspector-panel-inner');
inner.innerHTML = `
<div class="panel-header">
@@ -286,6 +296,7 @@ function renderPanel(swName, idx) {
${errHtml}
${lldpHtml}
${pathHtml}
${diagHtml}
`;
document.getElementById('inspector-panel').classList.add('open');
@@ -387,5 +398,313 @@ async function loadInspector() {
loadInspector();
setInterval(loadInspector, 60000);
// ── Link Diagnostics ─────────────────────────────────────────────────
let _diagPollTimer = null;
function runDiagnostic(swName, portIdx) {
const statusEl = document.getElementById('diag-status');
const resultsEl = document.getElementById('diag-results');
if (!statusEl || !resultsEl) return;
// Clear any previous poll
if (_diagPollTimer) { clearInterval(_diagPollTimer); _diagPollTimer = null; }
statusEl.textContent = 'Submitting to Pulse...';
resultsEl.innerHTML = '';
fetch('/api/diagnose', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({switch_name: swName, port_idx: portIdx}),
})
.then(r => r.json())
.then(resp => {
if (resp.error) {
statusEl.textContent = 'Error: ' + resp.error;
return;
}
statusEl.textContent = 'Collecting diagnostics via Pulse...';
pollDiagnostic(resp.job_id, statusEl, resultsEl);
})
.catch(e => {
statusEl.textContent = 'Request failed: ' + e;
});
}
function pollDiagnostic(jobId, statusEl, resultsEl) {
let attempts = 0;
_diagPollTimer = setInterval(() => {
attempts++;
if (attempts > 120) { // 2min timeout
clearInterval(_diagPollTimer);
statusEl.textContent = 'Timed out waiting for results.';
return;
}
fetch(`/api/diagnose/${jobId}`)
.then(r => r.json())
.then(resp => {
if (resp.status === 'done') {
clearInterval(_diagPollTimer);
_diagPollTimer = null;
statusEl.textContent = '';
renderDiagnosticResults(resp.result, resultsEl);
}
})
.catch(() => {});
}, 2000);
}
function renderDiagnosticResults(d, container) {
if (!d || d.status === 'error') {
container.innerHTML = `<div class="diag-error">Diagnostic error: ${escHtml((d && d.error) || 'unknown')}</div>`;
return;
}
const health = d.health || {};
const issues = health.issues || [];
const warns = health.warnings || [];
const infoArr = health.info || [];
const secs = d.sections || {};
const eth = secs.ethtool || {};
const drv = secs.ethtool_driver || {};
const pause = secs.ethtool_pause || {};
const ring = secs.ethtool_ring || {};
const dom = secs.ethtool_dom || {};
const sysfs = secs.sysfs_stats || {};
const dmesg = secs.dmesg || [];
const lldpctl = secs.lldpctl || {};
const nicStats = secs.ethtool_stats || {};
const swPort = d.switch_port || {};
// ── Health banner ──
let bannerHtml = '';
if (issues.length === 0 && warns.length === 0) {
bannerHtml = '<div class="diag-health-banner"><span class="diag-health-ok">ALL OK</span></div>';
} else {
const parts = [];
if (issues.length) parts.push(`<span class="diag-health-critical">${issues.length} CRITICAL</span>`);
if (warns.length) parts.push(`<span class="diag-health-warning">${warns.length} WARNING</span>`);
bannerHtml = `<div class="diag-health-banner">${parts.join(' ')}</div>`;
}
const issueRows = [...issues, ...warns, ...infoArr].map(item => {
const cls = issues.includes(item) ? 'diag-val-bad' : warns.includes(item) ? 'diag-val-warn' : 'diag-val-good';
const label = issues.includes(item) ? 'CRIT' : warns.includes(item) ? 'WARN' : 'INFO';
return `<div class="diag-issue-row"><span class="${cls}">[${label}]</span> <span class="diag-code">${escHtml(item.code)}</span> — ${escHtml(item.message)}</div>`;
}).join('');
// ── Physical layer ──
const carrierVal = secs.carrier === '1' ? '<span class="diag-val-good">YES</span>' :
secs.carrier === '0' ? '<span class="diag-val-bad">NO</span>' : '';
const operstateVal = (secs.operstate || '?').toUpperCase();
const opstateCls = secs.operstate === 'up' ? 'diag-val-good' : secs.operstate === 'down' ? 'diag-val-bad' : 'diag-val-warn';
const speedVal = eth.speed_mbps ? `<span class="diag-val-good">${fmtSpeed(eth.speed_mbps)}bps</span>` : '<span class="diag-val-warn"></span>';
const duplexVal = eth.duplex === 'full' ? '<span class="diag-val-good">Full</span>' :
eth.duplex === 'half' ? '<span class="diag-val-bad">Half</span>' : '';
const linkDetVal = eth.link_detected === true ? '<span class="diag-val-good">Yes</span>' :
eth.link_detected === false ? '<span class="diag-val-bad">No</span>' : '';
const autonegVal = eth.auto_neg === true ? '<span class="diag-val-good">On</span>' :
eth.auto_neg === false ? '<span class="diag-val-warn">Off</span>' : '';
const physHtml = `
<div class="diag-section">
<div class="diag-section-header">Physical Layer</div>
<table class="diag-table">
<tr><td>Carrier</td><td>${carrierVal}</td></tr>
<tr><td>Oper State</td><td><span class="${opstateCls}">${escHtml(operstateVal)}</span></td></tr>
<tr><td>Speed</td><td>${speedVal}</td></tr>
<tr><td>Duplex</td><td>${duplexVal}</td></tr>
<tr><td>Link Detected</td><td>${linkDetVal}</td></tr>
<tr><td>Auto-neg</td><td>${autonegVal}</td></tr>
${secs.carrier_changes != null ? `<tr><td>Carrier Changes</td><td><span class="${secs.carrier_changes > 20 ? 'diag-val-warn' : 'diag-val-good'}">${secs.carrier_changes}</span></td></tr>` : ''}
</table>
</div>`;
// ── SFP / DOM ──
let domHtml = '';
if (dom && Object.keys(dom).length > 0) {
const rxBar = dom.rx_power_dbm != null ? renderPowerBar(dom.rx_power_dbm, -18, -25) : '';
const txBar = dom.tx_power_dbm != null ? renderPowerBar(dom.tx_power_dbm, -10, -13) : '';
domHtml = `
<div class="diag-section">
<div class="diag-section-header">SFP / DOM</div>
<table class="diag-table">
${dom.vendor ? `<tr><td>Vendor</td><td>${escHtml(dom.vendor)}${dom.part_no ? ' / ' + escHtml(dom.part_no) : ''}</td></tr>` : ''}
${dom.sfp_type ? `<tr><td>Type</td><td>${escHtml(dom.sfp_type)}</td></tr>` : ''}
${dom.connector ? `<tr><td>Connector</td><td>${escHtml(dom.connector)}</td></tr>` : ''}
${dom.wavelength_nm != null ? `<tr><td>Wavelength</td><td>${dom.wavelength_nm} nm</td></tr>` : ''}
${dom.temp_c != null ? `<tr><td>Temperature</td><td>${dom.temp_c.toFixed(1)} °C</td></tr>` : ''}
${dom.voltage_v != null ? `<tr><td>Voltage</td><td>${dom.voltage_v.toFixed(4)} V</td></tr>` : ''}
${dom.bias_ma != null ? `<tr><td>Bias Current</td><td>${dom.bias_ma.toFixed(3)} mA</td></tr>` : ''}
${dom.tx_power_dbm != null ? `<tr><td>TX Power</td><td>${dom.tx_power_dbm.toFixed(2)} dBm ${txBar}</td></tr>` : ''}
${dom.rx_power_dbm != null ? `<tr><td>RX Power</td><td>${dom.rx_power_dbm.toFixed(2)} dBm ${rxBar}</td></tr>` : ''}
</table>
</div>`;
}
// ── NIC Error Counters ──
const errCounters = ['rx_crc_errors','rx_frame_errors','collisions','tx_carrier_errors','rx_missed_errors','rx_fifo_errors'];
const nonZeroCounters = errCounters.filter(k => sysfs[k] > 0);
let errCounterHtml = '';
if (nonZeroCounters.length > 0 || secs.carrier_changes > 0) {
const rows = nonZeroCounters.map(k => {
const v = sysfs[k];
const cls = v > 100 ? 'diag-val-bad' : 'diag-val-warn';
return `<tr><td>${escHtml(k)}</td><td class="${cls}">${v.toLocaleString()}</td></tr>`;
}).join('');
errCounterHtml = `
<div class="diag-section">
<div class="diag-section-header">NIC Error Counters</div>
<table class="diag-table">
${rows || '<tr><td colspan="2" class="diag-val-good">All zero</td></tr>'}
</table>
</div>`;
}
// ── ethtool -S (collapsible) ──
let nicStatHtml = '';
if (Object.keys(nicStats).length > 0) {
const _ERR_KEYS = /err|drop|miss|crc|frame|fifo|abort|carrier|collision|fault|discard|overflow|reset/i;
const rows = Object.entries(nicStats).map(([k, v]) => {
const cls = _ERR_KEYS.test(k) && v > 0 ? ' class="diag-stat-nonzero-warn"' : '';
return `<tr${cls}><td>${escHtml(k)}</td><td>${v.toLocaleString()}</td></tr>`;
}).join('');
nicStatHtml = `
<div class="diag-section diag-collapsible">
<div class="diag-section-header diag-toggle" onclick="this.parentElement.classList.toggle('diag-open')">
ethtool -S (NIC stats) <span class="diag-toggle-hint">[expand]</span>
</div>
<div class="diag-section-body">
<table class="diag-stat-table">${rows}</table>
</div>
</div>`;
}
// ── Flow Control + Ring Buffers ──
let flowRingHtml = '';
const hasPause = Object.keys(pause).length > 0;
const hasRing = Object.keys(ring).length > 0;
if (hasPause || hasRing) {
flowRingHtml = `
<div class="diag-section">
<div class="diag-section-header">Flow Control &amp; Ring Buffers</div>
<table class="diag-table">
${hasPause ? `
<tr><td>RX Pause</td><td>${pause.rx_pause ? '<span class="diag-val-good">On</span>' : 'Off'}</td></tr>
<tr><td>TX Pause</td><td>${pause.tx_pause ? '<span class="diag-val-good">On</span>' : 'Off'}</td></tr>` : ''}
${hasRing ? `
<tr><td>RX Ring</td><td>${ring.rx_current != null ? ring.rx_current : ''} / ${ring.rx_max != null ? ring.rx_max : ''} max</td></tr>
<tr><td>TX Ring</td><td>${ring.tx_current != null ? ring.tx_current : ''} / ${ring.tx_max != null ? ring.tx_max : ''} max</td></tr>` : ''}
</table>
</div>`;
}
// ── Driver Info ──
let drvHtml = '';
if (Object.keys(drv).length > 0) {
drvHtml = `
<div class="diag-section">
<div class="diag-section-header">Driver Info</div>
<table class="diag-table">
${drv.driver ? `<tr><td>Driver</td><td>${escHtml(drv.driver)}</td></tr>` : ''}
${drv.version ? `<tr><td>Version</td><td>${escHtml(drv.version)}</td></tr>` : ''}
${drv.firmware_version ? `<tr><td>Firmware</td><td>${escHtml(drv.firmware_version)}</td></tr>` : ''}
${drv.bus_info ? `<tr><td>Bus</td><td>${escHtml(drv.bus_info)}</td></tr>` : ''}
</table>
</div>`;
}
// ── LLDP Validation ──
let lldpValHtml = '';
const swLldp = swPort.lldp || {};
lldpValHtml = `
<div class="diag-section">
<div class="diag-section-header">LLDP Validation</div>
<div class="path-debug-cols">
<div class="path-col">
<div class="path-col-header">Switch sees</div>
<div class="path-row"><span>System</span><span>${escHtml(swLldp.system_name || '')}</span></div>
<div class="path-row"><span>Port</span><span>${escHtml(swLldp.port_id || '')}</span></div>
<div class="path-row"><span>Chassis</span><span>${escHtml(swLldp.chassis_id || '')}</span></div>
</div>
<div class="path-col">
<div class="path-col-header">Server lldpctl</div>
${lldpctl.available
? `<div class="path-row"><span>Neighbor</span><span>${escHtml(lldpctl.neighbor_system || '')}</span></div>
<div class="path-row"><span>Port</span><span>${escHtml(lldpctl.neighbor_port || '')}</span></div>`
: '<div class="path-row"><span class="diag-val-warn">lldpd not running</span></div>'}
</div>
</div>
</div>`;
// ── dmesg ──
let dmesgHtml = '';
if (dmesg.length > 0) {
const dlines = dmesg.map(e => {
const cls = e.severity === 'error' ? ' diag-dmesg-err' : e.severity === 'warn' ? ' diag-dmesg-warn' : '';
const ts = e.timestamp ? `[${e.timestamp}] ` : '';
return `<div class="diag-dmesg-line${cls}">${escHtml(ts + e.msg)}</div>`;
}).join('');
dmesgHtml = `
<div class="diag-section diag-collapsible">
<div class="diag-section-header diag-toggle" onclick="this.parentElement.classList.toggle('diag-open')">
Kernel Events (dmesg) <span class="diag-toggle-hint">[expand]</span>
</div>
<div class="diag-section-body">
<div class="diag-dmesg-wrap">${dlines}</div>
</div>
</div>`;
}
// ── Switch Port Summary ──
const swSummaryHtml = `
<div class="diag-section">
<div class="diag-section-header">Switch Port Summary</div>
<table class="diag-table">
<tr><td>Status</td><td>${swPort.up ? '<span class="diag-val-good">UP</span>' : '<span class="diag-val-bad">DOWN</span>'}</td></tr>
<tr><td>Speed</td><td>${swPort.speed_mbps ? fmtSpeed(swPort.speed_mbps) + 'bps' : ''}</td></tr>
<tr><td>Duplex</td><td>${swPort.full_duplex ? 'Full' : (swPort.up ? '<span class="diag-val-bad">Half</span>' : '')}</td></tr>
<tr><td>TX Err</td><td>${fmtErrors(swPort.tx_errs_rate)}</td></tr>
<tr><td>RX Err</td><td>${fmtErrors(swPort.rx_errs_rate)}</td></tr>
${swPort.poe_power != null ? `<tr><td>PoE</td><td><span class="val-amber">${swPort.poe_power.toFixed(1)}W</span></td></tr>` : ''}
</table>
</div>`;
// ── Pulse link ──
const pulseLink = d.pulse_url
? `<div class="diag-pulse-link"><a href="${escHtml(d.pulse_url)}" target="_blank" rel="noopener">View raw output in Pulse ↗</a></div>`
: '';
container.innerHTML = `
<div class="diag-results-inner">
${bannerHtml}
<div class="diag-issue-list">${issueRows}</div>
${physHtml}
${domHtml}
${errCounterHtml}
${nicStatHtml}
${flowRingHtml}
${drvHtml}
${lldpValHtml}
${dmesgHtml}
${swSummaryHtml}
${pulseLink}
</div>`;
}
// SFP power bar: range is 0 dBm (best) to -35 dBm (worst)
function renderPowerBar(dbm, warnThreshold, critThreshold) {
const minDbm = -35, maxDbm = 0;
const pct = Math.max(0, Math.min(100, ((dbm - minDbm) / (maxDbm - minDbm)) * 100));
const warnPct = ((warnThreshold - minDbm) / (maxDbm - minDbm)) * 100;
const critPct = ((critThreshold - minDbm) / (maxDbm - minDbm)) * 100;
const barCls = dbm < critThreshold ? 'diag-val-bad' : dbm < warnThreshold ? 'diag-val-warn' : 'diag-val-good';
return `<span class="diag-power-bar-wrap">
<span class="diag-power-bar ${barCls}" style="width:${pct.toFixed(1)}%"></span>
<span class="diag-power-zone-warn" style="left:${warnPct.toFixed(1)}%"></span>
<span class="diag-power-zone-crit" style="left:${critPct.toFixed(1)}%"></span>
</span>`;
}
</script>
{% endblock %}