diff --git a/app.py b/app.py index 12fd7d0..20373a6 100644 --- a/app.py +++ b/app.py @@ -6,11 +6,16 @@ All monitoring and alerting is handled by the separate monitor.py daemon. """ import json import logging +import threading +import time +import uuid from functools import wraps from flask import Flask, jsonify, redirect, render_template, request, url_for import db +import diagnose +from monitor import PulseClient logging.basicConfig( level=logging.INFO, @@ -22,6 +27,19 @@ app = Flask(__name__) _cfg = None +# In-memory diagnostic job store { job_id: { status, result, created_at } } +_diag_jobs: dict = {} +_diag_lock = threading.Lock() + + +def _purge_old_jobs(): + """Remove jobs older than 10 minutes (called before each new job creation).""" + cutoff = time.time() - 600 + with _diag_lock: + stale = [jid for jid, j in _diag_jobs.items() if j.get('created_at', 0) < cutoff] + for jid in stale: + del _diag_jobs[jid] + def _config() -> dict: global _cfg @@ -223,6 +241,115 @@ def api_delete_suppression(sup_id: int): return jsonify({'success': True}) +@app.route('/api/diagnose', methods=['POST']) +@require_auth +def api_diagnose_start(): + """Start a link diagnostic job. Returns {job_id}.""" + data = request.get_json(silent=True) or {} + switch_name = (data.get('switch_name') or '').strip() + port_idx = data.get('port_idx') + + if not switch_name or port_idx is None: + return jsonify({'error': 'switch_name and port_idx required'}), 400 + + # Look up switch + port in cached link_stats + raw = db.get_state('link_stats') + if not raw: + return jsonify({'error': 'No link_stats data available'}), 503 + try: + link_data = json.loads(raw) + except Exception: + logger.error('Failed to parse link_stats JSON in /api/diagnose') + return jsonify({'error': 'Internal data error'}), 500 + + switches = link_data.get('unifi_switches', {}) + sw = switches.get(switch_name) + if not sw: + return jsonify({'error': f'Switch "{switch_name}" not found'}), 404 + + # Find port by port_idx + port_data = None + for pname, pd in sw.get('ports', {}).items(): + if pd.get('port_idx') == port_idx: + port_data = dict(pd) + port_data['name'] = pname + break + if not port_data: + return jsonify({'error': f'Port {port_idx} not found on switch "{switch_name}"'}), 404 + + # LLDP neighbor required to know which host+iface to SSH into + lldp = port_data.get('lldp') + if not lldp or not lldp.get('system_name'): + return jsonify({'error': 'No LLDP neighbor data for this port'}), 400 + + server_name = lldp['system_name'] + lldp_port_id = lldp.get('port_id', '') + + # Find matching host + interface in link_stats hosts + hosts = link_data.get('hosts', {}) + server_ifaces = hosts.get(server_name) + if not server_ifaces: + return jsonify({'error': f'Host "{server_name}" not in link stats'}), 404 + + # Match interface by LLDP port_id (exact then fuzzy) + matched_iface = None + if lldp_port_id and lldp_port_id in server_ifaces: + matched_iface = lldp_port_id + if not matched_iface and lldp_port_id: + matched_iface = next( + (k for k in server_ifaces if lldp_port_id in k or k in lldp_port_id), + None + ) + if not matched_iface: + matched_iface = next(iter(server_ifaces), None) + if not matched_iface: + return jsonify({'error': 'Cannot determine server interface'}), 400 + + # Resolve host IP from link_stats host data + host_ip = (server_ifaces.get(matched_iface) or {}).get('host_ip') + if not host_ip: + # Fallback: use LLDP mgmt IPs + mgmt_ips = lldp.get('mgmt_ips') or [] + host_ip = mgmt_ips[0] if mgmt_ips else None + if not host_ip: + return jsonify({'error': 'Cannot determine host IP for SSH'}), 400 + + _purge_old_jobs() + job_id = str(uuid.uuid4()) + with _diag_lock: + _diag_jobs[job_id] = {'status': 'running', 'result': None, 'created_at': time.time()} + + def _run(): + try: + cfg = _config() + pulse = PulseClient(cfg) + runner = diagnose.DiagnosticsRunner(pulse) + result = runner.run(host_ip, server_name, matched_iface, port_data) + except Exception as e: + logger.error(f'Diagnostic job {job_id} failed: {e}', exc_info=True) + result = {'status': 'error', 'error': str(e)} + with _diag_lock: + if job_id in _diag_jobs: + _diag_jobs[job_id]['status'] = 'done' + _diag_jobs[job_id]['result'] = result + + t = threading.Thread(target=_run, daemon=True) + t.start() + + return jsonify({'job_id': job_id}) + + +@app.route('/api/diagnose/', methods=['GET']) +@require_auth +def api_diagnose_poll(job_id: str): + """Poll a diagnostic job. Returns {status, result}.""" + with _diag_lock: + job = _diag_jobs.get(job_id) + if not job: + return jsonify({'error': 'Job not found'}), 404 + return jsonify({'status': job['status'], 'result': job.get('result')}) + + @app.route('/health') def health(): """Health check endpoint (no auth).""" diff --git a/diagnose.py b/diagnose.py new file mode 100644 index 0000000..7b34d68 --- /dev/null +++ b/diagnose.py @@ -0,0 +1,546 @@ +"""Gandalf – Link Diagnostics module. + +Runs a comprehensive SSH-based diagnostic against a server NIC and +analyses the result against switch port data to surface root causes. +Executed in a background thread; results stored in _diag_jobs (app.py). +""" +import re +import shlex +import time +import logging +from typing import Dict, List, Optional, Tuple + +logger = logging.getLogger('gandalf.diagnose') + +# sysfs counters collected per interface +_SYSFS_STATS = [ + 'rx_bytes', 'tx_bytes', 'rx_errors', 'tx_errors', + 'rx_dropped', 'tx_dropped', 'rx_crc_errors', + 'rx_frame_errors', 'rx_fifo_errors', 'tx_carrier_errors', + 'collisions', 'rx_missed_errors', +] + + +class DiagnosticsRunner: + """Build and run a link diagnostic against a server NIC via PulseClient.""" + + def __init__(self, pulse_client): + self.pulse = pulse_client + + # ------------------------------------------------------------------ + # SSH command builder + # ------------------------------------------------------------------ + @staticmethod + def build_ssh_command(host_ip: str, iface: str) -> str: + """Return a single-line SSH command that collects all diagnostic data.""" + q = shlex.quote(iface) + ip_q = shlex.quote(host_ip) + + sysfs_loop = '; '.join( + f'echo "{s}:$(cat /sys/class/net/{q}/statistics/{s} 2>/dev/null || echo 0)"' + for s in _SYSFS_STATS + ) + + remote_cmd = ( + f'echo "=== carrier ===";' + f' cat /sys/class/net/{q}/carrier 2>/dev/null || echo "?";' + f' echo "=== operstate ===";' + f' cat /sys/class/net/{q}/operstate 2>/dev/null || echo "?";' + f' echo "=== sysfs_stats ===";' + f' {sysfs_loop};' + f' echo "=== carrier_changes ===";' + f' cat /sys/class/net/{q}/carrier_changes 2>/dev/null || echo "0";' + f' echo "=== ethtool ===";' + f' ethtool {q} 2>/dev/null;' + f' echo "=== ethtool_driver ===";' + f' ethtool -i {q} 2>/dev/null;' + f' echo "=== ethtool_pause ===";' + f' ethtool -a {q} 2>/dev/null;' + f' echo "=== ethtool_ring ===";' + f' ethtool -g {q} 2>/dev/null;' + f' echo "=== ethtool_stats ===";' + f' ethtool -S {q} 2>/dev/null;' + f' echo "=== ethtool_dom ===";' + f' ethtool -m {q} 2>/dev/null;' + f' echo "=== ip_link ===";' + f' ip -s link show {q} 2>/dev/null;' + f' echo "=== ip_addr ===";' + f' ip addr show {q} 2>/dev/null;' + f' echo "=== ip_route ===";' + f' ip route show dev {q} 2>/dev/null;' + f' echo "=== dmesg ===";' + f' dmesg 2>/dev/null | grep {q} | tail -50;' + f' echo "=== lldpctl ===";' + f' lldpctl 2>/dev/null || echo "lldpd not running";' + f' echo "=== end ==="' + ) + + return ( + f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 ' + f'-o LogLevel=ERROR root@{ip_q} \'{remote_cmd}\'' + ) + + # ------------------------------------------------------------------ + # Main entry point + # ------------------------------------------------------------------ + def run(self, host_ip: str, host_name: str, iface: str, + switch_port_data: dict) -> dict: + """Execute diagnostics and return structured result dict.""" + cmd = self.build_ssh_command(host_ip, iface) + logger.info(f'Running link diagnostic: {host_name}/{iface} via Pulse') + + # Reset execution_id before call + self.pulse.last_execution_id = None + output = self.pulse.run_command(cmd) + execution_id = getattr(self.pulse, 'last_execution_id', None) + + if output is None: + return { + 'status': 'error', + 'error': 'Pulse command failed or timed out', + 'host': host_name, + 'iface': iface, + 'pulse_execution_id': execution_id, + } + + sections = self.parse_output(output) + health = self.analyze(sections, switch_port_data) + + pulse_url = None + if execution_id: + pulse_url = f'http://pulse.lotusguild.org/executions/{execution_id}' + + return { + 'status': 'done', + 'host': host_name, + 'iface': iface, + 'sections': sections, + 'health': health, + 'pulse_execution_id': execution_id, + 'pulse_url': pulse_url, + 'switch_port': switch_port_data, + } + + # ------------------------------------------------------------------ + # Output parser (splits on === SECTION_NAME === sentinels) + # ------------------------------------------------------------------ + @staticmethod + def parse_output(raw: str) -> dict: + sections: Dict[str, str] = {} + current: Optional[str] = None + buf: List[str] = [] + + for line in raw.splitlines(): + m = re.match(r'^=== (.+?) ===$', line.strip()) + if m: + if current and current != 'end': + sections[current] = '\n'.join(buf).strip() + name = m.group(1) + if name == 'end': + current = None + else: + current = name + buf = [] + elif current: + buf.append(line) + + if current and current != 'end': + sections[current] = '\n'.join(buf).strip() + + parsed: dict = {} + + # Simple string sections + parsed['carrier'] = sections.get('carrier', '?').strip() + parsed['operstate'] = sections.get('operstate', '?').strip() + + # carrier_changes + cc_raw = sections.get('carrier_changes', '0').strip() + try: + parsed['carrier_changes'] = int(cc_raw) + except ValueError: + parsed['carrier_changes'] = None + + # Structured sections + parsed['sysfs_stats'] = DiagnosticsRunner.parse_sysfs_stats(sections.get('sysfs_stats', '')) + parsed['ethtool'] = DiagnosticsRunner.parse_ethtool(sections.get('ethtool', '')) + parsed['ethtool_driver'] = DiagnosticsRunner.parse_ethtool_driver(sections.get('ethtool_driver', '')) + parsed['ethtool_pause'] = DiagnosticsRunner.parse_ethtool_pause(sections.get('ethtool_pause', '')) + parsed['ethtool_ring'] = DiagnosticsRunner.parse_ethtool_ring(sections.get('ethtool_ring', '')) + parsed['ethtool_stats'] = DiagnosticsRunner.parse_nic_stats(sections.get('ethtool_stats', '')) + parsed['ethtool_dom'] = DiagnosticsRunner.parse_ethtool_dom(sections.get('ethtool_dom', '')) + parsed['ip_link'] = DiagnosticsRunner.parse_ip_link(sections.get('ip_link', '')) + parsed['ip_addr'] = sections.get('ip_addr', '').strip() + parsed['ip_route'] = sections.get('ip_route', '').strip() + parsed['dmesg'] = DiagnosticsRunner.parse_dmesg(sections.get('dmesg', '')) + parsed['lldpctl'] = DiagnosticsRunner.parse_lldpctl(sections.get('lldpctl', '')) + + return parsed + + # ------------------------------------------------------------------ + # Individual parsers + # ------------------------------------------------------------------ + @staticmethod + def parse_sysfs_stats(text: str) -> dict: + result: dict = {} + for line in text.splitlines(): + if ':' not in line: + continue + key, _, val = line.partition(':') + key = key.strip() + val = val.strip() + if key in _SYSFS_STATS: + try: + result[key] = int(val) + except ValueError: + result[key] = 0 + return result + + @staticmethod + def parse_ethtool(text: str) -> dict: + """Parse ethtool output.""" + data: dict = {} + for line in text.splitlines(): + if ':' not in line: + continue + key, _, val = line.partition(':') + key = key.strip() + val = val.strip() + if key == 'Speed': + m = re.match(r'(\d+)\s*Mb/s', val) + if m: + data['speed_mbps'] = int(m.group(1)) + elif 'Unknown' in val or 'unknown' in val: + data['speed_mbps'] = None + elif key == 'Duplex': + data['duplex'] = val.lower() + elif key == 'Port': + data['port_type'] = val + elif key == 'Auto-negotiation': + data['auto_neg'] = (val.lower() == 'on') + elif key == 'Link detected': + data['link_detected'] = (val.lower() == 'yes') + elif 'Supported link modes' in key: + data.setdefault('supported_modes', []).append(val) + return data + + @staticmethod + def parse_ethtool_driver(text: str) -> dict: + data: dict = {} + for line in text.splitlines(): + if ':' not in line: + continue + key, _, val = line.partition(':') + key = key.strip() + val = val.strip() + if key == 'driver': + data['driver'] = val + elif key == 'version': + data['version'] = val + elif key == 'firmware-version': + data['firmware_version'] = val + elif key == 'bus-info': + data['bus_info'] = val + return data + + @staticmethod + def parse_ethtool_pause(text: str) -> dict: + data = {'rx_pause': False, 'tx_pause': False} + for line in text.splitlines(): + if ':' not in line: + continue + key, _, val = line.partition(':') + key = key.strip() + val = val.strip().lower() + if key == 'RX': + data['rx_pause'] = (val == 'on') + elif key == 'TX': + data['tx_pause'] = (val == 'on') + return data + + @staticmethod + def parse_ethtool_ring(text: str) -> dict: + data: dict = {} + in_current = False + for line in text.splitlines(): + if 'Current hardware settings' in line: + in_current = True + continue + if 'Pre-set maximums' in line: + in_current = False + continue + if ':' not in line: + continue + key, _, val = line.partition(':') + key = key.strip() + val = val.strip() + try: + v = int(val) + except ValueError: + continue + if in_current: + if 'RX' in key and 'rx_current' not in data: + data['rx_current'] = v + elif 'TX' in key and 'tx_current' not in data: + data['tx_current'] = v + else: + if 'RX' in key and 'rx_max' not in data: + data['rx_max'] = v + elif 'TX' in key and 'tx_max' not in data: + data['tx_max'] = v + return data + + @staticmethod + def parse_nic_stats(text: str) -> dict: + """Parse ethtool -S output into {key: int} dict.""" + data: dict = {} + for line in text.splitlines(): + if ':' not in line: + continue + key, _, val = line.partition(':') + key = key.strip() + val = val.strip() + try: + data[key] = int(val) + except ValueError: + pass + return data + + @staticmethod + def parse_ethtool_dom(text: str) -> dict: + """Parse ethtool -m (SFP DOM) output.""" + if not text: + return {} + lower = text.lower() + if any(s in lower for s in ('cannot get', 'not supported', 'no sfp', 'operation not supported')): + return {} + + data: dict = {} + for line in text.splitlines(): + if ':' not in line: + continue + key, _, val = line.partition(':') + key = key.strip() + val = val.strip() + + if key == 'Vendor name': + data['vendor'] = val + elif key == 'Vendor PN': + data['part_no'] = val + elif key == 'Identifier': + m = re.search(r'\((.+?)\)', val) + if m: + data['sfp_type'] = m.group(1) + elif key == 'Connector': + m = re.search(r'\((.+?)\)', val) + if m: + data['connector'] = m.group(1) + elif key == 'Laser wavelength': + m = re.match(r'(\d+)', val) + if m: + data['wavelength_nm'] = int(m.group(1)) + elif key == 'Laser bias current': + m = re.match(r'([\d.]+)\s+mA', val) + if m: + data['bias_ma'] = float(m.group(1)) + elif key == 'Laser output power': + m = re.search(r'/\s*([-\d.]+)\s*dBm', val) + if m: + try: + data['tx_power_dbm'] = float(m.group(1)) + except ValueError: + pass + elif 'receiver' in key.lower() and ('power' in key.lower() or 'optical' in key.lower()): + m = re.search(r'/\s*([-\d.]+)\s*dBm', val) + if m: + try: + data['rx_power_dbm'] = float(m.group(1)) + except ValueError: + pass + elif key == 'Module temperature': + m = re.match(r'([\d.]+)\s+degrees', val) + if m: + data['temp_c'] = float(m.group(1)) + elif key == 'Module voltage': + m = re.match(r'([\d.]+)\s+V', val) + if m: + data['voltage_v'] = float(m.group(1)) + + return data + + @staticmethod + def parse_ip_link(text: str) -> dict: + """Parse ip -s link show output for basic link state and counters.""" + data: dict = {} + lines = text.splitlines() + for i, line in enumerate(lines): + # MTU and state: "2: eth0: mtu 1500 ..." + m = re.search(r'mtu\s+(\d+)', line) + if m: + data['mtu'] = int(m.group(1)) + m = re.search(r'state\s+(\S+)', line) + if m: + data['state'] = m.group(1).lower() + # RX line follows "RX:" label + if line.strip().startswith('RX:') and i + 1 < len(lines): + vals = lines[i + 1].split() + if len(vals) >= 5: + try: + data['ip_rx_bytes'] = int(vals[0]) + data['ip_rx_packets'] = int(vals[1]) + data['ip_rx_errors'] = int(vals[2]) + data['ip_rx_dropped'] = int(vals[3]) + except (ValueError, IndexError): + pass + if line.strip().startswith('TX:') and i + 1 < len(lines): + vals = lines[i + 1].split() + if len(vals) >= 5: + try: + data['ip_tx_bytes'] = int(vals[0]) + data['ip_tx_packets'] = int(vals[1]) + data['ip_tx_errors'] = int(vals[2]) + data['ip_tx_dropped'] = int(vals[3]) + except (ValueError, IndexError): + pass + return data + + @staticmethod + def parse_dmesg(text: str) -> List[dict]: + """Parse dmesg lines into [{timestamp, msg, severity}].""" + events = [] + for line in text.splitlines(): + if not line.strip(): + continue + # Extract timestamp from [ 123.456789] + m = re.match(r'^\[\s*([\d.]+)\]\s*(.*)', line) + if m: + ts = m.group(1) + msg = m.group(2) + else: + ts = '' + msg = line + + lower = msg.lower() + if any(w in lower for w in ('error', 'fail', 'reset', 'panic', 'oops', 'hung', 'timeout')): + severity = 'error' + elif any(w in lower for w in ('warn', 'drop', 'lost', 'miss')): + severity = 'warn' + else: + severity = 'info' + + events.append({'timestamp': ts, 'msg': msg, 'severity': severity}) + return events + + @staticmethod + def parse_lldpctl(text: str) -> dict: + """Extract neighbor info from lldpctl output.""" + if not text or 'lldpd not running' in text or 'not found' in text.lower(): + return {'available': False} + + data: dict = {'available': True} + for line in text.splitlines(): + if ':' not in line: + continue + key, _, val = line.partition(':') + key = key.strip() + val = val.strip() + if 'SysName' in key: + data['neighbor_system'] = val + elif 'PortID' in key and 'neighbor_port' not in data: + data['neighbor_port'] = val + elif 'ChassisID' in key and 'neighbor_chassis_id' not in data: + data['neighbor_chassis_id'] = val + return data + + # ------------------------------------------------------------------ + # Health analysis + # ------------------------------------------------------------------ + @staticmethod + def analyze(sections: dict, switch_port_data: dict) -> dict: + """Return {issues: [...], warnings: [...], info: [...]} health analysis.""" + issues: List[dict] = [] + warnings: List[dict] = [] + info: List[dict] = [] + + def add(collection, code, message): + collection.append({'code': code, 'message': message}) + + carrier = sections.get('carrier', '?') + eth = sections.get('ethtool', {}) + sysfs = sections.get('sysfs_stats', {}) + dom = sections.get('ethtool_dom', {}) + dmesg = sections.get('dmesg', []) + lldp = sections.get('lldpctl', {}) + cc = sections.get('carrier_changes') + + # Physical carrier + if carrier == '0': + add(issues, 'NO_CARRIER', + 'No physical carrier — cable/SFP disconnected or switch port disabled') + elif eth.get('link_detected') is False and carrier != '0': + add(issues, 'LINK_NOT_DETECTED', + 'NIC does not detect link signal despite carrier sysfs reading non-zero') + + # Duplex + if eth.get('duplex') == 'half': + add(issues, 'HALF_DUPLEX', + 'Half-duplex detected — likely duplex mismatch; force full-duplex on both ends') + + # Speed mismatch (switch vs server NIC) + sw_speed = switch_port_data.get('speed_mbps', 0) or 0 + srv_speed = eth.get('speed_mbps', 0) or 0 + if sw_speed > 0 and srv_speed > 0 and sw_speed != srv_speed: + add(warnings, 'SPEED_MISMATCH', + f'Speed mismatch: switch reports {sw_speed} Mbps, NIC reports {srv_speed} Mbps') + + # SFP DOM power levels + rx_dbm = dom.get('rx_power_dbm') + tx_dbm = dom.get('tx_power_dbm') + if rx_dbm is not None: + if rx_dbm < -25: + add(issues, 'SFP_RX_CRITICAL', + f'RX power critically low ({rx_dbm:.2f} dBm) — fiber not connected or SFP failed') + elif rx_dbm < -18: + add(warnings, 'SFP_RX_LOW', + f'RX power low ({rx_dbm:.2f} dBm) — check fiber cleanliness and SFP seating') + if tx_dbm is not None and tx_dbm < -10: + add(warnings, 'SFP_TX_LOW', + f'TX power low ({tx_dbm:.2f} dBm) — SFP may be failing or requires cleaning') + + # Carrier changes (flapping) + if cc is not None: + if cc > 100: + add(issues, 'CARRIER_FLAPPING', + f'Link has flapped {cc} times — severe physical instability') + elif cc > 20: + add(warnings, 'CARRIER_FLAPS', + f'Link has flapped {cc} times — intermittent physical issue') + + # CRC errors + crc = sysfs.get('rx_crc_errors', 0) or 0 + if crc > 100: + add(issues, 'CRC_ERRORS_HIGH', + f'High CRC error count ({crc}) — dirty fiber/connector or cable damage') + elif crc > 10: + add(warnings, 'CRC_ERRORS_LOW', + f'CRC errors present ({crc}) — cable or SFP quality issue') + + # Kernel events + err_events = [e for e in dmesg if e['severity'] == 'error'] + if err_events: + add(warnings, 'KERNEL_EVENTS', + f'{len(err_events)} recent kernel error event(s) for this interface in dmesg') + + # LLDP validation + if lldp.get('available'): + sw_lldp = switch_port_data.get('lldp') or {} + sw_system = (sw_lldp.get('system_name') or '').lower() + srv_neighbor = (lldp.get('neighbor_system') or '').lower() + if sw_system and srv_neighbor and sw_system not in srv_neighbor and srv_neighbor not in sw_system: + add(warnings, 'LLDP_MISMATCH', + f'LLDP mismatch: switch sees "{sw_lldp.get("system_name")}" but ' + f'server lldpctl sees "{lldp.get("neighbor_system")}" — cross-cabled port?') + else: + add(info, 'LLDP_MISSING', + 'lldpd not running on server — install lldpd for full path validation') + + return {'issues': issues, 'warnings': warnings, 'info': info} diff --git a/monitor.py b/monitor.py index 5f600c2..e550fb6 100644 --- a/monitor.py +++ b/monitor.py @@ -239,6 +239,7 @@ class PulseClient: self.api_key = p.get('api_key', '') self.worker_id = p.get('worker_id', '') self.timeout = p.get('timeout', 45) + self.last_execution_id: Optional[str] = None self.session = requests.Session() self.session.headers.update({ 'X-Gandalf-API-Key': self.api_key, @@ -247,6 +248,7 @@ class PulseClient: def run_command(self, command: str) -> Optional[str]: """Submit *command* to Pulse, poll until done, return stdout or None.""" + self.last_execution_id = None if not self.url or not self.api_key or not self.worker_id: return None try: @@ -257,6 +259,7 @@ class PulseClient: ) resp.raise_for_status() execution_id = resp.json()['execution_id'] + self.last_execution_id = execution_id except Exception as e: logger.debug(f'Pulse command submit failed: {e}') return None diff --git a/static/style.css b/static/style.css index 5ae4852..1ad7362 100644 --- a/static/style.css +++ b/static/style.css @@ -1133,6 +1133,283 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-amber); } } .path-dom-row span:first-child { color:var(--text-muted); } +/* ── Link Diagnostics ─────────────────────────────────────────────── */ +.diag-bar { + display: flex; + align-items: center; + gap: 10px; + margin-top: 14px; + padding-top: 10px; + border-top: 1px solid var(--border); +} + +.btn-diag { + font-family: var(--font); + font-size: .65em; + color: var(--cyan); + background: transparent; + border: 1px solid var(--cyan); + padding: 4px 10px; + cursor: pointer; + letter-spacing: .04em; + transition: background .15s, box-shadow .15s; + animation: diag-pulse 2.5s ease-in-out infinite; +} +.btn-diag:hover { + background: var(--cyan-dim); + box-shadow: var(--glow-cyan); +} + +@keyframes diag-pulse { + 0%, 100% { box-shadow: none; } + 50% { box-shadow: 0 0 6px rgba(0,255,255,.4); } +} + +.diag-status { + font-size: .6em; + color: var(--text-muted); + font-style: italic; +} + +.diag-error { + color: var(--red); + font-size: .65em; + margin-top: 8px; +} + +.diag-results { + margin-top: 4px; +} + +.diag-results-inner { + display: flex; + flex-direction: column; + gap: 6px; +} + +/* Health banner */ +.diag-health-banner { + display: flex; + gap: 8px; + padding: 6px 0 4px; + margin-bottom: 2px; +} + +.diag-health-critical { + background: var(--red-dim); + color: var(--red); + border: 1px solid var(--red); + padding: 2px 8px; + font-size: .62em; + font-weight: bold; + letter-spacing: .05em; +} + +.diag-health-warning { + background: var(--amber-dim); + color: var(--amber); + border: 1px solid var(--amber); + padding: 2px 8px; + font-size: .62em; + font-weight: bold; + letter-spacing: .05em; +} + +.diag-health-ok { + background: var(--green-dim); + color: var(--green); + border: 1px solid var(--green); + padding: 2px 8px; + font-size: .62em; + font-weight: bold; + letter-spacing: .05em; +} + +/* Issue list */ +.diag-issue-list { + display: flex; + flex-direction: column; + gap: 3px; +} + +.diag-issue-row { + font-size: .62em; + padding: 3px 6px; + background: var(--bg2); + border-left: 2px solid var(--border); + line-height: 1.4; +} + +.diag-code { + font-weight: bold; + color: var(--amber); +} + +/* Sections */ +.diag-section { + background: var(--bg2); + border: 1px solid rgba(0,255,65,.12); +} + +.diag-section-header { + font-size: .62em; + font-weight: bold; + color: var(--amber); + padding: 4px 8px; + letter-spacing: .04em; + border-bottom: 1px solid rgba(0,255,65,.12); + background: rgba(255,176,0,.04); +} + +/* Collapsible sections */ +.diag-collapsible .diag-section-body { + display: none; +} + +.diag-collapsible.diag-open .diag-section-body { + display: block; +} + +.diag-toggle { + cursor: pointer; + user-select: none; +} + +.diag-toggle-hint { + font-weight: normal; + color: var(--text-muted); + font-size: .9em; +} + +.diag-collapsible.diag-open .diag-toggle-hint::after { + content: ''; +} + +/* Data tables */ +.diag-table { + width: 100%; + border-collapse: collapse; + font-size: .62em; +} + +.diag-table td { + padding: 3px 8px; + vertical-align: top; +} + +.diag-table td:first-child { + color: var(--text-muted); + width: 40%; + white-space: nowrap; +} + +.diag-table td:last-child { + color: var(--text-dim); + font-weight: bold; + word-break: break-all; +} + +.diag-table tr:nth-child(even) { + background: rgba(0,255,65,.025); +} + +/* Value colour classes */ +.diag-val-good { color: var(--green); } +.diag-val-warn { color: var(--amber); } +.diag-val-bad { color: var(--red); } + +/* SFP power bar */ +.diag-power-bar-wrap { + position: relative; + display: inline-block; + width: 60px; + height: 7px; + background: var(--bg3); + border: 1px solid var(--border); + vertical-align: middle; + margin-left: 6px; + overflow: visible; +} + +.diag-power-bar { + display: inline-block; + position: absolute; + left: 0; + top: 0; + height: 100%; +} + +.diag-power-bar.diag-val-good { background: var(--green); } +.diag-power-bar.diag-val-warn { background: var(--amber); } +.diag-power-bar.diag-val-bad { background: var(--red); } + +.diag-power-zone-warn, +.diag-power-zone-crit { + position: absolute; + top: -2px; + width: 1px; + height: calc(100% + 4px); + pointer-events: none; +} + +.diag-power-zone-warn { background: var(--amber); opacity: .7; } +.diag-power-zone-crit { background: var(--red); opacity: .7; } + +/* ethtool -S stat table */ +.diag-stat-table { + width: 100%; + border-collapse: collapse; + font-size: .58em; +} + +.diag-stat-table td { + padding: 2px 8px; +} + +.diag-stat-table td:first-child { color: var(--text-muted); } +.diag-stat-table td:last-child { color: var(--text-dim); text-align: right; } + +.diag-stat-nonzero-warn { + background: var(--amber-dim); +} + +.diag-stat-nonzero-warn td { color: var(--amber); } + +/* dmesg */ +.diag-dmesg-wrap { + max-height: 200px; + overflow-y: auto; + padding: 6px 8px; +} + +.diag-dmesg-line { + font-family: var(--font); + font-size: .58em; + white-space: pre-wrap; + word-break: break-all; + padding: 1px 0; + color: var(--text-dim); +} + +.diag-dmesg-warn { color: var(--amber); } +.diag-dmesg-err { color: var(--red); } + +/* Pulse link */ +.diag-pulse-link { + font-size: .62em; + padding: 4px 0; + text-align: right; +} + +.diag-pulse-link a { + color: var(--cyan); + text-decoration: none; +} + +.diag-pulse-link a:hover { + text-shadow: var(--glow-cyan); +} + /* ── Responsive ───────────────────────────────────────────────────── */ @media (max-width: 768px) { .host-grid { grid-template-columns:1fr; } diff --git a/templates/inspector.html b/templates/inspector.html index 441e4df..af3221a 100644 --- a/templates/inspector.html +++ b/templates/inspector.html @@ -264,6 +264,16 @@ function renderPanel(swName, idx) { } } + // Diagnose button (only when LLDP has an identified neighbor we can map) + const hasDiagTarget = !!(d.lldp && d.lldp.system_name && + _apiData.hosts && _apiData.hosts[d.lldp.system_name]); + const diagHtml = hasDiagTarget ? ` +
+ + +
+
` : ''; + const inner = document.getElementById('inspector-panel-inner'); inner.innerHTML = `
@@ -286,6 +296,7 @@ function renderPanel(swName, idx) { ${errHtml} ${lldpHtml} ${pathHtml} + ${diagHtml} `; document.getElementById('inspector-panel').classList.add('open'); @@ -387,5 +398,313 @@ async function loadInspector() { loadInspector(); setInterval(loadInspector, 60000); + +// ── Link Diagnostics ───────────────────────────────────────────────── +let _diagPollTimer = null; + +function runDiagnostic(swName, portIdx) { + const statusEl = document.getElementById('diag-status'); + const resultsEl = document.getElementById('diag-results'); + if (!statusEl || !resultsEl) return; + + // Clear any previous poll + if (_diagPollTimer) { clearInterval(_diagPollTimer); _diagPollTimer = null; } + + statusEl.textContent = 'Submitting to Pulse...'; + resultsEl.innerHTML = ''; + + fetch('/api/diagnose', { + method: 'POST', + headers: {'Content-Type': 'application/json'}, + body: JSON.stringify({switch_name: swName, port_idx: portIdx}), + }) + .then(r => r.json()) + .then(resp => { + if (resp.error) { + statusEl.textContent = 'Error: ' + resp.error; + return; + } + statusEl.textContent = 'Collecting diagnostics via Pulse...'; + pollDiagnostic(resp.job_id, statusEl, resultsEl); + }) + .catch(e => { + statusEl.textContent = 'Request failed: ' + e; + }); +} + +function pollDiagnostic(jobId, statusEl, resultsEl) { + let attempts = 0; + _diagPollTimer = setInterval(() => { + attempts++; + if (attempts > 120) { // 2min timeout + clearInterval(_diagPollTimer); + statusEl.textContent = 'Timed out waiting for results.'; + return; + } + fetch(`/api/diagnose/${jobId}`) + .then(r => r.json()) + .then(resp => { + if (resp.status === 'done') { + clearInterval(_diagPollTimer); + _diagPollTimer = null; + statusEl.textContent = ''; + renderDiagnosticResults(resp.result, resultsEl); + } + }) + .catch(() => {}); + }, 2000); +} + +function renderDiagnosticResults(d, container) { + if (!d || d.status === 'error') { + container.innerHTML = `
Diagnostic error: ${escHtml((d && d.error) || 'unknown')}
`; + return; + } + + const health = d.health || {}; + const issues = health.issues || []; + const warns = health.warnings || []; + const infoArr = health.info || []; + const secs = d.sections || {}; + const eth = secs.ethtool || {}; + const drv = secs.ethtool_driver || {}; + const pause = secs.ethtool_pause || {}; + const ring = secs.ethtool_ring || {}; + const dom = secs.ethtool_dom || {}; + const sysfs = secs.sysfs_stats || {}; + const dmesg = secs.dmesg || []; + const lldpctl = secs.lldpctl || {}; + const nicStats = secs.ethtool_stats || {}; + const swPort = d.switch_port || {}; + + // ── Health banner ── + let bannerHtml = ''; + if (issues.length === 0 && warns.length === 0) { + bannerHtml = '
ALL OK
'; + } else { + const parts = []; + if (issues.length) parts.push(`${issues.length} CRITICAL`); + if (warns.length) parts.push(`${warns.length} WARNING`); + bannerHtml = `
${parts.join(' ')}
`; + } + + const issueRows = [...issues, ...warns, ...infoArr].map(item => { + const cls = issues.includes(item) ? 'diag-val-bad' : warns.includes(item) ? 'diag-val-warn' : 'diag-val-good'; + const label = issues.includes(item) ? 'CRIT' : warns.includes(item) ? 'WARN' : 'INFO'; + return `
[${label}] ${escHtml(item.code)} — ${escHtml(item.message)}
`; + }).join(''); + + // ── Physical layer ── + const carrierVal = secs.carrier === '1' ? 'YES' : + secs.carrier === '0' ? 'NO' : '–'; + const operstateVal = (secs.operstate || '?').toUpperCase(); + const opstateCls = secs.operstate === 'up' ? 'diag-val-good' : secs.operstate === 'down' ? 'diag-val-bad' : 'diag-val-warn'; + const speedVal = eth.speed_mbps ? `${fmtSpeed(eth.speed_mbps)}bps` : ''; + const duplexVal = eth.duplex === 'full' ? 'Full' : + eth.duplex === 'half' ? 'Half' : '–'; + const linkDetVal = eth.link_detected === true ? 'Yes' : + eth.link_detected === false ? 'No' : '–'; + const autonegVal = eth.auto_neg === true ? 'On' : + eth.auto_neg === false ? 'Off' : '–'; + + const physHtml = ` +
+
Physical Layer
+ + + + + + + + ${secs.carrier_changes != null ? `` : ''} +
Carrier${carrierVal}
Oper State${escHtml(operstateVal)}
Speed${speedVal}
Duplex${duplexVal}
Link Detected${linkDetVal}
Auto-neg${autonegVal}
Carrier Changes${secs.carrier_changes}
+
`; + + // ── SFP / DOM ── + let domHtml = ''; + if (dom && Object.keys(dom).length > 0) { + const rxBar = dom.rx_power_dbm != null ? renderPowerBar(dom.rx_power_dbm, -18, -25) : ''; + const txBar = dom.tx_power_dbm != null ? renderPowerBar(dom.tx_power_dbm, -10, -13) : ''; + domHtml = ` +
+
SFP / DOM
+ + ${dom.vendor ? `` : ''} + ${dom.sfp_type ? `` : ''} + ${dom.connector ? `` : ''} + ${dom.wavelength_nm != null ? `` : ''} + ${dom.temp_c != null ? `` : ''} + ${dom.voltage_v != null ? `` : ''} + ${dom.bias_ma != null ? `` : ''} + ${dom.tx_power_dbm != null ? `` : ''} + ${dom.rx_power_dbm != null ? `` : ''} +
Vendor${escHtml(dom.vendor)}${dom.part_no ? ' / ' + escHtml(dom.part_no) : ''}
Type${escHtml(dom.sfp_type)}
Connector${escHtml(dom.connector)}
Wavelength${dom.wavelength_nm} nm
Temperature${dom.temp_c.toFixed(1)} °C
Voltage${dom.voltage_v.toFixed(4)} V
Bias Current${dom.bias_ma.toFixed(3)} mA
TX Power${dom.tx_power_dbm.toFixed(2)} dBm ${txBar}
RX Power${dom.rx_power_dbm.toFixed(2)} dBm ${rxBar}
+
`; + } + + // ── NIC Error Counters ── + const errCounters = ['rx_crc_errors','rx_frame_errors','collisions','tx_carrier_errors','rx_missed_errors','rx_fifo_errors']; + const nonZeroCounters = errCounters.filter(k => sysfs[k] > 0); + let errCounterHtml = ''; + if (nonZeroCounters.length > 0 || secs.carrier_changes > 0) { + const rows = nonZeroCounters.map(k => { + const v = sysfs[k]; + const cls = v > 100 ? 'diag-val-bad' : 'diag-val-warn'; + return `${escHtml(k)}${v.toLocaleString()}`; + }).join(''); + errCounterHtml = ` +
+
NIC Error Counters
+ + ${rows || ''} +
All zero
+
`; + } + + // ── ethtool -S (collapsible) ── + let nicStatHtml = ''; + if (Object.keys(nicStats).length > 0) { + const _ERR_KEYS = /err|drop|miss|crc|frame|fifo|abort|carrier|collision|fault|discard|overflow|reset/i; + const rows = Object.entries(nicStats).map(([k, v]) => { + const cls = _ERR_KEYS.test(k) && v > 0 ? ' class="diag-stat-nonzero-warn"' : ''; + return `${escHtml(k)}${v.toLocaleString()}`; + }).join(''); + nicStatHtml = ` +
+
+ ethtool -S (NIC stats) [expand] +
+
+ ${rows}
+
+
`; + } + + // ── Flow Control + Ring Buffers ── + let flowRingHtml = ''; + const hasPause = Object.keys(pause).length > 0; + const hasRing = Object.keys(ring).length > 0; + if (hasPause || hasRing) { + flowRingHtml = ` +
+
Flow Control & Ring Buffers
+ + ${hasPause ? ` + + ` : ''} + ${hasRing ? ` + + ` : ''} +
RX Pause${pause.rx_pause ? 'On' : 'Off'}
TX Pause${pause.tx_pause ? 'On' : 'Off'}
RX Ring${ring.rx_current != null ? ring.rx_current : '–'} / ${ring.rx_max != null ? ring.rx_max : '–'} max
TX Ring${ring.tx_current != null ? ring.tx_current : '–'} / ${ring.tx_max != null ? ring.tx_max : '–'} max
+
`; + } + + // ── Driver Info ── + let drvHtml = ''; + if (Object.keys(drv).length > 0) { + drvHtml = ` +
+
Driver Info
+ + ${drv.driver ? `` : ''} + ${drv.version ? `` : ''} + ${drv.firmware_version ? `` : ''} + ${drv.bus_info ? `` : ''} +
Driver${escHtml(drv.driver)}
Version${escHtml(drv.version)}
Firmware${escHtml(drv.firmware_version)}
Bus${escHtml(drv.bus_info)}
+
`; + } + + // ── LLDP Validation ── + let lldpValHtml = ''; + const swLldp = swPort.lldp || {}; + lldpValHtml = ` +
+
LLDP Validation
+
+
+
Switch sees
+
System${escHtml(swLldp.system_name || '–')}
+
Port${escHtml(swLldp.port_id || '–')}
+
Chassis${escHtml(swLldp.chassis_id || '–')}
+
+
+
Server lldpctl
+ ${lldpctl.available + ? `
Neighbor${escHtml(lldpctl.neighbor_system || '–')}
+
Port${escHtml(lldpctl.neighbor_port || '–')}
` + : '
lldpd not running
'} +
+
+
`; + + // ── dmesg ── + let dmesgHtml = ''; + if (dmesg.length > 0) { + const dlines = dmesg.map(e => { + const cls = e.severity === 'error' ? ' diag-dmesg-err' : e.severity === 'warn' ? ' diag-dmesg-warn' : ''; + const ts = e.timestamp ? `[${e.timestamp}] ` : ''; + return `
${escHtml(ts + e.msg)}
`; + }).join(''); + dmesgHtml = ` +
+
+ Kernel Events (dmesg) [expand] +
+
+
${dlines}
+
+
`; + } + + // ── Switch Port Summary ── + const swSummaryHtml = ` +
+
Switch Port Summary
+ + + + + + + ${swPort.poe_power != null ? `` : ''} +
Status${swPort.up ? 'UP' : 'DOWN'}
Speed${swPort.speed_mbps ? fmtSpeed(swPort.speed_mbps) + 'bps' : '–'}
Duplex${swPort.full_duplex ? 'Full' : (swPort.up ? 'Half' : '–')}
TX Err${fmtErrors(swPort.tx_errs_rate)}
RX Err${fmtErrors(swPort.rx_errs_rate)}
PoE${swPort.poe_power.toFixed(1)}W
+
`; + + // ── Pulse link ── + const pulseLink = d.pulse_url + ? `` + : ''; + + container.innerHTML = ` +
+ ${bannerHtml} +
${issueRows}
+ ${physHtml} + ${domHtml} + ${errCounterHtml} + ${nicStatHtml} + ${flowRingHtml} + ${drvHtml} + ${lldpValHtml} + ${dmesgHtml} + ${swSummaryHtml} + ${pulseLink} +
`; +} + +// SFP power bar: range is 0 dBm (best) to -35 dBm (worst) +function renderPowerBar(dbm, warnThreshold, critThreshold) { + const minDbm = -35, maxDbm = 0; + const pct = Math.max(0, Math.min(100, ((dbm - minDbm) / (maxDbm - minDbm)) * 100)); + const warnPct = ((warnThreshold - minDbm) / (maxDbm - minDbm)) * 100; + const critPct = ((critThreshold - minDbm) / (maxDbm - minDbm)) * 100; + const barCls = dbm < critThreshold ? 'diag-val-bad' : dbm < warnThreshold ? 'diag-val-warn' : 'diag-val-good'; + return ` + + + + `; +} {% endblock %}