feat: inspector page, link debug enhancements, security hardening

- Add /inspector page: visual model-accurate switch chassis diagrams
  (USF5P, USL8A, US24PRO, USPPDUP, USMINI), clickable port blocks
  with color coding (green=up, amber=PoE, cyan=uplink, grey=down),
  detail panel with stats/PoE/LLDP, LLDP-based path debug side-by-side

- Link Debug: port number badges (#N), LLDP neighbor line, PoE class/max,
  collapsible host/switch panels with sessionStorage persistence

- monitor.py: collect LLDP neighbor map + PoE class/max/mode per switch
  port; PulseClient uses requests.Session() for HTTP keep-alive; add
  shlex.quote() around interface names (defense-in-depth)

- Security: suppress buttons use data-* attrs + delegated click handler
  instead of inline onclick with Jinja2 variable interpolation; remove
  | safe filter from user-controlled fields in suppressions.html;
  setDuration() takes explicit el param instead of implicit event global

- db.py: thread-local connection reuse with ping(reconnect=True) to
  avoid a new TCP handshake per query

- .gitignore: add config.json (contains credentials), __pycache__

- README: full rewrite covering architecture, all 4 pages, alert logic,
  config reference, deployment, troubleshooting, security notes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-03 15:39:48 -05:00
parent fa7512a2c2
commit 0278dad502
12 changed files with 1548 additions and 176 deletions

View File

@@ -10,6 +10,7 @@ Run as a separate systemd service alongside the Flask web app.
import json
import logging
import re
import shlex
import subprocess
import time
from datetime import datetime
@@ -120,6 +121,70 @@ class UnifiClient:
logger.error(f'UniFi API error: {e}')
return None
def get_switch_ports(self) -> Optional[Dict[str, dict]]:
"""Return per-port stats for all UniFi switches, keyed by switch name.
Uses the v1 stat API which includes full port_table data.
Returns {switch_name: {'ip': str, 'model': str, 'ports': {port_name: {...}}}}.
"""
try:
url = f'{self.base_url}/proxy/network/api/s/{self.site_id}/stat/device'
resp = self.session.get(url, headers=self.headers, timeout=15)
resp.raise_for_status()
devices = resp.json().get('data', [])
result: Dict[str, dict] = {}
for dev in devices:
if dev.get('type', '').lower() != 'usw':
continue
sw_name = dev.get('name') or dev.get('mac', 'unknown')
sw_ip = dev.get('ip', '')
sw_model = dev.get('model', '')
ports: Dict[str, dict] = {}
# Build LLDP neighbor map (keyed by port_idx)
lldp_map: Dict[int, dict] = {}
for entry in dev.get('lldp_table', []):
pidx = entry.get('lldp_port_idx')
if pidx is not None:
lldp_map[int(pidx)] = {
'chassis_id': entry.get('chassis_id', ''),
'system_name': entry.get('system_name', ''),
'port_id': entry.get('port_id', ''),
'port_desc': entry.get('port_desc', ''),
'mgmt_ips': entry.get('management_ips', []),
}
for port in dev.get('port_table', []):
idx = port.get('port_idx', 0)
pname = port.get('name') or f'Port {idx}'
raw_poe = port.get('poe_power')
raw_poe_max = port.get('poe_max_power')
ports[pname] = {
'port_idx': idx,
'switch_ip': sw_ip,
'up': port.get('up', False),
'speed_mbps': port.get('speed', 0),
'full_duplex': port.get('full_duplex', False),
'autoneg': port.get('autoneg', False),
'is_uplink': port.get('is_uplink', False),
'media': port.get('media', ''),
'poe_power': float(raw_poe) if raw_poe is not None else None,
'poe_class': port.get('poe_class'),
'poe_max_power': float(raw_poe_max) if raw_poe_max is not None else None,
'poe_mode': port.get('poe_mode', ''),
'lldp': lldp_map.get(idx),
'tx_bytes': port.get('tx_bytes', 0),
'rx_bytes': port.get('rx_bytes', 0),
'tx_errors': port.get('tx_errors', 0),
'rx_errors': port.get('rx_errors', 0),
'tx_dropped': port.get('tx_dropped', 0),
'rx_dropped': port.get('rx_dropped', 0),
}
if ports:
result[sw_name] = {'ip': sw_ip, 'model': sw_model, 'ports': ports}
return result
except Exception as e:
logger.error(f'UniFi switch port stats error: {e}')
return None
# --------------------------------------------------------------------------
# Ticket client
@@ -162,29 +227,90 @@ class TicketClient:
return None
# --------------------------------------------------------------------------
# Pulse HTTP client (delegates SSH commands to Pulse worker)
# --------------------------------------------------------------------------
class PulseClient:
"""Submit a command to a Pulse worker via the internal M2M API and poll for result."""
def __init__(self, cfg: dict):
p = cfg.get('pulse', {})
self.url = p.get('url', '').rstrip('/')
self.api_key = p.get('api_key', '')
self.worker_id = p.get('worker_id', '')
self.timeout = p.get('timeout', 45)
self.session = requests.Session()
self.session.headers.update({
'X-Gandalf-API-Key': self.api_key,
'Content-Type': 'application/json',
})
def run_command(self, command: str) -> Optional[str]:
"""Submit *command* to Pulse, poll until done, return stdout or None."""
if not self.url or not self.api_key or not self.worker_id:
return None
try:
resp = self.session.post(
f'{self.url}/api/internal/command',
json={'worker_id': self.worker_id, 'command': command},
timeout=10,
)
resp.raise_for_status()
execution_id = resp.json()['execution_id']
except Exception as e:
logger.debug(f'Pulse command submit failed: {e}')
return None
deadline = time.time() + self.timeout
while time.time() < deadline:
time.sleep(1)
try:
r = self.session.get(
f'{self.url}/api/internal/executions/{execution_id}',
timeout=10,
)
r.raise_for_status()
data = r.json()
status = data.get('status')
if status == 'completed':
logs = data.get('logs', [])
for entry in logs:
if entry.get('action') == 'command_result':
return entry.get('stdout', '')
return ''
if status == 'failed':
return None
except Exception as e:
logger.debug(f'Pulse poll failed: {e}')
logger.warning(f'Pulse command timed out after {self.timeout}s')
return None
# --------------------------------------------------------------------------
# Link stats collector (ethtool + Prometheus traffic metrics)
# --------------------------------------------------------------------------
class LinkStatsCollector:
"""Collects detailed per-interface statistics via SSH (ethtool) and Prometheus."""
"""Collects detailed per-interface statistics via SSH (ethtool) and Prometheus,
plus per-port stats from UniFi switches."""
def __init__(self, cfg: dict, prom: 'PrometheusClient'):
self.prom = prom
ssh = cfg.get('ssh', {})
self.ssh_user = ssh.get('user', 'root')
self.ssh_pass = ssh.get('password', '')
self.ssh_connect_timeout = ssh.get('connect_timeout', 5)
self.ssh_timeout = ssh.get('timeout', 20)
def __init__(self, cfg: dict, prom: 'PrometheusClient',
unifi: Optional['UnifiClient'] = None):
self.prom = prom
self.pulse = PulseClient(cfg)
self.unifi = unifi
# State for UniFi rate calculation (previous snapshot + timestamp)
self._prev_unifi: Dict[str, dict] = {}
self._prev_unifi_time: float = 0.0
# ------------------------------------------------------------------
# SSH collection
# SSH collection (via Pulse worker)
# ------------------------------------------------------------------
def _ssh_batch(self, ip: str, ifaces: List[str]) -> Dict[str, dict]:
"""
Open one SSH session to *ip* and collect ethtool + SFP DOM data for
all *ifaces*. Returns {iface: {speed_mbps, duplex, ..., sfp: {...}}}.
Delegate one SSH session to the Pulse worker to collect ethtool + SFP DOM
data for all *ifaces*. Returns {iface: {speed_mbps, duplex, ..., sfp: {...}}}.
"""
if not ifaces or not self.ssh_pass:
if not ifaces or not self.pulse.url:
return {}
# Validate interface names (kernel names only contain [a-zA-Z0-9_.-])
@@ -195,37 +321,23 @@ class LinkStatsCollector:
# Build a single shell command: for each iface output ethtool + -m with sentinels
parts = []
for iface in safe_ifaces:
q = shlex.quote(iface)
parts.append(
f'echo "___IFACE:{iface}___";'
f' ethtool "{iface}" 2>/dev/null;'
f' ethtool {q} 2>/dev/null;'
f' echo "___DOM:{iface}___";'
f' ethtool -m "{iface}" 2>/dev/null;'
f' ethtool -m {q} 2>/dev/null;'
f' echo "___END___"'
)
shell_cmd = ' '.join(parts)
try:
result = subprocess.run(
[
'sshpass', '-p', self.ssh_pass,
'ssh',
'-o', 'StrictHostKeyChecking=no',
'-o', f'ConnectTimeout={self.ssh_connect_timeout}',
'-o', 'LogLevel=ERROR',
'-o', 'BatchMode=no',
f'{self.ssh_user}@{ip}',
shell_cmd,
],
capture_output=True,
text=True,
timeout=self.ssh_timeout,
)
output = result.stdout
except FileNotFoundError:
logger.debug('sshpass not found skipping ethtool collection')
return {}
except Exception as e:
logger.debug(f'SSH ethtool {ip}: {e}')
ssh_cmd = (
f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 '
f'-o LogLevel=ERROR root@{ip} "{shell_cmd}"'
)
output = self.pulse.run_command(ssh_cmd)
if output is None:
logger.debug(f'Pulse ethtool collection returned None for {ip}')
return {}
return self._parse_ssh_output(output)
@@ -415,9 +527,9 @@ class LinkStatsCollector:
host_ip = instance.split(':')[0]
ifaces = list(iface_metrics.keys())
# SSH ethtool collection (one connection per host, all ifaces)
# SSH ethtool collection via Pulse worker (one connection per host, all ifaces)
ethtool_data: Dict[str, dict] = {}
if self.ssh_pass and ifaces:
if self.pulse.url and ifaces:
try:
ethtool_data = self._ssh_batch(host_ip, ifaces)
except Exception as e:
@@ -438,11 +550,52 @@ class LinkStatsCollector:
result_hosts[host] = merged
# Collect UniFi switch port stats
unifi_switches: dict = {}
if self.unifi:
try:
raw = self.unifi.get_switch_ports()
if raw is not None:
now = time.time()
unifi_switches = self._compute_unifi_rates(raw, now)
self._prev_unifi = raw
self._prev_unifi_time = now
except Exception as e:
logger.warning(f'UniFi switch port collection failed: {e}')
return {
'hosts': result_hosts,
'updated': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
'hosts': result_hosts,
'unifi_switches': unifi_switches,
'updated': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
}
def _compute_unifi_rates(self, raw: Dict[str, dict], now: float) -> Dict[str, dict]:
"""Compute per-port byte/error rates from delta against previous snapshot."""
dt = now - self._prev_unifi_time if self._prev_unifi_time > 0 else 0
def rate(new_val: int, old_val: int) -> Optional[float]:
if dt <= 0:
return None
return max(0.0, (new_val - old_val) / dt)
result: Dict[str, dict] = {}
for sw_name, sw_data in raw.items():
prev_ports = self._prev_unifi.get(sw_name, {}).get('ports', {})
merged_ports: Dict[str, dict] = {}
for pname, d in sw_data['ports'].items():
entry = dict(d)
prev = prev_ports.get(pname, {})
entry['tx_bytes_rate'] = rate(d['tx_bytes'], prev.get('tx_bytes', 0))
entry['rx_bytes_rate'] = rate(d['rx_bytes'], prev.get('rx_bytes', 0))
entry['tx_errs_rate'] = rate(d['tx_errors'], prev.get('tx_errors', 0))
entry['rx_errs_rate'] = rate(d['rx_errors'], prev.get('rx_errors', 0))
entry['tx_drops_rate'] = rate(d['tx_dropped'], prev.get('tx_dropped', 0))
entry['rx_drops_rate'] = rate(d['rx_dropped'], prev.get('rx_dropped', 0))
merged_ports[pname] = entry
result[sw_name] = {'ip': sw_data['ip'], 'model': sw_data['model'],
'ports': merged_ports}
return result
# --------------------------------------------------------------------------
# Helpers
@@ -479,7 +632,7 @@ class NetworkMonitor:
self.prom = PrometheusClient(prom_url)
self.unifi = UnifiClient(self.cfg['unifi'])
self.tickets = TicketClient(self.cfg.get('ticket_api', {}))
self.link_stats = LinkStatsCollector(self.cfg, self.prom)
self.link_stats = LinkStatsCollector(self.cfg, self.prom, self.unifi)
mon = self.cfg.get('monitor', {})
self.poll_interval = mon.get('poll_interval', 120)