feat: inspector page, link debug enhancements, security hardening
- Add /inspector page: visual model-accurate switch chassis diagrams (USF5P, USL8A, US24PRO, USPPDUP, USMINI), clickable port blocks with color coding (green=up, amber=PoE, cyan=uplink, grey=down), detail panel with stats/PoE/LLDP, LLDP-based path debug side-by-side - Link Debug: port number badges (#N), LLDP neighbor line, PoE class/max, collapsible host/switch panels with sessionStorage persistence - monitor.py: collect LLDP neighbor map + PoE class/max/mode per switch port; PulseClient uses requests.Session() for HTTP keep-alive; add shlex.quote() around interface names (defense-in-depth) - Security: suppress buttons use data-* attrs + delegated click handler instead of inline onclick with Jinja2 variable interpolation; remove | safe filter from user-controlled fields in suppressions.html; setDuration() takes explicit el param instead of implicit event global - db.py: thread-local connection reuse with ping(reconnect=True) to avoid a new TCP handshake per query - .gitignore: add config.json (contains credentials), __pycache__ - README: full rewrite covering architecture, all 4 pages, alert logic, config reference, deployment, troubleshooting, security notes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
235
monitor.py
235
monitor.py
@@ -10,6 +10,7 @@ Run as a separate systemd service alongside the Flask web app.
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import shlex
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
@@ -120,6 +121,70 @@ class UnifiClient:
|
||||
logger.error(f'UniFi API error: {e}')
|
||||
return None
|
||||
|
||||
def get_switch_ports(self) -> Optional[Dict[str, dict]]:
|
||||
"""Return per-port stats for all UniFi switches, keyed by switch name.
|
||||
|
||||
Uses the v1 stat API which includes full port_table data.
|
||||
Returns {switch_name: {'ip': str, 'model': str, 'ports': {port_name: {...}}}}.
|
||||
"""
|
||||
try:
|
||||
url = f'{self.base_url}/proxy/network/api/s/{self.site_id}/stat/device'
|
||||
resp = self.session.get(url, headers=self.headers, timeout=15)
|
||||
resp.raise_for_status()
|
||||
devices = resp.json().get('data', [])
|
||||
result: Dict[str, dict] = {}
|
||||
for dev in devices:
|
||||
if dev.get('type', '').lower() != 'usw':
|
||||
continue
|
||||
sw_name = dev.get('name') or dev.get('mac', 'unknown')
|
||||
sw_ip = dev.get('ip', '')
|
||||
sw_model = dev.get('model', '')
|
||||
ports: Dict[str, dict] = {}
|
||||
# Build LLDP neighbor map (keyed by port_idx)
|
||||
lldp_map: Dict[int, dict] = {}
|
||||
for entry in dev.get('lldp_table', []):
|
||||
pidx = entry.get('lldp_port_idx')
|
||||
if pidx is not None:
|
||||
lldp_map[int(pidx)] = {
|
||||
'chassis_id': entry.get('chassis_id', ''),
|
||||
'system_name': entry.get('system_name', ''),
|
||||
'port_id': entry.get('port_id', ''),
|
||||
'port_desc': entry.get('port_desc', ''),
|
||||
'mgmt_ips': entry.get('management_ips', []),
|
||||
}
|
||||
for port in dev.get('port_table', []):
|
||||
idx = port.get('port_idx', 0)
|
||||
pname = port.get('name') or f'Port {idx}'
|
||||
raw_poe = port.get('poe_power')
|
||||
raw_poe_max = port.get('poe_max_power')
|
||||
ports[pname] = {
|
||||
'port_idx': idx,
|
||||
'switch_ip': sw_ip,
|
||||
'up': port.get('up', False),
|
||||
'speed_mbps': port.get('speed', 0),
|
||||
'full_duplex': port.get('full_duplex', False),
|
||||
'autoneg': port.get('autoneg', False),
|
||||
'is_uplink': port.get('is_uplink', False),
|
||||
'media': port.get('media', ''),
|
||||
'poe_power': float(raw_poe) if raw_poe is not None else None,
|
||||
'poe_class': port.get('poe_class'),
|
||||
'poe_max_power': float(raw_poe_max) if raw_poe_max is not None else None,
|
||||
'poe_mode': port.get('poe_mode', ''),
|
||||
'lldp': lldp_map.get(idx),
|
||||
'tx_bytes': port.get('tx_bytes', 0),
|
||||
'rx_bytes': port.get('rx_bytes', 0),
|
||||
'tx_errors': port.get('tx_errors', 0),
|
||||
'rx_errors': port.get('rx_errors', 0),
|
||||
'tx_dropped': port.get('tx_dropped', 0),
|
||||
'rx_dropped': port.get('rx_dropped', 0),
|
||||
}
|
||||
if ports:
|
||||
result[sw_name] = {'ip': sw_ip, 'model': sw_model, 'ports': ports}
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f'UniFi switch port stats error: {e}')
|
||||
return None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Ticket client
|
||||
@@ -162,29 +227,90 @@ class TicketClient:
|
||||
return None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Pulse HTTP client (delegates SSH commands to Pulse worker)
|
||||
# --------------------------------------------------------------------------
|
||||
class PulseClient:
|
||||
"""Submit a command to a Pulse worker via the internal M2M API and poll for result."""
|
||||
|
||||
def __init__(self, cfg: dict):
|
||||
p = cfg.get('pulse', {})
|
||||
self.url = p.get('url', '').rstrip('/')
|
||||
self.api_key = p.get('api_key', '')
|
||||
self.worker_id = p.get('worker_id', '')
|
||||
self.timeout = p.get('timeout', 45)
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'X-Gandalf-API-Key': self.api_key,
|
||||
'Content-Type': 'application/json',
|
||||
})
|
||||
|
||||
def run_command(self, command: str) -> Optional[str]:
|
||||
"""Submit *command* to Pulse, poll until done, return stdout or None."""
|
||||
if not self.url or not self.api_key or not self.worker_id:
|
||||
return None
|
||||
try:
|
||||
resp = self.session.post(
|
||||
f'{self.url}/api/internal/command',
|
||||
json={'worker_id': self.worker_id, 'command': command},
|
||||
timeout=10,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
execution_id = resp.json()['execution_id']
|
||||
except Exception as e:
|
||||
logger.debug(f'Pulse command submit failed: {e}')
|
||||
return None
|
||||
|
||||
deadline = time.time() + self.timeout
|
||||
while time.time() < deadline:
|
||||
time.sleep(1)
|
||||
try:
|
||||
r = self.session.get(
|
||||
f'{self.url}/api/internal/executions/{execution_id}',
|
||||
timeout=10,
|
||||
)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
status = data.get('status')
|
||||
if status == 'completed':
|
||||
logs = data.get('logs', [])
|
||||
for entry in logs:
|
||||
if entry.get('action') == 'command_result':
|
||||
return entry.get('stdout', '')
|
||||
return ''
|
||||
if status == 'failed':
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.debug(f'Pulse poll failed: {e}')
|
||||
logger.warning(f'Pulse command timed out after {self.timeout}s')
|
||||
return None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Link stats collector (ethtool + Prometheus traffic metrics)
|
||||
# --------------------------------------------------------------------------
|
||||
class LinkStatsCollector:
|
||||
"""Collects detailed per-interface statistics via SSH (ethtool) and Prometheus."""
|
||||
"""Collects detailed per-interface statistics via SSH (ethtool) and Prometheus,
|
||||
plus per-port stats from UniFi switches."""
|
||||
|
||||
def __init__(self, cfg: dict, prom: 'PrometheusClient'):
|
||||
self.prom = prom
|
||||
ssh = cfg.get('ssh', {})
|
||||
self.ssh_user = ssh.get('user', 'root')
|
||||
self.ssh_pass = ssh.get('password', '')
|
||||
self.ssh_connect_timeout = ssh.get('connect_timeout', 5)
|
||||
self.ssh_timeout = ssh.get('timeout', 20)
|
||||
def __init__(self, cfg: dict, prom: 'PrometheusClient',
|
||||
unifi: Optional['UnifiClient'] = None):
|
||||
self.prom = prom
|
||||
self.pulse = PulseClient(cfg)
|
||||
self.unifi = unifi
|
||||
# State for UniFi rate calculation (previous snapshot + timestamp)
|
||||
self._prev_unifi: Dict[str, dict] = {}
|
||||
self._prev_unifi_time: float = 0.0
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# SSH collection
|
||||
# SSH collection (via Pulse worker)
|
||||
# ------------------------------------------------------------------
|
||||
def _ssh_batch(self, ip: str, ifaces: List[str]) -> Dict[str, dict]:
|
||||
"""
|
||||
Open one SSH session to *ip* and collect ethtool + SFP DOM data for
|
||||
all *ifaces*. Returns {iface: {speed_mbps, duplex, ..., sfp: {...}}}.
|
||||
Delegate one SSH session to the Pulse worker to collect ethtool + SFP DOM
|
||||
data for all *ifaces*. Returns {iface: {speed_mbps, duplex, ..., sfp: {...}}}.
|
||||
"""
|
||||
if not ifaces or not self.ssh_pass:
|
||||
if not ifaces or not self.pulse.url:
|
||||
return {}
|
||||
|
||||
# Validate interface names (kernel names only contain [a-zA-Z0-9_.-])
|
||||
@@ -195,37 +321,23 @@ class LinkStatsCollector:
|
||||
# Build a single shell command: for each iface output ethtool + -m with sentinels
|
||||
parts = []
|
||||
for iface in safe_ifaces:
|
||||
q = shlex.quote(iface)
|
||||
parts.append(
|
||||
f'echo "___IFACE:{iface}___";'
|
||||
f' ethtool "{iface}" 2>/dev/null;'
|
||||
f' ethtool {q} 2>/dev/null;'
|
||||
f' echo "___DOM:{iface}___";'
|
||||
f' ethtool -m "{iface}" 2>/dev/null;'
|
||||
f' ethtool -m {q} 2>/dev/null;'
|
||||
f' echo "___END___"'
|
||||
)
|
||||
shell_cmd = ' '.join(parts)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
'sshpass', '-p', self.ssh_pass,
|
||||
'ssh',
|
||||
'-o', 'StrictHostKeyChecking=no',
|
||||
'-o', f'ConnectTimeout={self.ssh_connect_timeout}',
|
||||
'-o', 'LogLevel=ERROR',
|
||||
'-o', 'BatchMode=no',
|
||||
f'{self.ssh_user}@{ip}',
|
||||
shell_cmd,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.ssh_timeout,
|
||||
)
|
||||
output = result.stdout
|
||||
except FileNotFoundError:
|
||||
logger.debug('sshpass not found – skipping ethtool collection')
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.debug(f'SSH ethtool {ip}: {e}')
|
||||
ssh_cmd = (
|
||||
f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 '
|
||||
f'-o LogLevel=ERROR root@{ip} "{shell_cmd}"'
|
||||
)
|
||||
output = self.pulse.run_command(ssh_cmd)
|
||||
if output is None:
|
||||
logger.debug(f'Pulse ethtool collection returned None for {ip}')
|
||||
return {}
|
||||
|
||||
return self._parse_ssh_output(output)
|
||||
@@ -415,9 +527,9 @@ class LinkStatsCollector:
|
||||
host_ip = instance.split(':')[0]
|
||||
ifaces = list(iface_metrics.keys())
|
||||
|
||||
# SSH ethtool collection (one connection per host, all ifaces)
|
||||
# SSH ethtool collection via Pulse worker (one connection per host, all ifaces)
|
||||
ethtool_data: Dict[str, dict] = {}
|
||||
if self.ssh_pass and ifaces:
|
||||
if self.pulse.url and ifaces:
|
||||
try:
|
||||
ethtool_data = self._ssh_batch(host_ip, ifaces)
|
||||
except Exception as e:
|
||||
@@ -438,11 +550,52 @@ class LinkStatsCollector:
|
||||
|
||||
result_hosts[host] = merged
|
||||
|
||||
# Collect UniFi switch port stats
|
||||
unifi_switches: dict = {}
|
||||
if self.unifi:
|
||||
try:
|
||||
raw = self.unifi.get_switch_ports()
|
||||
if raw is not None:
|
||||
now = time.time()
|
||||
unifi_switches = self._compute_unifi_rates(raw, now)
|
||||
self._prev_unifi = raw
|
||||
self._prev_unifi_time = now
|
||||
except Exception as e:
|
||||
logger.warning(f'UniFi switch port collection failed: {e}')
|
||||
|
||||
return {
|
||||
'hosts': result_hosts,
|
||||
'updated': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
|
||||
'hosts': result_hosts,
|
||||
'unifi_switches': unifi_switches,
|
||||
'updated': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
|
||||
}
|
||||
|
||||
def _compute_unifi_rates(self, raw: Dict[str, dict], now: float) -> Dict[str, dict]:
|
||||
"""Compute per-port byte/error rates from delta against previous snapshot."""
|
||||
dt = now - self._prev_unifi_time if self._prev_unifi_time > 0 else 0
|
||||
|
||||
def rate(new_val: int, old_val: int) -> Optional[float]:
|
||||
if dt <= 0:
|
||||
return None
|
||||
return max(0.0, (new_val - old_val) / dt)
|
||||
|
||||
result: Dict[str, dict] = {}
|
||||
for sw_name, sw_data in raw.items():
|
||||
prev_ports = self._prev_unifi.get(sw_name, {}).get('ports', {})
|
||||
merged_ports: Dict[str, dict] = {}
|
||||
for pname, d in sw_data['ports'].items():
|
||||
entry = dict(d)
|
||||
prev = prev_ports.get(pname, {})
|
||||
entry['tx_bytes_rate'] = rate(d['tx_bytes'], prev.get('tx_bytes', 0))
|
||||
entry['rx_bytes_rate'] = rate(d['rx_bytes'], prev.get('rx_bytes', 0))
|
||||
entry['tx_errs_rate'] = rate(d['tx_errors'], prev.get('tx_errors', 0))
|
||||
entry['rx_errs_rate'] = rate(d['rx_errors'], prev.get('rx_errors', 0))
|
||||
entry['tx_drops_rate'] = rate(d['tx_dropped'], prev.get('tx_dropped', 0))
|
||||
entry['rx_drops_rate'] = rate(d['rx_dropped'], prev.get('rx_dropped', 0))
|
||||
merged_ports[pname] = entry
|
||||
result[sw_name] = {'ip': sw_data['ip'], 'model': sw_data['model'],
|
||||
'ports': merged_ports}
|
||||
return result
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Helpers
|
||||
@@ -479,7 +632,7 @@ class NetworkMonitor:
|
||||
self.prom = PrometheusClient(prom_url)
|
||||
self.unifi = UnifiClient(self.cfg['unifi'])
|
||||
self.tickets = TicketClient(self.cfg.get('ticket_api', {}))
|
||||
self.link_stats = LinkStatsCollector(self.cfg, self.prom)
|
||||
self.link_stats = LinkStatsCollector(self.cfg, self.prom, self.unifi)
|
||||
|
||||
mon = self.cfg.get('monitor', {})
|
||||
self.poll_interval = mon.get('poll_interval', 120)
|
||||
|
||||
Reference in New Issue
Block a user