feat: terminal aesthetic rewrite + link debug page

- Full dark terminal aesthetic (Pulse/TinkerTickets style):
  - #0a0a0a background, #00ff41 green, #ffb000 amber, #00ffff cyan
  - CRT scanline overlay, phosphor glow, ASCII corner pseudoelements
  - Bracket-notation badges [CRITICAL], monospace font throughout
  - style.css, base.html, index.html, suppressions.html all rewritten

- New Link Debug page (/links, /api/links):
  - Per-host, per-interface cards with speed/duplex/port type/auto-neg
  - Traffic bars (TX cyan, RX green) with rate labels
  - Error/drop counters, carrier change history
  - SFP/DOM optical panel: vendor, temp, voltage, bias, TX/RX power dBm bars
  - RX-TX delta shown; color-coded warn/crit thresholds
  - Auto-refresh every 60s, anchor-jump to #hostname

- LinkStatsCollector in monitor.py:
  - SSHes to each host (one connection, all ifaces batched)
  - Parses ethtool + ethtool -m (SFP DOM) output
  - Merges with Prometheus traffic/error/carrier metrics
  - Stores as link_stats in monitor_state table

- config.json: added ssh section for ethtool collection
- app.js: terminal chip style consistency (uppercase, ● bullet)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-02 12:43:11 -05:00
parent 4356af1d84
commit fa7512a2c2
9 changed files with 1443 additions and 748 deletions

View File

@@ -162,6 +162,288 @@ class TicketClient:
return None
# --------------------------------------------------------------------------
# Link stats collector (ethtool + Prometheus traffic metrics)
# --------------------------------------------------------------------------
class LinkStatsCollector:
"""Collects detailed per-interface statistics via SSH (ethtool) and Prometheus."""
def __init__(self, cfg: dict, prom: 'PrometheusClient'):
self.prom = prom
ssh = cfg.get('ssh', {})
self.ssh_user = ssh.get('user', 'root')
self.ssh_pass = ssh.get('password', '')
self.ssh_connect_timeout = ssh.get('connect_timeout', 5)
self.ssh_timeout = ssh.get('timeout', 20)
# ------------------------------------------------------------------
# SSH collection
# ------------------------------------------------------------------
def _ssh_batch(self, ip: str, ifaces: List[str]) -> Dict[str, dict]:
"""
Open one SSH session to *ip* and collect ethtool + SFP DOM data for
all *ifaces*. Returns {iface: {speed_mbps, duplex, ..., sfp: {...}}}.
"""
if not ifaces or not self.ssh_pass:
return {}
# Validate interface names (kernel names only contain [a-zA-Z0-9_.-])
safe_ifaces = [i for i in ifaces if re.match(r'^[a-zA-Z0-9_.@-]+$', i)]
if not safe_ifaces:
return {}
# Build a single shell command: for each iface output ethtool + -m with sentinels
parts = []
for iface in safe_ifaces:
parts.append(
f'echo "___IFACE:{iface}___";'
f' ethtool "{iface}" 2>/dev/null;'
f' echo "___DOM:{iface}___";'
f' ethtool -m "{iface}" 2>/dev/null;'
f' echo "___END___"'
)
shell_cmd = ' '.join(parts)
try:
result = subprocess.run(
[
'sshpass', '-p', self.ssh_pass,
'ssh',
'-o', 'StrictHostKeyChecking=no',
'-o', f'ConnectTimeout={self.ssh_connect_timeout}',
'-o', 'LogLevel=ERROR',
'-o', 'BatchMode=no',
f'{self.ssh_user}@{ip}',
shell_cmd,
],
capture_output=True,
text=True,
timeout=self.ssh_timeout,
)
output = result.stdout
except FileNotFoundError:
logger.debug('sshpass not found skipping ethtool collection')
return {}
except Exception as e:
logger.debug(f'SSH ethtool {ip}: {e}')
return {}
return self._parse_ssh_output(output)
@staticmethod
def _parse_ssh_output(output: str) -> Dict[str, dict]:
result: Dict[str, dict] = {}
current_iface: Optional[str] = None
current_section: Optional[str] = None
buf: List[str] = []
def flush(iface, section, lines):
if not iface or not lines:
return
text = '\n'.join(lines)
if section == 'ethtool':
result.setdefault(iface, {}).update(
LinkStatsCollector._parse_ethtool(text)
)
elif section == 'dom':
sfp = LinkStatsCollector._parse_ethtool_m(text)
if sfp:
result.setdefault(iface, {})['sfp'] = sfp
for line in output.splitlines():
if line.startswith('___IFACE:') and line.endswith('___'):
flush(current_iface, current_section, buf)
current_iface = line[9:-3]
current_section = 'ethtool'
buf = []
elif line.startswith('___DOM:') and line.endswith('___'):
flush(current_iface, current_section, buf)
current_iface = line[7:-3]
current_section = 'dom'
buf = []
elif line == '___END___':
flush(current_iface, current_section, buf)
current_iface = None
current_section = None
buf = []
else:
buf.append(line)
flush(current_iface, current_section, buf)
return result
@staticmethod
def _parse_ethtool(output: str) -> dict:
data: dict = {}
for line in output.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
if key == 'Speed':
m = re.match(r'(\d+)Mb/s', val)
if m:
data['speed_mbps'] = int(m.group(1))
elif key == 'Duplex':
data['duplex'] = val.lower()
elif key == 'Port':
data['port_type'] = val
elif key == 'Auto-negotiation':
data['auto_neg'] = (val.lower() == 'on')
elif key == 'Link detected':
data['link_detected'] = (val.lower() == 'yes')
return data
@staticmethod
def _parse_ethtool_m(output: str) -> dict:
"""Parse ethtool -m (SFP DOM / digital optical monitoring) output."""
if not output:
return {}
# Skip if module diagnostics unsupported
lower = output.lower()
if 'cannot get' in lower or 'not supported' in lower or 'no sfp' in lower:
return {}
data: dict = {}
for line in output.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
if key == 'Vendor name':
data['vendor'] = val
elif key == 'Vendor PN':
data['part_no'] = val
elif key == 'Identifier':
m = re.search(r'\((.+?)\)', val)
if m:
data['sfp_type'] = m.group(1)
elif key == 'Connector':
m = re.search(r'\((.+?)\)', val)
if m:
data['connector'] = m.group(1)
elif key == 'Laser wavelength':
m = re.match(r'(\d+)', val)
if m:
data['wavelength_nm'] = int(m.group(1))
elif key == 'Laser bias current':
# e.g. "4.340 mA"
m = re.match(r'([\d.]+)\s+mA', val)
if m:
data['bias_ma'] = float(m.group(1))
elif key == 'Laser output power':
# e.g. "0.1234 mW / -9.09 dBm"
m = re.search(r'/\s*([-\d.]+)\s*dBm', val)
if m:
try:
data['tx_power_dbm'] = float(m.group(1))
except ValueError:
pass
elif 'receiver' in key.lower() and ('power' in key.lower() or 'optical' in key.lower()):
m = re.search(r'/\s*([-\d.]+)\s*dBm', val)
if m:
try:
data['rx_power_dbm'] = float(m.group(1))
except ValueError:
pass
elif key == 'Module temperature':
# e.g. "36.00 degrees C / 96.80 degrees F"
m = re.match(r'([\d.]+)\s+degrees', val)
if m:
data['temp_c'] = float(m.group(1))
elif key == 'Module voltage':
# e.g. "3.3180 V"
m = re.match(r'([\d.]+)\s+V', val)
if m:
data['voltage_v'] = float(m.group(1))
return data
# ------------------------------------------------------------------
# Prometheus traffic / error metrics
# ------------------------------------------------------------------
def _collect_prom_metrics(self) -> Dict[str, Dict[str, dict]]:
"""Return {instance: {device: {tx_bytes_rate, rx_bytes_rate, ...}}}."""
metrics: Dict[str, Dict[str, dict]] = {}
queries = [
('tx_bytes_rate', 'rate(node_network_transmit_bytes_total[5m])'),
('rx_bytes_rate', 'rate(node_network_receive_bytes_total[5m])'),
('tx_errs_rate', 'rate(node_network_transmit_errs_total[5m])'),
('rx_errs_rate', 'rate(node_network_receive_errs_total[5m])'),
('tx_drops_rate', 'rate(node_network_transmit_drop_total[5m])'),
('rx_drops_rate', 'rate(node_network_receive_drop_total[5m])'),
('carrier_changes', 'node_network_carrier_changes_total'),
]
for field, promql in queries:
for r in self.prom.query(promql):
instance = r['metric'].get('instance', '')
device = r['metric'].get('device', '')
if not is_physical_interface(device):
continue
raw = r['value'][1]
try:
val: Optional[float] = float(raw)
if val != val: # NaN
val = None
except (ValueError, TypeError):
val = None
metrics.setdefault(instance, {}).setdefault(device, {})[field] = val
return metrics
# ------------------------------------------------------------------
# Main collection entry point
# ------------------------------------------------------------------
def collect(self, instance_map: Dict[str, str]) -> dict:
"""
Collect full link stats for all Prometheus-monitored hosts.
*instance_map*: ``{'10.10.10.2:9100': 'large1', ...}``
Returns a dict suitable for ``db.set_state('link_stats', ...)``.
"""
prom_metrics = self._collect_prom_metrics()
result_hosts: Dict[str, Dict[str, dict]] = {}
for instance, iface_metrics in prom_metrics.items():
host = instance_map.get(instance, instance.split(':')[0])
host_ip = instance.split(':')[0]
ifaces = list(iface_metrics.keys())
# SSH ethtool collection (one connection per host, all ifaces)
ethtool_data: Dict[str, dict] = {}
if self.ssh_pass and ifaces:
try:
ethtool_data = self._ssh_batch(host_ip, ifaces)
except Exception as e:
logger.warning(f'ethtool collection failed for {host} ({host_ip}): {e}')
# Merge Prometheus metrics + ethtool data per interface
merged: Dict[str, dict] = {}
for iface in ifaces:
d: dict = {'host_ip': host_ip}
d.update(iface_metrics.get(iface, {}))
eth = ethtool_data.get(iface, {})
for k, v in eth.items():
if k != 'sfp':
d[k] = v
if 'sfp' in eth:
d['sfp'] = eth['sfp']
merged[iface] = d
result_hosts[host] = merged
return {
'hosts': result_hosts,
'updated': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
}
# --------------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------------
@@ -197,6 +479,7 @@ class NetworkMonitor:
self.prom = PrometheusClient(prom_url)
self.unifi = UnifiClient(self.cfg['unifi'])
self.tickets = TicketClient(self.cfg.get('ticket_api', {}))
self.link_stats = LinkStatsCollector(self.cfg, self.prom)
mon = self.cfg.get('monitor', {})
self.poll_interval = mon.get('poll_interval', 120)
@@ -457,7 +740,14 @@ class NetworkMonitor:
db.set_state('network_snapshot', snapshot)
db.set_state('last_check', _now_utc())
# 2. Process alerts (separate Prometheus call for fresh data)
# 2. Collect link stats (ethtool + traffic metrics)
try:
link_data = self.link_stats.collect(self._instance_map)
db.set_state('link_stats', link_data)
except Exception as e:
logger.error(f'Link stats collection failed: {e}', exc_info=True)
# 3. Process alerts (separate Prometheus call for fresh data)
iface_states = self.prom.get_interface_states()
self._process_interfaces(iface_states)