feat: terminal aesthetic rewrite + link debug page
- Full dark terminal aesthetic (Pulse/TinkerTickets style): - #0a0a0a background, #00ff41 green, #ffb000 amber, #00ffff cyan - CRT scanline overlay, phosphor glow, ASCII corner pseudoelements - Bracket-notation badges [CRITICAL], monospace font throughout - style.css, base.html, index.html, suppressions.html all rewritten - New Link Debug page (/links, /api/links): - Per-host, per-interface cards with speed/duplex/port type/auto-neg - Traffic bars (TX cyan, RX green) with rate labels - Error/drop counters, carrier change history - SFP/DOM optical panel: vendor, temp, voltage, bias, TX/RX power dBm bars - RX-TX delta shown; color-coded warn/crit thresholds - Auto-refresh every 60s, anchor-jump to #hostname - LinkStatsCollector in monitor.py: - SSHes to each host (one connection, all ifaces batched) - Parses ethtool + ethtool -m (SFP DOM) output - Merges with Prometheus traffic/error/carrier metrics - Stores as link_stats in monitor_state table - config.json: added ssh section for ethtool collection - app.js: terminal chip style consistency (uppercase, ● bullet) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
292
monitor.py
292
monitor.py
@@ -162,6 +162,288 @@ class TicketClient:
|
||||
return None
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Link stats collector (ethtool + Prometheus traffic metrics)
|
||||
# --------------------------------------------------------------------------
|
||||
class LinkStatsCollector:
|
||||
"""Collects detailed per-interface statistics via SSH (ethtool) and Prometheus."""
|
||||
|
||||
def __init__(self, cfg: dict, prom: 'PrometheusClient'):
|
||||
self.prom = prom
|
||||
ssh = cfg.get('ssh', {})
|
||||
self.ssh_user = ssh.get('user', 'root')
|
||||
self.ssh_pass = ssh.get('password', '')
|
||||
self.ssh_connect_timeout = ssh.get('connect_timeout', 5)
|
||||
self.ssh_timeout = ssh.get('timeout', 20)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# SSH collection
|
||||
# ------------------------------------------------------------------
|
||||
def _ssh_batch(self, ip: str, ifaces: List[str]) -> Dict[str, dict]:
|
||||
"""
|
||||
Open one SSH session to *ip* and collect ethtool + SFP DOM data for
|
||||
all *ifaces*. Returns {iface: {speed_mbps, duplex, ..., sfp: {...}}}.
|
||||
"""
|
||||
if not ifaces or not self.ssh_pass:
|
||||
return {}
|
||||
|
||||
# Validate interface names (kernel names only contain [a-zA-Z0-9_.-])
|
||||
safe_ifaces = [i for i in ifaces if re.match(r'^[a-zA-Z0-9_.@-]+$', i)]
|
||||
if not safe_ifaces:
|
||||
return {}
|
||||
|
||||
# Build a single shell command: for each iface output ethtool + -m with sentinels
|
||||
parts = []
|
||||
for iface in safe_ifaces:
|
||||
parts.append(
|
||||
f'echo "___IFACE:{iface}___";'
|
||||
f' ethtool "{iface}" 2>/dev/null;'
|
||||
f' echo "___DOM:{iface}___";'
|
||||
f' ethtool -m "{iface}" 2>/dev/null;'
|
||||
f' echo "___END___"'
|
||||
)
|
||||
shell_cmd = ' '.join(parts)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
'sshpass', '-p', self.ssh_pass,
|
||||
'ssh',
|
||||
'-o', 'StrictHostKeyChecking=no',
|
||||
'-o', f'ConnectTimeout={self.ssh_connect_timeout}',
|
||||
'-o', 'LogLevel=ERROR',
|
||||
'-o', 'BatchMode=no',
|
||||
f'{self.ssh_user}@{ip}',
|
||||
shell_cmd,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=self.ssh_timeout,
|
||||
)
|
||||
output = result.stdout
|
||||
except FileNotFoundError:
|
||||
logger.debug('sshpass not found – skipping ethtool collection')
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.debug(f'SSH ethtool {ip}: {e}')
|
||||
return {}
|
||||
|
||||
return self._parse_ssh_output(output)
|
||||
|
||||
@staticmethod
|
||||
def _parse_ssh_output(output: str) -> Dict[str, dict]:
|
||||
result: Dict[str, dict] = {}
|
||||
current_iface: Optional[str] = None
|
||||
current_section: Optional[str] = None
|
||||
buf: List[str] = []
|
||||
|
||||
def flush(iface, section, lines):
|
||||
if not iface or not lines:
|
||||
return
|
||||
text = '\n'.join(lines)
|
||||
if section == 'ethtool':
|
||||
result.setdefault(iface, {}).update(
|
||||
LinkStatsCollector._parse_ethtool(text)
|
||||
)
|
||||
elif section == 'dom':
|
||||
sfp = LinkStatsCollector._parse_ethtool_m(text)
|
||||
if sfp:
|
||||
result.setdefault(iface, {})['sfp'] = sfp
|
||||
|
||||
for line in output.splitlines():
|
||||
if line.startswith('___IFACE:') and line.endswith('___'):
|
||||
flush(current_iface, current_section, buf)
|
||||
current_iface = line[9:-3]
|
||||
current_section = 'ethtool'
|
||||
buf = []
|
||||
elif line.startswith('___DOM:') and line.endswith('___'):
|
||||
flush(current_iface, current_section, buf)
|
||||
current_iface = line[7:-3]
|
||||
current_section = 'dom'
|
||||
buf = []
|
||||
elif line == '___END___':
|
||||
flush(current_iface, current_section, buf)
|
||||
current_iface = None
|
||||
current_section = None
|
||||
buf = []
|
||||
else:
|
||||
buf.append(line)
|
||||
|
||||
flush(current_iface, current_section, buf)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _parse_ethtool(output: str) -> dict:
|
||||
data: dict = {}
|
||||
for line in output.splitlines():
|
||||
if ':' not in line:
|
||||
continue
|
||||
key, _, val = line.partition(':')
|
||||
key = key.strip()
|
||||
val = val.strip()
|
||||
if key == 'Speed':
|
||||
m = re.match(r'(\d+)Mb/s', val)
|
||||
if m:
|
||||
data['speed_mbps'] = int(m.group(1))
|
||||
elif key == 'Duplex':
|
||||
data['duplex'] = val.lower()
|
||||
elif key == 'Port':
|
||||
data['port_type'] = val
|
||||
elif key == 'Auto-negotiation':
|
||||
data['auto_neg'] = (val.lower() == 'on')
|
||||
elif key == 'Link detected':
|
||||
data['link_detected'] = (val.lower() == 'yes')
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
def _parse_ethtool_m(output: str) -> dict:
|
||||
"""Parse ethtool -m (SFP DOM / digital optical monitoring) output."""
|
||||
if not output:
|
||||
return {}
|
||||
# Skip if module diagnostics unsupported
|
||||
lower = output.lower()
|
||||
if 'cannot get' in lower or 'not supported' in lower or 'no sfp' in lower:
|
||||
return {}
|
||||
|
||||
data: dict = {}
|
||||
for line in output.splitlines():
|
||||
if ':' not in line:
|
||||
continue
|
||||
key, _, val = line.partition(':')
|
||||
key = key.strip()
|
||||
val = val.strip()
|
||||
|
||||
if key == 'Vendor name':
|
||||
data['vendor'] = val
|
||||
elif key == 'Vendor PN':
|
||||
data['part_no'] = val
|
||||
elif key == 'Identifier':
|
||||
m = re.search(r'\((.+?)\)', val)
|
||||
if m:
|
||||
data['sfp_type'] = m.group(1)
|
||||
elif key == 'Connector':
|
||||
m = re.search(r'\((.+?)\)', val)
|
||||
if m:
|
||||
data['connector'] = m.group(1)
|
||||
elif key == 'Laser wavelength':
|
||||
m = re.match(r'(\d+)', val)
|
||||
if m:
|
||||
data['wavelength_nm'] = int(m.group(1))
|
||||
elif key == 'Laser bias current':
|
||||
# e.g. "4.340 mA"
|
||||
m = re.match(r'([\d.]+)\s+mA', val)
|
||||
if m:
|
||||
data['bias_ma'] = float(m.group(1))
|
||||
elif key == 'Laser output power':
|
||||
# e.g. "0.1234 mW / -9.09 dBm"
|
||||
m = re.search(r'/\s*([-\d.]+)\s*dBm', val)
|
||||
if m:
|
||||
try:
|
||||
data['tx_power_dbm'] = float(m.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
elif 'receiver' in key.lower() and ('power' in key.lower() or 'optical' in key.lower()):
|
||||
m = re.search(r'/\s*([-\d.]+)\s*dBm', val)
|
||||
if m:
|
||||
try:
|
||||
data['rx_power_dbm'] = float(m.group(1))
|
||||
except ValueError:
|
||||
pass
|
||||
elif key == 'Module temperature':
|
||||
# e.g. "36.00 degrees C / 96.80 degrees F"
|
||||
m = re.match(r'([\d.]+)\s+degrees', val)
|
||||
if m:
|
||||
data['temp_c'] = float(m.group(1))
|
||||
elif key == 'Module voltage':
|
||||
# e.g. "3.3180 V"
|
||||
m = re.match(r'([\d.]+)\s+V', val)
|
||||
if m:
|
||||
data['voltage_v'] = float(m.group(1))
|
||||
|
||||
return data
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Prometheus traffic / error metrics
|
||||
# ------------------------------------------------------------------
|
||||
def _collect_prom_metrics(self) -> Dict[str, Dict[str, dict]]:
|
||||
"""Return {instance: {device: {tx_bytes_rate, rx_bytes_rate, ...}}}."""
|
||||
metrics: Dict[str, Dict[str, dict]] = {}
|
||||
|
||||
queries = [
|
||||
('tx_bytes_rate', 'rate(node_network_transmit_bytes_total[5m])'),
|
||||
('rx_bytes_rate', 'rate(node_network_receive_bytes_total[5m])'),
|
||||
('tx_errs_rate', 'rate(node_network_transmit_errs_total[5m])'),
|
||||
('rx_errs_rate', 'rate(node_network_receive_errs_total[5m])'),
|
||||
('tx_drops_rate', 'rate(node_network_transmit_drop_total[5m])'),
|
||||
('rx_drops_rate', 'rate(node_network_receive_drop_total[5m])'),
|
||||
('carrier_changes', 'node_network_carrier_changes_total'),
|
||||
]
|
||||
|
||||
for field, promql in queries:
|
||||
for r in self.prom.query(promql):
|
||||
instance = r['metric'].get('instance', '')
|
||||
device = r['metric'].get('device', '')
|
||||
if not is_physical_interface(device):
|
||||
continue
|
||||
raw = r['value'][1]
|
||||
try:
|
||||
val: Optional[float] = float(raw)
|
||||
if val != val: # NaN
|
||||
val = None
|
||||
except (ValueError, TypeError):
|
||||
val = None
|
||||
metrics.setdefault(instance, {}).setdefault(device, {})[field] = val
|
||||
|
||||
return metrics
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Main collection entry point
|
||||
# ------------------------------------------------------------------
|
||||
def collect(self, instance_map: Dict[str, str]) -> dict:
|
||||
"""
|
||||
Collect full link stats for all Prometheus-monitored hosts.
|
||||
|
||||
*instance_map*: ``{'10.10.10.2:9100': 'large1', ...}``
|
||||
|
||||
Returns a dict suitable for ``db.set_state('link_stats', ...)``.
|
||||
"""
|
||||
prom_metrics = self._collect_prom_metrics()
|
||||
result_hosts: Dict[str, Dict[str, dict]] = {}
|
||||
|
||||
for instance, iface_metrics in prom_metrics.items():
|
||||
host = instance_map.get(instance, instance.split(':')[0])
|
||||
host_ip = instance.split(':')[0]
|
||||
ifaces = list(iface_metrics.keys())
|
||||
|
||||
# SSH ethtool collection (one connection per host, all ifaces)
|
||||
ethtool_data: Dict[str, dict] = {}
|
||||
if self.ssh_pass and ifaces:
|
||||
try:
|
||||
ethtool_data = self._ssh_batch(host_ip, ifaces)
|
||||
except Exception as e:
|
||||
logger.warning(f'ethtool collection failed for {host} ({host_ip}): {e}')
|
||||
|
||||
# Merge Prometheus metrics + ethtool data per interface
|
||||
merged: Dict[str, dict] = {}
|
||||
for iface in ifaces:
|
||||
d: dict = {'host_ip': host_ip}
|
||||
d.update(iface_metrics.get(iface, {}))
|
||||
eth = ethtool_data.get(iface, {})
|
||||
for k, v in eth.items():
|
||||
if k != 'sfp':
|
||||
d[k] = v
|
||||
if 'sfp' in eth:
|
||||
d['sfp'] = eth['sfp']
|
||||
merged[iface] = d
|
||||
|
||||
result_hosts[host] = merged
|
||||
|
||||
return {
|
||||
'hosts': result_hosts,
|
||||
'updated': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
|
||||
}
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# --------------------------------------------------------------------------
|
||||
@@ -197,6 +479,7 @@ class NetworkMonitor:
|
||||
self.prom = PrometheusClient(prom_url)
|
||||
self.unifi = UnifiClient(self.cfg['unifi'])
|
||||
self.tickets = TicketClient(self.cfg.get('ticket_api', {}))
|
||||
self.link_stats = LinkStatsCollector(self.cfg, self.prom)
|
||||
|
||||
mon = self.cfg.get('monitor', {})
|
||||
self.poll_interval = mon.get('poll_interval', 120)
|
||||
@@ -457,7 +740,14 @@ class NetworkMonitor:
|
||||
db.set_state('network_snapshot', snapshot)
|
||||
db.set_state('last_check', _now_utc())
|
||||
|
||||
# 2. Process alerts (separate Prometheus call for fresh data)
|
||||
# 2. Collect link stats (ethtool + traffic metrics)
|
||||
try:
|
||||
link_data = self.link_stats.collect(self._instance_map)
|
||||
db.set_state('link_stats', link_data)
|
||||
except Exception as e:
|
||||
logger.error(f'Link stats collection failed: {e}', exc_info=True)
|
||||
|
||||
# 3. Process alerts (separate Prometheus call for fresh data)
|
||||
iface_states = self.prom.get_interface_states()
|
||||
self._process_interfaces(iface_states)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user