Files
gandalf/monitor.py
Jared Vititoe fa7512a2c2 feat: terminal aesthetic rewrite + link debug page
- Full dark terminal aesthetic (Pulse/TinkerTickets style):
  - #0a0a0a background, #00ff41 green, #ffb000 amber, #00ffff cyan
  - CRT scanline overlay, phosphor glow, ASCII corner pseudoelements
  - Bracket-notation badges [CRITICAL], monospace font throughout
  - style.css, base.html, index.html, suppressions.html all rewritten

- New Link Debug page (/links, /api/links):
  - Per-host, per-interface cards with speed/duplex/port type/auto-neg
  - Traffic bars (TX cyan, RX green) with rate labels
  - Error/drop counters, carrier change history
  - SFP/DOM optical panel: vendor, temp, voltage, bias, TX/RX power dBm bars
  - RX-TX delta shown; color-coded warn/crit thresholds
  - Auto-refresh every 60s, anchor-jump to #hostname

- LinkStatsCollector in monitor.py:
  - SSHes to each host (one connection, all ifaces batched)
  - Parses ethtool + ethtool -m (SFP DOM) output
  - Merges with Prometheus traffic/error/carrier metrics
  - Stores as link_stats in monitor_state table

- config.json: added ssh section for ethtool collection
- app.js: terminal chip style consistency (uppercase, ● bullet)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-02 12:43:11 -05:00

770 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Gandalf network monitor daemon.
Polls Prometheus (node_exporter) and the UniFi controller for network
interface and device state. Creates tickets in Tinker Tickets when issues
are detected, with deduplication and suppression support.
Run as a separate systemd service alongside the Flask web app.
"""
import json
import logging
import re
import subprocess
import time
from datetime import datetime
from typing import Dict, List, Optional
import requests
from urllib3.exceptions import InsecureRequestWarning
import db
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s %(message)s',
)
logger = logging.getLogger('gandalf.monitor')
# --------------------------------------------------------------------------
# Interface filtering
# --------------------------------------------------------------------------
_SKIP_PREFIXES = (
'lo', 'veth', 'tap', 'fwbr', 'fwln', 'fwpr',
'docker', 'dummy', 'br-', 'virbr', 'vmbr',
)
_VLAN_SUFFIX = re.compile(r'\.\d+$')
def is_physical_interface(name: str) -> bool:
"""Return True for physical/bond interfaces worth monitoring."""
if any(name.startswith(p) for p in _SKIP_PREFIXES):
return False
if _VLAN_SUFFIX.search(name):
return False
return True
# --------------------------------------------------------------------------
# Prometheus client
# --------------------------------------------------------------------------
class PrometheusClient:
def __init__(self, url: str):
self.url = url.rstrip('/')
def query(self, promql: str) -> list:
try:
resp = requests.get(
f'{self.url}/api/v1/query',
params={'query': promql},
timeout=15,
)
resp.raise_for_status()
data = resp.json()
if data.get('status') == 'success':
return data['data']['result']
except Exception as e:
logger.error(f'Prometheus query failed ({promql!r}): {e}')
return []
def get_interface_states(self) -> Dict[str, Dict[str, bool]]:
"""Return {instance: {device: is_up}} for physical interfaces."""
results = self.query('node_network_up')
hosts: Dict[str, Dict[str, bool]] = {}
for r in results:
instance = r['metric'].get('instance', '')
device = r['metric'].get('device', '')
if not is_physical_interface(device):
continue
hosts.setdefault(instance, {})[device] = (r['value'][1] == '1')
return hosts
# --------------------------------------------------------------------------
# UniFi client
# --------------------------------------------------------------------------
class UnifiClient:
def __init__(self, cfg: dict):
self.base_url = cfg['controller']
self.site_id = cfg.get('site_id', 'default')
self.session = requests.Session()
self.session.verify = False
self.headers = {
'X-API-KEY': cfg['api_key'],
'Accept': 'application/json',
}
def get_devices(self) -> Optional[List[dict]]:
"""Return list of UniFi devices, or None if the controller is unreachable."""
try:
url = f'{self.base_url}/proxy/network/v2/api/site/{self.site_id}/device'
resp = self.session.get(url, headers=self.headers, timeout=15)
resp.raise_for_status()
data = resp.json()
devices = []
for d in data.get('network_devices', []):
state = d.get('state', 1)
devices.append({
'name': d.get('name') or d.get('mac', 'unknown'),
'mac': d.get('mac', ''),
'ip': d.get('ip', ''),
'type': d.get('type', 'unknown'),
'model': d.get('model', ''),
'state': state,
'connected': state == 1,
})
return devices
except Exception as e:
logger.error(f'UniFi API error: {e}')
return None
# --------------------------------------------------------------------------
# Ticket client
# --------------------------------------------------------------------------
class TicketClient:
def __init__(self, cfg: dict):
self.url = cfg.get('url', '')
self.api_key = cfg.get('api_key', '')
def create(self, title: str, description: str, priority: str = '2') -> Optional[str]:
if not self.api_key or not self.url:
logger.warning('Ticket API not configured skipping ticket creation')
return None
try:
resp = requests.post(
self.url,
json={
'title': title,
'description': description,
'status': 'Open',
'priority': priority,
'category': 'Network',
'type': 'Issue',
},
headers={'Authorization': f'Bearer {self.api_key}'},
timeout=15,
)
resp.raise_for_status()
data = resp.json()
if data.get('success'):
tid = data['ticket_id']
logger.info(f'Created ticket #{tid}: {title}')
return tid
if data.get('existing_ticket_id'):
logger.info(f'Duplicate suppressed by API existing #{data["existing_ticket_id"]}')
return data['existing_ticket_id']
logger.warning(f'Unexpected ticket API response: {data}')
except Exception as e:
logger.error(f'Ticket creation failed: {e}')
return None
# --------------------------------------------------------------------------
# Link stats collector (ethtool + Prometheus traffic metrics)
# --------------------------------------------------------------------------
class LinkStatsCollector:
"""Collects detailed per-interface statistics via SSH (ethtool) and Prometheus."""
def __init__(self, cfg: dict, prom: 'PrometheusClient'):
self.prom = prom
ssh = cfg.get('ssh', {})
self.ssh_user = ssh.get('user', 'root')
self.ssh_pass = ssh.get('password', '')
self.ssh_connect_timeout = ssh.get('connect_timeout', 5)
self.ssh_timeout = ssh.get('timeout', 20)
# ------------------------------------------------------------------
# SSH collection
# ------------------------------------------------------------------
def _ssh_batch(self, ip: str, ifaces: List[str]) -> Dict[str, dict]:
"""
Open one SSH session to *ip* and collect ethtool + SFP DOM data for
all *ifaces*. Returns {iface: {speed_mbps, duplex, ..., sfp: {...}}}.
"""
if not ifaces or not self.ssh_pass:
return {}
# Validate interface names (kernel names only contain [a-zA-Z0-9_.-])
safe_ifaces = [i for i in ifaces if re.match(r'^[a-zA-Z0-9_.@-]+$', i)]
if not safe_ifaces:
return {}
# Build a single shell command: for each iface output ethtool + -m with sentinels
parts = []
for iface in safe_ifaces:
parts.append(
f'echo "___IFACE:{iface}___";'
f' ethtool "{iface}" 2>/dev/null;'
f' echo "___DOM:{iface}___";'
f' ethtool -m "{iface}" 2>/dev/null;'
f' echo "___END___"'
)
shell_cmd = ' '.join(parts)
try:
result = subprocess.run(
[
'sshpass', '-p', self.ssh_pass,
'ssh',
'-o', 'StrictHostKeyChecking=no',
'-o', f'ConnectTimeout={self.ssh_connect_timeout}',
'-o', 'LogLevel=ERROR',
'-o', 'BatchMode=no',
f'{self.ssh_user}@{ip}',
shell_cmd,
],
capture_output=True,
text=True,
timeout=self.ssh_timeout,
)
output = result.stdout
except FileNotFoundError:
logger.debug('sshpass not found skipping ethtool collection')
return {}
except Exception as e:
logger.debug(f'SSH ethtool {ip}: {e}')
return {}
return self._parse_ssh_output(output)
@staticmethod
def _parse_ssh_output(output: str) -> Dict[str, dict]:
result: Dict[str, dict] = {}
current_iface: Optional[str] = None
current_section: Optional[str] = None
buf: List[str] = []
def flush(iface, section, lines):
if not iface or not lines:
return
text = '\n'.join(lines)
if section == 'ethtool':
result.setdefault(iface, {}).update(
LinkStatsCollector._parse_ethtool(text)
)
elif section == 'dom':
sfp = LinkStatsCollector._parse_ethtool_m(text)
if sfp:
result.setdefault(iface, {})['sfp'] = sfp
for line in output.splitlines():
if line.startswith('___IFACE:') and line.endswith('___'):
flush(current_iface, current_section, buf)
current_iface = line[9:-3]
current_section = 'ethtool'
buf = []
elif line.startswith('___DOM:') and line.endswith('___'):
flush(current_iface, current_section, buf)
current_iface = line[7:-3]
current_section = 'dom'
buf = []
elif line == '___END___':
flush(current_iface, current_section, buf)
current_iface = None
current_section = None
buf = []
else:
buf.append(line)
flush(current_iface, current_section, buf)
return result
@staticmethod
def _parse_ethtool(output: str) -> dict:
data: dict = {}
for line in output.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
if key == 'Speed':
m = re.match(r'(\d+)Mb/s', val)
if m:
data['speed_mbps'] = int(m.group(1))
elif key == 'Duplex':
data['duplex'] = val.lower()
elif key == 'Port':
data['port_type'] = val
elif key == 'Auto-negotiation':
data['auto_neg'] = (val.lower() == 'on')
elif key == 'Link detected':
data['link_detected'] = (val.lower() == 'yes')
return data
@staticmethod
def _parse_ethtool_m(output: str) -> dict:
"""Parse ethtool -m (SFP DOM / digital optical monitoring) output."""
if not output:
return {}
# Skip if module diagnostics unsupported
lower = output.lower()
if 'cannot get' in lower or 'not supported' in lower or 'no sfp' in lower:
return {}
data: dict = {}
for line in output.splitlines():
if ':' not in line:
continue
key, _, val = line.partition(':')
key = key.strip()
val = val.strip()
if key == 'Vendor name':
data['vendor'] = val
elif key == 'Vendor PN':
data['part_no'] = val
elif key == 'Identifier':
m = re.search(r'\((.+?)\)', val)
if m:
data['sfp_type'] = m.group(1)
elif key == 'Connector':
m = re.search(r'\((.+?)\)', val)
if m:
data['connector'] = m.group(1)
elif key == 'Laser wavelength':
m = re.match(r'(\d+)', val)
if m:
data['wavelength_nm'] = int(m.group(1))
elif key == 'Laser bias current':
# e.g. "4.340 mA"
m = re.match(r'([\d.]+)\s+mA', val)
if m:
data['bias_ma'] = float(m.group(1))
elif key == 'Laser output power':
# e.g. "0.1234 mW / -9.09 dBm"
m = re.search(r'/\s*([-\d.]+)\s*dBm', val)
if m:
try:
data['tx_power_dbm'] = float(m.group(1))
except ValueError:
pass
elif 'receiver' in key.lower() and ('power' in key.lower() or 'optical' in key.lower()):
m = re.search(r'/\s*([-\d.]+)\s*dBm', val)
if m:
try:
data['rx_power_dbm'] = float(m.group(1))
except ValueError:
pass
elif key == 'Module temperature':
# e.g. "36.00 degrees C / 96.80 degrees F"
m = re.match(r'([\d.]+)\s+degrees', val)
if m:
data['temp_c'] = float(m.group(1))
elif key == 'Module voltage':
# e.g. "3.3180 V"
m = re.match(r'([\d.]+)\s+V', val)
if m:
data['voltage_v'] = float(m.group(1))
return data
# ------------------------------------------------------------------
# Prometheus traffic / error metrics
# ------------------------------------------------------------------
def _collect_prom_metrics(self) -> Dict[str, Dict[str, dict]]:
"""Return {instance: {device: {tx_bytes_rate, rx_bytes_rate, ...}}}."""
metrics: Dict[str, Dict[str, dict]] = {}
queries = [
('tx_bytes_rate', 'rate(node_network_transmit_bytes_total[5m])'),
('rx_bytes_rate', 'rate(node_network_receive_bytes_total[5m])'),
('tx_errs_rate', 'rate(node_network_transmit_errs_total[5m])'),
('rx_errs_rate', 'rate(node_network_receive_errs_total[5m])'),
('tx_drops_rate', 'rate(node_network_transmit_drop_total[5m])'),
('rx_drops_rate', 'rate(node_network_receive_drop_total[5m])'),
('carrier_changes', 'node_network_carrier_changes_total'),
]
for field, promql in queries:
for r in self.prom.query(promql):
instance = r['metric'].get('instance', '')
device = r['metric'].get('device', '')
if not is_physical_interface(device):
continue
raw = r['value'][1]
try:
val: Optional[float] = float(raw)
if val != val: # NaN
val = None
except (ValueError, TypeError):
val = None
metrics.setdefault(instance, {}).setdefault(device, {})[field] = val
return metrics
# ------------------------------------------------------------------
# Main collection entry point
# ------------------------------------------------------------------
def collect(self, instance_map: Dict[str, str]) -> dict:
"""
Collect full link stats for all Prometheus-monitored hosts.
*instance_map*: ``{'10.10.10.2:9100': 'large1', ...}``
Returns a dict suitable for ``db.set_state('link_stats', ...)``.
"""
prom_metrics = self._collect_prom_metrics()
result_hosts: Dict[str, Dict[str, dict]] = {}
for instance, iface_metrics in prom_metrics.items():
host = instance_map.get(instance, instance.split(':')[0])
host_ip = instance.split(':')[0]
ifaces = list(iface_metrics.keys())
# SSH ethtool collection (one connection per host, all ifaces)
ethtool_data: Dict[str, dict] = {}
if self.ssh_pass and ifaces:
try:
ethtool_data = self._ssh_batch(host_ip, ifaces)
except Exception as e:
logger.warning(f'ethtool collection failed for {host} ({host_ip}): {e}')
# Merge Prometheus metrics + ethtool data per interface
merged: Dict[str, dict] = {}
for iface in ifaces:
d: dict = {'host_ip': host_ip}
d.update(iface_metrics.get(iface, {}))
eth = ethtool_data.get(iface, {})
for k, v in eth.items():
if k != 'sfp':
d[k] = v
if 'sfp' in eth:
d['sfp'] = eth['sfp']
merged[iface] = d
result_hosts[host] = merged
return {
'hosts': result_hosts,
'updated': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
}
# --------------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------------
def ping(ip: str, count: int = 3, timeout: int = 2) -> bool:
try:
r = subprocess.run(
['ping', '-c', str(count), '-W', str(timeout), ip],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=30,
)
return r.returncode == 0
except Exception:
return False
def _now_utc() -> str:
return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')
# --------------------------------------------------------------------------
# Monitor
# --------------------------------------------------------------------------
CLUSTER_NAME = 'proxmox-cluster'
class NetworkMonitor:
def __init__(self):
with open('config.json') as f:
self.cfg = json.load(f)
prom_url = self.cfg['prometheus']['url']
self.prom = PrometheusClient(prom_url)
self.unifi = UnifiClient(self.cfg['unifi'])
self.tickets = TicketClient(self.cfg.get('ticket_api', {}))
self.link_stats = LinkStatsCollector(self.cfg, self.prom)
mon = self.cfg.get('monitor', {})
self.poll_interval = mon.get('poll_interval', 120)
self.fail_thresh = mon.get('failure_threshold', 2)
self.cluster_thresh = mon.get('cluster_threshold', 3)
# Build Prometheus instance → hostname lookup
self._instance_map: Dict[str, str] = {
h['prometheus_instance']: h['name']
for h in self.cfg.get('hosts', [])
if 'prometheus_instance' in h
}
def _hostname(self, instance: str) -> str:
return self._instance_map.get(instance, instance.split(':')[0])
# ------------------------------------------------------------------
# Interface monitoring (Prometheus)
# ------------------------------------------------------------------
def _process_interfaces(self, states: Dict[str, Dict[str, bool]]) -> None:
baseline = db.get_baseline()
new_baseline = {k: dict(v) for k, v in baseline.items()}
# Only count hosts with genuine regressions (UP→DOWN) toward cluster threshold
hosts_with_regression: List[str] = []
for instance, ifaces in states.items():
host = self._hostname(instance)
new_baseline.setdefault(host, {})
host_has_regression = False
for iface, is_up in ifaces.items():
prev = baseline.get(host, {}).get(iface) # 'up', 'initial_down', or None
if is_up:
new_baseline[host][iface] = 'up'
db.resolve_event('interface_down', host, iface)
else:
if prev is None:
# First observation is down could be unused port, don't alert
new_baseline[host][iface] = 'initial_down'
elif prev == 'initial_down':
# Persistently down since first observation no alert
pass
else: # prev == 'up'
# Regression: was UP, now DOWN
host_has_regression = True
sup = (
db.is_suppressed('interface', host, iface) or
db.is_suppressed('host', host)
)
event_id, is_new, consec = db.upsert_event(
'interface_down', 'critical', 'prometheus',
host, iface,
f'Interface {iface} on {host} went link-down ({_now_utc()})',
)
if not sup and consec >= self.fail_thresh:
self._ticket_interface(event_id, is_new, host, iface, consec)
if host_has_regression:
hosts_with_regression.append(host)
db.set_baseline(new_baseline)
# Cluster-wide check only genuine regressions count
if len(hosts_with_regression) >= self.cluster_thresh:
sup = db.is_suppressed('all', '')
event_id, is_new, consec = db.upsert_event(
'cluster_network_issue', 'critical', 'prometheus',
CLUSTER_NAME, '',
f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
f'{", ".join(hosts_with_regression)}',
)
if not sup and is_new:
title = (
f'[{CLUSTER_NAME}][auto][production][issue][network][cluster-wide] '
f'Multiple hosts reporting interface failures'
)
desc = (
f'Cluster Network Alert\n{"=" * 40}\n\n'
f'Affected hosts: {", ".join(hosts_with_regression)}\n'
f'Detected: {_now_utc()}\n\n'
f'{len(hosts_with_regression)} Proxmox hosts simultaneously reported '
f'interface regressions (link-down on interfaces previously known UP).\n'
f'This likely indicates a switch or upstream network failure.\n\n'
f'Please check the core and management switches immediately.'
)
tid = self.tickets.create(title, desc, priority='1')
if tid:
db.set_ticket_id(event_id, tid)
else:
db.resolve_event('cluster_network_issue', CLUSTER_NAME, '')
def _ticket_interface(
self, event_id: int, is_new: bool, host: str, iface: str, consec: int
) -> None:
title = (
f'[{host}][auto][production][issue][network][single-node] '
f'Interface {iface} link-down'
)
desc = (
f'Network Interface Alert\n{"=" * 40}\n\n'
f'Host: {host}\n'
f'Interface: {iface}\n'
f'Detected: {_now_utc()}\n'
f'Consecutive check failures: {consec}\n\n'
f'Interface {iface} on {host} is reporting link-down state via '
f'Prometheus node_exporter.\n\n'
f'Note: {host} may still be reachable via its other network interface.\n'
f'Please inspect the cable/SFP/switch port for {host}/{iface}.'
)
tid = self.tickets.create(title, desc, priority='2')
if tid and is_new:
db.set_ticket_id(event_id, tid)
# ------------------------------------------------------------------
# UniFi device monitoring
# ------------------------------------------------------------------
def _process_unifi(self, devices: Optional[List[dict]]) -> None:
if devices is None:
logger.warning('UniFi API unreachable this cycle')
return
for d in devices:
name = d['name']
if not d['connected']:
sup = db.is_suppressed('unifi_device', name)
event_id, is_new, consec = db.upsert_event(
'unifi_device_offline', 'critical', 'unifi',
name, d.get('type', ''),
f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
)
if not sup and consec >= self.fail_thresh:
self._ticket_unifi(event_id, is_new, d)
else:
db.resolve_event('unifi_device_offline', name, d.get('type', ''))
def _ticket_unifi(self, event_id: int, is_new: bool, device: dict) -> None:
name = device['name']
title = (
f'[{name}][auto][production][issue][network][single-node] '
f'UniFi device offline'
)
desc = (
f'UniFi Device Alert\n{"=" * 40}\n\n'
f'Device: {name}\n'
f'Type: {device.get("type","unknown")}\n'
f'Model: {device.get("model","")}\n'
f'Last Known IP: {device.get("ip","unknown")}\n'
f'Detected: {_now_utc()}\n\n'
f'The UniFi device {name} is offline per the UniFi controller.\n'
f'Please check power and cable connectivity.'
)
tid = self.tickets.create(title, desc, priority='2')
if tid and is_new:
db.set_ticket_id(event_id, tid)
# ------------------------------------------------------------------
# Ping-only hosts (no node_exporter)
# ------------------------------------------------------------------
def _process_ping_hosts(self) -> None:
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip']
reachable = ping(ip)
if not reachable:
sup = db.is_suppressed('host', name)
event_id, is_new, consec = db.upsert_event(
'host_unreachable', 'critical', 'ping',
name, ip,
f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
)
if not sup and consec >= self.fail_thresh:
self._ticket_unreachable(event_id, is_new, name, ip, consec)
else:
db.resolve_event('host_unreachable', name, ip)
def _ticket_unreachable(
self, event_id: int, is_new: bool, name: str, ip: str, consec: int
) -> None:
title = (
f'[{name}][auto][production][issue][network][single-node] '
f'Host unreachable'
)
desc = (
f'Host Reachability Alert\n{"=" * 40}\n\n'
f'Host: {name}\n'
f'IP: {ip}\n'
f'Detected: {_now_utc()}\n'
f'Consecutive check failures: {consec}\n\n'
f'Host {name} ({ip}) is not responding to ping from the Gandalf monitor.\n'
f'This host does not have a Prometheus node_exporter, so interface-level '
f'detail is unavailable.\n\n'
f'Please check the host power, management interface, and network connectivity.'
)
tid = self.tickets.create(title, desc, priority='2')
if tid and is_new:
db.set_ticket_id(event_id, tid)
# ------------------------------------------------------------------
# Snapshot collection (for dashboard)
# ------------------------------------------------------------------
def _collect_snapshot(self) -> dict:
iface_states = self.prom.get_interface_states()
unifi_devices = self.unifi.get_devices() or []
hosts = {}
for instance, ifaces in iface_states.items():
host = self._hostname(instance)
phys = {k: v for k, v in ifaces.items()}
up_count = sum(1 for v in phys.values() if v)
total = len(phys)
if total == 0 or up_count == total:
status = 'up'
elif up_count == 0:
status = 'down'
else:
status = 'degraded'
hosts[host] = {
'ip': instance.split(':')[0],
'interfaces': {k: ('up' if v else 'down') for k, v in phys.items()},
'status': status,
'source': 'prometheus',
}
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip']
reachable = ping(ip, count=1, timeout=2)
hosts[name] = {
'ip': ip,
'interfaces': {},
'status': 'up' if reachable else 'down',
'source': 'ping',
}
return {
'hosts': hosts,
'unifi': unifi_devices,
'updated': datetime.utcnow().isoformat(),
}
# ------------------------------------------------------------------
# Main loop
# ------------------------------------------------------------------
def run(self) -> None:
logger.info(
f'Gandalf monitor started poll_interval={self.poll_interval}s '
f'fail_thresh={self.fail_thresh}'
)
while True:
try:
logger.info('Starting network check cycle')
# 1. Collect and store snapshot for dashboard
snapshot = self._collect_snapshot()
db.set_state('network_snapshot', snapshot)
db.set_state('last_check', _now_utc())
# 2. Collect link stats (ethtool + traffic metrics)
try:
link_data = self.link_stats.collect(self._instance_map)
db.set_state('link_stats', link_data)
except Exception as e:
logger.error(f'Link stats collection failed: {e}', exc_info=True)
# 3. Process alerts (separate Prometheus call for fresh data)
iface_states = self.prom.get_interface_states()
self._process_interfaces(iface_states)
unifi_devices = self.unifi.get_devices()
self._process_unifi(unifi_devices)
self._process_ping_hosts()
logger.info('Network check cycle complete')
except Exception as e:
logger.error(f'Monitor loop error: {e}', exc_info=True)
time.sleep(self.poll_interval)
if __name__ == '__main__':
monitor = NetworkMonitor()
monitor.run()