arch+security: route all server contact through Pulse, harden SSH
Lint / Python (flake8) (push) Failing after 43s
Lint / JS (eslint) (push) Successful in 8s
Security / Python Security (bandit) (push) Successful in 1m4s
Test / Python Tests (pytest) (push) Failing after 1m5s
Lint / Notify on failure (push) Successful in 2s
Lint / Deploy (push) Has been skipped

Architecture:
- Remove direct subprocess ping from Gandalf; add PulseClient.ping()
  which runs the ping via the Pulse worker instead
- Remove standalone ping() function and subprocess import from monitor.py
- Add self.pulse alias to NetworkMonitor for convenience
- Both _process_ping_hosts() and snapshot builder now use self.pulse.ping()

Security:
- Change StrictHostKeyChecking=no → accept-new in both SSH command
  builders (monitor.py _ssh_batch, diagnose.py build_ssh_command).
  The Pulse worker's known_hosts is now authoritative; host keys are
  recorded on first connection and verified on all subsequent ones.
  MITM attacks after initial key exchange are now detectable.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-10 23:58:16 -04:00
parent ca41486c45
commit 38297e616f
2 changed files with 13 additions and 18 deletions
+1 -1
View File
@@ -75,7 +75,7 @@ class DiagnosticsRunner:
)
return (
f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 '
f'ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 '
f'-o BatchMode=yes -o LogLevel=ERROR '
f'-o ServerAliveInterval=10 -o ServerAliveCountMax=2 '
f'root@{ip_q} \'{remote_cmd}\''
+12 -17
View File
@@ -11,7 +11,6 @@ import json
import logging
import re
import shlex
import subprocess
import time
from datetime import datetime
from typing import Dict, List, Optional
@@ -315,6 +314,14 @@ class PulseClient:
return self.run_command(command, _retry=False)
return None
def ping(self, ip: str, count: int = 3, timeout: int = 2) -> bool:
"""Ping *ip* via the Pulse worker. Returns True if host responds."""
ip_q = shlex.quote(ip)
output = self.run_command(
f'ping -c {count} -W {timeout} {ip_q} >/dev/null 2>&1 && echo REACHABLE || echo UNREACHABLE'
)
return output is not None and output.strip() == 'REACHABLE'
# --------------------------------------------------------------------------
# Link stats collector (ethtool + Prometheus traffic metrics)
@@ -363,7 +370,7 @@ class LinkStatsCollector:
shell_cmd = ' '.join(parts)
ssh_cmd = (
f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 '
f'ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 '
f'-o BatchMode=yes -o LogLevel=ERROR '
f'-o ServerAliveInterval=10 -o ServerAliveCountMax=2 '
f'root@{ip} "{shell_cmd}"'
@@ -638,19 +645,6 @@ class LinkStatsCollector:
# --------------------------------------------------------------------------
# Helpers
# --------------------------------------------------------------------------
def ping(ip: str, count: int = 3, timeout: int = 2) -> bool:
try:
r = subprocess.run(
['ping', '-c', str(count), '-W', str(timeout), ip],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=30,
)
return r.returncode == 0
except Exception:
return False
def _now_utc() -> str:
return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')
@@ -671,6 +665,7 @@ class NetworkMonitor:
self.unifi = UnifiClient(self.cfg['unifi'])
self.tickets = TicketClient(self.cfg.get('ticket_api', {}))
self.link_stats = LinkStatsCollector(self.cfg, self.prom, self.unifi)
self.pulse = self.link_stats.pulse # convenience alias
mon = self.cfg.get('monitor', {})
self.poll_interval = mon.get('poll_interval', 120)
@@ -838,7 +833,7 @@ class NetworkMonitor:
def _process_ping_hosts(self, suppressions: list) -> None:
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip']
reachable = ping(ip)
reachable = self.pulse.ping(ip)
if not reachable:
sup = db.check_suppressed(suppressions, 'host', name)
@@ -908,7 +903,7 @@ class NetworkMonitor:
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip']
reachable = ping(ip, count=1, timeout=2)
reachable = self.pulse.ping(ip, count=1, timeout=2)
hosts[name] = {
'ip': ip,
'interfaces': {},