arch+security: route all server contact through Pulse, harden SSH
Lint / Python (flake8) (push) Failing after 43s
Lint / JS (eslint) (push) Successful in 8s
Security / Python Security (bandit) (push) Successful in 1m4s
Test / Python Tests (pytest) (push) Failing after 1m5s
Lint / Notify on failure (push) Successful in 2s
Lint / Deploy (push) Has been skipped

Architecture:
- Remove direct subprocess ping from Gandalf; add PulseClient.ping()
  which runs the ping via the Pulse worker instead
- Remove standalone ping() function and subprocess import from monitor.py
- Add self.pulse alias to NetworkMonitor for convenience
- Both _process_ping_hosts() and snapshot builder now use self.pulse.ping()

Security:
- Change StrictHostKeyChecking=no → accept-new in both SSH command
  builders (monitor.py _ssh_batch, diagnose.py build_ssh_command).
  The Pulse worker's known_hosts is now authoritative; host keys are
  recorded on first connection and verified on all subsequent ones.
  MITM attacks after initial key exchange are now detectable.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-10 23:58:16 -04:00
parent ca41486c45
commit 38297e616f
2 changed files with 13 additions and 18 deletions
+1 -1
View File
@@ -75,7 +75,7 @@ class DiagnosticsRunner:
) )
return ( return (
f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 ' f'ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 '
f'-o BatchMode=yes -o LogLevel=ERROR ' f'-o BatchMode=yes -o LogLevel=ERROR '
f'-o ServerAliveInterval=10 -o ServerAliveCountMax=2 ' f'-o ServerAliveInterval=10 -o ServerAliveCountMax=2 '
f'root@{ip_q} \'{remote_cmd}\'' f'root@{ip_q} \'{remote_cmd}\''
+12 -17
View File
@@ -11,7 +11,6 @@ import json
import logging import logging
import re import re
import shlex import shlex
import subprocess
import time import time
from datetime import datetime from datetime import datetime
from typing import Dict, List, Optional from typing import Dict, List, Optional
@@ -315,6 +314,14 @@ class PulseClient:
return self.run_command(command, _retry=False) return self.run_command(command, _retry=False)
return None return None
def ping(self, ip: str, count: int = 3, timeout: int = 2) -> bool:
"""Ping *ip* via the Pulse worker. Returns True if host responds."""
ip_q = shlex.quote(ip)
output = self.run_command(
f'ping -c {count} -W {timeout} {ip_q} >/dev/null 2>&1 && echo REACHABLE || echo UNREACHABLE'
)
return output is not None and output.strip() == 'REACHABLE'
# -------------------------------------------------------------------------- # --------------------------------------------------------------------------
# Link stats collector (ethtool + Prometheus traffic metrics) # Link stats collector (ethtool + Prometheus traffic metrics)
@@ -363,7 +370,7 @@ class LinkStatsCollector:
shell_cmd = ' '.join(parts) shell_cmd = ' '.join(parts)
ssh_cmd = ( ssh_cmd = (
f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 ' f'ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 '
f'-o BatchMode=yes -o LogLevel=ERROR ' f'-o BatchMode=yes -o LogLevel=ERROR '
f'-o ServerAliveInterval=10 -o ServerAliveCountMax=2 ' f'-o ServerAliveInterval=10 -o ServerAliveCountMax=2 '
f'root@{ip} "{shell_cmd}"' f'root@{ip} "{shell_cmd}"'
@@ -638,19 +645,6 @@ class LinkStatsCollector:
# -------------------------------------------------------------------------- # --------------------------------------------------------------------------
# Helpers # Helpers
# -------------------------------------------------------------------------- # --------------------------------------------------------------------------
def ping(ip: str, count: int = 3, timeout: int = 2) -> bool:
try:
r = subprocess.run(
['ping', '-c', str(count), '-W', str(timeout), ip],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=30,
)
return r.returncode == 0
except Exception:
return False
def _now_utc() -> str: def _now_utc() -> str:
return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC') return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')
@@ -671,6 +665,7 @@ class NetworkMonitor:
self.unifi = UnifiClient(self.cfg['unifi']) self.unifi = UnifiClient(self.cfg['unifi'])
self.tickets = TicketClient(self.cfg.get('ticket_api', {})) self.tickets = TicketClient(self.cfg.get('ticket_api', {}))
self.link_stats = LinkStatsCollector(self.cfg, self.prom, self.unifi) self.link_stats = LinkStatsCollector(self.cfg, self.prom, self.unifi)
self.pulse = self.link_stats.pulse # convenience alias
mon = self.cfg.get('monitor', {}) mon = self.cfg.get('monitor', {})
self.poll_interval = mon.get('poll_interval', 120) self.poll_interval = mon.get('poll_interval', 120)
@@ -838,7 +833,7 @@ class NetworkMonitor:
def _process_ping_hosts(self, suppressions: list) -> None: def _process_ping_hosts(self, suppressions: list) -> None:
for h in self.cfg.get('monitor', {}).get('ping_hosts', []): for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip'] name, ip = h['name'], h['ip']
reachable = ping(ip) reachable = self.pulse.ping(ip)
if not reachable: if not reachable:
sup = db.check_suppressed(suppressions, 'host', name) sup = db.check_suppressed(suppressions, 'host', name)
@@ -908,7 +903,7 @@ class NetworkMonitor:
for h in self.cfg.get('monitor', {}).get('ping_hosts', []): for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip'] name, ip = h['name'], h['ip']
reachable = ping(ip, count=1, timeout=2) reachable = self.pulse.ping(ip, count=1, timeout=2)
hosts[name] = { hosts[name] = {
'ip': ip, 'ip': ip,
'interfaces': {}, 'interfaces': {},