diff --git a/diagnose.py b/diagnose.py index e2140c3..77552d0 100644 --- a/diagnose.py +++ b/diagnose.py @@ -75,7 +75,7 @@ class DiagnosticsRunner: ) return ( - f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 ' + f'ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 ' f'-o BatchMode=yes -o LogLevel=ERROR ' f'-o ServerAliveInterval=10 -o ServerAliveCountMax=2 ' f'root@{ip_q} \'{remote_cmd}\'' diff --git a/monitor.py b/monitor.py index b91895a..9d287ec 100644 --- a/monitor.py +++ b/monitor.py @@ -11,7 +11,6 @@ import json import logging import re import shlex -import subprocess import time from datetime import datetime from typing import Dict, List, Optional @@ -315,6 +314,14 @@ class PulseClient: return self.run_command(command, _retry=False) return None + def ping(self, ip: str, count: int = 3, timeout: int = 2) -> bool: + """Ping *ip* via the Pulse worker. Returns True if host responds.""" + ip_q = shlex.quote(ip) + output = self.run_command( + f'ping -c {count} -W {timeout} {ip_q} >/dev/null 2>&1 && echo REACHABLE || echo UNREACHABLE' + ) + return output is not None and output.strip() == 'REACHABLE' + # -------------------------------------------------------------------------- # Link stats collector (ethtool + Prometheus traffic metrics) @@ -363,7 +370,7 @@ class LinkStatsCollector: shell_cmd = ' '.join(parts) ssh_cmd = ( - f'ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 ' + f'ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 ' f'-o BatchMode=yes -o LogLevel=ERROR ' f'-o ServerAliveInterval=10 -o ServerAliveCountMax=2 ' f'root@{ip} "{shell_cmd}"' @@ -638,19 +645,6 @@ class LinkStatsCollector: # -------------------------------------------------------------------------- # Helpers # -------------------------------------------------------------------------- -def ping(ip: str, count: int = 3, timeout: int = 2) -> bool: - try: - r = subprocess.run( - ['ping', '-c', str(count), '-W', str(timeout), ip], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - timeout=30, - ) - return r.returncode == 0 - except Exception: - return False - - def _now_utc() -> str: return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC') @@ -671,6 +665,7 @@ class NetworkMonitor: self.unifi = UnifiClient(self.cfg['unifi']) self.tickets = TicketClient(self.cfg.get('ticket_api', {})) self.link_stats = LinkStatsCollector(self.cfg, self.prom, self.unifi) + self.pulse = self.link_stats.pulse # convenience alias mon = self.cfg.get('monitor', {}) self.poll_interval = mon.get('poll_interval', 120) @@ -838,7 +833,7 @@ class NetworkMonitor: def _process_ping_hosts(self, suppressions: list) -> None: for h in self.cfg.get('monitor', {}).get('ping_hosts', []): name, ip = h['name'], h['ip'] - reachable = ping(ip) + reachable = self.pulse.ping(ip) if not reachable: sup = db.check_suppressed(suppressions, 'host', name) @@ -908,7 +903,7 @@ class NetworkMonitor: for h in self.cfg.get('monitor', {}).get('ping_hosts', []): name, ip = h['name'], h['ip'] - reachable = ping(ip, count=1, timeout=2) + reachable = self.pulse.ping(ip, count=1, timeout=2) hosts[name] = { 'ip': ip, 'interfaces': {},