Security and reliability fixes: input validation, logging, job cleanup

- C5: Validate host_ip (IPv4 check) and iface (allowlist regex) before SSH command builder - H6: Upgrade Pulse failure logging from debug to error so operators see outages - M6: Replace per-request O(n) purge with background daemon thread (runs every 2 min) - M7: Background thread marks jobs stuck in 'running' > 5 min as errored Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-12 17:30:50 -04:00
parent b1dd5f9cad
commit 0335845101
2 changed files with 34 additions and 11 deletions
@@ -4,8 +4,10 @@ Flask web application serving the monitoring dashboard and suppression
 management UI. Authentication via Authelia forward-auth headers.
 All monitoring and alerting is handled by the separate monitor.py daemon.
 """
+import ipaddress
 import json
 import logging
+import re
 import threading
 import time
 import uuid
@@ -32,13 +34,25 @@ _diag_jobs: dict = {}
 _diag_lock = threading.Lock()


-def _purge_old_jobs():
-    """Remove jobs older than 10 minutes (called before each new job creation)."""
+def _purge_old_jobs_loop():
+    """Background thread: remove jobs older than 10 minutes and mark stuck running jobs as errored."""
+    while True:
+        time.sleep(120)
        cutoff = time.time() - 600
+        stuck_cutoff = time.time() - 300  # 5 min: job still 'running' → thread must have crashed
        with _diag_lock:
            stale = [jid for jid, j in _diag_jobs.items() if j.get('created_at', 0) < cutoff]
            for jid in stale:
                del _diag_jobs[jid]
+            for jid, j in _diag_jobs.items():
+                if j['status'] == 'running' and j.get('created_at', 0) < stuck_cutoff:
+                    j['status'] = 'done'
+                    j['result'] = {'status': 'error', 'error': 'Diagnostic timed out (thread crash)'}
+                    logger.error(f'Diagnostic job {jid} appeared stuck; marked as errored')
+
+
+_purge_thread = threading.Thread(target=_purge_old_jobs_loop, daemon=True)
+_purge_thread.start()


 def _config() -> dict:
@@ -314,7 +328,16 @@ def api_diagnose_start():
    if not host_ip:
        return jsonify({'error': 'Cannot determine host IP for SSH'}), 400

-    _purge_old_jobs()
+    # Validate resolved values before passing to SSH command builder
+    try:
+        ipaddress.ip_address(host_ip)
+    except ValueError:
+        logger.error(f'Refusing diagnostic: invalid host_ip "{host_ip}" for {server_name}')
+        return jsonify({'error': 'Resolved host IP is not a valid IP address'}), 400
+    if not re.fullmatch(r'[a-zA-Z0-9._-]+', matched_iface):
+        logger.error(f'Refusing diagnostic: invalid iface "{matched_iface}" for {server_name}')
+        return jsonify({'error': 'Resolved interface name contains invalid characters'}), 400
+
    job_id = str(uuid.uuid4())
    with _diag_lock:
        _diag_jobs[job_id] = {'status': 'running', 'result': None, 'created_at': time.time()}
@@ -261,7 +261,7 @@ class PulseClient:
            execution_id = resp.json()['execution_id']
            self.last_execution_id = execution_id
        except Exception as e:
-            logger.debug(f'Pulse command submit failed: {e}')
+            logger.error(f'Pulse command submit failed: {e}')
            return None

        deadline = time.time() + self.timeout
@@ -284,7 +284,7 @@ class PulseClient:
                if status == 'failed':
                    return None
            except Exception as e:
-                logger.debug(f'Pulse poll failed: {e}')
+                logger.error(f'Pulse poll failed: {e}')
        logger.warning(f'Pulse command timed out after {self.timeout}s')
        return None

@@ -340,7 +340,7 @@ class LinkStatsCollector:
        )
        output = self.pulse.run_command(ssh_cmd)
        if output is None:
-            logger.debug(f'Pulse ethtool collection returned None for {ip}')
+            logger.error(f'Pulse ethtool collection returned None for {ip}')
            return {}

        return self._parse_ssh_output(output)