Security and reliability fixes: input validation, logging, job cleanup

- C5: Validate host_ip (IPv4 check) and iface (allowlist regex) before SSH command builder
- H6: Upgrade Pulse failure logging from debug to error so operators see outages
- M6: Replace per-request O(n) purge with background daemon thread (runs every 2 min)
- M7: Background thread marks jobs stuck in 'running' > 5 min as errored

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-12 17:30:50 -04:00
parent b1dd5f9cad
commit 0335845101
2 changed files with 34 additions and 11 deletions

29
app.py
View File

@@ -4,8 +4,10 @@ Flask web application serving the monitoring dashboard and suppression
management UI. Authentication via Authelia forward-auth headers.
All monitoring and alerting is handled by the separate monitor.py daemon.
"""
import ipaddress
import json
import logging
import re
import threading
import time
import uuid
@@ -32,13 +34,25 @@ _diag_jobs: dict = {}
_diag_lock = threading.Lock()
def _purge_old_jobs():
"""Remove jobs older than 10 minutes (called before each new job creation)."""
def _purge_old_jobs_loop():
"""Background thread: remove jobs older than 10 minutes and mark stuck running jobs as errored."""
while True:
time.sleep(120)
cutoff = time.time() - 600
stuck_cutoff = time.time() - 300 # 5 min: job still 'running' → thread must have crashed
with _diag_lock:
stale = [jid for jid, j in _diag_jobs.items() if j.get('created_at', 0) < cutoff]
for jid in stale:
del _diag_jobs[jid]
for jid, j in _diag_jobs.items():
if j['status'] == 'running' and j.get('created_at', 0) < stuck_cutoff:
j['status'] = 'done'
j['result'] = {'status': 'error', 'error': 'Diagnostic timed out (thread crash)'}
logger.error(f'Diagnostic job {jid} appeared stuck; marked as errored')
_purge_thread = threading.Thread(target=_purge_old_jobs_loop, daemon=True)
_purge_thread.start()
def _config() -> dict:
@@ -314,7 +328,16 @@ def api_diagnose_start():
if not host_ip:
return jsonify({'error': 'Cannot determine host IP for SSH'}), 400
_purge_old_jobs()
# Validate resolved values before passing to SSH command builder
try:
ipaddress.ip_address(host_ip)
except ValueError:
logger.error(f'Refusing diagnostic: invalid host_ip "{host_ip}" for {server_name}')
return jsonify({'error': 'Resolved host IP is not a valid IP address'}), 400
if not re.fullmatch(r'[a-zA-Z0-9._-]+', matched_iface):
logger.error(f'Refusing diagnostic: invalid iface "{matched_iface}" for {server_name}')
return jsonify({'error': 'Resolved interface name contains invalid characters'}), 400
job_id = str(uuid.uuid4())
with _diag_lock:
_diag_jobs[job_id] = {'status': 'running', 'result': None, 'created_at': time.time()}

View File

@@ -261,7 +261,7 @@ class PulseClient:
execution_id = resp.json()['execution_id']
self.last_execution_id = execution_id
except Exception as e:
logger.debug(f'Pulse command submit failed: {e}')
logger.error(f'Pulse command submit failed: {e}')
return None
deadline = time.time() + self.timeout
@@ -284,7 +284,7 @@ class PulseClient:
if status == 'failed':
return None
except Exception as e:
logger.debug(f'Pulse poll failed: {e}')
logger.error(f'Pulse poll failed: {e}')
logger.warning(f'Pulse command timed out after {self.timeout}s')
return None
@@ -340,7 +340,7 @@ class LinkStatsCollector:
)
output = self.pulse.run_command(ssh_cmd)
if output is None:
logger.debug(f'Pulse ethtool collection returned None for {ip}')
logger.error(f'Pulse ethtool collection returned None for {ip}')
return {}
return self._parse_ssh_output(output)