Add HTTP health check endpoint on port 9102

Lightweight /health endpoint returns JSON with status, hostname, and
last check timestamp. Runs as daemon thread, activated via --health-server
flag or HEALTH_SERVER_ENABLED=true in .env config.

Fixes: #21

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-10 13:15:15 -05:00
parent b02e416117
commit 07782da7b6

View File

@@ -130,7 +130,10 @@ class SystemHealthMonitor:
'NEW_DRIVE_HOURS_THRESHOLD': 720, # Hours to consider a drive "new" (~30 days)
'SMART_ERROR_RECENT_HOURS': 168, # Hours window for recent SMART errors (~1 week)
# Storage limits
'HISTORY_MAX_BYTES': 52428800 # 50MB max storage for history files
'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files
# Health check endpoint
'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint
'HEALTH_SERVER_PORT': 9102 # Port for health check endpoint
}
@classmethod
@@ -212,6 +215,15 @@ class SystemHealthMonitor:
cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
except ValueError:
logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
# Health server settings
elif key == 'HEALTH_SERVER_ENABLED':
cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
logger.info(f"✓ Loaded HEALTH_SERVER_ENABLED: {cls.CONFIG['HEALTH_SERVER_ENABLED']}")
elif key == 'HEALTH_SERVER_PORT':
try:
cls.CONFIG['HEALTH_SERVER_PORT'] = int(value)
except ValueError:
logger.warning(f"Invalid HEALTH_SERVER_PORT value: {value}")
except Exception as e:
logger.error(f"Failed to load .env file: {e}")
@@ -669,6 +681,10 @@ class SystemHealthMonitor:
# Drive details cache (per-run, cleared on next execution)
self._drive_details_cache = {}
# Health check tracking
self._last_check_timestamp = None
self._last_check_status = 'unknown'
# Check tool availability at startup
self._available_tools = self._check_tool_availability()
@@ -751,6 +767,45 @@ class SystemHealthMonitor:
except Exception as e:
logger.error(f"Error enforcing storage limit: {e}")
# =============================================================================
# HEALTH CHECK ENDPOINT
# =============================================================================
def _start_health_server(self):
"""Start a lightweight HTTP health check endpoint as a daemon thread."""
from http.server import HTTPServer, BaseHTTPRequestHandler
import threading
monitor = self
class HealthHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == '/health':
response = {
'status': monitor._last_check_status,
'hostname': socket.gethostname(),
'last_check': monitor._last_check_timestamp,
'uptime': datetime.datetime.now().isoformat()
}
self.send_response(200)
self.send_header('Content-Type', 'application/json')
self.end_headers()
self.wfile.write(json.dumps(response).encode())
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
logger.debug(f"Health server: {format % args}")
port = self.CONFIG.get('HEALTH_SERVER_PORT', 9102)
try:
server = HTTPServer(('', port), HealthHandler)
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
logger.info(f"Health check endpoint started on port {port}")
except OSError as e:
logger.warning(f"Could not start health server on port {port}: {e}")
# =============================================================================
# MAIN EXECUTION METHODS
# =============================================================================
@@ -760,6 +815,10 @@ class SystemHealthMonitor:
# Perform health checks and gather the report
health_report = self.perform_health_checks()
# Track last check for health endpoint
self._last_check_timestamp = datetime.datetime.now().isoformat()
self._last_check_status = health_report.get('drives_health', {}).get('overall_status', 'unknown')
# Create tickets for any detected critical issues
self._create_tickets_for_issues(health_report)
@@ -3625,6 +3684,11 @@ def main():
action="store_true",
help="Enable verbose (DEBUG) logging output."
)
parser.add_argument(
"--health-server",
action="store_true",
help="Start HTTP health check endpoint (default port 9102)."
)
args = parser.parse_args()
monitor = SystemHealthMonitor(
@@ -3633,6 +3697,10 @@ def main():
verbose=args.verbose
)
# Start health server if requested via CLI or .env
if args.health_server or monitor.CONFIG.get('HEALTH_SERVER_ENABLED', False):
monitor._start_health_server()
if args.metrics:
# Just output metrics to stdout
health_report = monitor.perform_health_checks()