Add HTTP health check endpoint on port 9102
Lightweight /health endpoint returns JSON with status, hostname, and last check timestamp. Runs as daemon thread, activated via --health-server flag or HEALTH_SERVER_ENABLED=true in .env config. Fixes: #21 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -130,7 +130,10 @@ class SystemHealthMonitor:
|
||||
'NEW_DRIVE_HOURS_THRESHOLD': 720, # Hours to consider a drive "new" (~30 days)
|
||||
'SMART_ERROR_RECENT_HOURS': 168, # Hours window for recent SMART errors (~1 week)
|
||||
# Storage limits
|
||||
'HISTORY_MAX_BYTES': 52428800 # 50MB max storage for history files
|
||||
'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files
|
||||
# Health check endpoint
|
||||
'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint
|
||||
'HEALTH_SERVER_PORT': 9102 # Port for health check endpoint
|
||||
}
|
||||
|
||||
@classmethod
|
||||
@@ -212,6 +215,15 @@ class SystemHealthMonitor:
|
||||
cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
|
||||
# Health server settings
|
||||
elif key == 'HEALTH_SERVER_ENABLED':
|
||||
cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||
logger.info(f"✓ Loaded HEALTH_SERVER_ENABLED: {cls.CONFIG['HEALTH_SERVER_ENABLED']}")
|
||||
elif key == 'HEALTH_SERVER_PORT':
|
||||
try:
|
||||
cls.CONFIG['HEALTH_SERVER_PORT'] = int(value)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid HEALTH_SERVER_PORT value: {value}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load .env file: {e}")
|
||||
@@ -669,6 +681,10 @@ class SystemHealthMonitor:
|
||||
# Drive details cache (per-run, cleared on next execution)
|
||||
self._drive_details_cache = {}
|
||||
|
||||
# Health check tracking
|
||||
self._last_check_timestamp = None
|
||||
self._last_check_status = 'unknown'
|
||||
|
||||
# Check tool availability at startup
|
||||
self._available_tools = self._check_tool_availability()
|
||||
|
||||
@@ -751,6 +767,45 @@ class SystemHealthMonitor:
|
||||
except Exception as e:
|
||||
logger.error(f"Error enforcing storage limit: {e}")
|
||||
|
||||
# =============================================================================
|
||||
# HEALTH CHECK ENDPOINT
|
||||
# =============================================================================
|
||||
def _start_health_server(self):
|
||||
"""Start a lightweight HTTP health check endpoint as a daemon thread."""
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
import threading
|
||||
|
||||
monitor = self
|
||||
|
||||
class HealthHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == '/health':
|
||||
response = {
|
||||
'status': monitor._last_check_status,
|
||||
'hostname': socket.gethostname(),
|
||||
'last_check': monitor._last_check_timestamp,
|
||||
'uptime': datetime.datetime.now().isoformat()
|
||||
}
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'application/json')
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(response).encode())
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def log_message(self, format, *args):
|
||||
logger.debug(f"Health server: {format % args}")
|
||||
|
||||
port = self.CONFIG.get('HEALTH_SERVER_PORT', 9102)
|
||||
try:
|
||||
server = HTTPServer(('', port), HealthHandler)
|
||||
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||
thread.start()
|
||||
logger.info(f"Health check endpoint started on port {port}")
|
||||
except OSError as e:
|
||||
logger.warning(f"Could not start health server on port {port}: {e}")
|
||||
|
||||
# =============================================================================
|
||||
# MAIN EXECUTION METHODS
|
||||
# =============================================================================
|
||||
@@ -760,6 +815,10 @@ class SystemHealthMonitor:
|
||||
# Perform health checks and gather the report
|
||||
health_report = self.perform_health_checks()
|
||||
|
||||
# Track last check for health endpoint
|
||||
self._last_check_timestamp = datetime.datetime.now().isoformat()
|
||||
self._last_check_status = health_report.get('drives_health', {}).get('overall_status', 'unknown')
|
||||
|
||||
# Create tickets for any detected critical issues
|
||||
self._create_tickets_for_issues(health_report)
|
||||
|
||||
@@ -3625,6 +3684,11 @@ def main():
|
||||
action="store_true",
|
||||
help="Enable verbose (DEBUG) logging output."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--health-server",
|
||||
action="store_true",
|
||||
help="Start HTTP health check endpoint (default port 9102)."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
monitor = SystemHealthMonitor(
|
||||
@@ -3633,6 +3697,10 @@ def main():
|
||||
verbose=args.verbose
|
||||
)
|
||||
|
||||
# Start health server if requested via CLI or .env
|
||||
if args.health_server or monitor.CONFIG.get('HEALTH_SERVER_ENABLED', False):
|
||||
monitor._start_health_server()
|
||||
|
||||
if args.metrics:
|
||||
# Just output metrics to stdout
|
||||
health_report = monitor.perform_health_checks()
|
||||
|
||||
Reference in New Issue
Block a user