Add HTTP health check endpoint on port 9102
Lightweight /health endpoint returns JSON with status, hostname, and last check timestamp. Runs as daemon thread, activated via --health-server flag or HEALTH_SERVER_ENABLED=true in .env config. Fixes: #21 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -130,7 +130,10 @@ class SystemHealthMonitor:
|
|||||||
'NEW_DRIVE_HOURS_THRESHOLD': 720, # Hours to consider a drive "new" (~30 days)
|
'NEW_DRIVE_HOURS_THRESHOLD': 720, # Hours to consider a drive "new" (~30 days)
|
||||||
'SMART_ERROR_RECENT_HOURS': 168, # Hours window for recent SMART errors (~1 week)
|
'SMART_ERROR_RECENT_HOURS': 168, # Hours window for recent SMART errors (~1 week)
|
||||||
# Storage limits
|
# Storage limits
|
||||||
'HISTORY_MAX_BYTES': 52428800 # 50MB max storage for history files
|
'HISTORY_MAX_BYTES': 52428800, # 50MB max storage for history files
|
||||||
|
# Health check endpoint
|
||||||
|
'HEALTH_SERVER_ENABLED': False, # Enable HTTP health check endpoint
|
||||||
|
'HEALTH_SERVER_PORT': 9102 # Port for health check endpoint
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -212,6 +215,15 @@ class SystemHealthMonitor:
|
|||||||
cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
|
cls.CONFIG['HISTORY_MAX_BYTES'] = int(value)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
|
logger.warning(f"Invalid HISTORY_MAX_BYTES value: {value}")
|
||||||
|
# Health server settings
|
||||||
|
elif key == 'HEALTH_SERVER_ENABLED':
|
||||||
|
cls.CONFIG['HEALTH_SERVER_ENABLED'] = value.lower() in ('true', '1', 'yes')
|
||||||
|
logger.info(f"✓ Loaded HEALTH_SERVER_ENABLED: {cls.CONFIG['HEALTH_SERVER_ENABLED']}")
|
||||||
|
elif key == 'HEALTH_SERVER_PORT':
|
||||||
|
try:
|
||||||
|
cls.CONFIG['HEALTH_SERVER_PORT'] = int(value)
|
||||||
|
except ValueError:
|
||||||
|
logger.warning(f"Invalid HEALTH_SERVER_PORT value: {value}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to load .env file: {e}")
|
logger.error(f"Failed to load .env file: {e}")
|
||||||
@@ -669,6 +681,10 @@ class SystemHealthMonitor:
|
|||||||
# Drive details cache (per-run, cleared on next execution)
|
# Drive details cache (per-run, cleared on next execution)
|
||||||
self._drive_details_cache = {}
|
self._drive_details_cache = {}
|
||||||
|
|
||||||
|
# Health check tracking
|
||||||
|
self._last_check_timestamp = None
|
||||||
|
self._last_check_status = 'unknown'
|
||||||
|
|
||||||
# Check tool availability at startup
|
# Check tool availability at startup
|
||||||
self._available_tools = self._check_tool_availability()
|
self._available_tools = self._check_tool_availability()
|
||||||
|
|
||||||
@@ -751,6 +767,45 @@ class SystemHealthMonitor:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error enforcing storage limit: {e}")
|
logger.error(f"Error enforcing storage limit: {e}")
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# HEALTH CHECK ENDPOINT
|
||||||
|
# =============================================================================
|
||||||
|
def _start_health_server(self):
|
||||||
|
"""Start a lightweight HTTP health check endpoint as a daemon thread."""
|
||||||
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||||
|
import threading
|
||||||
|
|
||||||
|
monitor = self
|
||||||
|
|
||||||
|
class HealthHandler(BaseHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
if self.path == '/health':
|
||||||
|
response = {
|
||||||
|
'status': monitor._last_check_status,
|
||||||
|
'hostname': socket.gethostname(),
|
||||||
|
'last_check': monitor._last_check_timestamp,
|
||||||
|
'uptime': datetime.datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header('Content-Type', 'application/json')
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(json.dumps(response).encode())
|
||||||
|
else:
|
||||||
|
self.send_response(404)
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
def log_message(self, format, *args):
|
||||||
|
logger.debug(f"Health server: {format % args}")
|
||||||
|
|
||||||
|
port = self.CONFIG.get('HEALTH_SERVER_PORT', 9102)
|
||||||
|
try:
|
||||||
|
server = HTTPServer(('', port), HealthHandler)
|
||||||
|
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
||||||
|
thread.start()
|
||||||
|
logger.info(f"Health check endpoint started on port {port}")
|
||||||
|
except OSError as e:
|
||||||
|
logger.warning(f"Could not start health server on port {port}: {e}")
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# MAIN EXECUTION METHODS
|
# MAIN EXECUTION METHODS
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -760,6 +815,10 @@ class SystemHealthMonitor:
|
|||||||
# Perform health checks and gather the report
|
# Perform health checks and gather the report
|
||||||
health_report = self.perform_health_checks()
|
health_report = self.perform_health_checks()
|
||||||
|
|
||||||
|
# Track last check for health endpoint
|
||||||
|
self._last_check_timestamp = datetime.datetime.now().isoformat()
|
||||||
|
self._last_check_status = health_report.get('drives_health', {}).get('overall_status', 'unknown')
|
||||||
|
|
||||||
# Create tickets for any detected critical issues
|
# Create tickets for any detected critical issues
|
||||||
self._create_tickets_for_issues(health_report)
|
self._create_tickets_for_issues(health_report)
|
||||||
|
|
||||||
@@ -3625,6 +3684,11 @@ def main():
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Enable verbose (DEBUG) logging output."
|
help="Enable verbose (DEBUG) logging output."
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--health-server",
|
||||||
|
action="store_true",
|
||||||
|
help="Start HTTP health check endpoint (default port 9102)."
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
monitor = SystemHealthMonitor(
|
monitor = SystemHealthMonitor(
|
||||||
@@ -3633,6 +3697,10 @@ def main():
|
|||||||
verbose=args.verbose
|
verbose=args.verbose
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Start health server if requested via CLI or .env
|
||||||
|
if args.health_server or monitor.CONFIG.get('HEALTH_SERVER_ENABLED', False):
|
||||||
|
monitor._start_health_server()
|
||||||
|
|
||||||
if args.metrics:
|
if args.metrics:
|
||||||
# Just output metrics to stdout
|
# Just output metrics to stdout
|
||||||
health_report = monitor.perform_health_checks()
|
health_report = monitor.perform_health_checks()
|
||||||
|
|||||||
Reference in New Issue
Block a user