New features: stale banner, tab title alerts, health checks, DB housekeeping
static/app.js: - Browser tab title updates to show alert count: '(3 CRIT) GANDALF' or '(2 WARN) GANDALF' - Stale monitoring banner: injected into .main if last_check > 15 min old, warns operator that the monitor daemon may be down static/style.css: - .stale-banner: amber top-border warning strip app.py: - /health now checks DB connectivity and monitor freshness (last_check age) Returns 503 + degraded status if DB unreachable or monitor stale >20min db.py: - cleanup_expired_suppressions(): marks time-limited suppressions inactive when expires_at <= NOW() (was only filtered in SELECTs, never marked inactive) - purge_old_resolved_events(days=90): deletes old resolved events to prevent unbounded table growth monitor.py: - Calls cleanup_expired_suppressions() and purge_old_resolved_events() each cycle Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
34
app.py
34
app.py
@@ -391,8 +391,38 @@ def api_diagnose_poll(job_id: str):
|
|||||||
|
|
||||||
@app.route('/health')
|
@app.route('/health')
|
||||||
def health():
|
def health():
|
||||||
"""Health check endpoint (no auth)."""
|
"""Health check endpoint (no auth). Checks DB and monitor freshness."""
|
||||||
return jsonify({'status': 'ok', 'service': 'gandalf'})
|
checks = {}
|
||||||
|
overall = 'ok'
|
||||||
|
|
||||||
|
# DB connectivity
|
||||||
|
try:
|
||||||
|
db.get_state('last_check')
|
||||||
|
checks['db'] = 'ok'
|
||||||
|
except Exception as e:
|
||||||
|
checks['db'] = f'error: {e}'
|
||||||
|
overall = 'degraded'
|
||||||
|
|
||||||
|
# Monitor freshness: fail if last_check is older than 20 minutes
|
||||||
|
try:
|
||||||
|
last_check = db.get_state('last_check', '')
|
||||||
|
if last_check:
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
ts = datetime.strptime(last_check, '%Y-%m-%d %H:%M:%S UTC').replace(tzinfo=timezone.utc)
|
||||||
|
age_s = (datetime.now(timezone.utc) - ts).total_seconds()
|
||||||
|
if age_s > 1200:
|
||||||
|
checks['monitor'] = f'stale ({int(age_s)}s since last check)'
|
||||||
|
overall = 'degraded'
|
||||||
|
else:
|
||||||
|
checks['monitor'] = f'ok ({int(age_s)}s ago)'
|
||||||
|
else:
|
||||||
|
checks['monitor'] = 'no data yet'
|
||||||
|
except Exception as e:
|
||||||
|
checks['monitor'] = f'error: {e}'
|
||||||
|
overall = 'degraded'
|
||||||
|
|
||||||
|
status_code = 200 if overall == 'ok' else 503
|
||||||
|
return jsonify({'status': overall, 'service': 'gandalf', 'checks': checks}), status_code
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
31
db.py
31
db.py
@@ -269,6 +269,37 @@ def deactivate_suppression(sup_id: int) -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_expired_suppressions() -> int:
|
||||||
|
"""Mark expired time-limited suppressions as inactive. Returns count deactivated."""
|
||||||
|
with get_conn() as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"""UPDATE suppression_rules
|
||||||
|
SET active=FALSE
|
||||||
|
WHERE active=TRUE AND expires_at IS NOT NULL AND expires_at <= NOW()"""
|
||||||
|
)
|
||||||
|
n = cur.rowcount
|
||||||
|
if n:
|
||||||
|
logger.info(f'Deactivated {n} expired suppression(s)')
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
def purge_old_resolved_events(days: int = 90) -> int:
|
||||||
|
"""Delete resolved events older than `days` days. Returns count deleted."""
|
||||||
|
with get_conn() as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"""DELETE FROM network_events
|
||||||
|
WHERE resolved_at IS NOT NULL
|
||||||
|
AND resolved_at < DATE_SUB(NOW(), INTERVAL %s DAY)""",
|
||||||
|
(days,),
|
||||||
|
)
|
||||||
|
n = cur.rowcount
|
||||||
|
if n:
|
||||||
|
logger.info(f'Purged {n} old resolved event(s) (>{days}d)')
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
def check_suppressed(suppressions: list, target_type: str, target_name: str, target_detail: str = '') -> bool:
|
def check_suppressed(suppressions: list, target_type: str, target_name: str, target_detail: str = '') -> bool:
|
||||||
"""Check suppression against a pre-loaded list (avoids per-call DB queries)."""
|
"""Check suppression against a pre-loaded list (avoids per-call DB queries)."""
|
||||||
for s in suppressions:
|
for s in suppressions:
|
||||||
|
|||||||
@@ -916,6 +916,10 @@ class NetworkMonitor:
|
|||||||
|
|
||||||
self._process_ping_hosts(suppressions)
|
self._process_ping_hosts(suppressions)
|
||||||
|
|
||||||
|
# Housekeeping: deactivate expired suppressions and purge old resolved events
|
||||||
|
db.cleanup_expired_suppressions()
|
||||||
|
db.purge_old_resolved_events(days=90)
|
||||||
|
|
||||||
logger.info('Network check cycle complete')
|
logger.info('Network check cycle complete')
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -49,6 +49,37 @@ function updateStatusBar(summary, lastCheck) {
|
|||||||
|
|
||||||
const lc = document.getElementById('last-check');
|
const lc = document.getElementById('last-check');
|
||||||
if (lc && lastCheck) lc.textContent = lastCheck;
|
if (lc && lastCheck) lc.textContent = lastCheck;
|
||||||
|
|
||||||
|
// Update browser tab title with alert count
|
||||||
|
const critCount = summary.critical || 0;
|
||||||
|
const warnCount = summary.warning || 0;
|
||||||
|
if (critCount) {
|
||||||
|
document.title = `(${critCount} CRIT) GANDALF`;
|
||||||
|
} else if (warnCount) {
|
||||||
|
document.title = `(${warnCount} WARN) GANDALF`;
|
||||||
|
} else {
|
||||||
|
document.title = 'GANDALF';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Stale data banner: warn if last_check is older than 15 minutes
|
||||||
|
let staleBanner = document.getElementById('stale-banner');
|
||||||
|
if (lastCheck) {
|
||||||
|
// last_check format: "2026-03-14 14:14:21 UTC"
|
||||||
|
const checkAge = (Date.now() - new Date(lastCheck.replace(' UTC', 'Z').replace(' ', 'T'))) / 1000;
|
||||||
|
if (checkAge > 900) { // 15 minutes
|
||||||
|
if (!staleBanner) {
|
||||||
|
staleBanner = document.createElement('div');
|
||||||
|
staleBanner.id = 'stale-banner';
|
||||||
|
staleBanner.className = 'stale-banner';
|
||||||
|
document.querySelector('.main').prepend(staleBanner);
|
||||||
|
}
|
||||||
|
const mins = Math.floor(checkAge / 60);
|
||||||
|
staleBanner.textContent = `⚠ Monitoring data is stale — last check was ${mins} minute${mins !== 1 ? 's' : ''} ago. The monitor daemon may be down.`;
|
||||||
|
staleBanner.style.display = '';
|
||||||
|
} else if (staleBanner) {
|
||||||
|
staleBanner.style.display = 'none';
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function updateHostGrid(hosts) {
|
function updateHostGrid(hosts) {
|
||||||
|
|||||||
@@ -1423,3 +1423,16 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-amber); }
|
|||||||
.inspector-panel.open { width:100%; }
|
.inspector-panel.open { width:100%; }
|
||||||
.inspector-panel-inner { width:100%; }
|
.inspector-panel-inner { width:100%; }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ── Stale monitoring banner ──────────────────────────────────────── */
|
||||||
|
.stale-banner {
|
||||||
|
background: rgba(255, 160, 0, 0.12);
|
||||||
|
border: 1px solid var(--warning);
|
||||||
|
border-left: 4px solid var(--warning);
|
||||||
|
color: var(--warning);
|
||||||
|
padding: 10px 16px;
|
||||||
|
margin: 12px 16px 0;
|
||||||
|
font-size: 0.88em;
|
||||||
|
font-family: var(--font-mono);
|
||||||
|
border-radius: 2px;
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user