New features: stale banner, tab title alerts, health checks, DB housekeeping

static/app.js:
- Browser tab title updates to show alert count: '(3 CRIT) GANDALF' or '(2 WARN) GANDALF'
- Stale monitoring banner: injected into .main if last_check > 15 min old,
  warns operator that the monitor daemon may be down

static/style.css:
- .stale-banner: amber top-border warning strip

app.py:
- /health now checks DB connectivity and monitor freshness (last_check age)
  Returns 503 + degraded status if DB unreachable or monitor stale >20min

db.py:
- cleanup_expired_suppressions(): marks time-limited suppressions inactive when
  expires_at <= NOW() (was only filtered in SELECTs, never marked inactive)
- purge_old_resolved_events(days=90): deletes old resolved events to prevent
  unbounded table growth

monitor.py:
- Calls cleanup_expired_suppressions() and purge_old_resolved_events() each cycle

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-14 21:35:32 -04:00
parent 14eaa6a8c9
commit 17d3b7d227
5 changed files with 111 additions and 2 deletions

34
app.py
View File

@@ -391,8 +391,38 @@ def api_diagnose_poll(job_id: str):
@app.route('/health')
def health():
"""Health check endpoint (no auth)."""
return jsonify({'status': 'ok', 'service': 'gandalf'})
"""Health check endpoint (no auth). Checks DB and monitor freshness."""
checks = {}
overall = 'ok'
# DB connectivity
try:
db.get_state('last_check')
checks['db'] = 'ok'
except Exception as e:
checks['db'] = f'error: {e}'
overall = 'degraded'
# Monitor freshness: fail if last_check is older than 20 minutes
try:
last_check = db.get_state('last_check', '')
if last_check:
from datetime import datetime, timezone
ts = datetime.strptime(last_check, '%Y-%m-%d %H:%M:%S UTC').replace(tzinfo=timezone.utc)
age_s = (datetime.now(timezone.utc) - ts).total_seconds()
if age_s > 1200:
checks['monitor'] = f'stale ({int(age_s)}s since last check)'
overall = 'degraded'
else:
checks['monitor'] = f'ok ({int(age_s)}s ago)'
else:
checks['monitor'] = 'no data yet'
except Exception as e:
checks['monitor'] = f'error: {e}'
overall = 'degraded'
status_code = 200 if overall == 'ok' else 503
return jsonify({'status': overall, 'service': 'gandalf', 'checks': checks}), status_code
if __name__ == '__main__':

31
db.py
View File

@@ -269,6 +269,37 @@ def deactivate_suppression(sup_id: int) -> None:
)
def cleanup_expired_suppressions() -> int:
"""Mark expired time-limited suppressions as inactive. Returns count deactivated."""
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute(
"""UPDATE suppression_rules
SET active=FALSE
WHERE active=TRUE AND expires_at IS NOT NULL AND expires_at <= NOW()"""
)
n = cur.rowcount
if n:
logger.info(f'Deactivated {n} expired suppression(s)')
return n
def purge_old_resolved_events(days: int = 90) -> int:
"""Delete resolved events older than `days` days. Returns count deleted."""
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute(
"""DELETE FROM network_events
WHERE resolved_at IS NOT NULL
AND resolved_at < DATE_SUB(NOW(), INTERVAL %s DAY)""",
(days,),
)
n = cur.rowcount
if n:
logger.info(f'Purged {n} old resolved event(s) (>{days}d)')
return n
def check_suppressed(suppressions: list, target_type: str, target_name: str, target_detail: str = '') -> bool:
"""Check suppression against a pre-loaded list (avoids per-call DB queries)."""
for s in suppressions:

View File

@@ -916,6 +916,10 @@ class NetworkMonitor:
self._process_ping_hosts(suppressions)
# Housekeeping: deactivate expired suppressions and purge old resolved events
db.cleanup_expired_suppressions()
db.purge_old_resolved_events(days=90)
logger.info('Network check cycle complete')
except Exception as e:

View File

@@ -49,6 +49,37 @@ function updateStatusBar(summary, lastCheck) {
const lc = document.getElementById('last-check');
if (lc && lastCheck) lc.textContent = lastCheck;
// Update browser tab title with alert count
const critCount = summary.critical || 0;
const warnCount = summary.warning || 0;
if (critCount) {
document.title = `(${critCount} CRIT) GANDALF`;
} else if (warnCount) {
document.title = `(${warnCount} WARN) GANDALF`;
} else {
document.title = 'GANDALF';
}
// Stale data banner: warn if last_check is older than 15 minutes
let staleBanner = document.getElementById('stale-banner');
if (lastCheck) {
// last_check format: "2026-03-14 14:14:21 UTC"
const checkAge = (Date.now() - new Date(lastCheck.replace(' UTC', 'Z').replace(' ', 'T'))) / 1000;
if (checkAge > 900) { // 15 minutes
if (!staleBanner) {
staleBanner = document.createElement('div');
staleBanner.id = 'stale-banner';
staleBanner.className = 'stale-banner';
document.querySelector('.main').prepend(staleBanner);
}
const mins = Math.floor(checkAge / 60);
staleBanner.textContent = `⚠ Monitoring data is stale — last check was ${mins} minute${mins !== 1 ? 's' : ''} ago. The monitor daemon may be down.`;
staleBanner.style.display = '';
} else if (staleBanner) {
staleBanner.style.display = 'none';
}
}
}
function updateHostGrid(hosts) {

View File

@@ -1423,3 +1423,16 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-amber); }
.inspector-panel.open { width:100%; }
.inspector-panel-inner { width:100%; }
}
/* ── Stale monitoring banner ──────────────────────────────────────── */
.stale-banner {
background: rgba(255, 160, 0, 0.12);
border: 1px solid var(--warning);
border-left: 4px solid var(--warning);
color: var(--warning);
padding: 10px 16px;
margin: 12px 16px 0;
font-size: 0.88em;
font-family: var(--font-mono);
border-radius: 2px;
}