Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9c5a88fbce | |||
| 0975dd007a | |||
| a34898b8e8 | |||
| 31747c4bd3 | |||
| faa0707f79 | |||
| 9c52e4ad1a |
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"env": {
|
||||
"browser": true,
|
||||
"es2021": true
|
||||
},
|
||||
"globals": {
|
||||
"lt": "readonly",
|
||||
"GANDALF_CONFIG": "readonly",
|
||||
"CSS": "readonly"
|
||||
},
|
||||
"rules": {
|
||||
"no-undef": "error",
|
||||
"no-unused-vars": ["warn", { "argsIgnorePattern": "^_", "varsIgnorePattern": "^_" }],
|
||||
"no-console": "off",
|
||||
"eqeqeq": ["error", "always", { "null": "ignore" }]
|
||||
},
|
||||
"parserOptions": {
|
||||
"ecmaVersion": 2021,
|
||||
"sourceType": "script"
|
||||
}
|
||||
}
|
||||
@@ -64,7 +64,7 @@ _diag_rate: dict = {}
|
||||
|
||||
|
||||
def _purge_old_jobs_loop():
|
||||
"""Background thread: remove stale diag jobs and run daily event purge."""
|
||||
"""Background thread: remove stale diagnostic jobs and mark stuck ones done."""
|
||||
while True:
|
||||
time.sleep(120)
|
||||
cutoff = time.time() - 600
|
||||
|
||||
@@ -3,7 +3,7 @@ import json
|
||||
import logging
|
||||
import threading
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
import pymysql
|
||||
@@ -114,12 +114,12 @@ def upsert_event(
|
||||
target_detail: str,
|
||||
description: str,
|
||||
) -> tuple:
|
||||
"""Insert or update a network event. Returns (id, is_new, consecutive_failures)."""
|
||||
"""Insert or update a network event. Returns (id, is_new, consecutive_failures, ticket_id)."""
|
||||
detail = target_detail or ''
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"""SELECT id, consecutive_failures FROM network_events
|
||||
"""SELECT id, consecutive_failures, ticket_id FROM network_events
|
||||
WHERE event_type=%s AND target_name=%s AND target_detail=%s
|
||||
AND resolved_at IS NULL LIMIT 1""",
|
||||
(event_type, target_name, detail),
|
||||
@@ -134,7 +134,7 @@ def upsert_event(
|
||||
WHERE id=%s""",
|
||||
(new_count, description, existing['id']),
|
||||
)
|
||||
return existing['id'], False, new_count
|
||||
return existing['id'], False, new_count, existing.get('ticket_id')
|
||||
else:
|
||||
cur.execute(
|
||||
"""INSERT INTO network_events
|
||||
@@ -142,7 +142,7 @@ def upsert_event(
|
||||
VALUES (%s, %s, %s, %s, %s, %s)""",
|
||||
(event_type, severity, source_type, target_name, detail, description),
|
||||
)
|
||||
return cur.lastrowid, True, 1
|
||||
return cur.lastrowid, True, 1, None
|
||||
|
||||
|
||||
def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None:
|
||||
@@ -281,7 +281,7 @@ def create_suppression(
|
||||
) -> int:
|
||||
expires_at = None
|
||||
if expires_minutes:
|
||||
expires_at = datetime.utcnow() + timedelta(minutes=int(expires_minutes))
|
||||
expires_at = datetime.now(timezone.utc) + timedelta(minutes=int(expires_minutes))
|
||||
with get_conn() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
|
||||
+25
-18
@@ -12,7 +12,7 @@ import logging
|
||||
import re
|
||||
import shlex
|
||||
import time
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import requests
|
||||
@@ -618,7 +618,7 @@ class LinkStatsCollector:
|
||||
return {
|
||||
'hosts': result_hosts,
|
||||
'unifi_switches': unifi_switches,
|
||||
'updated': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
|
||||
'updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'),
|
||||
}
|
||||
|
||||
def _compute_unifi_rates(self, raw: Dict[str, dict], now: float) -> Dict[str, dict]:
|
||||
@@ -653,7 +653,7 @@ class LinkStatsCollector:
|
||||
# Helpers
|
||||
# --------------------------------------------------------------------------
|
||||
def _now_utc() -> str:
|
||||
return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')
|
||||
return datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
|
||||
|
||||
|
||||
# --------------------------------------------------------------------------
|
||||
@@ -728,12 +728,12 @@ class NetworkMonitor:
|
||||
db.check_suppressed(suppressions, 'interface', host, iface) or
|
||||
db.check_suppressed(suppressions, 'host', host)
|
||||
)
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||
'interface_down', 'critical', 'prometheus',
|
||||
host, iface,
|
||||
f'Interface {iface} on {host} went link-down ({_now_utc()})',
|
||||
)
|
||||
if not sup and consec >= self.fail_thresh:
|
||||
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||
self._ticket_interface(event_id, host, iface, consec)
|
||||
|
||||
if host_has_regression:
|
||||
@@ -744,13 +744,13 @@ class NetworkMonitor:
|
||||
# Cluster-wide check – only genuine regressions count
|
||||
if len(hosts_with_regression) >= self.cluster_thresh:
|
||||
sup = db.check_suppressed(suppressions, 'all', '')
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||
'cluster_network_issue', 'critical', 'prometheus',
|
||||
self.cluster_name, '',
|
||||
f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
|
||||
f'{", ".join(hosts_with_regression)}',
|
||||
)
|
||||
if not sup and is_new:
|
||||
if not sup and (is_new or not ticket_id):
|
||||
title = (
|
||||
f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
|
||||
f'Multiple hosts reporting interface failures'
|
||||
@@ -804,12 +804,12 @@ class NetworkMonitor:
|
||||
name = d['name']
|
||||
if not d['connected']:
|
||||
sup = db.check_suppressed(suppressions, 'unifi_device', name)
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||
'unifi_device_offline', 'critical', 'unifi',
|
||||
name, d.get('type', ''),
|
||||
f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
|
||||
)
|
||||
if not sup and consec >= self.fail_thresh:
|
||||
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||
self._ticket_unifi(event_id, d)
|
||||
else:
|
||||
db.resolve_event('unifi_device_offline', name, d.get('type', ''))
|
||||
@@ -837,19 +837,19 @@ class NetworkMonitor:
|
||||
# ------------------------------------------------------------------
|
||||
# Ping-only hosts (no node_exporter)
|
||||
# ------------------------------------------------------------------
|
||||
def _process_ping_hosts(self, suppressions: list) -> None:
|
||||
def _process_ping_hosts(self, suppressions: list, ping_states: Dict[str, bool]) -> None:
|
||||
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
|
||||
name, ip = h['name'], h['ip']
|
||||
reachable = self.pulse.ping(ip)
|
||||
reachable = ping_states.get(name, False)
|
||||
|
||||
if not reachable:
|
||||
sup = db.check_suppressed(suppressions, 'host', name)
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||
'host_unreachable', 'critical', 'ping',
|
||||
name, ip,
|
||||
f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
|
||||
)
|
||||
if not sup and consec >= self.fail_thresh:
|
||||
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||
self._ticket_unreachable(event_id, name, ip, consec)
|
||||
else:
|
||||
db.resolve_event('host_unreachable', name, ip)
|
||||
@@ -882,6 +882,7 @@ class NetworkMonitor:
|
||||
def _collect_snapshot(
|
||||
self, iface_states: Dict[str, Dict[str, bool]],
|
||||
unifi_devices: Optional[List[dict]] = None,
|
||||
ping_states: Optional[Dict[str, bool]] = None,
|
||||
) -> dict:
|
||||
# Accept pre-fetched devices; fall back to empty list if unavailable
|
||||
display_unifi = unifi_devices if unifi_devices is not None else []
|
||||
@@ -910,7 +911,7 @@ class NetworkMonitor:
|
||||
|
||||
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
|
||||
name, ip = h['name'], h['ip']
|
||||
reachable = self.pulse.ping(ip, count=1, timeout=2)
|
||||
reachable = (ping_states or {}).get(name, False)
|
||||
hosts[name] = {
|
||||
'ip': ip,
|
||||
'interfaces': {},
|
||||
@@ -921,7 +922,7 @@ class NetworkMonitor:
|
||||
return {
|
||||
'hosts': hosts,
|
||||
'unifi': display_unifi,
|
||||
'updated': datetime.utcnow().isoformat() + 'Z',
|
||||
'updated': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
@@ -942,8 +943,14 @@ class NetworkMonitor:
|
||||
# 2. Fetch UniFi devices once — used by both snapshot and alert processing
|
||||
unifi_devices = self.unifi.get_devices()
|
||||
|
||||
# 3. Collect and store snapshot for dashboard
|
||||
snapshot = self._collect_snapshot(iface_states, unifi_devices)
|
||||
# 3a. Ping-only hosts once — shared by snapshot and alert processing
|
||||
ping_states: Dict[str, bool] = {
|
||||
h['name']: self.pulse.ping(h['ip'])
|
||||
for h in self.cfg.get('monitor', {}).get('ping_hosts', [])
|
||||
}
|
||||
|
||||
# 3b. Collect and store snapshot for dashboard
|
||||
snapshot = self._collect_snapshot(iface_states, unifi_devices, ping_states)
|
||||
db.set_state('network_snapshot', snapshot)
|
||||
db.set_state('last_check', _now_utc())
|
||||
|
||||
@@ -959,7 +966,7 @@ class NetworkMonitor:
|
||||
self._process_interfaces(iface_states, suppressions)
|
||||
self._process_unifi(unifi_devices, suppressions)
|
||||
|
||||
self._process_ping_hosts(suppressions)
|
||||
self._process_ping_hosts(suppressions, ping_states)
|
||||
|
||||
# Housekeeping: deactivate expired suppressions and purge old resolved events
|
||||
db.cleanup_expired_suppressions()
|
||||
|
||||
@@ -473,7 +473,7 @@ async function loadInspector() {
|
||||
}
|
||||
|
||||
loadInspector();
|
||||
const _inspInterval = (window.gandalfSettings && window.gandalfSettings.refreshInterval) || 60;
|
||||
const _inspInterval = window.gandalfSettings?.refreshInterval ?? 60;
|
||||
if (_inspInterval > 0) lt.autoRefresh.start(loadInspector, Math.max(_inspInterval, 15) * 1000);
|
||||
|
||||
window.onGandalfSettingsChanged = function(s) {
|
||||
|
||||
Reference in New Issue
Block a user