Compare commits

..

4 Commits

Author SHA1 Message Date
jared 9c5a88fbce Guard ticket creation against duplicates using event's existing ticket_id
Lint / Python (flake8) (push) Successful in 41s
Lint / JS (eslint) (push) Successful in 7s
Security / Python Security (bandit) (push) Successful in 40s
Test / Python Tests (pytest) (push) Successful in 1m18s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 4s
upsert_event now returns ticket_id (4th element) so callers can skip
ticket creation when one already exists. This prevents calling the ticket
API every poll cycle for ongoing issues while still retrying if the
previous creation attempt failed (ticket_id stays NULL until success).

Cluster events use (is_new or not ticket_id) so they too get retried
on failure rather than relying solely on is_new.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 11:09:50 -04:00
jared 0975dd007a Fix misleading docstring on _purge_old_jobs_loop
Lint / Python (flake8) (push) Successful in 42s
Lint / JS (eslint) (push) Successful in 7s
Security / Python Security (bandit) (push) Successful in 41s
Test / Python Tests (pytest) (push) Successful in 52s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 3s
The comment claimed the function "runs daily event purge" — that
housekeeping is done by monitor.py's main loop, not here.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 11:06:28 -04:00
jared a34898b8e8 Fix ping-only hosts polled twice per cycle with inconsistent parameters
Lint / Python (flake8) (push) Successful in 57s
Lint / JS (eslint) (push) Successful in 28s
Security / Python Security (bandit) (push) Successful in 1m14s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 7s
Test / Python Tests (pytest) (push) Failing after 13m52s
_collect_snapshot called pulse.ping(count=1) independently from
_process_ping_hosts which called pulse.ping(count=3). This doubled
network load and could show a host as 'up' in the dashboard while
simultaneously firing an 'unreachable' alert, or vice versa.

Now ping_states is computed once in run() using the alert-quality
parameters (count=3) and shared by both snapshot and alert processing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 23:13:43 -04:00
jared 31747c4bd3 Replace deprecated datetime.utcnow() with datetime.now(timezone.utc)
Lint / Python (flake8) (push) Successful in 1m9s
Lint / JS (eslint) (push) Successful in 11s
Security / Python Security (bandit) (push) Successful in 44s
Test / Python Tests (pytest) (push) Successful in 58s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 3s
datetime.utcnow() is deprecated in Python 3.12 and removed in 3.13.
Replace all four call sites with timezone-aware equivalents so the
codebase is ready for Python 3.12+.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 15:34:41 -04:00
3 changed files with 32 additions and 25 deletions
+1 -1
View File
@@ -64,7 +64,7 @@ _diag_rate: dict = {}
def _purge_old_jobs_loop():
"""Background thread: remove stale diag jobs and run daily event purge."""
"""Background thread: remove stale diagnostic jobs and mark stuck ones done."""
while True:
time.sleep(120)
cutoff = time.time() - 600
+6 -6
View File
@@ -3,7 +3,7 @@ import json
import logging
import threading
from contextlib import contextmanager
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from typing import Optional
import pymysql
@@ -114,12 +114,12 @@ def upsert_event(
target_detail: str,
description: str,
) -> tuple:
"""Insert or update a network event. Returns (id, is_new, consecutive_failures)."""
"""Insert or update a network event. Returns (id, is_new, consecutive_failures, ticket_id)."""
detail = target_detail or ''
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute(
"""SELECT id, consecutive_failures FROM network_events
"""SELECT id, consecutive_failures, ticket_id FROM network_events
WHERE event_type=%s AND target_name=%s AND target_detail=%s
AND resolved_at IS NULL LIMIT 1""",
(event_type, target_name, detail),
@@ -134,7 +134,7 @@ def upsert_event(
WHERE id=%s""",
(new_count, description, existing['id']),
)
return existing['id'], False, new_count
return existing['id'], False, new_count, existing.get('ticket_id')
else:
cur.execute(
"""INSERT INTO network_events
@@ -142,7 +142,7 @@ def upsert_event(
VALUES (%s, %s, %s, %s, %s, %s)""",
(event_type, severity, source_type, target_name, detail, description),
)
return cur.lastrowid, True, 1
return cur.lastrowid, True, 1, None
def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None:
@@ -281,7 +281,7 @@ def create_suppression(
) -> int:
expires_at = None
if expires_minutes:
expires_at = datetime.utcnow() + timedelta(minutes=int(expires_minutes))
expires_at = datetime.now(timezone.utc) + timedelta(minutes=int(expires_minutes))
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute(
+25 -18
View File
@@ -12,7 +12,7 @@ import logging
import re
import shlex
import time
from datetime import datetime
from datetime import datetime, timezone
from typing import Dict, List, Optional
import requests
@@ -618,7 +618,7 @@ class LinkStatsCollector:
return {
'hosts': result_hosts,
'unifi_switches': unifi_switches,
'updated': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
'updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'),
}
def _compute_unifi_rates(self, raw: Dict[str, dict], now: float) -> Dict[str, dict]:
@@ -653,7 +653,7 @@ class LinkStatsCollector:
# Helpers
# --------------------------------------------------------------------------
def _now_utc() -> str:
return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')
return datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')
# --------------------------------------------------------------------------
@@ -728,12 +728,12 @@ class NetworkMonitor:
db.check_suppressed(suppressions, 'interface', host, iface) or
db.check_suppressed(suppressions, 'host', host)
)
event_id, is_new, consec = db.upsert_event(
event_id, is_new, consec, ticket_id = db.upsert_event(
'interface_down', 'critical', 'prometheus',
host, iface,
f'Interface {iface} on {host} went link-down ({_now_utc()})',
)
if not sup and consec >= self.fail_thresh:
if not sup and consec >= self.fail_thresh and not ticket_id:
self._ticket_interface(event_id, host, iface, consec)
if host_has_regression:
@@ -744,13 +744,13 @@ class NetworkMonitor:
# Cluster-wide check only genuine regressions count
if len(hosts_with_regression) >= self.cluster_thresh:
sup = db.check_suppressed(suppressions, 'all', '')
event_id, is_new, consec = db.upsert_event(
event_id, is_new, consec, ticket_id = db.upsert_event(
'cluster_network_issue', 'critical', 'prometheus',
self.cluster_name, '',
f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
f'{", ".join(hosts_with_regression)}',
)
if not sup and is_new:
if not sup and (is_new or not ticket_id):
title = (
f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
f'Multiple hosts reporting interface failures'
@@ -804,12 +804,12 @@ class NetworkMonitor:
name = d['name']
if not d['connected']:
sup = db.check_suppressed(suppressions, 'unifi_device', name)
event_id, is_new, consec = db.upsert_event(
event_id, is_new, consec, ticket_id = db.upsert_event(
'unifi_device_offline', 'critical', 'unifi',
name, d.get('type', ''),
f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
)
if not sup and consec >= self.fail_thresh:
if not sup and consec >= self.fail_thresh and not ticket_id:
self._ticket_unifi(event_id, d)
else:
db.resolve_event('unifi_device_offline', name, d.get('type', ''))
@@ -837,19 +837,19 @@ class NetworkMonitor:
# ------------------------------------------------------------------
# Ping-only hosts (no node_exporter)
# ------------------------------------------------------------------
def _process_ping_hosts(self, suppressions: list) -> None:
def _process_ping_hosts(self, suppressions: list, ping_states: Dict[str, bool]) -> None:
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip']
reachable = self.pulse.ping(ip)
reachable = ping_states.get(name, False)
if not reachable:
sup = db.check_suppressed(suppressions, 'host', name)
event_id, is_new, consec = db.upsert_event(
event_id, is_new, consec, ticket_id = db.upsert_event(
'host_unreachable', 'critical', 'ping',
name, ip,
f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
)
if not sup and consec >= self.fail_thresh:
if not sup and consec >= self.fail_thresh and not ticket_id:
self._ticket_unreachable(event_id, name, ip, consec)
else:
db.resolve_event('host_unreachable', name, ip)
@@ -882,6 +882,7 @@ class NetworkMonitor:
def _collect_snapshot(
self, iface_states: Dict[str, Dict[str, bool]],
unifi_devices: Optional[List[dict]] = None,
ping_states: Optional[Dict[str, bool]] = None,
) -> dict:
# Accept pre-fetched devices; fall back to empty list if unavailable
display_unifi = unifi_devices if unifi_devices is not None else []
@@ -910,7 +911,7 @@ class NetworkMonitor:
for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
name, ip = h['name'], h['ip']
reachable = self.pulse.ping(ip, count=1, timeout=2)
reachable = (ping_states or {}).get(name, False)
hosts[name] = {
'ip': ip,
'interfaces': {},
@@ -921,7 +922,7 @@ class NetworkMonitor:
return {
'hosts': hosts,
'unifi': display_unifi,
'updated': datetime.utcnow().isoformat() + 'Z',
'updated': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
}
# ------------------------------------------------------------------
@@ -942,8 +943,14 @@ class NetworkMonitor:
# 2. Fetch UniFi devices once — used by both snapshot and alert processing
unifi_devices = self.unifi.get_devices()
# 3. Collect and store snapshot for dashboard
snapshot = self._collect_snapshot(iface_states, unifi_devices)
# 3a. Ping-only hosts once — shared by snapshot and alert processing
ping_states: Dict[str, bool] = {
h['name']: self.pulse.ping(h['ip'])
for h in self.cfg.get('monitor', {}).get('ping_hosts', [])
}
# 3b. Collect and store snapshot for dashboard
snapshot = self._collect_snapshot(iface_states, unifi_devices, ping_states)
db.set_state('network_snapshot', snapshot)
db.set_state('last_check', _now_utc())
@@ -959,7 +966,7 @@ class NetworkMonitor:
self._process_interfaces(iface_states, suppressions)
self._process_unifi(unifi_devices, suppressions)
self._process_ping_hosts(suppressions)
self._process_ping_hosts(suppressions, ping_states)
# Housekeeping: deactivate expired suppressions and purge old resolved events
db.cleanup_expired_suppressions()