Guard ticket creation against duplicates using event's existing ticket_id

upsert_event now returns ticket_id (4th element) so callers can skip ticket creation when one already exists. This prevents calling the ticket API every poll cycle for ongoing issues while still retrying if the previous creation attempt failed (ticket_id stays NULL until success). Cluster events use (is_new or not ticket_id) so they too get retried on failure rather than relying solely on is_new. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Fix misleading docstring on _purge_old_jobs_loop
2026-05-14 11:09:50 -04:00 · 2026-05-14 11:06:28 -04:00 · 2026-05-13 23:13:43 -04:00 · 2026-05-13 15:34:41 -04:00
3 changed files with 32 additions and 25 deletions
@@ -64,7 +64,7 @@ _diag_rate: dict = {}


 def _purge_old_jobs_loop():
-    """Background thread: remove stale diag jobs and run daily event purge."""
+    """Background thread: remove stale diagnostic jobs and mark stuck ones done."""
    while True:
        time.sleep(120)
        cutoff = time.time() - 600
@@ -3,7 +3,7 @@ import json
 import logging
 import threading
 from contextlib import contextmanager
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from typing import Optional

 import pymysql
@@ -114,12 +114,12 @@ def upsert_event(
    target_detail: str,
    description: str,
 ) -> tuple:
-    """Insert or update a network event. Returns (id, is_new, consecutive_failures)."""
+    """Insert or update a network event. Returns (id, is_new, consecutive_failures, ticket_id)."""
    detail = target_detail or ''
    with get_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(
-                """SELECT id, consecutive_failures FROM network_events
+                """SELECT id, consecutive_failures, ticket_id FROM network_events
                   WHERE event_type=%s AND target_name=%s AND target_detail=%s
                   AND resolved_at IS NULL LIMIT 1""",
                (event_type, target_name, detail),
@@ -134,7 +134,7 @@ def upsert_event(
                       WHERE id=%s""",
                    (new_count, description, existing['id']),
                )
-                return existing['id'], False, new_count
+                return existing['id'], False, new_count, existing.get('ticket_id')
            else:
                cur.execute(
                    """INSERT INTO network_events
@@ -142,7 +142,7 @@ def upsert_event(
                       VALUES (%s, %s, %s, %s, %s, %s)""",
                    (event_type, severity, source_type, target_name, detail, description),
                )
-                return cur.lastrowid, True, 1
+                return cur.lastrowid, True, 1, None


 def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None:
@@ -281,7 +281,7 @@ def create_suppression(
 ) -> int:
    expires_at = None
    if expires_minutes:
-        expires_at = datetime.utcnow() + timedelta(minutes=int(expires_minutes))
+        expires_at = datetime.now(timezone.utc) + timedelta(minutes=int(expires_minutes))
    with get_conn() as conn:
        with conn.cursor() as cur:
            cur.execute(
@@ -12,7 +12,7 @@ import logging
 import re
 import shlex
 import time
-from datetime import datetime
+from datetime import datetime, timezone
 from typing import Dict, List, Optional

 import requests
@@ -618,7 +618,7 @@ class LinkStatsCollector:
        return {
            'hosts':          result_hosts,
            'unifi_switches': unifi_switches,
-            'updated':        datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
+            'updated':        datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'),
        }

    def _compute_unifi_rates(self, raw: Dict[str, dict], now: float) -> Dict[str, dict]:
@@ -653,7 +653,7 @@ class LinkStatsCollector:
 # Helpers
 # --------------------------------------------------------------------------
 def _now_utc() -> str:
-    return datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')
+    return datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')


 # --------------------------------------------------------------------------
@@ -728,12 +728,12 @@ class NetworkMonitor:
                            db.check_suppressed(suppressions, 'interface', host, iface) or
                            db.check_suppressed(suppressions, 'host', host)
                        )
-                        event_id, is_new, consec = db.upsert_event(
+                        event_id, is_new, consec, ticket_id = db.upsert_event(
                            'interface_down', 'critical', 'prometheus',
                            host, iface,
                            f'Interface {iface} on {host} went link-down ({_now_utc()})',
                        )
-                        if not sup and consec >= self.fail_thresh:
+                        if not sup and consec >= self.fail_thresh and not ticket_id:
                            self._ticket_interface(event_id, host, iface, consec)

            if host_has_regression:
@@ -744,13 +744,13 @@ class NetworkMonitor:
        # Cluster-wide check – only genuine regressions count
        if len(hosts_with_regression) >= self.cluster_thresh:
            sup = db.check_suppressed(suppressions, 'all', '')
-            event_id, is_new, consec = db.upsert_event(
+            event_id, is_new, consec, ticket_id = db.upsert_event(
                'cluster_network_issue', 'critical', 'prometheus',
                self.cluster_name, '',
                f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
                f'{", ".join(hosts_with_regression)}',
            )
-            if not sup and is_new:
+            if not sup and (is_new or not ticket_id):
                title = (
                    f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
                    f'Multiple hosts reporting interface failures'
@@ -804,12 +804,12 @@ class NetworkMonitor:
            name = d['name']
            if not d['connected']:
                sup = db.check_suppressed(suppressions, 'unifi_device', name)
-                event_id, is_new, consec = db.upsert_event(
+                event_id, is_new, consec, ticket_id = db.upsert_event(
                    'unifi_device_offline', 'critical', 'unifi',
                    name, d.get('type', ''),
                    f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
                )
-                if not sup and consec >= self.fail_thresh:
+                if not sup and consec >= self.fail_thresh and not ticket_id:
                    self._ticket_unifi(event_id, d)
            else:
                db.resolve_event('unifi_device_offline', name, d.get('type', ''))
@@ -837,19 +837,19 @@ class NetworkMonitor:
    # ------------------------------------------------------------------
    # Ping-only hosts (no node_exporter)
    # ------------------------------------------------------------------
-    def _process_ping_hosts(self, suppressions: list) -> None:
+    def _process_ping_hosts(self, suppressions: list, ping_states: Dict[str, bool]) -> None:
        for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
            name, ip = h['name'], h['ip']
-            reachable = self.pulse.ping(ip)
+            reachable = ping_states.get(name, False)

            if not reachable:
                sup = db.check_suppressed(suppressions, 'host', name)
-                event_id, is_new, consec = db.upsert_event(
+                event_id, is_new, consec, ticket_id = db.upsert_event(
                    'host_unreachable', 'critical', 'ping',
                    name, ip,
                    f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
                )
-                if not sup and consec >= self.fail_thresh:
+                if not sup and consec >= self.fail_thresh and not ticket_id:
                    self._ticket_unreachable(event_id, name, ip, consec)
            else:
                db.resolve_event('host_unreachable', name, ip)
@@ -882,6 +882,7 @@ class NetworkMonitor:
    def _collect_snapshot(
        self, iface_states: Dict[str, Dict[str, bool]],
        unifi_devices: Optional[List[dict]] = None,
+        ping_states: Optional[Dict[str, bool]] = None,
    ) -> dict:
        # Accept pre-fetched devices; fall back to empty list if unavailable
        display_unifi = unifi_devices if unifi_devices is not None else []
@@ -910,7 +911,7 @@ class NetworkMonitor:

        for h in self.cfg.get('monitor', {}).get('ping_hosts', []):
            name, ip = h['name'], h['ip']
-            reachable = self.pulse.ping(ip, count=1, timeout=2)
+            reachable = (ping_states or {}).get(name, False)
            hosts[name] = {
                'ip': ip,
                'interfaces': {},
@@ -921,7 +922,7 @@ class NetworkMonitor:
        return {
            'hosts': hosts,
            'unifi': display_unifi,
-            'updated': datetime.utcnow().isoformat() + 'Z',
+            'updated': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'),
        }

    # ------------------------------------------------------------------
@@ -942,8 +943,14 @@ class NetworkMonitor:
                # 2. Fetch UniFi devices once — used by both snapshot and alert processing
                unifi_devices = self.unifi.get_devices()

-                # 3. Collect and store snapshot for dashboard
-                snapshot = self._collect_snapshot(iface_states, unifi_devices)
+                # 3a. Ping-only hosts once — shared by snapshot and alert processing
+                ping_states: Dict[str, bool] = {
+                    h['name']: self.pulse.ping(h['ip'])
+                    for h in self.cfg.get('monitor', {}).get('ping_hosts', [])
+                }
+
+                # 3b. Collect and store snapshot for dashboard
+                snapshot = self._collect_snapshot(iface_states, unifi_devices, ping_states)
                db.set_state('network_snapshot', snapshot)
                db.set_state('last_check', _now_utc())

@@ -959,7 +966,7 @@ class NetworkMonitor:
                self._process_interfaces(iface_states, suppressions)
                self._process_unifi(unifi_devices, suppressions)

-                self._process_ping_hosts(suppressions)
+                self._process_ping_hosts(suppressions, ping_states)

                # Housekeeping: deactivate expired suppressions and purge old resolved events
                db.cleanup_expired_suppressions()
Author	SHA1	Message	Date
jared	9c5a88fbce	Guard ticket creation against duplicates using event's existing ticket_id Lint / Python (flake8) (push) Successful in 41s Details Lint / JS (eslint) (push) Successful in 7s Details Security / Python Security (bandit) (push) Successful in 40s Details Test / Python Tests (pytest) (push) Successful in 1m18s Details Lint / Notify on failure (push) Has been skipped Details Lint / Deploy (push) Successful in 4s Details upsert_event now returns ticket_id (4th element) so callers can skip ticket creation when one already exists. This prevents calling the ticket API every poll cycle for ongoing issues while still retrying if the previous creation attempt failed (ticket_id stays NULL until success). Cluster events use (is_new or not ticket_id) so they too get retried on failure rather than relying solely on is_new. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-14 11:09:50 -04:00
jared	0975dd007a	Fix misleading docstring on _purge_old_jobs_loop Lint / Python (flake8) (push) Successful in 42s Details Lint / JS (eslint) (push) Successful in 7s Details Security / Python Security (bandit) (push) Successful in 41s Details Test / Python Tests (pytest) (push) Successful in 52s Details Lint / Notify on failure (push) Has been skipped Details Lint / Deploy (push) Successful in 3s Details The comment claimed the function "runs daily event purge" — that housekeeping is done by monitor.py's main loop, not here. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-14 11:06:28 -04:00
jared	a34898b8e8	Fix ping-only hosts polled twice per cycle with inconsistent parameters Lint / Python (flake8) (push) Successful in 57s Details Lint / JS (eslint) (push) Successful in 28s Details Security / Python Security (bandit) (push) Successful in 1m14s Details Lint / Notify on failure (push) Has been skipped Details Lint / Deploy (push) Successful in 7s Details Test / Python Tests (pytest) (push) Failing after 13m52s Details _collect_snapshot called pulse.ping(count=1) independently from _process_ping_hosts which called pulse.ping(count=3). This doubled network load and could show a host as 'up' in the dashboard while simultaneously firing an 'unreachable' alert, or vice versa. Now ping_states is computed once in run() using the alert-quality parameters (count=3) and shared by both snapshot and alert processing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-13 23:13:43 -04:00
jared	31747c4bd3	Replace deprecated datetime.utcnow() with datetime.now(timezone.utc) Lint / Python (flake8) (push) Successful in 1m9s Details Lint / JS (eslint) (push) Successful in 11s Details Security / Python Security (bandit) (push) Successful in 44s Details Test / Python Tests (pytest) (push) Successful in 58s Details Lint / Notify on failure (push) Has been skipped Details Lint / Deploy (push) Successful in 3s Details datetime.utcnow() is deprecated in Python 3.12 and removed in 3.13. Replace all four call sites with timezone-aware equivalents so the codebase is ready for Python 3.12+. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>	2026-05-13 15:34:41 -04:00