From 9c5a88fbcea79c01b4b8141ed8c46c9dd908c2bc Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Thu, 14 May 2026 11:09:50 -0400 Subject: [PATCH] Guard ticket creation against duplicates using event's existing ticket_id upsert_event now returns ticket_id (4th element) so callers can skip ticket creation when one already exists. This prevents calling the ticket API every poll cycle for ongoing issues while still retrying if the previous creation attempt failed (ticket_id stays NULL until success). Cluster events use (is_new or not ticket_id) so they too get retried on failure rather than relying solely on is_new. Co-Authored-By: Claude Sonnet 4.6 --- db.py | 8 ++++---- monitor.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/db.py b/db.py index 7760154..08001ce 100644 --- a/db.py +++ b/db.py @@ -114,12 +114,12 @@ def upsert_event( target_detail: str, description: str, ) -> tuple: - """Insert or update a network event. Returns (id, is_new, consecutive_failures).""" + """Insert or update a network event. Returns (id, is_new, consecutive_failures, ticket_id).""" detail = target_detail or '' with get_conn() as conn: with conn.cursor() as cur: cur.execute( - """SELECT id, consecutive_failures FROM network_events + """SELECT id, consecutive_failures, ticket_id FROM network_events WHERE event_type=%s AND target_name=%s AND target_detail=%s AND resolved_at IS NULL LIMIT 1""", (event_type, target_name, detail), @@ -134,7 +134,7 @@ def upsert_event( WHERE id=%s""", (new_count, description, existing['id']), ) - return existing['id'], False, new_count + return existing['id'], False, new_count, existing.get('ticket_id') else: cur.execute( """INSERT INTO network_events @@ -142,7 +142,7 @@ def upsert_event( VALUES (%s, %s, %s, %s, %s, %s)""", (event_type, severity, source_type, target_name, detail, description), ) - return cur.lastrowid, True, 1 + return cur.lastrowid, True, 1, None def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None: diff --git a/monitor.py b/monitor.py index 811c5aa..7080789 100644 --- a/monitor.py +++ b/monitor.py @@ -728,12 +728,12 @@ class NetworkMonitor: db.check_suppressed(suppressions, 'interface', host, iface) or db.check_suppressed(suppressions, 'host', host) ) - event_id, is_new, consec = db.upsert_event( + event_id, is_new, consec, ticket_id = db.upsert_event( 'interface_down', 'critical', 'prometheus', host, iface, f'Interface {iface} on {host} went link-down ({_now_utc()})', ) - if not sup and consec >= self.fail_thresh: + if not sup and consec >= self.fail_thresh and not ticket_id: self._ticket_interface(event_id, host, iface, consec) if host_has_regression: @@ -744,13 +744,13 @@ class NetworkMonitor: # Cluster-wide check – only genuine regressions count if len(hosts_with_regression) >= self.cluster_thresh: sup = db.check_suppressed(suppressions, 'all', '') - event_id, is_new, consec = db.upsert_event( + event_id, is_new, consec, ticket_id = db.upsert_event( 'cluster_network_issue', 'critical', 'prometheus', self.cluster_name, '', f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: ' f'{", ".join(hosts_with_regression)}', ) - if not sup and is_new: + if not sup and (is_new or not ticket_id): title = ( f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] ' f'Multiple hosts reporting interface failures' @@ -804,12 +804,12 @@ class NetworkMonitor: name = d['name'] if not d['connected']: sup = db.check_suppressed(suppressions, 'unifi_device', name) - event_id, is_new, consec = db.upsert_event( + event_id, is_new, consec, ticket_id = db.upsert_event( 'unifi_device_offline', 'critical', 'unifi', name, d.get('type', ''), f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})', ) - if not sup and consec >= self.fail_thresh: + if not sup and consec >= self.fail_thresh and not ticket_id: self._ticket_unifi(event_id, d) else: db.resolve_event('unifi_device_offline', name, d.get('type', '')) @@ -844,12 +844,12 @@ class NetworkMonitor: if not reachable: sup = db.check_suppressed(suppressions, 'host', name) - event_id, is_new, consec = db.upsert_event( + event_id, is_new, consec, ticket_id = db.upsert_event( 'host_unreachable', 'critical', 'ping', name, ip, f'Host {name} ({ip}) unreachable via ping ({_now_utc()})', ) - if not sup and consec >= self.fail_thresh: + if not sup and consec >= self.fail_thresh and not ticket_id: self._ticket_unreachable(event_id, name, ip, consec) else: db.resolve_event('host_unreachable', name, ip)