Guard ticket creation against duplicates using event's existing ticket_id

upsert_event now returns ticket_id (4th element) so callers can skip ticket creation when one already exists. This prevents calling the ticket API every poll cycle for ongoing issues while still retrying if the previous creation attempt failed (ticket_id stays NULL until success). Cluster events use (is_new or not ticket_id) so they too get retried on failure rather than relying solely on is_new. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 11:09:50 -04:00
parent 0975dd007a
commit 9c5a88fbce
2 changed files with 12 additions and 12 deletions
@@ -728,12 +728,12 @@ class NetworkMonitor:
                            db.check_suppressed(suppressions, 'interface', host, iface) or
                            db.check_suppressed(suppressions, 'host', host)
                        )
-                        event_id, is_new, consec = db.upsert_event(
+                        event_id, is_new, consec, ticket_id = db.upsert_event(
                            'interface_down', 'critical', 'prometheus',
                            host, iface,
                            f'Interface {iface} on {host} went link-down ({_now_utc()})',
                        )
-                        if not sup and consec >= self.fail_thresh:
+                        if not sup and consec >= self.fail_thresh and not ticket_id:
                            self._ticket_interface(event_id, host, iface, consec)

            if host_has_regression:
@@ -744,13 +744,13 @@ class NetworkMonitor:
        # Cluster-wide check – only genuine regressions count
        if len(hosts_with_regression) >= self.cluster_thresh:
            sup = db.check_suppressed(suppressions, 'all', '')
-            event_id, is_new, consec = db.upsert_event(
+            event_id, is_new, consec, ticket_id = db.upsert_event(
                'cluster_network_issue', 'critical', 'prometheus',
                self.cluster_name, '',
                f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
                f'{", ".join(hosts_with_regression)}',
            )
-            if not sup and is_new:
+            if not sup and (is_new or not ticket_id):
                title = (
                    f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
                    f'Multiple hosts reporting interface failures'
@@ -804,12 +804,12 @@ class NetworkMonitor:
            name = d['name']
            if not d['connected']:
                sup = db.check_suppressed(suppressions, 'unifi_device', name)
-                event_id, is_new, consec = db.upsert_event(
+                event_id, is_new, consec, ticket_id = db.upsert_event(
                    'unifi_device_offline', 'critical', 'unifi',
                    name, d.get('type', ''),
                    f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
                )
-                if not sup and consec >= self.fail_thresh:
+                if not sup and consec >= self.fail_thresh and not ticket_id:
                    self._ticket_unifi(event_id, d)
            else:
                db.resolve_event('unifi_device_offline', name, d.get('type', ''))
@@ -844,12 +844,12 @@ class NetworkMonitor:

            if not reachable:
                sup = db.check_suppressed(suppressions, 'host', name)
-                event_id, is_new, consec = db.upsert_event(
+                event_id, is_new, consec, ticket_id = db.upsert_event(
                    'host_unreachable', 'critical', 'ping',
                    name, ip,
                    f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
                )
-                if not sup and consec >= self.fail_thresh:
+                if not sup and consec >= self.fail_thresh and not ticket_id:
                    self._ticket_unreachable(event_id, name, ip, consec)
            else:
                db.resolve_event('host_unreachable', name, ip)