Guard ticket creation against duplicates using event's existing ticket_id
Lint / Python (flake8) (push) Successful in 41s
Lint / JS (eslint) (push) Successful in 7s
Security / Python Security (bandit) (push) Successful in 40s
Test / Python Tests (pytest) (push) Successful in 1m18s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 4s

upsert_event now returns ticket_id (4th element) so callers can skip
ticket creation when one already exists. This prevents calling the ticket
API every poll cycle for ongoing issues while still retrying if the
previous creation attempt failed (ticket_id stays NULL until success).

Cluster events use (is_new or not ticket_id) so they too get retried
on failure rather than relying solely on is_new.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-14 11:09:50 -04:00
parent 0975dd007a
commit 9c5a88fbce
2 changed files with 12 additions and 12 deletions
+4 -4
View File
@@ -114,12 +114,12 @@ def upsert_event(
target_detail: str,
description: str,
) -> tuple:
"""Insert or update a network event. Returns (id, is_new, consecutive_failures)."""
"""Insert or update a network event. Returns (id, is_new, consecutive_failures, ticket_id)."""
detail = target_detail or ''
with get_conn() as conn:
with conn.cursor() as cur:
cur.execute(
"""SELECT id, consecutive_failures FROM network_events
"""SELECT id, consecutive_failures, ticket_id FROM network_events
WHERE event_type=%s AND target_name=%s AND target_detail=%s
AND resolved_at IS NULL LIMIT 1""",
(event_type, target_name, detail),
@@ -134,7 +134,7 @@ def upsert_event(
WHERE id=%s""",
(new_count, description, existing['id']),
)
return existing['id'], False, new_count
return existing['id'], False, new_count, existing.get('ticket_id')
else:
cur.execute(
"""INSERT INTO network_events
@@ -142,7 +142,7 @@ def upsert_event(
VALUES (%s, %s, %s, %s, %s, %s)""",
(event_type, severity, source_type, target_name, detail, description),
)
return cur.lastrowid, True, 1
return cur.lastrowid, True, 1, None
def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None:
+8 -8
View File
@@ -728,12 +728,12 @@ class NetworkMonitor:
db.check_suppressed(suppressions, 'interface', host, iface) or
db.check_suppressed(suppressions, 'host', host)
)
event_id, is_new, consec = db.upsert_event(
event_id, is_new, consec, ticket_id = db.upsert_event(
'interface_down', 'critical', 'prometheus',
host, iface,
f'Interface {iface} on {host} went link-down ({_now_utc()})',
)
if not sup and consec >= self.fail_thresh:
if not sup and consec >= self.fail_thresh and not ticket_id:
self._ticket_interface(event_id, host, iface, consec)
if host_has_regression:
@@ -744,13 +744,13 @@ class NetworkMonitor:
# Cluster-wide check only genuine regressions count
if len(hosts_with_regression) >= self.cluster_thresh:
sup = db.check_suppressed(suppressions, 'all', '')
event_id, is_new, consec = db.upsert_event(
event_id, is_new, consec, ticket_id = db.upsert_event(
'cluster_network_issue', 'critical', 'prometheus',
self.cluster_name, '',
f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
f'{", ".join(hosts_with_regression)}',
)
if not sup and is_new:
if not sup and (is_new or not ticket_id):
title = (
f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
f'Multiple hosts reporting interface failures'
@@ -804,12 +804,12 @@ class NetworkMonitor:
name = d['name']
if not d['connected']:
sup = db.check_suppressed(suppressions, 'unifi_device', name)
event_id, is_new, consec = db.upsert_event(
event_id, is_new, consec, ticket_id = db.upsert_event(
'unifi_device_offline', 'critical', 'unifi',
name, d.get('type', ''),
f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
)
if not sup and consec >= self.fail_thresh:
if not sup and consec >= self.fail_thresh and not ticket_id:
self._ticket_unifi(event_id, d)
else:
db.resolve_event('unifi_device_offline', name, d.get('type', ''))
@@ -844,12 +844,12 @@ class NetworkMonitor:
if not reachable:
sup = db.check_suppressed(suppressions, 'host', name)
event_id, is_new, consec = db.upsert_event(
event_id, is_new, consec, ticket_id = db.upsert_event(
'host_unreachable', 'critical', 'ping',
name, ip,
f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
)
if not sup and consec >= self.fail_thresh:
if not sup and consec >= self.fail_thresh and not ticket_id:
self._ticket_unreachable(event_id, name, ip, consec)
else:
db.resolve_event('host_unreachable', name, ip)