Guard ticket creation against duplicates using event's existing ticket_id
Lint / Python (flake8) (push) Successful in 41s
Lint / JS (eslint) (push) Successful in 7s
Security / Python Security (bandit) (push) Successful in 40s
Test / Python Tests (pytest) (push) Successful in 1m18s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 4s
Lint / Python (flake8) (push) Successful in 41s
Lint / JS (eslint) (push) Successful in 7s
Security / Python Security (bandit) (push) Successful in 40s
Test / Python Tests (pytest) (push) Successful in 1m18s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 4s
upsert_event now returns ticket_id (4th element) so callers can skip ticket creation when one already exists. This prevents calling the ticket API every poll cycle for ongoing issues while still retrying if the previous creation attempt failed (ticket_id stays NULL until success). Cluster events use (is_new or not ticket_id) so they too get retried on failure rather than relying solely on is_new. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+8
-8
@@ -728,12 +728,12 @@ class NetworkMonitor:
|
||||
db.check_suppressed(suppressions, 'interface', host, iface) or
|
||||
db.check_suppressed(suppressions, 'host', host)
|
||||
)
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||
'interface_down', 'critical', 'prometheus',
|
||||
host, iface,
|
||||
f'Interface {iface} on {host} went link-down ({_now_utc()})',
|
||||
)
|
||||
if not sup and consec >= self.fail_thresh:
|
||||
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||
self._ticket_interface(event_id, host, iface, consec)
|
||||
|
||||
if host_has_regression:
|
||||
@@ -744,13 +744,13 @@ class NetworkMonitor:
|
||||
# Cluster-wide check – only genuine regressions count
|
||||
if len(hosts_with_regression) >= self.cluster_thresh:
|
||||
sup = db.check_suppressed(suppressions, 'all', '')
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||
'cluster_network_issue', 'critical', 'prometheus',
|
||||
self.cluster_name, '',
|
||||
f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
|
||||
f'{", ".join(hosts_with_regression)}',
|
||||
)
|
||||
if not sup and is_new:
|
||||
if not sup and (is_new or not ticket_id):
|
||||
title = (
|
||||
f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
|
||||
f'Multiple hosts reporting interface failures'
|
||||
@@ -804,12 +804,12 @@ class NetworkMonitor:
|
||||
name = d['name']
|
||||
if not d['connected']:
|
||||
sup = db.check_suppressed(suppressions, 'unifi_device', name)
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||
'unifi_device_offline', 'critical', 'unifi',
|
||||
name, d.get('type', ''),
|
||||
f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
|
||||
)
|
||||
if not sup and consec >= self.fail_thresh:
|
||||
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||
self._ticket_unifi(event_id, d)
|
||||
else:
|
||||
db.resolve_event('unifi_device_offline', name, d.get('type', ''))
|
||||
@@ -844,12 +844,12 @@ class NetworkMonitor:
|
||||
|
||||
if not reachable:
|
||||
sup = db.check_suppressed(suppressions, 'host', name)
|
||||
event_id, is_new, consec = db.upsert_event(
|
||||
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||
'host_unreachable', 'critical', 'ping',
|
||||
name, ip,
|
||||
f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
|
||||
)
|
||||
if not sup and consec >= self.fail_thresh:
|
||||
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||
self._ticket_unreachable(event_id, name, ip, consec)
|
||||
else:
|
||||
db.resolve_event('host_unreachable', name, ip)
|
||||
|
||||
Reference in New Issue
Block a user