Guard ticket creation against duplicates using event's existing ticket_id
Lint / Python (flake8) (push) Successful in 41s
Lint / JS (eslint) (push) Successful in 7s
Security / Python Security (bandit) (push) Successful in 40s
Test / Python Tests (pytest) (push) Successful in 1m18s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 4s
Lint / Python (flake8) (push) Successful in 41s
Lint / JS (eslint) (push) Successful in 7s
Security / Python Security (bandit) (push) Successful in 40s
Test / Python Tests (pytest) (push) Successful in 1m18s
Lint / Notify on failure (push) Has been skipped
Lint / Deploy (push) Successful in 4s
upsert_event now returns ticket_id (4th element) so callers can skip ticket creation when one already exists. This prevents calling the ticket API every poll cycle for ongoing issues while still retrying if the previous creation attempt failed (ticket_id stays NULL until success). Cluster events use (is_new or not ticket_id) so they too get retried on failure rather than relying solely on is_new. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -114,12 +114,12 @@ def upsert_event(
|
|||||||
target_detail: str,
|
target_detail: str,
|
||||||
description: str,
|
description: str,
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
"""Insert or update a network event. Returns (id, is_new, consecutive_failures)."""
|
"""Insert or update a network event. Returns (id, is_new, consecutive_failures, ticket_id)."""
|
||||||
detail = target_detail or ''
|
detail = target_detail or ''
|
||||||
with get_conn() as conn:
|
with get_conn() as conn:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"""SELECT id, consecutive_failures FROM network_events
|
"""SELECT id, consecutive_failures, ticket_id FROM network_events
|
||||||
WHERE event_type=%s AND target_name=%s AND target_detail=%s
|
WHERE event_type=%s AND target_name=%s AND target_detail=%s
|
||||||
AND resolved_at IS NULL LIMIT 1""",
|
AND resolved_at IS NULL LIMIT 1""",
|
||||||
(event_type, target_name, detail),
|
(event_type, target_name, detail),
|
||||||
@@ -134,7 +134,7 @@ def upsert_event(
|
|||||||
WHERE id=%s""",
|
WHERE id=%s""",
|
||||||
(new_count, description, existing['id']),
|
(new_count, description, existing['id']),
|
||||||
)
|
)
|
||||||
return existing['id'], False, new_count
|
return existing['id'], False, new_count, existing.get('ticket_id')
|
||||||
else:
|
else:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"""INSERT INTO network_events
|
"""INSERT INTO network_events
|
||||||
@@ -142,7 +142,7 @@ def upsert_event(
|
|||||||
VALUES (%s, %s, %s, %s, %s, %s)""",
|
VALUES (%s, %s, %s, %s, %s, %s)""",
|
||||||
(event_type, severity, source_type, target_name, detail, description),
|
(event_type, severity, source_type, target_name, detail, description),
|
||||||
)
|
)
|
||||||
return cur.lastrowid, True, 1
|
return cur.lastrowid, True, 1, None
|
||||||
|
|
||||||
|
|
||||||
def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None:
|
def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None:
|
||||||
|
|||||||
+8
-8
@@ -728,12 +728,12 @@ class NetworkMonitor:
|
|||||||
db.check_suppressed(suppressions, 'interface', host, iface) or
|
db.check_suppressed(suppressions, 'interface', host, iface) or
|
||||||
db.check_suppressed(suppressions, 'host', host)
|
db.check_suppressed(suppressions, 'host', host)
|
||||||
)
|
)
|
||||||
event_id, is_new, consec = db.upsert_event(
|
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||||
'interface_down', 'critical', 'prometheus',
|
'interface_down', 'critical', 'prometheus',
|
||||||
host, iface,
|
host, iface,
|
||||||
f'Interface {iface} on {host} went link-down ({_now_utc()})',
|
f'Interface {iface} on {host} went link-down ({_now_utc()})',
|
||||||
)
|
)
|
||||||
if not sup and consec >= self.fail_thresh:
|
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||||
self._ticket_interface(event_id, host, iface, consec)
|
self._ticket_interface(event_id, host, iface, consec)
|
||||||
|
|
||||||
if host_has_regression:
|
if host_has_regression:
|
||||||
@@ -744,13 +744,13 @@ class NetworkMonitor:
|
|||||||
# Cluster-wide check – only genuine regressions count
|
# Cluster-wide check – only genuine regressions count
|
||||||
if len(hosts_with_regression) >= self.cluster_thresh:
|
if len(hosts_with_regression) >= self.cluster_thresh:
|
||||||
sup = db.check_suppressed(suppressions, 'all', '')
|
sup = db.check_suppressed(suppressions, 'all', '')
|
||||||
event_id, is_new, consec = db.upsert_event(
|
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||||
'cluster_network_issue', 'critical', 'prometheus',
|
'cluster_network_issue', 'critical', 'prometheus',
|
||||||
self.cluster_name, '',
|
self.cluster_name, '',
|
||||||
f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
|
f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
|
||||||
f'{", ".join(hosts_with_regression)}',
|
f'{", ".join(hosts_with_regression)}',
|
||||||
)
|
)
|
||||||
if not sup and is_new:
|
if not sup and (is_new or not ticket_id):
|
||||||
title = (
|
title = (
|
||||||
f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
|
f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
|
||||||
f'Multiple hosts reporting interface failures'
|
f'Multiple hosts reporting interface failures'
|
||||||
@@ -804,12 +804,12 @@ class NetworkMonitor:
|
|||||||
name = d['name']
|
name = d['name']
|
||||||
if not d['connected']:
|
if not d['connected']:
|
||||||
sup = db.check_suppressed(suppressions, 'unifi_device', name)
|
sup = db.check_suppressed(suppressions, 'unifi_device', name)
|
||||||
event_id, is_new, consec = db.upsert_event(
|
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||||
'unifi_device_offline', 'critical', 'unifi',
|
'unifi_device_offline', 'critical', 'unifi',
|
||||||
name, d.get('type', ''),
|
name, d.get('type', ''),
|
||||||
f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
|
f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
|
||||||
)
|
)
|
||||||
if not sup and consec >= self.fail_thresh:
|
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||||
self._ticket_unifi(event_id, d)
|
self._ticket_unifi(event_id, d)
|
||||||
else:
|
else:
|
||||||
db.resolve_event('unifi_device_offline', name, d.get('type', ''))
|
db.resolve_event('unifi_device_offline', name, d.get('type', ''))
|
||||||
@@ -844,12 +844,12 @@ class NetworkMonitor:
|
|||||||
|
|
||||||
if not reachable:
|
if not reachable:
|
||||||
sup = db.check_suppressed(suppressions, 'host', name)
|
sup = db.check_suppressed(suppressions, 'host', name)
|
||||||
event_id, is_new, consec = db.upsert_event(
|
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||||
'host_unreachable', 'critical', 'ping',
|
'host_unreachable', 'critical', 'ping',
|
||||||
name, ip,
|
name, ip,
|
||||||
f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
|
f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
|
||||||
)
|
)
|
||||||
if not sup and consec >= self.fail_thresh:
|
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||||
self._ticket_unreachable(event_id, name, ip, consec)
|
self._ticket_unreachable(event_id, name, ip, consec)
|
||||||
else:
|
else:
|
||||||
db.resolve_event('host_unreachable', name, ip)
|
db.resolve_event('host_unreachable', name, ip)
|
||||||
|
|||||||
Reference in New Issue
Block a user