diff --git a/db.py b/db.py index 7760154..08001ce 100644 --- a/db.py +++ b/db.py @@ -114,12 +114,12 @@ def upsert_event( target_detail: str, description: str, ) -> tuple: - """Insert or update a network event. Returns (id, is_new, consecutive_failures).""" + """Insert or update a network event. Returns (id, is_new, consecutive_failures, ticket_id).""" detail = target_detail or '' with get_conn() as conn: with conn.cursor() as cur: cur.execute( - """SELECT id, consecutive_failures FROM network_events + """SELECT id, consecutive_failures, ticket_id FROM network_events WHERE event_type=%s AND target_name=%s AND target_detail=%s AND resolved_at IS NULL LIMIT 1""", (event_type, target_name, detail), @@ -134,7 +134,7 @@ def upsert_event( WHERE id=%s""", (new_count, description, existing['id']), ) - return existing['id'], False, new_count + return existing['id'], False, new_count, existing.get('ticket_id') else: cur.execute( """INSERT INTO network_events @@ -142,7 +142,7 @@ def upsert_event( VALUES (%s, %s, %s, %s, %s, %s)""", (event_type, severity, source_type, target_name, detail, description), ) - return cur.lastrowid, True, 1 + return cur.lastrowid, True, 1, None def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None: diff --git a/monitor.py b/monitor.py index 811c5aa..7080789 100644 --- a/monitor.py +++ b/monitor.py @@ -728,12 +728,12 @@ class NetworkMonitor: db.check_suppressed(suppressions, 'interface', host, iface) or db.check_suppressed(suppressions, 'host', host) ) - event_id, is_new, consec = db.upsert_event( + event_id, is_new, consec, ticket_id = db.upsert_event( 'interface_down', 'critical', 'prometheus', host, iface, f'Interface {iface} on {host} went link-down ({_now_utc()})', ) - if not sup and consec >= self.fail_thresh: + if not sup and consec >= self.fail_thresh and not ticket_id: self._ticket_interface(event_id, host, iface, consec) if host_has_regression: @@ -744,13 +744,13 @@ class NetworkMonitor: # Cluster-wide check – only genuine regressions count if len(hosts_with_regression) >= self.cluster_thresh: sup = db.check_suppressed(suppressions, 'all', '') - event_id, is_new, consec = db.upsert_event( + event_id, is_new, consec, ticket_id = db.upsert_event( 'cluster_network_issue', 'critical', 'prometheus', self.cluster_name, '', f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: ' f'{", ".join(hosts_with_regression)}', ) - if not sup and is_new: + if not sup and (is_new or not ticket_id): title = ( f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] ' f'Multiple hosts reporting interface failures' @@ -804,12 +804,12 @@ class NetworkMonitor: name = d['name'] if not d['connected']: sup = db.check_suppressed(suppressions, 'unifi_device', name) - event_id, is_new, consec = db.upsert_event( + event_id, is_new, consec, ticket_id = db.upsert_event( 'unifi_device_offline', 'critical', 'unifi', name, d.get('type', ''), f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})', ) - if not sup and consec >= self.fail_thresh: + if not sup and consec >= self.fail_thresh and not ticket_id: self._ticket_unifi(event_id, d) else: db.resolve_event('unifi_device_offline', name, d.get('type', '')) @@ -844,12 +844,12 @@ class NetworkMonitor: if not reachable: sup = db.check_suppressed(suppressions, 'host', name) - event_id, is_new, consec = db.upsert_event( + event_id, is_new, consec, ticket_id = db.upsert_event( 'host_unreachable', 'critical', 'ping', name, ip, f'Host {name} ({ip}) unreachable via ping ({_now_utc()})', ) - if not sup and consec >= self.fail_thresh: + if not sup and consec >= self.fail_thresh and not ticket_id: self._ticket_unreachable(event_id, name, ip, consec) else: db.resolve_event('host_unreachable', name, ip)