Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9c5a88fbce |
@@ -114,12 +114,12 @@ def upsert_event(
|
|||||||
target_detail: str,
|
target_detail: str,
|
||||||
description: str,
|
description: str,
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
"""Insert or update a network event. Returns (id, is_new, consecutive_failures)."""
|
"""Insert or update a network event. Returns (id, is_new, consecutive_failures, ticket_id)."""
|
||||||
detail = target_detail or ''
|
detail = target_detail or ''
|
||||||
with get_conn() as conn:
|
with get_conn() as conn:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"""SELECT id, consecutive_failures FROM network_events
|
"""SELECT id, consecutive_failures, ticket_id FROM network_events
|
||||||
WHERE event_type=%s AND target_name=%s AND target_detail=%s
|
WHERE event_type=%s AND target_name=%s AND target_detail=%s
|
||||||
AND resolved_at IS NULL LIMIT 1""",
|
AND resolved_at IS NULL LIMIT 1""",
|
||||||
(event_type, target_name, detail),
|
(event_type, target_name, detail),
|
||||||
@@ -134,7 +134,7 @@ def upsert_event(
|
|||||||
WHERE id=%s""",
|
WHERE id=%s""",
|
||||||
(new_count, description, existing['id']),
|
(new_count, description, existing['id']),
|
||||||
)
|
)
|
||||||
return existing['id'], False, new_count
|
return existing['id'], False, new_count, existing.get('ticket_id')
|
||||||
else:
|
else:
|
||||||
cur.execute(
|
cur.execute(
|
||||||
"""INSERT INTO network_events
|
"""INSERT INTO network_events
|
||||||
@@ -142,7 +142,7 @@ def upsert_event(
|
|||||||
VALUES (%s, %s, %s, %s, %s, %s)""",
|
VALUES (%s, %s, %s, %s, %s, %s)""",
|
||||||
(event_type, severity, source_type, target_name, detail, description),
|
(event_type, severity, source_type, target_name, detail, description),
|
||||||
)
|
)
|
||||||
return cur.lastrowid, True, 1
|
return cur.lastrowid, True, 1, None
|
||||||
|
|
||||||
|
|
||||||
def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None:
|
def resolve_event(event_type: str, target_name: str, target_detail: str = '') -> None:
|
||||||
|
|||||||
+8
-8
@@ -728,12 +728,12 @@ class NetworkMonitor:
|
|||||||
db.check_suppressed(suppressions, 'interface', host, iface) or
|
db.check_suppressed(suppressions, 'interface', host, iface) or
|
||||||
db.check_suppressed(suppressions, 'host', host)
|
db.check_suppressed(suppressions, 'host', host)
|
||||||
)
|
)
|
||||||
event_id, is_new, consec = db.upsert_event(
|
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||||
'interface_down', 'critical', 'prometheus',
|
'interface_down', 'critical', 'prometheus',
|
||||||
host, iface,
|
host, iface,
|
||||||
f'Interface {iface} on {host} went link-down ({_now_utc()})',
|
f'Interface {iface} on {host} went link-down ({_now_utc()})',
|
||||||
)
|
)
|
||||||
if not sup and consec >= self.fail_thresh:
|
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||||
self._ticket_interface(event_id, host, iface, consec)
|
self._ticket_interface(event_id, host, iface, consec)
|
||||||
|
|
||||||
if host_has_regression:
|
if host_has_regression:
|
||||||
@@ -744,13 +744,13 @@ class NetworkMonitor:
|
|||||||
# Cluster-wide check – only genuine regressions count
|
# Cluster-wide check – only genuine regressions count
|
||||||
if len(hosts_with_regression) >= self.cluster_thresh:
|
if len(hosts_with_regression) >= self.cluster_thresh:
|
||||||
sup = db.check_suppressed(suppressions, 'all', '')
|
sup = db.check_suppressed(suppressions, 'all', '')
|
||||||
event_id, is_new, consec = db.upsert_event(
|
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||||
'cluster_network_issue', 'critical', 'prometheus',
|
'cluster_network_issue', 'critical', 'prometheus',
|
||||||
self.cluster_name, '',
|
self.cluster_name, '',
|
||||||
f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
|
f'{len(hosts_with_regression)} hosts reporting simultaneous interface failures: '
|
||||||
f'{", ".join(hosts_with_regression)}',
|
f'{", ".join(hosts_with_regression)}',
|
||||||
)
|
)
|
||||||
if not sup and is_new:
|
if not sup and (is_new or not ticket_id):
|
||||||
title = (
|
title = (
|
||||||
f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
|
f'[{self.cluster_name}][auto][production][issue][network][cluster-wide] '
|
||||||
f'Multiple hosts reporting interface failures'
|
f'Multiple hosts reporting interface failures'
|
||||||
@@ -804,12 +804,12 @@ class NetworkMonitor:
|
|||||||
name = d['name']
|
name = d['name']
|
||||||
if not d['connected']:
|
if not d['connected']:
|
||||||
sup = db.check_suppressed(suppressions, 'unifi_device', name)
|
sup = db.check_suppressed(suppressions, 'unifi_device', name)
|
||||||
event_id, is_new, consec = db.upsert_event(
|
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||||
'unifi_device_offline', 'critical', 'unifi',
|
'unifi_device_offline', 'critical', 'unifi',
|
||||||
name, d.get('type', ''),
|
name, d.get('type', ''),
|
||||||
f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
|
f'UniFi {name} ({d.get("ip","")}) offline ({_now_utc()})',
|
||||||
)
|
)
|
||||||
if not sup and consec >= self.fail_thresh:
|
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||||
self._ticket_unifi(event_id, d)
|
self._ticket_unifi(event_id, d)
|
||||||
else:
|
else:
|
||||||
db.resolve_event('unifi_device_offline', name, d.get('type', ''))
|
db.resolve_event('unifi_device_offline', name, d.get('type', ''))
|
||||||
@@ -844,12 +844,12 @@ class NetworkMonitor:
|
|||||||
|
|
||||||
if not reachable:
|
if not reachable:
|
||||||
sup = db.check_suppressed(suppressions, 'host', name)
|
sup = db.check_suppressed(suppressions, 'host', name)
|
||||||
event_id, is_new, consec = db.upsert_event(
|
event_id, is_new, consec, ticket_id = db.upsert_event(
|
||||||
'host_unreachable', 'critical', 'ping',
|
'host_unreachable', 'critical', 'ping',
|
||||||
name, ip,
|
name, ip,
|
||||||
f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
|
f'Host {name} ({ip}) unreachable via ping ({_now_utc()})',
|
||||||
)
|
)
|
||||||
if not sup and consec >= self.fail_thresh:
|
if not sup and consec >= self.fail_thresh and not ticket_id:
|
||||||
self._ticket_unreachable(event_id, name, ip, consec)
|
self._ticket_unreachable(event_id, name, ip, consec)
|
||||||
else:
|
else:
|
||||||
db.resolve_event('host_unreachable', name, ip)
|
db.resolve_event('host_unreachable', name, ip)
|
||||||
|
|||||||
Reference in New Issue
Block a user