Use drive serial numbers instead of device paths for ticket dedup

Device paths like /dev/sdg are assigned by the kernel at boot and can
change after hot-swaps or reboots, causing duplicate tickets for the
same physical drive under a new letter.

Changes:
- _detect_issues(): issue strings now use serial number (e.g.
  "Drive Z4ZC4B6R has SMART issues: ...") falling back to device
  path only if smartctl cannot return a serial
- _create_tickets_for_issues(): capacity lookup resolves serial →
  device via the details cache instead of regex on the issue string;
  serial is included in the API payload as a dedicated field
- _generate_detailed_description(): drive lookup uses serial match
  instead of /dev/ regex

The tinker_tickets API uses the serial field in the dedup hash so
the same physical drive always maps to the same ticket regardless
of which /dev/sdX letter it occupies.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-06 18:54:26 -04:00
parent d1750ea6cf
commit 03320c0ece
+40 -16
View File
@@ -1349,10 +1349,19 @@ class SystemHealthMonitor:
- Possible drive failure! - Possible drive failure!
""").strip() + "\n" """).strip() + "\n"
if "Drive" in issue and "/dev/" in issue: if "Drive" in issue and ("has SMART issues" in issue or "temperature is high" in issue):
try: try:
device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0) if '/dev/' in issue else None serial_match = re.search(r'Drive (\S+) (?:has SMART issues|temperature is high)', issue)
drive_info = next((d for d in health_report['drives_health']['drives'] if d['device'] == device), None) drive_id = serial_match.group(1) if serial_match else None
# Find drive_info by matching serial (or device path as fallback)
device = None
drive_info = None
for d in health_report['drives_health']['drives']:
dd = self._get_drive_details(d['device'])
if (dd.get('serial') or d['device']) == drive_id:
device = d['device']
drive_info = d
break
if drive_info: if drive_info:
drive_details = self._get_drive_details(device) drive_details = self._get_drive_details(device)
@@ -1899,17 +1908,27 @@ class SystemHealthMonitor:
# Remove [ceph] marker since _categorize_issue adds it as issue_tag # Remove [ceph] marker since _categorize_issue adds it as issue_tag
clean_issue = clean_issue.replace('[ceph] ', '').replace('[ceph]', '') clean_issue = clean_issue.replace('[ceph] ', '').replace('[ceph]', '')
# Extract drive capacity if this is a drive-related issue # Extract drive capacity if this is a drive-related issue.
# Issue strings now use serial numbers; find the matching drive by serial.
drive_size = "" drive_size = ""
if "Drive" in issue and "/dev/" in issue: issue_serial = None
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue) if "Drive" in issue and ("has SMART issues" in issue or "temperature is high" in issue):
if device_match: serial_match = re.search(r'Drive (\S+) (?:has SMART issues|temperature is high)', issue)
device = device_match.group(0) if serial_match:
drive_details = self._get_drive_details(device) issue_serial = serial_match.group(1)
# Find the device path for this serial via the details cache
matched_device = None
for d in health_report.get('drives_health', {}).get('drives', []):
dd = self._get_drive_details(d['device'])
if (dd.get('serial') or d['device']) == issue_serial:
matched_device = d['device']
break
if matched_device:
drive_details = self._get_drive_details(matched_device)
if drive_details['capacity']: if drive_details['capacity']:
drive_size = f"[{drive_details['capacity']}] " drive_size = f"[{drive_details['capacity']}] "
else: else:
logger.warning(f"Could not extract device from issue: {issue}") logger.warning(f"Could not find device for drive id '{issue_serial}' in issue: {issue}")
# Build ticket title with proper categorization # Build ticket title with proper categorization
# Add space after issue_tag if drive_size is empty (for non-drive issues) # Add space after issue_tag if drive_size is empty (for non-drive issues)
@@ -1933,16 +1952,17 @@ class SystemHealthMonitor:
description = self._generate_detailed_description(issue, health_report, priority) description = self._generate_detailed_description(issue, health_report, priority)
# NOTE: The ticket API (create_ticket_api.php) deduplicates using a SHA-256 hash of: # NOTE: The ticket API (create_ticket_api.php) deduplicates using a SHA-256 hash of:
# issue_category + environment_tags + hostname (excluded for [cluster-wide]) + device # issue_category + environment_tags + hostname (excluded for [cluster-wide]) + serial
# Description content and timestamps are NOT included in the dedup hash. # Serial is preferred over device path — it remains stable across reboots and
# The 24-hour dedup window prevents duplicate tickets from multiple nodes or runs. # device-letter reassignments. Falls back to /dev/sdX for non-SMART-capable devices.
ticket_payload = { ticket_payload = {
"title": ticket_title, "title": ticket_title,
"description": description, "description": description,
"priority": priority, "priority": priority,
"status": "Open", "status": "Open",
"category": category, "category": category,
"type": ticket_type "type": ticket_type,
"serial": issue_serial, # drive serial for stable dedup; None for non-drive issues
} }
if self.dry_run: if self.dry_run:
@@ -2021,11 +2041,15 @@ class SystemHealthMonitor:
filtered_issues.append(issue) filtered_issues.append(issue)
if filtered_issues: if filtered_issues:
issues.append(f"Drive {drive['device']} has SMART issues: {', '.join(filtered_issues)}") drive_details = self._get_drive_details(drive['device'])
drive_id = drive_details.get('serial') or drive['device']
issues.append(f"Drive {drive_id} has SMART issues: {', '.join(filtered_issues)}")
# Check temperature regardless of SMART status # Check temperature regardless of SMART status
if drive.get('temperature') and drive['temperature'] > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']: if drive.get('temperature') and drive['temperature'] > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
issues.append(f"Drive {drive['device']} temperature is high: {drive['temperature']}°C") drive_details = self._get_drive_details(drive['device'])
drive_id = drive_details.get('serial') or drive['device']
issues.append(f"Drive {drive_id} temperature is high: {drive['temperature']}°C")
# Check for ECC memory errors # Check for ECC memory errors
memory_health = health_report.get('memory_health', {}) memory_health = health_report.get('memory_health', {})