Use drive serial numbers instead of device paths for ticket dedup
Device paths like /dev/sdg are assigned by the kernel at boot and can change after hot-swaps or reboots, causing duplicate tickets for the same physical drive under a new letter. Changes: - _detect_issues(): issue strings now use serial number (e.g. "Drive Z4ZC4B6R has SMART issues: ...") falling back to device path only if smartctl cannot return a serial - _create_tickets_for_issues(): capacity lookup resolves serial → device via the details cache instead of regex on the issue string; serial is included in the API payload as a dedicated field - _generate_detailed_description(): drive lookup uses serial match instead of /dev/ regex The tinker_tickets API uses the serial field in the dedup hash so the same physical drive always maps to the same ticket regardless of which /dev/sdX letter it occupies. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+43
-19
@@ -1349,10 +1349,19 @@ class SystemHealthMonitor:
|
||||
- Possible drive failure!
|
||||
""").strip() + "\n"
|
||||
|
||||
if "Drive" in issue and "/dev/" in issue:
|
||||
if "Drive" in issue and ("has SMART issues" in issue or "temperature is high" in issue):
|
||||
try:
|
||||
device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0) if '/dev/' in issue else None
|
||||
drive_info = next((d for d in health_report['drives_health']['drives'] if d['device'] == device), None)
|
||||
serial_match = re.search(r'Drive (\S+) (?:has SMART issues|temperature is high)', issue)
|
||||
drive_id = serial_match.group(1) if serial_match else None
|
||||
# Find drive_info by matching serial (or device path as fallback)
|
||||
device = None
|
||||
drive_info = None
|
||||
for d in health_report['drives_health']['drives']:
|
||||
dd = self._get_drive_details(d['device'])
|
||||
if (dd.get('serial') or d['device']) == drive_id:
|
||||
device = d['device']
|
||||
drive_info = d
|
||||
break
|
||||
|
||||
if drive_info:
|
||||
drive_details = self._get_drive_details(device)
|
||||
@@ -1899,17 +1908,27 @@ class SystemHealthMonitor:
|
||||
# Remove [ceph] marker since _categorize_issue adds it as issue_tag
|
||||
clean_issue = clean_issue.replace('[ceph] ', '').replace('[ceph]', '')
|
||||
|
||||
# Extract drive capacity if this is a drive-related issue
|
||||
# Extract drive capacity if this is a drive-related issue.
|
||||
# Issue strings now use serial numbers; find the matching drive by serial.
|
||||
drive_size = ""
|
||||
if "Drive" in issue and "/dev/" in issue:
|
||||
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
|
||||
if device_match:
|
||||
device = device_match.group(0)
|
||||
drive_details = self._get_drive_details(device)
|
||||
if drive_details['capacity']:
|
||||
drive_size = f"[{drive_details['capacity']}] "
|
||||
else:
|
||||
logger.warning(f"Could not extract device from issue: {issue}")
|
||||
issue_serial = None
|
||||
if "Drive" in issue and ("has SMART issues" in issue or "temperature is high" in issue):
|
||||
serial_match = re.search(r'Drive (\S+) (?:has SMART issues|temperature is high)', issue)
|
||||
if serial_match:
|
||||
issue_serial = serial_match.group(1)
|
||||
# Find the device path for this serial via the details cache
|
||||
matched_device = None
|
||||
for d in health_report.get('drives_health', {}).get('drives', []):
|
||||
dd = self._get_drive_details(d['device'])
|
||||
if (dd.get('serial') or d['device']) == issue_serial:
|
||||
matched_device = d['device']
|
||||
break
|
||||
if matched_device:
|
||||
drive_details = self._get_drive_details(matched_device)
|
||||
if drive_details['capacity']:
|
||||
drive_size = f"[{drive_details['capacity']}] "
|
||||
else:
|
||||
logger.warning(f"Could not find device for drive id '{issue_serial}' in issue: {issue}")
|
||||
|
||||
# Build ticket title with proper categorization
|
||||
# Add space after issue_tag if drive_size is empty (for non-drive issues)
|
||||
@@ -1933,16 +1952,17 @@ class SystemHealthMonitor:
|
||||
description = self._generate_detailed_description(issue, health_report, priority)
|
||||
|
||||
# NOTE: The ticket API (create_ticket_api.php) deduplicates using a SHA-256 hash of:
|
||||
# issue_category + environment_tags + hostname (excluded for [cluster-wide]) + device
|
||||
# Description content and timestamps are NOT included in the dedup hash.
|
||||
# The 24-hour dedup window prevents duplicate tickets from multiple nodes or runs.
|
||||
# issue_category + environment_tags + hostname (excluded for [cluster-wide]) + serial
|
||||
# Serial is preferred over device path — it remains stable across reboots and
|
||||
# device-letter reassignments. Falls back to /dev/sdX for non-SMART-capable devices.
|
||||
ticket_payload = {
|
||||
"title": ticket_title,
|
||||
"description": description,
|
||||
"priority": priority,
|
||||
"status": "Open",
|
||||
"category": category,
|
||||
"type": ticket_type
|
||||
"type": ticket_type,
|
||||
"serial": issue_serial, # drive serial for stable dedup; None for non-drive issues
|
||||
}
|
||||
|
||||
if self.dry_run:
|
||||
@@ -2021,11 +2041,15 @@ class SystemHealthMonitor:
|
||||
filtered_issues.append(issue)
|
||||
|
||||
if filtered_issues:
|
||||
issues.append(f"Drive {drive['device']} has SMART issues: {', '.join(filtered_issues)}")
|
||||
drive_details = self._get_drive_details(drive['device'])
|
||||
drive_id = drive_details.get('serial') or drive['device']
|
||||
issues.append(f"Drive {drive_id} has SMART issues: {', '.join(filtered_issues)}")
|
||||
|
||||
# Check temperature regardless of SMART status
|
||||
if drive.get('temperature') and drive['temperature'] > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
|
||||
issues.append(f"Drive {drive['device']} temperature is high: {drive['temperature']}°C")
|
||||
drive_details = self._get_drive_details(drive['device'])
|
||||
drive_id = drive_details.get('serial') or drive['device']
|
||||
issues.append(f"Drive {drive_id} temperature is high: {drive['temperature']}°C")
|
||||
|
||||
# Check for ECC memory errors
|
||||
memory_health = health_report.get('memory_health', {})
|
||||
|
||||
Reference in New Issue
Block a user