Use drive serial numbers instead of device paths for ticket dedup
Device paths like /dev/sdg are assigned by the kernel at boot and can change after hot-swaps or reboots, causing duplicate tickets for the same physical drive under a new letter. Changes: - _detect_issues(): issue strings now use serial number (e.g. "Drive Z4ZC4B6R has SMART issues: ...") falling back to device path only if smartctl cannot return a serial - _create_tickets_for_issues(): capacity lookup resolves serial → device via the details cache instead of regex on the issue string; serial is included in the API payload as a dedicated field - _generate_detailed_description(): drive lookup uses serial match instead of /dev/ regex The tinker_tickets API uses the serial field in the dedup hash so the same physical drive always maps to the same ticket regardless of which /dev/sdX letter it occupies. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+45
-21
@@ -1349,11 +1349,20 @@ class SystemHealthMonitor:
|
|||||||
- Possible drive failure!
|
- Possible drive failure!
|
||||||
""").strip() + "\n"
|
""").strip() + "\n"
|
||||||
|
|
||||||
if "Drive" in issue and "/dev/" in issue:
|
if "Drive" in issue and ("has SMART issues" in issue or "temperature is high" in issue):
|
||||||
try:
|
try:
|
||||||
device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0) if '/dev/' in issue else None
|
serial_match = re.search(r'Drive (\S+) (?:has SMART issues|temperature is high)', issue)
|
||||||
drive_info = next((d for d in health_report['drives_health']['drives'] if d['device'] == device), None)
|
drive_id = serial_match.group(1) if serial_match else None
|
||||||
|
# Find drive_info by matching serial (or device path as fallback)
|
||||||
|
device = None
|
||||||
|
drive_info = None
|
||||||
|
for d in health_report['drives_health']['drives']:
|
||||||
|
dd = self._get_drive_details(d['device'])
|
||||||
|
if (dd.get('serial') or d['device']) == drive_id:
|
||||||
|
device = d['device']
|
||||||
|
drive_info = d
|
||||||
|
break
|
||||||
|
|
||||||
if drive_info:
|
if drive_info:
|
||||||
drive_details = self._get_drive_details(device)
|
drive_details = self._get_drive_details(device)
|
||||||
|
|
||||||
@@ -1899,17 +1908,27 @@ class SystemHealthMonitor:
|
|||||||
# Remove [ceph] marker since _categorize_issue adds it as issue_tag
|
# Remove [ceph] marker since _categorize_issue adds it as issue_tag
|
||||||
clean_issue = clean_issue.replace('[ceph] ', '').replace('[ceph]', '')
|
clean_issue = clean_issue.replace('[ceph] ', '').replace('[ceph]', '')
|
||||||
|
|
||||||
# Extract drive capacity if this is a drive-related issue
|
# Extract drive capacity if this is a drive-related issue.
|
||||||
|
# Issue strings now use serial numbers; find the matching drive by serial.
|
||||||
drive_size = ""
|
drive_size = ""
|
||||||
if "Drive" in issue and "/dev/" in issue:
|
issue_serial = None
|
||||||
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
|
if "Drive" in issue and ("has SMART issues" in issue or "temperature is high" in issue):
|
||||||
if device_match:
|
serial_match = re.search(r'Drive (\S+) (?:has SMART issues|temperature is high)', issue)
|
||||||
device = device_match.group(0)
|
if serial_match:
|
||||||
drive_details = self._get_drive_details(device)
|
issue_serial = serial_match.group(1)
|
||||||
if drive_details['capacity']:
|
# Find the device path for this serial via the details cache
|
||||||
drive_size = f"[{drive_details['capacity']}] "
|
matched_device = None
|
||||||
else:
|
for d in health_report.get('drives_health', {}).get('drives', []):
|
||||||
logger.warning(f"Could not extract device from issue: {issue}")
|
dd = self._get_drive_details(d['device'])
|
||||||
|
if (dd.get('serial') or d['device']) == issue_serial:
|
||||||
|
matched_device = d['device']
|
||||||
|
break
|
||||||
|
if matched_device:
|
||||||
|
drive_details = self._get_drive_details(matched_device)
|
||||||
|
if drive_details['capacity']:
|
||||||
|
drive_size = f"[{drive_details['capacity']}] "
|
||||||
|
else:
|
||||||
|
logger.warning(f"Could not find device for drive id '{issue_serial}' in issue: {issue}")
|
||||||
|
|
||||||
# Build ticket title with proper categorization
|
# Build ticket title with proper categorization
|
||||||
# Add space after issue_tag if drive_size is empty (for non-drive issues)
|
# Add space after issue_tag if drive_size is empty (for non-drive issues)
|
||||||
@@ -1933,16 +1952,17 @@ class SystemHealthMonitor:
|
|||||||
description = self._generate_detailed_description(issue, health_report, priority)
|
description = self._generate_detailed_description(issue, health_report, priority)
|
||||||
|
|
||||||
# NOTE: The ticket API (create_ticket_api.php) deduplicates using a SHA-256 hash of:
|
# NOTE: The ticket API (create_ticket_api.php) deduplicates using a SHA-256 hash of:
|
||||||
# issue_category + environment_tags + hostname (excluded for [cluster-wide]) + device
|
# issue_category + environment_tags + hostname (excluded for [cluster-wide]) + serial
|
||||||
# Description content and timestamps are NOT included in the dedup hash.
|
# Serial is preferred over device path — it remains stable across reboots and
|
||||||
# The 24-hour dedup window prevents duplicate tickets from multiple nodes or runs.
|
# device-letter reassignments. Falls back to /dev/sdX for non-SMART-capable devices.
|
||||||
ticket_payload = {
|
ticket_payload = {
|
||||||
"title": ticket_title,
|
"title": ticket_title,
|
||||||
"description": description,
|
"description": description,
|
||||||
"priority": priority,
|
"priority": priority,
|
||||||
"status": "Open",
|
"status": "Open",
|
||||||
"category": category,
|
"category": category,
|
||||||
"type": ticket_type
|
"type": ticket_type,
|
||||||
|
"serial": issue_serial, # drive serial for stable dedup; None for non-drive issues
|
||||||
}
|
}
|
||||||
|
|
||||||
if self.dry_run:
|
if self.dry_run:
|
||||||
@@ -2021,11 +2041,15 @@ class SystemHealthMonitor:
|
|||||||
filtered_issues.append(issue)
|
filtered_issues.append(issue)
|
||||||
|
|
||||||
if filtered_issues:
|
if filtered_issues:
|
||||||
issues.append(f"Drive {drive['device']} has SMART issues: {', '.join(filtered_issues)}")
|
drive_details = self._get_drive_details(drive['device'])
|
||||||
|
drive_id = drive_details.get('serial') or drive['device']
|
||||||
|
issues.append(f"Drive {drive_id} has SMART issues: {', '.join(filtered_issues)}")
|
||||||
|
|
||||||
# Check temperature regardless of SMART status
|
# Check temperature regardless of SMART status
|
||||||
if drive.get('temperature') and drive['temperature'] > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
|
if drive.get('temperature') and drive['temperature'] > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
|
||||||
issues.append(f"Drive {drive['device']} temperature is high: {drive['temperature']}°C")
|
drive_details = self._get_drive_details(drive['device'])
|
||||||
|
drive_id = drive_details.get('serial') or drive['device']
|
||||||
|
issues.append(f"Drive {drive_id} temperature is high: {drive['temperature']}°C")
|
||||||
|
|
||||||
# Check for ECC memory errors
|
# Check for ECC memory errors
|
||||||
memory_health = health_report.get('memory_health', {})
|
memory_health = health_report.get('memory_health', {})
|
||||||
|
|||||||
Reference in New Issue
Block a user