From 03320c0eced9cfececc21577aea47f5e39908757 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Mon, 6 Apr 2026 18:54:26 -0400 Subject: [PATCH] Use drive serial numbers instead of device paths for ticket dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device paths like /dev/sdg are assigned by the kernel at boot and can change after hot-swaps or reboots, causing duplicate tickets for the same physical drive under a new letter. Changes: - _detect_issues(): issue strings now use serial number (e.g. "Drive Z4ZC4B6R has SMART issues: ...") falling back to device path only if smartctl cannot return a serial - _create_tickets_for_issues(): capacity lookup resolves serial → device via the details cache instead of regex on the issue string; serial is included in the API payload as a dedicated field - _generate_detailed_description(): drive lookup uses serial match instead of /dev/ regex The tinker_tickets API uses the serial field in the dedup hash so the same physical drive always maps to the same ticket regardless of which /dev/sdX letter it occupies. Co-Authored-By: Claude Sonnet 4.6 --- hwmonDaemon.py | 66 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index d703ac8..2cce8a4 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -1349,11 +1349,20 @@ class SystemHealthMonitor: - Possible drive failure! """).strip() + "\n" - if "Drive" in issue and "/dev/" in issue: + if "Drive" in issue and ("has SMART issues" in issue or "temperature is high" in issue): try: - device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0) if '/dev/' in issue else None - drive_info = next((d for d in health_report['drives_health']['drives'] if d['device'] == device), None) - + serial_match = re.search(r'Drive (\S+) (?:has SMART issues|temperature is high)', issue) + drive_id = serial_match.group(1) if serial_match else None + # Find drive_info by matching serial (or device path as fallback) + device = None + drive_info = None + for d in health_report['drives_health']['drives']: + dd = self._get_drive_details(d['device']) + if (dd.get('serial') or d['device']) == drive_id: + device = d['device'] + drive_info = d + break + if drive_info: drive_details = self._get_drive_details(device) @@ -1899,17 +1908,27 @@ class SystemHealthMonitor: # Remove [ceph] marker since _categorize_issue adds it as issue_tag clean_issue = clean_issue.replace('[ceph] ', '').replace('[ceph]', '') - # Extract drive capacity if this is a drive-related issue + # Extract drive capacity if this is a drive-related issue. + # Issue strings now use serial numbers; find the matching drive by serial. drive_size = "" - if "Drive" in issue and "/dev/" in issue: - device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue) - if device_match: - device = device_match.group(0) - drive_details = self._get_drive_details(device) - if drive_details['capacity']: - drive_size = f"[{drive_details['capacity']}] " - else: - logger.warning(f"Could not extract device from issue: {issue}") + issue_serial = None + if "Drive" in issue and ("has SMART issues" in issue or "temperature is high" in issue): + serial_match = re.search(r'Drive (\S+) (?:has SMART issues|temperature is high)', issue) + if serial_match: + issue_serial = serial_match.group(1) + # Find the device path for this serial via the details cache + matched_device = None + for d in health_report.get('drives_health', {}).get('drives', []): + dd = self._get_drive_details(d['device']) + if (dd.get('serial') or d['device']) == issue_serial: + matched_device = d['device'] + break + if matched_device: + drive_details = self._get_drive_details(matched_device) + if drive_details['capacity']: + drive_size = f"[{drive_details['capacity']}] " + else: + logger.warning(f"Could not find device for drive id '{issue_serial}' in issue: {issue}") # Build ticket title with proper categorization # Add space after issue_tag if drive_size is empty (for non-drive issues) @@ -1933,16 +1952,17 @@ class SystemHealthMonitor: description = self._generate_detailed_description(issue, health_report, priority) # NOTE: The ticket API (create_ticket_api.php) deduplicates using a SHA-256 hash of: - # issue_category + environment_tags + hostname (excluded for [cluster-wide]) + device - # Description content and timestamps are NOT included in the dedup hash. - # The 24-hour dedup window prevents duplicate tickets from multiple nodes or runs. + # issue_category + environment_tags + hostname (excluded for [cluster-wide]) + serial + # Serial is preferred over device path — it remains stable across reboots and + # device-letter reassignments. Falls back to /dev/sdX for non-SMART-capable devices. ticket_payload = { "title": ticket_title, "description": description, "priority": priority, "status": "Open", "category": category, - "type": ticket_type + "type": ticket_type, + "serial": issue_serial, # drive serial for stable dedup; None for non-drive issues } if self.dry_run: @@ -2021,11 +2041,15 @@ class SystemHealthMonitor: filtered_issues.append(issue) if filtered_issues: - issues.append(f"Drive {drive['device']} has SMART issues: {', '.join(filtered_issues)}") - + drive_details = self._get_drive_details(drive['device']) + drive_id = drive_details.get('serial') or drive['device'] + issues.append(f"Drive {drive_id} has SMART issues: {', '.join(filtered_issues)}") + # Check temperature regardless of SMART status if drive.get('temperature') and drive['temperature'] > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']: - issues.append(f"Drive {drive['device']} temperature is high: {drive['temperature']}°C") + drive_details = self._get_drive_details(drive['device']) + drive_id = drive_details.get('serial') or drive['device'] + issues.append(f"Drive {drive_id} temperature is high: {drive['temperature']}°C") # Check for ECC memory errors memory_health = health_report.get('memory_health', {})