From 90dd8f3390fe472ad503307b2083590ab3a949c4 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Fri, 17 Apr 2026 10:09:54 -0400 Subject: [PATCH] fix: calibrate SMART thresholds per manufacturer to eliminate false positives Investigated all 7 pending drive tickets in the ticketing DB. Identified 3 confirmed false positives and 1 parsing bug. Implemented manufacturer- specific SMART profiles and a systemic substring-match fix. Changes: - Seagate: disable Seek_Error_Rate (packed counter), add High_Fly_Writes profile threshold (100/500 vs the old 1/5), disable Command_Timeout (packed 3-part 48-bit format on Exos series) - Western Digital: disable Command_Timeout (same packed format) - Toshiba: new profile covering MG04-MG10 enterprise and MQ01-MQ04 consumer series; disable Raw/Seek counters, keep Command_Timeout with raised thresholds (1000/5000) since MG-series uses a real simple count; add model-prefix detection so MG08ACP16TE etc. match without "TOSHIBA" in the model string - OOS: add OOS14000G alias (fleet has both 12TB and 14TB variants); replace billion-scale Command_Timeout threshold with monitor:False - Samsung: disable Program_Fail_Cnt_Total (attr 181, vendor-encoded), Erase_Fail_Count_Chip (attrs 172/176, chip-level internal counter), Program_Fail_Count_Chip (attr 171); disable generic Erase_Fail_Count and Program_Fail_Count to prevent bleed-through from _Chip lines Bug fixes: - Fix substring match: 'Erase_Fail_Count' was matching 'Erase_Fail_Count_Chip' lines in both the first-pass and main attribute loops. Changed to token-boundary check (attr + ' ') in both places. - Add 32-bit overflow guard: raw SMART values > 0xFFFFFFFF are skipped at threshold comparison. Catches 0xFFFFFFFFFFFF sentinel values from unrecognized drives (was generating Critical Program_Fail_Cnt_Total tickets with value 281474976710655). BASE_SMART_THRESHOLDS: - High_Fly_Writes: 1/5 -> 100/500 - Program_Fail_Cnt_Total: 1/5 -> 50/200 - Erase_Fail_Count_Total: 1/5 -> 50/200 Global filtered_issues: removed Seek_Error_Rate and Command_Timeout (now handled per-profile); Raw_Read_Error_Rate kept as catch-all. Verified with --dry-run on all 4 servers: compute-storage-01, large1, compute-storage-gpu-01, pbs. Only legitimate issues surface. Co-Authored-By: Claude Sonnet 4.6 --- hwmonDaemon.py | 216 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 174 insertions(+), 42 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index bc3a68f..9269a69 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -341,20 +341,49 @@ class SystemHealthMonitor: 'attributes': { 'Raw_Read_Error_Rate': { 'monitor': False, - 'description': 'WD drives use this as operation counter, not error count' + 'description': 'WD/HGST drives encode this as a multi-byte ECC operational counter, not a simple error rate' }, 'Seek_Error_Rate': { 'monitor': False, - 'description': 'WD drives use this as operation counter, not error count' + 'description': 'WD/HGST drives encode this as a multi-byte operational counter, not a simple error rate' + }, + 'Command_Timeout': { + 'monitor': False, + 'description': 'WD drives encode Command_Timeout as a packed 3-part 48-bit counter ' + '(bits 0-15: last-cycle count, 16-31: total, 32-47: max); ' + 'raw value is not comparable to a simple count threshold' } } }, 'Seagate': { + # Covers both consumer (Barracuda, IronWolf) and enterprise (Exos) lines. + # Consumer drives report Command_Timeout as "0 0 0" (parses to 0, harmless). + # Exos drives report it as a packed 3-part 48-bit counter identical to WD format. 'aliases': ['Seagate', 'ST'], 'attributes': { 'Raw_Read_Error_Rate': { 'monitor': False, - 'description': 'Seagate drives use this as operation counter' + 'description': 'Seagate drives encode this as a multi-byte ECC counter ' + '(high 16-bits: ECC correction count, low 32-bits: total reads); ' + 'raw value is not a simple uncorrected read error count' + }, + 'Seek_Error_Rate': { + 'monitor': False, + 'description': 'Seagate drives encode this as a multi-byte operational counter; ' + 'raw value is not a simple seek error count' + }, + 'High_Fly_Writes': { + 'monitor': True, + 'behavior': 'countup', + 'warning_threshold': 100, + 'critical_threshold': 500, + 'description': 'Seagate fly-height write counter; values up to ~100 accumulated ' + 'over a drive lifetime are normal and not correlated with failure' + }, + 'Command_Timeout': { + 'monitor': False, + 'description': 'Seagate encodes Command_Timeout as a packed 3-part 48-bit counter; ' + 'raw value is not comparable to a simple count threshold' } } }, @@ -414,22 +443,56 @@ class SystemHealthMonitor: } } }, - 'OOS': { - 'aliases': ['OOS12000G', 'OOS'], + 'Toshiba': { + # Covers enterprise NAS/DC drives (MG04, MG06, MG08, MG09 series) and + # consumer/mobile drives (MQ03, MQ04 series). + # Enterprise MG-series: Raw_Read_Error_Rate and Seek_Error_Rate are real + # counters encoded as simple values (often 0 on healthy drives). + # Command_Timeout on enterprise MG-series is a simple count (not packed). + # Model prefixes: MG=enterprise NAS/DC, MQ=consumer/mobile, DT=desktop, + # HDWD=retail rebranded. Include both "TOSHIBA" (Model Family string) + # and bare model prefixes (Device Model when not in smartctl DB). + 'aliases': ['Toshiba', 'TOSHIBA', 'MG04', 'MG06', 'MG07', 'MG08', 'MG09', 'MG10', + 'MQ01', 'MQ03', 'MQ04', 'HDWD'], 'attributes': { - # These drives seem to report very high error rates normally 'Raw_Read_Error_Rate': { - 'monitor': False, # Skip monitoring - seems to be a counter - 'description': 'OOS drives report high values normally' + 'monitor': False, + 'description': 'Toshiba drives typically report 0 here on healthy drives; ' + 'disabling to avoid noise on models that use it as a counter' }, 'Seek_Error_Rate': { - 'monitor': False, # Skip monitoring - seems to be a counter - 'description': 'OOS drives report high values normally' + 'monitor': False, + 'description': 'Toshiba drives typically report 0 here on healthy drives; ' + 'disabling to avoid noise on models that use it as a counter' }, 'Command_Timeout': { - 'warning_threshold': 100000000000, # 100 billion - 'critical_threshold': 200000000000, # 200 billion - 'description': 'OOS drives report very high timeout counters' + 'monitor': True, + 'behavior': 'countup', + 'warning_threshold': 1000, + 'critical_threshold': 5000, + 'description': 'Toshiba MG-series enterprise drives report a simple cumulative ' + 'command timeout count; raised threshold to account for normal ' + 'enterprise workload transients' + } + } + }, + 'OOS': { + # OOS drives (OOS12000G, OOS14000G) are Seagate-based OEM drives. + # They use the same packed counter encoding as Seagate for several attributes. + 'aliases': ['OOS12000G', 'OOS14000G', 'OOS'], + 'attributes': { + 'Raw_Read_Error_Rate': { + 'monitor': False, + 'description': 'OOS drives (Seagate-based OEM) encode this as a multi-byte counter' + }, + 'Seek_Error_Rate': { + 'monitor': False, + 'description': 'OOS drives (Seagate-based OEM) encode this as a multi-byte counter' + }, + 'Command_Timeout': { + 'monitor': False, + 'description': 'OOS drives use Seagate-style packed 3-part 48-bit counter format; ' + 'raw value is not comparable to a simple count threshold' } } }, @@ -450,16 +513,51 @@ class SystemHealthMonitor: 'description': 'Total wear leveling operations performed', 'monitor': True }, - # Standard monitoring for all other attributes - 'Program_Fail_Count': { + # Attr 171/172: chip-level counters; Samsung firmware normalizes these + # internally — rely on the normalized VALUE, not raw count. + 'Program_Fail_Count_Chip': { + 'monitor': False, + 'description': 'Samsung attr 171: chip-level program fail counter; ' + 'Samsung firmware normalizes this internally' + }, + 'Erase_Fail_Count_Chip': { + 'monitor': False, + 'description': 'Samsung attr 172/176: chip-level erase fail counter; ' + 'Samsung firmware normalizes this internally. Raw values ' + 'in the hundreds are normal on older drives with high P/E cycles' + }, + # Attr 181: vendor-specific packed value on Samsung consumer SSDs. + # The raw value is NOT a simple program failure count; it reflects + # internal NAND controller state and the encoding varies by model. + # The normalized VALUE (100 = healthy) is the reliable health indicator. + 'Program_Fail_Cnt_Total': { + 'monitor': False, + 'description': 'Samsung attr 181: vendor-specific raw encoding — not a simple ' + 'failure count. Rely on the normalized VALUE field instead' + }, + # Attr 174: unexpected power loss counter — monitor but with lenient threshold + # as small counts are normal (power outages, hard shutdowns). + 'Unexpect_Power_Loss_Ct': { 'monitor': True, - 'warning_threshold': 10, - 'critical_threshold': 20 + 'behavior': 'countup', + 'warning_threshold': 500, + 'critical_threshold': 2000, + 'description': 'Samsung attr 174: unexpected power loss / unsafe shutdown count' + }, + # Samsung SSDs only expose erase/program failures through the + # vendor-specific _Chip and _Total attributes above (already disabled). + # The generic 'Erase_Fail_Count' / 'Program_Fail_Count' names on Samsung + # are captured by substring from those same lines; disable here to prevent + # false alerts from that bleed-through. + 'Program_Fail_Count': { + 'monitor': False, + 'description': 'Samsung program failure reporting is via Program_Fail_Count_Chip/Total; ' + 'this generic name bleeds through from those attribute lines' }, 'Erase_Fail_Count': { - 'monitor': True, - 'warning_threshold': 10, - 'critical_threshold': 20 + 'monitor': False, + 'description': 'Samsung erase failure reporting is via Erase_Fail_Count_Chip (attrs 172/176); ' + 'this generic name bleeds through from those attribute lines' } } }, @@ -2052,14 +2150,15 @@ class SystemHealthMonitor: ]): continue - # Skip manufacturer-specific operation counters (not actual errors) - # These are monitored attributes that manufacturers use as counters + # Belt-and-suspenders filter for attributes that are known operation + # counters on many drives and are now disabled per manufacturer profile. + # Raw_Read_Error_Rate kept here as a catch-all for drives not yet + # covered by any profile. Seek_Error_Rate and Command_Timeout are + # fully handled at the profile level. if any(counter_name in issue for counter_name in [ - "Seek_Error_Rate", # Seagate/WD use as operation counter - "Command_Timeout", # OOS/Seagate use as operation counter - "Raw_Read_Error_Rate" # Seagate/WD use as operation counter + "Raw_Read_Error_Rate", # catch-all for unprofiled drives ]): - logger.debug(f"Filtering manufacturer operation counter from issues: {issue}") + logger.debug(f"Filtering global operation counter from issues: {issue}") continue filtered_issues.append(issue) @@ -2412,7 +2511,12 @@ class SystemHealthMonitor: return 'Micron' # Toshiba patterns - elif 'TOSHIBA' in model_upper: + # Enterprise NAS/DC: MG04, MG06, MG07, MG08, MG09, MG10 series + # Consumer/mobile: MQ01, MQ03, MQ04, MK series, DT01 desktop + elif any(pattern in model_upper for pattern in [ + 'TOSHIBA', 'MG04', 'MG06', 'MG07', 'MG08', 'MG09', 'MG10', + 'MQ01', 'MQ03', 'MQ04', 'HDWD', 'DT01' + ]): return 'Toshiba' # Ridata/Ritek patterns (for your existing special handling) @@ -2499,13 +2603,21 @@ class SystemHealthMonitor: 'Erase_Fail_Count': {'warning': 10, 'critical': 20}, 'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000}, 'SSD_Life_Left': {'warning': 30, 'critical': 10}, - 'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5}, - 'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5}, - # ADJUSTED: More lenient thresholds for error rates on unknown drives - 'Raw_Read_Error_Rate': {'warning': 10000000, 'critical': 100000000}, # Raised significantly - 'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000}, # Raised significantly - 'Command_Timeout': {'warning': 100, 'critical': 1000}, # Raised significantly - 'High_Fly_Writes': {'warning': 1, 'critical': 5}, + # Program/Erase fail totals: default thresholds for drives not matched by a + # manufacturer profile. Seagate/WD/OOS use counters here; Samsung disables + # these per-profile. Generic/unknown drives: any real failures are concerning. + 'Program_Fail_Cnt_Total': {'warning': 50, 'critical': 200}, + 'Erase_Fail_Count_Total': {'warning': 50, 'critical': 200}, + # Raw_Read_Error_Rate / Seek_Error_Rate: Seagate and WD disable these per-profile. + # For truly unknown drives the thresholds below act as a last-resort catch. + 'Raw_Read_Error_Rate': {'warning': 10000000, 'critical': 100000000}, + 'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000}, + # Command_Timeout: Seagate/WD/OOS disable per-profile (packed format). + # Toshiba uses a raised profile threshold. Generic fallback below. + 'Command_Timeout': {'warning': 100, 'critical': 1000}, + # High_Fly_Writes: Seagate sets its own profile threshold (100/500). + # Default for unknown drives is raised significantly from the old 1/5. + 'High_Fly_Writes': {'warning': 100, 'critical': 500}, 'Airflow_Temperature_Cel': {'warning': 65, 'critical': 75}, 'G_Sense_Error_Rate': {'warning': 100, 'critical': 1000}, 'Power-Off_Retract_Count': {'warning': 100000, 'critical': 500000}, @@ -2625,18 +2737,21 @@ class SystemHealthMonitor: power_on_hours = self._parse_smart_value(parts[9]) smart_attributes_raw['Power_On_Hours'] = power_on_hours - # Handle SMART attributes with preference for _Total versions + # Handle SMART attributes with preference for _Total versions. + # Use token-boundary matching (attr + ' ') to avoid 'Erase_Fail_Count' + # matching lines that actually contain 'Erase_Fail_Count_Chip' etc. for attr in ['Erase_Fail_Count', 'Program_Fail_Count']: - # Check for _Total version first (more accurate) - if f'{attr}_Total' in line: + total_attr = f'{attr}_Total' + # Check for _Total version first (more accurate), also token-safe + if (total_attr + ' ') in line or line.endswith(total_attr): parts = line.split() if len(parts) >= 10: raw_value = self._parse_smart_value(parts[9]) - smart_attributes_raw[f'{attr}_Total'] = raw_value # Store as _Total - logger.debug(f"Found {attr}_Total: {raw_value}") + smart_attributes_raw[total_attr] = raw_value + logger.debug(f"Found {total_attr}: {raw_value}") break - # Only use non-_Total version if _Total not found AND not Ridata - elif attr in line and f'{attr}_Total' not in smart_attributes_raw: + # Only use non-_Total version if exact token match and _Total not yet found + elif ((attr + ' ') in line or line.endswith(attr)) and total_attr not in smart_attributes_raw: # Check if this is a Ridata drive and should skip regular counters if manufacturer_profile and manufacturer_profile.get('aliases', [{}])[0] == 'Ridata': logger.debug(f"Skipping {attr} for Ridata drive - using _Total version only") @@ -2709,7 +2824,11 @@ class SystemHealthMonitor: for line in output.split('\n'): for attr in ALL_SMART_ATTRIBUTES: - if attr in line and attr not in ['Wear_Leveling_Count']: # Wear_Leveling handled separately above + # Use token-boundary check: attribute name must be followed by whitespace + # (or be at line end) to avoid 'Erase_Fail_Count' matching + # 'Erase_Fail_Count_Chip' lines, etc. + attr_present = (attr + ' ') in line or (attr + '\t') in line + if attr_present and attr not in ['Wear_Leveling_Count']: # Wear_Leveling handled separately above # Check if we should monitor this attribute if not self._should_monitor_attribute(attr, manufacturer_profile): logger.debug(f"Skipping {attr} - disabled for this manufacturer") @@ -2720,6 +2839,19 @@ class SystemHealthMonitor: raw_value = self._parse_smart_value(parts[9]) smart_health['attributes'][attr] = raw_value + # Guard: values exceeding 32-bit unsigned max are almost certainly + # packed multi-byte vendor fields (e.g. Seagate/WD Command_Timeout + # 3-counter format, or Samsung attr 181 vendor encoding). + # These should be suppressed by manufacturer profile settings, but + # this cap prevents false alarms from any drive not yet profiled. + if raw_value > 0xFFFFFFFF: + logger.debug( + f"Skipping threshold check for {attr} on {device}: " + f"raw value {raw_value} (0x{raw_value:x}) exceeds 32-bit max — " + f"likely a packed multi-byte vendor field" + ) + continue + # Get manufacturer-specific or default thresholds attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile) if not attr_thresholds: