From 90dd8f3390fe472ad503307b2083590ab3a949c4 Mon Sep 17 00:00:00 2001
From: Jared Vititoe <jjvititoe1@gmail.com>
Date: Fri, 17 Apr 2026 10:09:54 -0400
Subject: [PATCH] fix: calibrate SMART thresholds per manufacturer to eliminate
 false positives

Investigated all 7 pending drive tickets in the ticketing DB. Identified
3 confirmed false positives and 1 parsing bug. Implemented manufacturer-
specific SMART profiles and a systemic substring-match fix.

Changes:
- Seagate: disable Seek_Error_Rate (packed counter), add High_Fly_Writes
  profile threshold (100/500 vs the old 1/5), disable Command_Timeout
  (packed 3-part 48-bit format on Exos series)
- Western Digital: disable Command_Timeout (same packed format)
- Toshiba: new profile covering MG04-MG10 enterprise and MQ01-MQ04
  consumer series; disable Raw/Seek counters, keep Command_Timeout with
  raised thresholds (1000/5000) since MG-series uses a real simple count;
  add model-prefix detection so MG08ACP16TE etc. match without "TOSHIBA"
  in the model string
- OOS: add OOS14000G alias (fleet has both 12TB and 14TB variants);
  replace billion-scale Command_Timeout threshold with monitor:False
- Samsung: disable Program_Fail_Cnt_Total (attr 181, vendor-encoded),
  Erase_Fail_Count_Chip (attrs 172/176, chip-level internal counter),
  Program_Fail_Count_Chip (attr 171); disable generic Erase_Fail_Count
  and Program_Fail_Count to prevent bleed-through from _Chip lines

Bug fixes:
- Fix substring match: 'Erase_Fail_Count' was matching
  'Erase_Fail_Count_Chip' lines in both the first-pass and main attribute
  loops. Changed to token-boundary check (attr + ' ') in both places.
- Add 32-bit overflow guard: raw SMART values > 0xFFFFFFFF are skipped
  at threshold comparison. Catches 0xFFFFFFFFFFFF sentinel values from
  unrecognized drives (was generating Critical Program_Fail_Cnt_Total
  tickets with value 281474976710655).

BASE_SMART_THRESHOLDS:
- High_Fly_Writes: 1/5 -> 100/500
- Program_Fail_Cnt_Total: 1/5 -> 50/200
- Erase_Fail_Count_Total: 1/5 -> 50/200

Global filtered_issues: removed Seek_Error_Rate and Command_Timeout
(now handled per-profile); Raw_Read_Error_Rate kept as catch-all.

Verified with --dry-run on all 4 servers: compute-storage-01, large1,
compute-storage-gpu-01, pbs. Only legitimate issues surface.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 hwmonDaemon.py | 216 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 174 insertions(+), 42 deletions(-)

diff --git a/hwmonDaemon.py b/hwmonDaemon.py
index bc3a68f..9269a69 100644
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -341,20 +341,49 @@ class SystemHealthMonitor:
             'attributes': {
                 'Raw_Read_Error_Rate': {
                     'monitor': False,
-                    'description': 'WD drives use this as operation counter, not error count'
+                    'description': 'WD/HGST drives encode this as a multi-byte ECC operational counter, not a simple error rate'
                 },
                 'Seek_Error_Rate': {
                     'monitor': False,
-                    'description': 'WD drives use this as operation counter, not error count'
+                    'description': 'WD/HGST drives encode this as a multi-byte operational counter, not a simple error rate'
+                },
+                'Command_Timeout': {
+                    'monitor': False,
+                    'description': 'WD drives encode Command_Timeout as a packed 3-part 48-bit counter '
+                                   '(bits 0-15: last-cycle count, 16-31: total, 32-47: max); '
+                                   'raw value is not comparable to a simple count threshold'
                 }
             }
         },
         'Seagate': {
+            # Covers both consumer (Barracuda, IronWolf) and enterprise (Exos) lines.
+            # Consumer drives report Command_Timeout as "0 0 0" (parses to 0, harmless).
+            # Exos drives report it as a packed 3-part 48-bit counter identical to WD format.
             'aliases': ['Seagate', 'ST'],
             'attributes': {
                 'Raw_Read_Error_Rate': {
                     'monitor': False,
-                    'description': 'Seagate drives use this as operation counter'
+                    'description': 'Seagate drives encode this as a multi-byte ECC counter '
+                                   '(high 16-bits: ECC correction count, low 32-bits: total reads); '
+                                   'raw value is not a simple uncorrected read error count'
+                },
+                'Seek_Error_Rate': {
+                    'monitor': False,
+                    'description': 'Seagate drives encode this as a multi-byte operational counter; '
+                                   'raw value is not a simple seek error count'
+                },
+                'High_Fly_Writes': {
+                    'monitor': True,
+                    'behavior': 'countup',
+                    'warning_threshold': 100,
+                    'critical_threshold': 500,
+                    'description': 'Seagate fly-height write counter; values up to ~100 accumulated '
+                                   'over a drive lifetime are normal and not correlated with failure'
+                },
+                'Command_Timeout': {
+                    'monitor': False,
+                    'description': 'Seagate encodes Command_Timeout as a packed 3-part 48-bit counter; '
+                                   'raw value is not comparable to a simple count threshold'
                 }
             }
         },
@@ -414,22 +443,56 @@ class SystemHealthMonitor:
                 }
             }
         },
-        'OOS': {
-            'aliases': ['OOS12000G', 'OOS'],
+        'Toshiba': {
+            # Covers enterprise NAS/DC drives (MG04, MG06, MG08, MG09 series) and
+            # consumer/mobile drives (MQ03, MQ04 series).
+            # Enterprise MG-series: Raw_Read_Error_Rate and Seek_Error_Rate are real
+            # counters encoded as simple values (often 0 on healthy drives).
+            # Command_Timeout on enterprise MG-series is a simple count (not packed).
+            # Model prefixes: MG=enterprise NAS/DC, MQ=consumer/mobile, DT=desktop,
+            # HDWD=retail rebranded. Include both "TOSHIBA" (Model Family string)
+            # and bare model prefixes (Device Model when not in smartctl DB).
+            'aliases': ['Toshiba', 'TOSHIBA', 'MG04', 'MG06', 'MG07', 'MG08', 'MG09', 'MG10',
+                        'MQ01', 'MQ03', 'MQ04', 'HDWD'],
             'attributes': {
-                # These drives seem to report very high error rates normally
                 'Raw_Read_Error_Rate': {
-                    'monitor': False,  # Skip monitoring - seems to be a counter
-                    'description': 'OOS drives report high values normally'
+                    'monitor': False,
+                    'description': 'Toshiba drives typically report 0 here on healthy drives; '
+                                   'disabling to avoid noise on models that use it as a counter'
                 },
                 'Seek_Error_Rate': {
-                    'monitor': False,  # Skip monitoring - seems to be a counter
-                    'description': 'OOS drives report high values normally'
+                    'monitor': False,
+                    'description': 'Toshiba drives typically report 0 here on healthy drives; '
+                                   'disabling to avoid noise on models that use it as a counter'
                 },
                 'Command_Timeout': {
-                    'warning_threshold': 100000000000,  # 100 billion
-                    'critical_threshold': 200000000000,  # 200 billion
-                    'description': 'OOS drives report very high timeout counters'
+                    'monitor': True,
+                    'behavior': 'countup',
+                    'warning_threshold': 1000,
+                    'critical_threshold': 5000,
+                    'description': 'Toshiba MG-series enterprise drives report a simple cumulative '
+                                   'command timeout count; raised threshold to account for normal '
+                                   'enterprise workload transients'
+                }
+            }
+        },
+        'OOS': {
+            # OOS drives (OOS12000G, OOS14000G) are Seagate-based OEM drives.
+            # They use the same packed counter encoding as Seagate for several attributes.
+            'aliases': ['OOS12000G', 'OOS14000G', 'OOS'],
+            'attributes': {
+                'Raw_Read_Error_Rate': {
+                    'monitor': False,
+                    'description': 'OOS drives (Seagate-based OEM) encode this as a multi-byte counter'
+                },
+                'Seek_Error_Rate': {
+                    'monitor': False,
+                    'description': 'OOS drives (Seagate-based OEM) encode this as a multi-byte counter'
+                },
+                'Command_Timeout': {
+                    'monitor': False,
+                    'description': 'OOS drives use Seagate-style packed 3-part 48-bit counter format; '
+                                   'raw value is not comparable to a simple count threshold'
                 }
             }
         },
@@ -450,16 +513,51 @@ class SystemHealthMonitor:
                     'description': 'Total wear leveling operations performed',
                     'monitor': True
                 },
-                # Standard monitoring for all other attributes
-                'Program_Fail_Count': {
+                # Attr 171/172: chip-level counters; Samsung firmware normalizes these
+                # internally — rely on the normalized VALUE, not raw count.
+                'Program_Fail_Count_Chip': {
+                    'monitor': False,
+                    'description': 'Samsung attr 171: chip-level program fail counter; '
+                                   'Samsung firmware normalizes this internally'
+                },
+                'Erase_Fail_Count_Chip': {
+                    'monitor': False,
+                    'description': 'Samsung attr 172/176: chip-level erase fail counter; '
+                                   'Samsung firmware normalizes this internally. Raw values '
+                                   'in the hundreds are normal on older drives with high P/E cycles'
+                },
+                # Attr 181: vendor-specific packed value on Samsung consumer SSDs.
+                # The raw value is NOT a simple program failure count; it reflects
+                # internal NAND controller state and the encoding varies by model.
+                # The normalized VALUE (100 = healthy) is the reliable health indicator.
+                'Program_Fail_Cnt_Total': {
+                    'monitor': False,
+                    'description': 'Samsung attr 181: vendor-specific raw encoding — not a simple '
+                                   'failure count. Rely on the normalized VALUE field instead'
+                },
+                # Attr 174: unexpected power loss counter — monitor but with lenient threshold
+                # as small counts are normal (power outages, hard shutdowns).
+                'Unexpect_Power_Loss_Ct': {
                     'monitor': True,
-                    'warning_threshold': 10,
-                    'critical_threshold': 20
+                    'behavior': 'countup',
+                    'warning_threshold': 500,
+                    'critical_threshold': 2000,
+                    'description': 'Samsung attr 174: unexpected power loss / unsafe shutdown count'
+                },
+                # Samsung SSDs only expose erase/program failures through the
+                # vendor-specific _Chip and _Total attributes above (already disabled).
+                # The generic 'Erase_Fail_Count' / 'Program_Fail_Count' names on Samsung
+                # are captured by substring from those same lines; disable here to prevent
+                # false alerts from that bleed-through.
+                'Program_Fail_Count': {
+                    'monitor': False,
+                    'description': 'Samsung program failure reporting is via Program_Fail_Count_Chip/Total; '
+                                   'this generic name bleeds through from those attribute lines'
                 },
                 'Erase_Fail_Count': {
-                    'monitor': True,
-                    'warning_threshold': 10,
-                    'critical_threshold': 20
+                    'monitor': False,
+                    'description': 'Samsung erase failure reporting is via Erase_Fail_Count_Chip (attrs 172/176); '
+                                   'this generic name bleeds through from those attribute lines'
                 }
             }
         },
@@ -2052,14 +2150,15 @@ class SystemHealthMonitor:
                     ]):
                         continue
 
-                    # Skip manufacturer-specific operation counters (not actual errors)
-                    # These are monitored attributes that manufacturers use as counters
+                    # Belt-and-suspenders filter for attributes that are known operation
+                    # counters on many drives and are now disabled per manufacturer profile.
+                    # Raw_Read_Error_Rate kept here as a catch-all for drives not yet
+                    # covered by any profile. Seek_Error_Rate and Command_Timeout are
+                    # fully handled at the profile level.
                     if any(counter_name in issue for counter_name in [
-                        "Seek_Error_Rate",      # Seagate/WD use as operation counter
-                        "Command_Timeout",       # OOS/Seagate use as operation counter
-                        "Raw_Read_Error_Rate"   # Seagate/WD use as operation counter
+                        "Raw_Read_Error_Rate",  # catch-all for unprofiled drives
                     ]):
-                        logger.debug(f"Filtering manufacturer operation counter from issues: {issue}")
+                        logger.debug(f"Filtering global operation counter from issues: {issue}")
                         continue
 
                     filtered_issues.append(issue)
@@ -2412,7 +2511,12 @@ class SystemHealthMonitor:
             return 'Micron'
 
         # Toshiba patterns
-        elif 'TOSHIBA' in model_upper:
+        # Enterprise NAS/DC: MG04, MG06, MG07, MG08, MG09, MG10 series
+        # Consumer/mobile: MQ01, MQ03, MQ04, MK series, DT01 desktop
+        elif any(pattern in model_upper for pattern in [
+            'TOSHIBA', 'MG04', 'MG06', 'MG07', 'MG08', 'MG09', 'MG10',
+            'MQ01', 'MQ03', 'MQ04', 'HDWD', 'DT01'
+        ]):
             return 'Toshiba'
 
         # Ridata/Ritek patterns (for your existing special handling)
@@ -2499,13 +2603,21 @@ class SystemHealthMonitor:
             'Erase_Fail_Count': {'warning': 10, 'critical': 20},
             'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
             'SSD_Life_Left': {'warning': 30, 'critical': 10},
-            'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5},
-            'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5},
-            # ADJUSTED: More lenient thresholds for error rates on unknown drives
-            'Raw_Read_Error_Rate': {'warning': 10000000, 'critical': 100000000},  # Raised significantly
-            'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000},     # Raised significantly
-            'Command_Timeout': {'warning': 100, 'critical': 1000},               # Raised significantly
-            'High_Fly_Writes': {'warning': 1, 'critical': 5},
+            # Program/Erase fail totals: default thresholds for drives not matched by a
+            # manufacturer profile. Seagate/WD/OOS use counters here; Samsung disables
+            # these per-profile. Generic/unknown drives: any real failures are concerning.
+            'Program_Fail_Cnt_Total': {'warning': 50, 'critical': 200},
+            'Erase_Fail_Count_Total': {'warning': 50, 'critical': 200},
+            # Raw_Read_Error_Rate / Seek_Error_Rate: Seagate and WD disable these per-profile.
+            # For truly unknown drives the thresholds below act as a last-resort catch.
+            'Raw_Read_Error_Rate': {'warning': 10000000, 'critical': 100000000},
+            'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000},
+            # Command_Timeout: Seagate/WD/OOS disable per-profile (packed format).
+            # Toshiba uses a raised profile threshold. Generic fallback below.
+            'Command_Timeout': {'warning': 100, 'critical': 1000},
+            # High_Fly_Writes: Seagate sets its own profile threshold (100/500).
+            # Default for unknown drives is raised significantly from the old 1/5.
+            'High_Fly_Writes': {'warning': 100, 'critical': 500},
             'Airflow_Temperature_Cel': {'warning': 65, 'critical': 75},
             'G_Sense_Error_Rate': {'warning': 100, 'critical': 1000},
             'Power-Off_Retract_Count': {'warning': 100000, 'critical': 500000},
@@ -2625,18 +2737,21 @@ class SystemHealthMonitor:
                         power_on_hours = self._parse_smart_value(parts[9])
                         smart_attributes_raw['Power_On_Hours'] = power_on_hours
 
-                # Handle SMART attributes with preference for _Total versions
+                # Handle SMART attributes with preference for _Total versions.
+                # Use token-boundary matching (attr + ' ') to avoid 'Erase_Fail_Count'
+                # matching lines that actually contain 'Erase_Fail_Count_Chip' etc.
                 for attr in ['Erase_Fail_Count', 'Program_Fail_Count']:
-                    # Check for _Total version first (more accurate)
-                    if f'{attr}_Total' in line:
+                    total_attr = f'{attr}_Total'
+                    # Check for _Total version first (more accurate), also token-safe
+                    if (total_attr + ' ') in line or line.endswith(total_attr):
                         parts = line.split()
                         if len(parts) >= 10:
                             raw_value = self._parse_smart_value(parts[9])
-                            smart_attributes_raw[f'{attr}_Total'] = raw_value  # Store as _Total
-                            logger.debug(f"Found {attr}_Total: {raw_value}")
+                            smart_attributes_raw[total_attr] = raw_value
+                            logger.debug(f"Found {total_attr}: {raw_value}")
                             break
-                    # Only use non-_Total version if _Total not found AND not Ridata
-                    elif attr in line and f'{attr}_Total' not in smart_attributes_raw:
+                    # Only use non-_Total version if exact token match and _Total not yet found
+                    elif ((attr + ' ') in line or line.endswith(attr)) and total_attr not in smart_attributes_raw:
                         # Check if this is a Ridata drive and should skip regular counters
                         if manufacturer_profile and manufacturer_profile.get('aliases', [{}])[0] == 'Ridata':
                             logger.debug(f"Skipping {attr} for Ridata drive - using _Total version only")
@@ -2709,7 +2824,11 @@ class SystemHealthMonitor:
 
             for line in output.split('\n'):
                 for attr in ALL_SMART_ATTRIBUTES:
-                    if attr in line and attr not in ['Wear_Leveling_Count']:  # Wear_Leveling handled separately above
+                    # Use token-boundary check: attribute name must be followed by whitespace
+                    # (or be at line end) to avoid 'Erase_Fail_Count' matching
+                    # 'Erase_Fail_Count_Chip' lines, etc.
+                    attr_present = (attr + ' ') in line or (attr + '\t') in line
+                    if attr_present and attr not in ['Wear_Leveling_Count']:  # Wear_Leveling handled separately above
                         # Check if we should monitor this attribute
                         if not self._should_monitor_attribute(attr, manufacturer_profile):
                             logger.debug(f"Skipping {attr} - disabled for this manufacturer")
@@ -2720,6 +2839,19 @@ class SystemHealthMonitor:
                             raw_value = self._parse_smart_value(parts[9])
                             smart_health['attributes'][attr] = raw_value
 
+                            # Guard: values exceeding 32-bit unsigned max are almost certainly
+                            # packed multi-byte vendor fields (e.g. Seagate/WD Command_Timeout
+                            # 3-counter format, or Samsung attr 181 vendor encoding).
+                            # These should be suppressed by manufacturer profile settings, but
+                            # this cap prevents false alarms from any drive not yet profiled.
+                            if raw_value > 0xFFFFFFFF:
+                                logger.debug(
+                                    f"Skipping threshold check for {attr} on {device}: "
+                                    f"raw value {raw_value} (0x{raw_value:x}) exceeds 32-bit max — "
+                                    f"likely a packed multi-byte vendor field"
+                                )
+                                continue
+
                             # Get manufacturer-specific or default thresholds
                             attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile)
                             if not attr_thresholds: