fix: calibrate SMART thresholds per manufacturer to eliminate false positives
Investigated all 7 pending drive tickets in the ticketing DB. Identified 3 confirmed false positives and 1 parsing bug. Implemented manufacturer- specific SMART profiles and a systemic substring-match fix. Changes: - Seagate: disable Seek_Error_Rate (packed counter), add High_Fly_Writes profile threshold (100/500 vs the old 1/5), disable Command_Timeout (packed 3-part 48-bit format on Exos series) - Western Digital: disable Command_Timeout (same packed format) - Toshiba: new profile covering MG04-MG10 enterprise and MQ01-MQ04 consumer series; disable Raw/Seek counters, keep Command_Timeout with raised thresholds (1000/5000) since MG-series uses a real simple count; add model-prefix detection so MG08ACP16TE etc. match without "TOSHIBA" in the model string - OOS: add OOS14000G alias (fleet has both 12TB and 14TB variants); replace billion-scale Command_Timeout threshold with monitor:False - Samsung: disable Program_Fail_Cnt_Total (attr 181, vendor-encoded), Erase_Fail_Count_Chip (attrs 172/176, chip-level internal counter), Program_Fail_Count_Chip (attr 171); disable generic Erase_Fail_Count and Program_Fail_Count to prevent bleed-through from _Chip lines Bug fixes: - Fix substring match: 'Erase_Fail_Count' was matching 'Erase_Fail_Count_Chip' lines in both the first-pass and main attribute loops. Changed to token-boundary check (attr + ' ') in both places. - Add 32-bit overflow guard: raw SMART values > 0xFFFFFFFF are skipped at threshold comparison. Catches 0xFFFFFFFFFFFF sentinel values from unrecognized drives (was generating Critical Program_Fail_Cnt_Total tickets with value 281474976710655). BASE_SMART_THRESHOLDS: - High_Fly_Writes: 1/5 -> 100/500 - Program_Fail_Cnt_Total: 1/5 -> 50/200 - Erase_Fail_Count_Total: 1/5 -> 50/200 Global filtered_issues: removed Seek_Error_Rate and Command_Timeout (now handled per-profile); Raw_Read_Error_Rate kept as catch-all. Verified with --dry-run on all 4 servers: compute-storage-01, large1, compute-storage-gpu-01, pbs. Only legitimate issues surface. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+174
-42
@@ -341,20 +341,49 @@ class SystemHealthMonitor:
|
||||
'attributes': {
|
||||
'Raw_Read_Error_Rate': {
|
||||
'monitor': False,
|
||||
'description': 'WD drives use this as operation counter, not error count'
|
||||
'description': 'WD/HGST drives encode this as a multi-byte ECC operational counter, not a simple error rate'
|
||||
},
|
||||
'Seek_Error_Rate': {
|
||||
'monitor': False,
|
||||
'description': 'WD drives use this as operation counter, not error count'
|
||||
'description': 'WD/HGST drives encode this as a multi-byte operational counter, not a simple error rate'
|
||||
},
|
||||
'Command_Timeout': {
|
||||
'monitor': False,
|
||||
'description': 'WD drives encode Command_Timeout as a packed 3-part 48-bit counter '
|
||||
'(bits 0-15: last-cycle count, 16-31: total, 32-47: max); '
|
||||
'raw value is not comparable to a simple count threshold'
|
||||
}
|
||||
}
|
||||
},
|
||||
'Seagate': {
|
||||
# Covers both consumer (Barracuda, IronWolf) and enterprise (Exos) lines.
|
||||
# Consumer drives report Command_Timeout as "0 0 0" (parses to 0, harmless).
|
||||
# Exos drives report it as a packed 3-part 48-bit counter identical to WD format.
|
||||
'aliases': ['Seagate', 'ST'],
|
||||
'attributes': {
|
||||
'Raw_Read_Error_Rate': {
|
||||
'monitor': False,
|
||||
'description': 'Seagate drives use this as operation counter'
|
||||
'description': 'Seagate drives encode this as a multi-byte ECC counter '
|
||||
'(high 16-bits: ECC correction count, low 32-bits: total reads); '
|
||||
'raw value is not a simple uncorrected read error count'
|
||||
},
|
||||
'Seek_Error_Rate': {
|
||||
'monitor': False,
|
||||
'description': 'Seagate drives encode this as a multi-byte operational counter; '
|
||||
'raw value is not a simple seek error count'
|
||||
},
|
||||
'High_Fly_Writes': {
|
||||
'monitor': True,
|
||||
'behavior': 'countup',
|
||||
'warning_threshold': 100,
|
||||
'critical_threshold': 500,
|
||||
'description': 'Seagate fly-height write counter; values up to ~100 accumulated '
|
||||
'over a drive lifetime are normal and not correlated with failure'
|
||||
},
|
||||
'Command_Timeout': {
|
||||
'monitor': False,
|
||||
'description': 'Seagate encodes Command_Timeout as a packed 3-part 48-bit counter; '
|
||||
'raw value is not comparable to a simple count threshold'
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -414,22 +443,56 @@ class SystemHealthMonitor:
|
||||
}
|
||||
}
|
||||
},
|
||||
'OOS': {
|
||||
'aliases': ['OOS12000G', 'OOS'],
|
||||
'Toshiba': {
|
||||
# Covers enterprise NAS/DC drives (MG04, MG06, MG08, MG09 series) and
|
||||
# consumer/mobile drives (MQ03, MQ04 series).
|
||||
# Enterprise MG-series: Raw_Read_Error_Rate and Seek_Error_Rate are real
|
||||
# counters encoded as simple values (often 0 on healthy drives).
|
||||
# Command_Timeout on enterprise MG-series is a simple count (not packed).
|
||||
# Model prefixes: MG=enterprise NAS/DC, MQ=consumer/mobile, DT=desktop,
|
||||
# HDWD=retail rebranded. Include both "TOSHIBA" (Model Family string)
|
||||
# and bare model prefixes (Device Model when not in smartctl DB).
|
||||
'aliases': ['Toshiba', 'TOSHIBA', 'MG04', 'MG06', 'MG07', 'MG08', 'MG09', 'MG10',
|
||||
'MQ01', 'MQ03', 'MQ04', 'HDWD'],
|
||||
'attributes': {
|
||||
# These drives seem to report very high error rates normally
|
||||
'Raw_Read_Error_Rate': {
|
||||
'monitor': False, # Skip monitoring - seems to be a counter
|
||||
'description': 'OOS drives report high values normally'
|
||||
'monitor': False,
|
||||
'description': 'Toshiba drives typically report 0 here on healthy drives; '
|
||||
'disabling to avoid noise on models that use it as a counter'
|
||||
},
|
||||
'Seek_Error_Rate': {
|
||||
'monitor': False, # Skip monitoring - seems to be a counter
|
||||
'description': 'OOS drives report high values normally'
|
||||
'monitor': False,
|
||||
'description': 'Toshiba drives typically report 0 here on healthy drives; '
|
||||
'disabling to avoid noise on models that use it as a counter'
|
||||
},
|
||||
'Command_Timeout': {
|
||||
'warning_threshold': 100000000000, # 100 billion
|
||||
'critical_threshold': 200000000000, # 200 billion
|
||||
'description': 'OOS drives report very high timeout counters'
|
||||
'monitor': True,
|
||||
'behavior': 'countup',
|
||||
'warning_threshold': 1000,
|
||||
'critical_threshold': 5000,
|
||||
'description': 'Toshiba MG-series enterprise drives report a simple cumulative '
|
||||
'command timeout count; raised threshold to account for normal '
|
||||
'enterprise workload transients'
|
||||
}
|
||||
}
|
||||
},
|
||||
'OOS': {
|
||||
# OOS drives (OOS12000G, OOS14000G) are Seagate-based OEM drives.
|
||||
# They use the same packed counter encoding as Seagate for several attributes.
|
||||
'aliases': ['OOS12000G', 'OOS14000G', 'OOS'],
|
||||
'attributes': {
|
||||
'Raw_Read_Error_Rate': {
|
||||
'monitor': False,
|
||||
'description': 'OOS drives (Seagate-based OEM) encode this as a multi-byte counter'
|
||||
},
|
||||
'Seek_Error_Rate': {
|
||||
'monitor': False,
|
||||
'description': 'OOS drives (Seagate-based OEM) encode this as a multi-byte counter'
|
||||
},
|
||||
'Command_Timeout': {
|
||||
'monitor': False,
|
||||
'description': 'OOS drives use Seagate-style packed 3-part 48-bit counter format; '
|
||||
'raw value is not comparable to a simple count threshold'
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -450,16 +513,51 @@ class SystemHealthMonitor:
|
||||
'description': 'Total wear leveling operations performed',
|
||||
'monitor': True
|
||||
},
|
||||
# Standard monitoring for all other attributes
|
||||
'Program_Fail_Count': {
|
||||
# Attr 171/172: chip-level counters; Samsung firmware normalizes these
|
||||
# internally — rely on the normalized VALUE, not raw count.
|
||||
'Program_Fail_Count_Chip': {
|
||||
'monitor': False,
|
||||
'description': 'Samsung attr 171: chip-level program fail counter; '
|
||||
'Samsung firmware normalizes this internally'
|
||||
},
|
||||
'Erase_Fail_Count_Chip': {
|
||||
'monitor': False,
|
||||
'description': 'Samsung attr 172/176: chip-level erase fail counter; '
|
||||
'Samsung firmware normalizes this internally. Raw values '
|
||||
'in the hundreds are normal on older drives with high P/E cycles'
|
||||
},
|
||||
# Attr 181: vendor-specific packed value on Samsung consumer SSDs.
|
||||
# The raw value is NOT a simple program failure count; it reflects
|
||||
# internal NAND controller state and the encoding varies by model.
|
||||
# The normalized VALUE (100 = healthy) is the reliable health indicator.
|
||||
'Program_Fail_Cnt_Total': {
|
||||
'monitor': False,
|
||||
'description': 'Samsung attr 181: vendor-specific raw encoding — not a simple '
|
||||
'failure count. Rely on the normalized VALUE field instead'
|
||||
},
|
||||
# Attr 174: unexpected power loss counter — monitor but with lenient threshold
|
||||
# as small counts are normal (power outages, hard shutdowns).
|
||||
'Unexpect_Power_Loss_Ct': {
|
||||
'monitor': True,
|
||||
'warning_threshold': 10,
|
||||
'critical_threshold': 20
|
||||
'behavior': 'countup',
|
||||
'warning_threshold': 500,
|
||||
'critical_threshold': 2000,
|
||||
'description': 'Samsung attr 174: unexpected power loss / unsafe shutdown count'
|
||||
},
|
||||
# Samsung SSDs only expose erase/program failures through the
|
||||
# vendor-specific _Chip and _Total attributes above (already disabled).
|
||||
# The generic 'Erase_Fail_Count' / 'Program_Fail_Count' names on Samsung
|
||||
# are captured by substring from those same lines; disable here to prevent
|
||||
# false alerts from that bleed-through.
|
||||
'Program_Fail_Count': {
|
||||
'monitor': False,
|
||||
'description': 'Samsung program failure reporting is via Program_Fail_Count_Chip/Total; '
|
||||
'this generic name bleeds through from those attribute lines'
|
||||
},
|
||||
'Erase_Fail_Count': {
|
||||
'monitor': True,
|
||||
'warning_threshold': 10,
|
||||
'critical_threshold': 20
|
||||
'monitor': False,
|
||||
'description': 'Samsung erase failure reporting is via Erase_Fail_Count_Chip (attrs 172/176); '
|
||||
'this generic name bleeds through from those attribute lines'
|
||||
}
|
||||
}
|
||||
},
|
||||
@@ -2052,14 +2150,15 @@ class SystemHealthMonitor:
|
||||
]):
|
||||
continue
|
||||
|
||||
# Skip manufacturer-specific operation counters (not actual errors)
|
||||
# These are monitored attributes that manufacturers use as counters
|
||||
# Belt-and-suspenders filter for attributes that are known operation
|
||||
# counters on many drives and are now disabled per manufacturer profile.
|
||||
# Raw_Read_Error_Rate kept here as a catch-all for drives not yet
|
||||
# covered by any profile. Seek_Error_Rate and Command_Timeout are
|
||||
# fully handled at the profile level.
|
||||
if any(counter_name in issue for counter_name in [
|
||||
"Seek_Error_Rate", # Seagate/WD use as operation counter
|
||||
"Command_Timeout", # OOS/Seagate use as operation counter
|
||||
"Raw_Read_Error_Rate" # Seagate/WD use as operation counter
|
||||
"Raw_Read_Error_Rate", # catch-all for unprofiled drives
|
||||
]):
|
||||
logger.debug(f"Filtering manufacturer operation counter from issues: {issue}")
|
||||
logger.debug(f"Filtering global operation counter from issues: {issue}")
|
||||
continue
|
||||
|
||||
filtered_issues.append(issue)
|
||||
@@ -2412,7 +2511,12 @@ class SystemHealthMonitor:
|
||||
return 'Micron'
|
||||
|
||||
# Toshiba patterns
|
||||
elif 'TOSHIBA' in model_upper:
|
||||
# Enterprise NAS/DC: MG04, MG06, MG07, MG08, MG09, MG10 series
|
||||
# Consumer/mobile: MQ01, MQ03, MQ04, MK series, DT01 desktop
|
||||
elif any(pattern in model_upper for pattern in [
|
||||
'TOSHIBA', 'MG04', 'MG06', 'MG07', 'MG08', 'MG09', 'MG10',
|
||||
'MQ01', 'MQ03', 'MQ04', 'HDWD', 'DT01'
|
||||
]):
|
||||
return 'Toshiba'
|
||||
|
||||
# Ridata/Ritek patterns (for your existing special handling)
|
||||
@@ -2499,13 +2603,21 @@ class SystemHealthMonitor:
|
||||
'Erase_Fail_Count': {'warning': 10, 'critical': 20},
|
||||
'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
|
||||
'SSD_Life_Left': {'warning': 30, 'critical': 10},
|
||||
'Program_Fail_Cnt_Total': {'warning': 1, 'critical': 5},
|
||||
'Erase_Fail_Count_Total': {'warning': 1, 'critical': 5},
|
||||
# ADJUSTED: More lenient thresholds for error rates on unknown drives
|
||||
'Raw_Read_Error_Rate': {'warning': 10000000, 'critical': 100000000}, # Raised significantly
|
||||
'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000}, # Raised significantly
|
||||
'Command_Timeout': {'warning': 100, 'critical': 1000}, # Raised significantly
|
||||
'High_Fly_Writes': {'warning': 1, 'critical': 5},
|
||||
# Program/Erase fail totals: default thresholds for drives not matched by a
|
||||
# manufacturer profile. Seagate/WD/OOS use counters here; Samsung disables
|
||||
# these per-profile. Generic/unknown drives: any real failures are concerning.
|
||||
'Program_Fail_Cnt_Total': {'warning': 50, 'critical': 200},
|
||||
'Erase_Fail_Count_Total': {'warning': 50, 'critical': 200},
|
||||
# Raw_Read_Error_Rate / Seek_Error_Rate: Seagate and WD disable these per-profile.
|
||||
# For truly unknown drives the thresholds below act as a last-resort catch.
|
||||
'Raw_Read_Error_Rate': {'warning': 10000000, 'critical': 100000000},
|
||||
'Seek_Error_Rate': {'warning': 10000000, 'critical': 100000000},
|
||||
# Command_Timeout: Seagate/WD/OOS disable per-profile (packed format).
|
||||
# Toshiba uses a raised profile threshold. Generic fallback below.
|
||||
'Command_Timeout': {'warning': 100, 'critical': 1000},
|
||||
# High_Fly_Writes: Seagate sets its own profile threshold (100/500).
|
||||
# Default for unknown drives is raised significantly from the old 1/5.
|
||||
'High_Fly_Writes': {'warning': 100, 'critical': 500},
|
||||
'Airflow_Temperature_Cel': {'warning': 65, 'critical': 75},
|
||||
'G_Sense_Error_Rate': {'warning': 100, 'critical': 1000},
|
||||
'Power-Off_Retract_Count': {'warning': 100000, 'critical': 500000},
|
||||
@@ -2625,18 +2737,21 @@ class SystemHealthMonitor:
|
||||
power_on_hours = self._parse_smart_value(parts[9])
|
||||
smart_attributes_raw['Power_On_Hours'] = power_on_hours
|
||||
|
||||
# Handle SMART attributes with preference for _Total versions
|
||||
# Handle SMART attributes with preference for _Total versions.
|
||||
# Use token-boundary matching (attr + ' ') to avoid 'Erase_Fail_Count'
|
||||
# matching lines that actually contain 'Erase_Fail_Count_Chip' etc.
|
||||
for attr in ['Erase_Fail_Count', 'Program_Fail_Count']:
|
||||
# Check for _Total version first (more accurate)
|
||||
if f'{attr}_Total' in line:
|
||||
total_attr = f'{attr}_Total'
|
||||
# Check for _Total version first (more accurate), also token-safe
|
||||
if (total_attr + ' ') in line or line.endswith(total_attr):
|
||||
parts = line.split()
|
||||
if len(parts) >= 10:
|
||||
raw_value = self._parse_smart_value(parts[9])
|
||||
smart_attributes_raw[f'{attr}_Total'] = raw_value # Store as _Total
|
||||
logger.debug(f"Found {attr}_Total: {raw_value}")
|
||||
smart_attributes_raw[total_attr] = raw_value
|
||||
logger.debug(f"Found {total_attr}: {raw_value}")
|
||||
break
|
||||
# Only use non-_Total version if _Total not found AND not Ridata
|
||||
elif attr in line and f'{attr}_Total' not in smart_attributes_raw:
|
||||
# Only use non-_Total version if exact token match and _Total not yet found
|
||||
elif ((attr + ' ') in line or line.endswith(attr)) and total_attr not in smart_attributes_raw:
|
||||
# Check if this is a Ridata drive and should skip regular counters
|
||||
if manufacturer_profile and manufacturer_profile.get('aliases', [{}])[0] == 'Ridata':
|
||||
logger.debug(f"Skipping {attr} for Ridata drive - using _Total version only")
|
||||
@@ -2709,7 +2824,11 @@ class SystemHealthMonitor:
|
||||
|
||||
for line in output.split('\n'):
|
||||
for attr in ALL_SMART_ATTRIBUTES:
|
||||
if attr in line and attr not in ['Wear_Leveling_Count']: # Wear_Leveling handled separately above
|
||||
# Use token-boundary check: attribute name must be followed by whitespace
|
||||
# (or be at line end) to avoid 'Erase_Fail_Count' matching
|
||||
# 'Erase_Fail_Count_Chip' lines, etc.
|
||||
attr_present = (attr + ' ') in line or (attr + '\t') in line
|
||||
if attr_present and attr not in ['Wear_Leveling_Count']: # Wear_Leveling handled separately above
|
||||
# Check if we should monitor this attribute
|
||||
if not self._should_monitor_attribute(attr, manufacturer_profile):
|
||||
logger.debug(f"Skipping {attr} - disabled for this manufacturer")
|
||||
@@ -2720,6 +2839,19 @@ class SystemHealthMonitor:
|
||||
raw_value = self._parse_smart_value(parts[9])
|
||||
smart_health['attributes'][attr] = raw_value
|
||||
|
||||
# Guard: values exceeding 32-bit unsigned max are almost certainly
|
||||
# packed multi-byte vendor fields (e.g. Seagate/WD Command_Timeout
|
||||
# 3-counter format, or Samsung attr 181 vendor encoding).
|
||||
# These should be suppressed by manufacturer profile settings, but
|
||||
# this cap prevents false alarms from any drive not yet profiled.
|
||||
if raw_value > 0xFFFFFFFF:
|
||||
logger.debug(
|
||||
f"Skipping threshold check for {attr} on {device}: "
|
||||
f"raw value {raw_value} (0x{raw_value:x}) exceeds 32-bit max — "
|
||||
f"likely a packed multi-byte vendor field"
|
||||
)
|
||||
continue
|
||||
|
||||
# Get manufacturer-specific or default thresholds
|
||||
attr_thresholds = self._get_attribute_thresholds(attr, manufacturer_profile)
|
||||
if not attr_thresholds:
|
||||
|
||||
Reference in New Issue
Block a user