From 147947b8cacadbef152a3081d177362cacd0d491 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Wed, 28 May 2025 14:59:47 -0400 Subject: [PATCH] Testing manufacturer specific smart tests --- hwmonDaemon.py | 243 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 191 insertions(+), 52 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 83abe61..debe18b 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -107,6 +107,81 @@ class SystemHealthMonitor: 'WD141KRYZ': ['02.01A02'] } } + MANUFACTURER_SMART_PROFILES = { + 'Ridata': { + 'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK'], + 'wear_leveling_behavior': 'countup', # Based on your data, it counts up + 'wear_leveling_baseline': 0, + 'wear_leveling_thresholds': { + 'warning': 500000, # Much higher threshold for countup behavior + 'critical': 1000000 # Very high threshold + }, + 'attributes': { + 'Wear_Leveling_Count': { + 'behavior': 'countup', + 'baseline': 0, + 'warning_threshold': 500000, + 'critical_threshold': 1000000, + 'description': 'Total wear leveling operations performed (countup from 0)', + 'ignore_on_new_drive': True # Don't alert on new drives + } + } + }, + 'Samsung': { + 'aliases': ['Samsung', 'SAMSUNG'], + 'wear_leveling_behavior': 'countup', + 'wear_leveling_baseline': 0, + 'wear_leveling_thresholds': { + 'warning': 2000, + 'critical': 3000 + }, + 'attributes': { + 'Wear_Leveling_Count': { + 'behavior': 'countup', + 'baseline': 0, + 'warning_threshold': 2000, + 'critical_threshold': 3000, + 'description': 'Total wear leveling operations performed' + } + } + }, + 'Intel': { + 'aliases': ['Intel', 'INTEL'], + 'wear_leveling_behavior': 'percentage', + 'wear_leveling_baseline': 100, + 'wear_leveling_thresholds': { + 'warning': 30, + 'critical': 10 + }, + 'attributes': { + 'Media_Wearout_Indicator': { + 'behavior': 'countdown', + 'baseline': 100, + 'warning_threshold': 30, + 'critical_threshold': 10, + 'description': 'Percentage of rated life remaining' + } + } + }, + 'Micron': { + 'aliases': ['Micron', 'MICRON', 'Crucial', 'CRUCIAL'], + 'wear_leveling_behavior': 'percentage', + 'wear_leveling_baseline': 100, + 'wear_leveling_thresholds': { + 'warning': 30, + 'critical': 10 + } + }, + 'Generic': { # Fallback for unknown manufacturers + 'aliases': ['Unknown', 'Generic'], + 'wear_leveling_behavior': 'unknown', + 'wear_leveling_baseline': None, + 'wear_leveling_thresholds': { + 'warning': None, # Don't trigger on unknown + 'critical': None + } + } + } SEVERITY_INDICATORS = { 'CRITICAL': '🔴', 'WARNING': '🟡', @@ -274,13 +349,18 @@ class SystemHealthMonitor: SSD block erase distribution metric. Impact: - Indicates wear pattern uniformity - - Higher values show more balanced wear + - Interpretation varies by manufacturer - Critical for SSD longevity Recommended Actions: 1. Monitor trend over time - 2. Compare with similar drives + 2. Compare with manufacturer baseline 3. Check workload distribution + + Note: Different manufacturers use different counting methods: + - Some count up from 0 (Samsung, etc.) + - Others count down from baseline (Ridata, etc.) + - Always check manufacturer specifications """ } @@ -983,49 +1063,56 @@ class SystemHealthMonitor: logger.debug(f"Could not parse SMART value: {raw_value}") return 0 + def _get_manufacturer_profile(self, model: str, manufacturer: str = None) -> Dict[str, Any]: + """ + Get manufacturer-specific SMART profile based on drive model/manufacturer. + """ + # Check each manufacturer profile + for mfg, profile in self.MANUFACTURER_SMART_PROFILES.items(): + for alias in profile['aliases']: + if alias.lower() in model.lower() or (manufacturer and alias.lower() in manufacturer.lower()): + logger.debug(f"Matched manufacturer profile: {mfg} for model: {model}") + return profile + + # Return generic profile if no match + logger.debug(f"No specific profile found for {model}, using Generic profile") + return self.MANUFACTURER_SMART_PROFILES['Generic'] + + def _is_new_drive(self, power_on_hours: int) -> bool: + """ + Determine if a drive is considered "new" based on power-on hours. + """ + return power_on_hours < 168 # Less than 1 week of runtime + def _check_smart_health(self, device: str) -> Dict[str, Any]: """ - Enhanced SMART health check with detailed failure thresholds. + Enhanced SMART health check with manufacturer-specific thresholds. """ smart_health = { 'status': 'HEALTHY', 'severity': 'NORMAL', 'issues': [], 'temp': None, - 'attributes': {} - } - - # Define critical SMART attributes and their thresholds - SMART_THRESHOLDS = { - 'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10}, - 'Current_Pending_Sector': {'warning': 1, 'critical': 5}, - 'Offline_Uncorrectable': {'warning': 1, 'critical': 2}, - 'Reported_Uncorrect': {'warning': 1, 'critical': 10}, - 'Spin_Retry_Count': {'warning': 1, 'critical': 5}, - # 'Command_Timeout': {'warning': 5, 'critical': 10}, # Removed - 'Power_Cycle_Count': {'warning': 5000, 'critical': 10000}, - 'Power_On_Hours': {'warning': 61320, 'critical': 70080}, # ~7-8 years - 'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, - 'Temperature_Celsius': {'warning': 65, 'critical': 75}, - 'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000}, - 'Wear_Leveling_Count': {'warning': 2000, 'critical': 3000}, - 'Available_Spare': {'warning': 30, 'critical': 10}, - 'Program_Fail_Count': {'warning': 10, 'critical': 20}, - 'Erase_Fail_Count': {'warning': 10, 'critical': 20}, - # 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100}, # Removed - # 'Seek_Error_Rate': {'warning': 50, 'critical': 100}, # Removed - 'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000}, - 'SSD_Life_Left': {'warning': 30, 'critical': 10} + 'attributes': {}, + 'manufacturer_profile': None } try: + # Get drive details first to determine manufacturer + drive_details = self._get_drive_details(device) + manufacturer_profile = self._get_manufacturer_profile( + drive_details.get('model', ''), + drive_details.get('manufacturer', '') + ) + smart_health['manufacturer_profile'] = manufacturer_profile + # Get firmware information firmware_info = self._check_disk_firmware(device) if firmware_info['is_problematic']: smart_health['severity'] = 'WARNING' smart_health['issues'].extend(firmware_info['known_issues']) - # Get detailed SMART data including performance metrics + # Get detailed SMART data result = subprocess.run( ['smartctl', '-A', '-H', '-l', 'error', '-l', 'background', device], stdout=subprocess.PIPE, @@ -1041,25 +1128,81 @@ class SystemHealthMonitor: smart_health['severity'] = 'CRITICAL' smart_health['issues'].append("SMART overall health check failed") - # Parse SMART attributes with thresholds + # Parse SMART attributes with manufacturer-specific handling + power_on_hours = 0 + for line in output.split('\n'): - if 'Reported_Uncorrect' in line: + # Extract Power_On_Hours first to determine if drive is new + if 'Power_On_Hours' in line: parts = line.split() - raw_value = self._parse_smart_value(parts[9]) - logger.debug(f"Found Reported_Uncorrect value: {raw_value}") - smart_health['attributes']['Reported_Uncorrect'] = raw_value - - if raw_value >= SMART_THRESHOLDS['Reported_Uncorrect']['critical']: - smart_health['status'] = 'UNHEALTHY' - smart_health['severity'] = 'CRITICAL' - smart_health['issues'].append(f"Critical uncorrectable errors: {raw_value}") - elif raw_value >= SMART_THRESHOLDS['Reported_Uncorrect']['warning']: - if smart_health['severity'] != 'CRITICAL': - smart_health['severity'] = 'WARNING' - smart_health['issues'].append(f"Warning: uncorrectable errors detected: {raw_value}") + if len(parts) >= 10: + power_on_hours = self._parse_smart_value(parts[9]) + smart_health['attributes']['Power_On_Hours'] = power_on_hours - for attr, thresholds in SMART_THRESHOLDS.items(): - if attr in line: + # Check if this is a new drive + is_new_drive = self._is_new_drive(power_on_hours) + logger.debug(f"Drive {device} power-on hours: {power_on_hours}, is_new_drive: {is_new_drive}") + + # Define base SMART thresholds (for non-manufacturer specific attributes) + BASE_SMART_THRESHOLDS = { + 'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10}, + 'Current_Pending_Sector': {'warning': 1, 'critical': 5}, + 'Offline_Uncorrectable': {'warning': 1, 'critical': 2}, + 'Reported_Uncorrect': {'warning': 1, 'critical': 10}, + 'Spin_Retry_Count': {'warning': 1, 'critical': 5}, + 'Power_Cycle_Count': {'warning': 5000, 'critical': 10000}, + 'Power_On_Hours': {'warning': 61320, 'critical': 70080}, # ~7-8 years + 'Temperature_Celsius': {'warning': 65, 'critical': 75}, + 'Available_Spare': {'warning': 30, 'critical': 10}, + 'Program_Fail_Count': {'warning': 10, 'critical': 20}, + 'Erase_Fail_Count': {'warning': 10, 'critical': 20}, + 'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000}, + 'SSD_Life_Left': {'warning': 30, 'critical': 10} + } + + # Parse all SMART attributes + for line in output.split('\n'): + # Handle manufacturer-specific Wear_Leveling_Count + if 'Wear_Leveling_Count' in line: + parts = line.split() + if len(parts) >= 10: + raw_value = self._parse_smart_value(parts[9]) + smart_health['attributes']['Wear_Leveling_Count'] = raw_value + + # Get manufacturer-specific thresholds + wear_attr = manufacturer_profile.get('attributes', {}).get('Wear_Leveling_Count', {}) + + # Skip evaluation if this is a new drive and manufacturer profile says to ignore + if is_new_drive and wear_attr.get('ignore_on_new_drive', False): + logger.debug(f"Skipping Wear_Leveling_Count evaluation for new drive: {raw_value}") + continue + + warning_threshold = wear_attr.get('warning_threshold') + critical_threshold = wear_attr.get('critical_threshold') + + if warning_threshold and critical_threshold: + behavior = wear_attr.get('behavior', 'countup') + + if behavior == 'countup': + if raw_value >= critical_threshold: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical wear leveling count: {raw_value}") + elif raw_value >= warning_threshold: + if smart_health['severity'] != 'CRITICAL': + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"High wear leveling count: {raw_value}") + elif behavior == 'countdown': + if raw_value <= critical_threshold: + smart_health['severity'] = 'CRITICAL' + smart_health['issues'].append(f"Critical wear leveling remaining: {raw_value}") + elif raw_value <= warning_threshold: + if smart_health['severity'] != 'CRITICAL': + smart_health['severity'] = 'WARNING' + smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}") + + # Handle all other standard SMART attributes + for attr, thresholds in BASE_SMART_THRESHOLDS.items(): + if attr in line and attr != 'Wear_Leveling_Count': # Skip wear leveling as it's handled above parts = line.split() if len(parts) >= 10: raw_value = self._parse_smart_value(parts[9]) @@ -1071,7 +1214,8 @@ class SystemHealthMonitor: smart_health['severity'] = 'CRITICAL' smart_health['issues'].append(f"Critical temperature: {raw_value}°C") elif raw_value >= thresholds['warning']: - smart_health['severity'] = 'WARNING' + if smart_health['severity'] != 'CRITICAL': + smart_health['severity'] = 'WARNING' smart_health['issues'].append(f"High temperature: {raw_value}°C") else: if raw_value >= thresholds['critical']: @@ -1098,10 +1242,12 @@ class SystemHealthMonitor: smart_health['issues'].extend(recent_errors) logger.debug(f"=== SMART Health Check for {device} ===") + logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0]}") logger.debug("Raw SMART attributes:") for attr, value in smart_health['attributes'].items(): logger.debug(f"{attr}: {value}") logger.debug(f"Temperature: {smart_health['temp']}°C") + logger.debug(f"Is new drive: {is_new_drive}") logger.debug(f"Detected Issues: {smart_health['issues']}") logger.debug("=== End SMART Check ===\n") @@ -1408,7 +1554,6 @@ class SystemHealthMonitor: } try: - logger.debug("Executing 'pct list' command") result = subprocess.run( ['pct', 'list'], stdout=subprocess.PIPE, @@ -1418,7 +1563,6 @@ class SystemHealthMonitor: logger.debug(f"pct list output:\n{result.stdout}") for line in result.stdout.split('\n')[1:]: - logger.debug(f"Raw LXC line: {line}") if not line.strip(): continue @@ -1428,7 +1572,6 @@ class SystemHealthMonitor: continue vmid, status = parts[0], parts[1] - logger.debug(f"Processing container VMID: {vmid}, Status: {status}") if status.lower() == 'running': logger.debug(f"Checking container {vmid} disk usage") @@ -1438,7 +1581,6 @@ class SystemHealthMonitor: stderr=subprocess.PIPE, text=True ) - logger.debug(f"Raw disk info output:\n{disk_info.stdout}") container_info = { 'vmid': vmid, @@ -1447,11 +1589,8 @@ class SystemHealthMonitor: for fs_line in disk_info.stdout.split('\n')[1:]: if not fs_line.strip() or 'MP' in fs_line: - logger.debug(f"Skipping line: {fs_line}") continue - logger.debug(f"Processing filesystem line: {fs_line}") - #parts = fs_line.split() columns = line.split() logger.debug(f"Split parts: {parts}") if len(columns) >= 6: