From 147947b8cacadbef152a3081d177362cacd0d491 Mon Sep 17 00:00:00 2001
From: Jared Vititoe <jjvititoe1@gmail.com>
Date: Wed, 28 May 2025 14:59:47 -0400
Subject: [PATCH] Testing manufacturer specific smart tests

---
 hwmonDaemon.py | 243 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 191 insertions(+), 52 deletions(-)

diff --git a/hwmonDaemon.py b/hwmonDaemon.py
index 83abe61..debe18b 100644
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -107,6 +107,81 @@ class SystemHealthMonitor:
             'WD141KRYZ': ['02.01A02']
         }
     }
+    MANUFACTURER_SMART_PROFILES = {
+        'Ridata': {
+            'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK'],
+            'wear_leveling_behavior': 'countup',  # Based on your data, it counts up
+            'wear_leveling_baseline': 0,
+            'wear_leveling_thresholds': {
+                'warning': 500000,   # Much higher threshold for countup behavior
+                'critical': 1000000  # Very high threshold
+            },
+            'attributes': {
+                'Wear_Leveling_Count': {
+                    'behavior': 'countup',
+                    'baseline': 0,
+                    'warning_threshold': 500000,
+                    'critical_threshold': 1000000,
+                    'description': 'Total wear leveling operations performed (countup from 0)',
+                    'ignore_on_new_drive': True  # Don't alert on new drives
+                }
+            }
+        },
+        'Samsung': {
+            'aliases': ['Samsung', 'SAMSUNG'],
+            'wear_leveling_behavior': 'countup',
+            'wear_leveling_baseline': 0,
+            'wear_leveling_thresholds': {
+                'warning': 2000,
+                'critical': 3000
+            },
+            'attributes': {
+                'Wear_Leveling_Count': {
+                    'behavior': 'countup',
+                    'baseline': 0,
+                    'warning_threshold': 2000,
+                    'critical_threshold': 3000,
+                    'description': 'Total wear leveling operations performed'
+                }
+            }
+        },
+        'Intel': {
+            'aliases': ['Intel', 'INTEL'],
+            'wear_leveling_behavior': 'percentage',
+            'wear_leveling_baseline': 100,
+            'wear_leveling_thresholds': {
+                'warning': 30,
+                'critical': 10
+            },
+            'attributes': {
+                'Media_Wearout_Indicator': {
+                    'behavior': 'countdown',
+                    'baseline': 100,
+                    'warning_threshold': 30,
+                    'critical_threshold': 10,
+                    'description': 'Percentage of rated life remaining'
+                }
+            }
+        },
+        'Micron': {
+            'aliases': ['Micron', 'MICRON', 'Crucial', 'CRUCIAL'],
+            'wear_leveling_behavior': 'percentage',
+            'wear_leveling_baseline': 100,
+            'wear_leveling_thresholds': {
+                'warning': 30,
+                'critical': 10
+            }
+        },
+        'Generic': {  # Fallback for unknown manufacturers
+            'aliases': ['Unknown', 'Generic'],
+            'wear_leveling_behavior': 'unknown',
+            'wear_leveling_baseline': None,
+            'wear_leveling_thresholds': {
+                'warning': None,  # Don't trigger on unknown
+                'critical': None
+            }
+        }
+    }
     SEVERITY_INDICATORS = {
         'CRITICAL': '🔴',
         'WARNING': '🟡',
@@ -274,13 +349,18 @@ class SystemHealthMonitor:
         SSD block erase distribution metric.
         Impact:
         - Indicates wear pattern uniformity
-        - Higher values show more balanced wear
+        - Interpretation varies by manufacturer
         - Critical for SSD longevity
         
         Recommended Actions:
         1. Monitor trend over time
-        2. Compare with similar drives
+        2. Compare with manufacturer baseline
         3. Check workload distribution
+        
+        Note: Different manufacturers use different counting methods:
+        - Some count up from 0 (Samsung, etc.)
+        - Others count down from baseline (Ridata, etc.)
+        - Always check manufacturer specifications
         """
     }
 
@@ -983,49 +1063,56 @@ class SystemHealthMonitor:
             logger.debug(f"Could not parse SMART value: {raw_value}")
             return 0
 
+    def _get_manufacturer_profile(self, model: str, manufacturer: str = None) -> Dict[str, Any]:
+        """
+        Get manufacturer-specific SMART profile based on drive model/manufacturer.
+        """
+        # Check each manufacturer profile
+        for mfg, profile in self.MANUFACTURER_SMART_PROFILES.items():
+            for alias in profile['aliases']:
+                if alias.lower() in model.lower() or (manufacturer and alias.lower() in manufacturer.lower()):
+                    logger.debug(f"Matched manufacturer profile: {mfg} for model: {model}")
+                    return profile
+        
+        # Return generic profile if no match
+        logger.debug(f"No specific profile found for {model}, using Generic profile")
+        return self.MANUFACTURER_SMART_PROFILES['Generic']
+
+    def _is_new_drive(self, power_on_hours: int) -> bool:
+        """
+        Determine if a drive is considered "new" based on power-on hours.
+        """
+        return power_on_hours < 168  # Less than 1 week of runtime
+
     def _check_smart_health(self, device: str) -> Dict[str, Any]:
         """
-        Enhanced SMART health check with detailed failure thresholds.
+        Enhanced SMART health check with manufacturer-specific thresholds.
         """
         smart_health = {
             'status': 'HEALTHY',
             'severity': 'NORMAL',
             'issues': [],
             'temp': None,
-            'attributes': {}
-        }
-
-        # Define critical SMART attributes and their thresholds
-        SMART_THRESHOLDS = {
-            'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
-            'Current_Pending_Sector': {'warning': 1, 'critical': 5},
-            'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
-            'Reported_Uncorrect': {'warning': 1, 'critical': 10},
-            'Spin_Retry_Count': {'warning': 1, 'critical': 5},
-            # 'Command_Timeout': {'warning': 5, 'critical': 10},  # Removed
-            'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
-            'Power_On_Hours': {'warning': 61320, 'critical': 70080},  # ~7-8 years
-            'Media_Wearout_Indicator': {'warning': 30, 'critical': 10},
-            'Temperature_Celsius': {'warning': 65, 'critical': 75},
-            'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000},
-            'Wear_Leveling_Count': {'warning': 2000, 'critical': 3000},
-            'Available_Spare': {'warning': 30, 'critical': 10},
-            'Program_Fail_Count': {'warning': 10, 'critical': 20},
-            'Erase_Fail_Count': {'warning': 10, 'critical': 20},
-            # 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100},  # Removed
-            # 'Seek_Error_Rate': {'warning': 50, 'critical': 100},  # Removed
-            'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
-            'SSD_Life_Left': {'warning': 30, 'critical': 10}
+            'attributes': {},
+            'manufacturer_profile': None
         }
 
         try:
+            # Get drive details first to determine manufacturer
+            drive_details = self._get_drive_details(device)
+            manufacturer_profile = self._get_manufacturer_profile(
+                drive_details.get('model', ''), 
+                drive_details.get('manufacturer', '')
+            )
+            smart_health['manufacturer_profile'] = manufacturer_profile
+
             # Get firmware information
             firmware_info = self._check_disk_firmware(device)
             if firmware_info['is_problematic']:
                 smart_health['severity'] = 'WARNING'
                 smart_health['issues'].extend(firmware_info['known_issues'])
 
-            # Get detailed SMART data including performance metrics
+            # Get detailed SMART data
             result = subprocess.run(
                 ['smartctl', '-A', '-H', '-l', 'error', '-l', 'background', device],
                 stdout=subprocess.PIPE,
@@ -1041,25 +1128,81 @@ class SystemHealthMonitor:
                 smart_health['severity'] = 'CRITICAL'
                 smart_health['issues'].append("SMART overall health check failed")
 
-            # Parse SMART attributes with thresholds
+            # Parse SMART attributes with manufacturer-specific handling
+            power_on_hours = 0
+            
             for line in output.split('\n'):
-                if 'Reported_Uncorrect' in line:
+                # Extract Power_On_Hours first to determine if drive is new
+                if 'Power_On_Hours' in line:
                     parts = line.split()
-                    raw_value = self._parse_smart_value(parts[9])
-                    logger.debug(f"Found Reported_Uncorrect value: {raw_value}")
-                    smart_health['attributes']['Reported_Uncorrect'] = raw_value
-                    
-                    if raw_value >= SMART_THRESHOLDS['Reported_Uncorrect']['critical']:
-                        smart_health['status'] = 'UNHEALTHY'
-                        smart_health['severity'] = 'CRITICAL'
-                        smart_health['issues'].append(f"Critical uncorrectable errors: {raw_value}")
-                    elif raw_value >= SMART_THRESHOLDS['Reported_Uncorrect']['warning']:
-                        if smart_health['severity'] != 'CRITICAL':
-                            smart_health['severity'] = 'WARNING'
-                        smart_health['issues'].append(f"Warning: uncorrectable errors detected: {raw_value}")
+                    if len(parts) >= 10:
+                        power_on_hours = self._parse_smart_value(parts[9])
+                        smart_health['attributes']['Power_On_Hours'] = power_on_hours
 
-                for attr, thresholds in SMART_THRESHOLDS.items():
-                    if attr in line:
+            # Check if this is a new drive
+            is_new_drive = self._is_new_drive(power_on_hours)
+            logger.debug(f"Drive {device} power-on hours: {power_on_hours}, is_new_drive: {is_new_drive}")
+
+            # Define base SMART thresholds (for non-manufacturer specific attributes)
+            BASE_SMART_THRESHOLDS = {
+                'Reallocated_Sector_Ct': {'warning': 5, 'critical': 10},
+                'Current_Pending_Sector': {'warning': 1, 'critical': 5},
+                'Offline_Uncorrectable': {'warning': 1, 'critical': 2},
+                'Reported_Uncorrect': {'warning': 1, 'critical': 10},
+                'Spin_Retry_Count': {'warning': 1, 'critical': 5},
+                'Power_Cycle_Count': {'warning': 5000, 'critical': 10000},
+                'Power_On_Hours': {'warning': 61320, 'critical': 70080},  # ~7-8 years
+                'Temperature_Celsius': {'warning': 65, 'critical': 75},
+                'Available_Spare': {'warning': 30, 'critical': 10},
+                'Program_Fail_Count': {'warning': 10, 'critical': 20},
+                'Erase_Fail_Count': {'warning': 10, 'critical': 20},
+                'Load_Cycle_Count': {'warning': 900000, 'critical': 1000000},
+                'SSD_Life_Left': {'warning': 30, 'critical': 10}
+            }
+
+            # Parse all SMART attributes
+            for line in output.split('\n'):
+                # Handle manufacturer-specific Wear_Leveling_Count
+                if 'Wear_Leveling_Count' in line:
+                    parts = line.split()
+                    if len(parts) >= 10:
+                        raw_value = self._parse_smart_value(parts[9])
+                        smart_health['attributes']['Wear_Leveling_Count'] = raw_value
+                        
+                        # Get manufacturer-specific thresholds
+                        wear_attr = manufacturer_profile.get('attributes', {}).get('Wear_Leveling_Count', {})
+                        
+                        # Skip evaluation if this is a new drive and manufacturer profile says to ignore
+                        if is_new_drive and wear_attr.get('ignore_on_new_drive', False):
+                            logger.debug(f"Skipping Wear_Leveling_Count evaluation for new drive: {raw_value}")
+                            continue
+                            
+                        warning_threshold = wear_attr.get('warning_threshold')
+                        critical_threshold = wear_attr.get('critical_threshold')
+                        
+                        if warning_threshold and critical_threshold:
+                            behavior = wear_attr.get('behavior', 'countup')
+                            
+                            if behavior == 'countup':
+                                if raw_value >= critical_threshold:
+                                    smart_health['severity'] = 'CRITICAL'
+                                    smart_health['issues'].append(f"Critical wear leveling count: {raw_value}")
+                                elif raw_value >= warning_threshold:
+                                    if smart_health['severity'] != 'CRITICAL':
+                                        smart_health['severity'] = 'WARNING'
+                                    smart_health['issues'].append(f"High wear leveling count: {raw_value}")
+                            elif behavior == 'countdown':
+                                if raw_value <= critical_threshold:
+                                    smart_health['severity'] = 'CRITICAL'
+                                    smart_health['issues'].append(f"Critical wear leveling remaining: {raw_value}")
+                                elif raw_value <= warning_threshold:
+                                    if smart_health['severity'] != 'CRITICAL':
+                                        smart_health['severity'] = 'WARNING'
+                                    smart_health['issues'].append(f"Low wear leveling remaining: {raw_value}")
+
+                # Handle all other standard SMART attributes
+                for attr, thresholds in BASE_SMART_THRESHOLDS.items():
+                    if attr in line and attr != 'Wear_Leveling_Count':  # Skip wear leveling as it's handled above
                         parts = line.split()
                         if len(parts) >= 10:
                             raw_value = self._parse_smart_value(parts[9])
@@ -1071,7 +1214,8 @@ class SystemHealthMonitor:
                                     smart_health['severity'] = 'CRITICAL'
                                     smart_health['issues'].append(f"Critical temperature: {raw_value}°C")
                                 elif raw_value >= thresholds['warning']:
-                                    smart_health['severity'] = 'WARNING'
+                                    if smart_health['severity'] != 'CRITICAL':
+                                        smart_health['severity'] = 'WARNING'
                                     smart_health['issues'].append(f"High temperature: {raw_value}°C")
                             else:
                                 if raw_value >= thresholds['critical']:
@@ -1098,10 +1242,12 @@ class SystemHealthMonitor:
                 smart_health['issues'].extend(recent_errors)
         
             logger.debug(f"=== SMART Health Check for {device} ===")
+            logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0]}")
             logger.debug("Raw SMART attributes:")
             for attr, value in smart_health['attributes'].items():
                 logger.debug(f"{attr}: {value}")
             logger.debug(f"Temperature: {smart_health['temp']}°C")
+            logger.debug(f"Is new drive: {is_new_drive}")
             logger.debug(f"Detected Issues: {smart_health['issues']}")
             logger.debug("=== End SMART Check ===\n")
 
@@ -1408,7 +1554,6 @@ class SystemHealthMonitor:
         }
         
         try:
-            logger.debug("Executing 'pct list' command")
             result = subprocess.run(
                 ['pct', 'list'],
                 stdout=subprocess.PIPE,
@@ -1418,7 +1563,6 @@ class SystemHealthMonitor:
             logger.debug(f"pct list output:\n{result.stdout}")
             
             for line in result.stdout.split('\n')[1:]:
-                logger.debug(f"Raw LXC line: {line}")
                 if not line.strip():
                     continue
                     
@@ -1428,7 +1572,6 @@ class SystemHealthMonitor:
                     continue
                     
                 vmid, status = parts[0], parts[1]
-                logger.debug(f"Processing container VMID: {vmid}, Status: {status}")
                 
                 if status.lower() == 'running':
                     logger.debug(f"Checking container {vmid} disk usage")
@@ -1438,7 +1581,6 @@ class SystemHealthMonitor:
                         stderr=subprocess.PIPE,
                         text=True
                     )
-                    logger.debug(f"Raw disk info output:\n{disk_info.stdout}")
                     
                     container_info = {
                         'vmid': vmid,
@@ -1447,11 +1589,8 @@ class SystemHealthMonitor:
                     
                     for fs_line in disk_info.stdout.split('\n')[1:]:
                         if not fs_line.strip() or 'MP' in fs_line:
-                            logger.debug(f"Skipping line: {fs_line}")
                             continue
                         
-                        logger.debug(f"Processing filesystem line: {fs_line}")
-                        #parts = fs_line.split()
                         columns = line.split()
                         logger.debug(f"Split parts: {parts}")
                         if len(columns) >= 6: