From 8660b3b6e458e3e87b0ac9c1d0a17e778910665e Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Mon, 3 Mar 2025 16:38:02 -0500 Subject: [PATCH] IO test and more SMART parameters --- hwmonDaemon.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 3 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index be75ae2..768ffb1 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -61,6 +61,21 @@ class SystemHealthMonitor: 'DEFAULT_CATEGORY': 'Hardware', 'DEFAULT_ISSUE_TYPE': 'Problem' } + PROBLEMATIC_FIRMWARE = { + 'Samsung': { + 'EVO860': ['RVT01B6Q', 'RVT02B6Q'], # Known issues with sudden performance drops + 'EVO870': ['SVT01B6Q'], + 'PM883': ['HXT7404Q'] # Known issues with TRIM + }, + 'Seagate': { + 'ST8000NM': ['CC64'], # Known issues with NCQ + 'ST12000NM': ['SN02'] + }, + 'WDC': { + 'WD121KRYZ': ['01.01A01'], # RAID rebuild issues + 'WD141KRYZ': ['02.01A02'] + } + } def __init__(self, ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php', @@ -292,7 +307,57 @@ class SystemHealthMonitor: if any(device_path.startswith(mount) for mount in excluded_mounts): return False return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)$', base_device)) + + def _check_disk_firmware(self, device: str) -> Dict[str, Any]: + """ + Check disk firmware version against known problematic versions. + """ + firmware_info = { + 'version': None, + 'model': None, + 'manufacturer': None, + 'is_problematic': False, + 'known_issues': [] + } + try: + result = subprocess.run( + ['smartctl', '-i', device], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + for line in result.stdout.split('\n'): + if 'Firmware Version:' in line: + firmware_info['version'] = line.split(':')[1].strip() + elif 'Model Family:' in line: + firmware_info['model'] = line.split(':')[1].strip() + elif 'Device Model:' in line: + if not firmware_info['model']: + firmware_info['model'] = line.split(':')[1].strip() + + # Determine manufacturer + for manufacturer in self.PROBLEMATIC_FIRMWARE.keys(): + if manufacturer in firmware_info['model']: + firmware_info['manufacturer'] = manufacturer + break + + # Check against known problematic versions + if firmware_info['manufacturer'] and firmware_info['model']: + for model, versions in self.PROBLEMATIC_FIRMWARE[firmware_info['manufacturer']].items(): + if model in firmware_info['model'] and firmware_info['version'] in versions: + firmware_info['is_problematic'] = True + firmware_info['known_issues'].append( + f"Known problematic firmware version {firmware_info['version']} " + f"for {firmware_info['model']}" + ) + + except Exception as e: + firmware_info['known_issues'].append(f"Error checking firmware: {str(e)}") + + return firmware_info + def _check_smart_health(self, device: str) -> Dict[str, Any]: """ Enhanced SMART health check with detailed failure thresholds. @@ -316,13 +381,28 @@ class SystemHealthMonitor: 'Power_Cycle_Count': {'warning': 5000, 'critical': 10000}, 'Power_On_Hours': {'warning': 35040, 'critical': 43800}, # ~4-5 years 'Media_Wearout_Indicator': {'warning': 30, 'critical': 10}, # Percentage remaining - 'Temperature_Celsius': {'warning': 65, 'critical': 75} + 'Temperature_Celsius': {'warning': 65, 'critical': 75}, + 'Host_Writes_32MiB': {'warning': 50000000, 'critical': 100000000}, + 'Wear_Leveling_Count': {'warning': 50, 'critical': 20}, + 'Available_Spare': {'warning': 30, 'critical': 10}, + 'Program_Fail_Count': {'warning': 10, 'critical': 20}, + 'Erase_Fail_Count': {'warning': 10, 'critical': 20}, + 'Raw_Read_Error_Rate': {'warning': 50, 'critical': 100}, + 'Seek_Error_Rate': {'warning': 50, 'critical': 100}, + 'Load_Cycle_Count': {'warning': 300000, 'critical': 600000}, + 'SSD_Life_Left': {'warning': 30, 'critical': 10} } try: - # Get detailed SMART data + # Get firmware information + firmware_info = self._check_disk_firmware(device) + if firmware_info['is_problematic']: + smart_health['severity'] = 'WARNING' + smart_health['issues'].extend(firmware_info['known_issues']) + + # Get detailed SMART data including performance metrics result = subprocess.run( - ['smartctl', '-A', '-H', '-l', 'error', device], + ['smartctl', '-A', '-H', '-l', 'error', '-l', 'background', device], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True @@ -377,6 +457,29 @@ class SystemHealthMonitor: smart_health['severity'] = 'WARNING' smart_health['issues'].extend(recent_errors) + smart_health['performance_metrics'] = { + 'read_speed': None, + 'write_speed': None, + 'access_time': None + } + + # Quick performance test + try: + perf_result = subprocess.run( + ['hdparm', '-Tt', device], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + for line in perf_result.stdout.split('\n'): + if 'buffered disk reads' in line: + smart_health['performance_metrics']['read_speed'] = float(line.split()[0]) + elif 'cached reads' in line: + smart_health['performance_metrics']['cached_speed'] = float(line.split()[0]) + except: + pass # Skip performance metrics if hdparm fails + except Exception as e: smart_health['status'] = 'ERROR' smart_health['severity'] = 'UNKNOWN'