From 95a5a8227a7b5fe650109f4e7bbcf88b2a1973fb Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Thu, 29 May 2025 12:44:55 -0400 Subject: [PATCH] NoneType fix? --- hwmonDaemon.py | 185 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 136 insertions(+), 49 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 9898f19..6b4d4de 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -448,31 +448,53 @@ class SystemHealthMonitor: 'serial': None, 'capacity': None, 'firmware': None, - 'type': None # SSD or HDD + 'type': None, # SSD or HDD + 'smart_capable': False } try: - result = subprocess.run( + # First check if device supports SMART + capability_result = subprocess.run( ['smartctl', '-i', device], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) - for line in result.stdout.split('\n'): - if 'Device Model' in line: + # Check if smartctl failed completely + if capability_result.returncode not in [0, 4]: # 0 = success, 4 = some SMART errors but readable + logger.debug(f"smartctl failed for {device}: return code {capability_result.returncode}") + return drive_details + + output = capability_result.stdout + + # Check if SMART is supported + if "SMART support is: Enabled" in output or "SMART support is: Available" in output: + drive_details['smart_capable'] = True + elif "SMART support is: Unavailable" in output or "does not support SMART" in output: + logger.debug(f"Device {device} does not support SMART") + return drive_details + + for line in output.split('\n'): + if 'Device Model' in line or 'Model Number' in line: drive_details['model'] = line.split(':')[1].strip() elif 'Serial Number' in line: drive_details['serial'] = line.split(':')[1].strip() elif 'User Capacity' in line: - drive_details['capacity'] = line.split('[')[1].split(']')[0] + # Extract capacity from brackets + capacity_match = re.search(r'\[(.*?)\]', line) + if capacity_match: + drive_details['capacity'] = capacity_match.group(1) elif 'Firmware Version' in line: drive_details['firmware'] = line.split(':')[1].strip() elif 'Rotation Rate' in line: - drive_details['type'] = 'SSD' if 'Solid State Device' in line else 'HDD' - + if 'Solid State Device' in line: + drive_details['type'] = 'SSD' + else: + drive_details['type'] = 'HDD' + except Exception as e: - logger.debug(f"Error getting drive details: {e}") + logger.debug(f"Error getting drive details for {device}: {e}") return drive_details @@ -801,8 +823,28 @@ class SystemHealthMonitor: # Check for drive-related issues for drive in health_report.get('drives_health', {}).get('drives', []): - if drive.get('smart_issues'): - issues.append(f"Drive {drive['device']} has SMART issues: {', '.join(drive['smart_issues'])}") + # Skip drives with ERROR or NOT_SUPPORTED status - these are likely virtual/unsupported devices + if drive.get('smart_status') in ['ERROR', 'NOT_SUPPORTED']: + logger.debug(f"Skipping issue detection for drive {drive['device']} with status {drive.get('smart_status')}") + continue + + # Only report issues for drives with valid SMART status + if drive.get('smart_issues') and drive.get('smart_status') in ['HEALTHY', 'UNHEALTHY', 'UNKNOWN']: + # Filter out generic error messages that don't indicate real hardware issues + filtered_issues = [] + for issue in drive['smart_issues']: + if not any(skip_phrase in issue for skip_phrase in [ + "Error checking SMART:", + "Unable to read device information", + "SMART not supported", + "timed out" + ]): + filtered_issues.append(issue) + + if filtered_issues: + issues.append(f"Drive {drive['device']} has SMART issues: {', '.join(filtered_issues)}") + + # Check temperature regardless of SMART status if drive.get('temperature') and drive['temperature'] > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']: issues.append(f"Drive {drive['device']} temperature is high: {drive['temperature']}°C") @@ -1086,10 +1128,10 @@ class SystemHealthMonitor: def _check_smart_health(self, device: str) -> Dict[str, Any]: """ - Enhanced SMART health check with manufacturer-specific thresholds. + Enhanced SMART health check with better error handling. """ smart_health = { - 'status': 'HEALTHY', + 'status': 'UNKNOWN', 'severity': 'NORMAL', 'issues': [], 'temp': None, @@ -1098,8 +1140,19 @@ class SystemHealthMonitor: } try: - # Get drive details first to determine manufacturer + # First verify the device is SMART-capable drive_details = self._get_drive_details(device) + if not drive_details.get('smart_capable', False): + smart_health['status'] = 'NOT_SUPPORTED' + smart_health['issues'].append("SMART not supported on this device") + return smart_health + + # If we have no model info, the device might not be responding properly + if not drive_details.get('model'): + smart_health['status'] = 'ERROR' + smart_health['issues'].append("Unable to read device information") + return smart_health + manufacturer_profile = self._get_manufacturer_profile( drive_details.get('model', ''), drive_details.get('manufacturer', '') @@ -1112,12 +1165,13 @@ class SystemHealthMonitor: smart_health['severity'] = 'WARNING' smart_health['issues'].extend(firmware_info['known_issues']) - # Get detailed SMART data + # Get detailed SMART data with timeout result = subprocess.run( ['smartctl', '-A', '-H', '-l', 'error', '-l', 'background', device], stdout=subprocess.PIPE, stderr=subprocess.PIPE, - text=True + text=True, + timeout=30 # 30 second timeout ) output = result.stdout @@ -1127,6 +1181,10 @@ class SystemHealthMonitor: smart_health['status'] = 'UNHEALTHY' smart_health['severity'] = 'CRITICAL' smart_health['issues'].append("SMART overall health check failed") + elif 'PASSED' in output: + smart_health['status'] = 'HEALTHY' + else: + smart_health['status'] = 'UNKNOWN' # Parse SMART attributes with manufacturer-specific handling power_on_hours = 0 @@ -1240,9 +1298,9 @@ class SystemHealthMonitor: if recent_errors: smart_health['severity'] = 'WARNING' smart_health['issues'].extend(recent_errors) - + logger.debug(f"=== SMART Health Check for {device} ===") - logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0]}") + logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0] if manufacturer_profile else 'None'}") logger.debug("Raw SMART attributes:") for attr, value in smart_health['attributes'].items(): logger.debug(f"{attr}: {value}") @@ -1253,38 +1311,58 @@ class SystemHealthMonitor: # Special handling for NVMe drives if 'nvme' in device: - nvme_result = subprocess.run( - ['nvme', 'smart-log', device], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - logger.debug(f"NVMe smart-log raw output for {device}:") - logger.debug(nvme_result.stdout) - - # Add this line to initialize the temperature attribute - smart_health['attributes']['Temperature_Celsius'] = None - - for line in nvme_result.stdout.split('\n'): - if 'temperature' in line.lower(): - temp_str = line.split(':')[1].strip() - logger.debug(f"Raw temperature string: {temp_str}") - - # Extract first temperature value - temp_value = int(''.join(c for c in temp_str if c.isdigit())[0:2]) - logger.debug(f"Parsed temperature value: {temp_value}") - - # Set both temperature fields - smart_health['temp'] = temp_value - smart_health['attributes']['Temperature_Celsius'] = temp_value - - logger.debug(f"Final temperature recorded: {smart_health['temp']}") - break + try: + nvme_result = subprocess.run( + ['nvme', 'smart-log', device], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=10 + ) + logger.debug(f"NVMe smart-log raw output for {device}:") + logger.debug(nvme_result.stdout) + + # Initialize the temperature attribute + if smart_health['temp'] is None: + smart_health['attributes']['Temperature_Celsius'] = None + + for line in nvme_result.stdout.split('\n'): + # Fix the NoneType error by checking if line exists and has content + if line and line.strip() and 'temperature' in line.lower(): + try: + temp_str = line.split(':')[1].strip() if ':' in line else line.strip() + logger.debug(f"Raw temperature string: {temp_str}") + + # Extract first temperature value more safely + digits = ''.join(c for c in temp_str if c.isdigit()) + if len(digits) >= 2: + temp_value = int(digits[:2]) + logger.debug(f"Parsed temperature value: {temp_value}") + + # Set both temperature fields + smart_health['temp'] = temp_value + smart_health['attributes']['Temperature_Celsius'] = temp_value + + logger.debug(f"Final temperature recorded: {smart_health['temp']}") + break + except (ValueError, IndexError, AttributeError) as e: + logger.debug(f"Error parsing NVMe temperature from line '{line}': {e}") + continue + except subprocess.TimeoutExpired: + logger.debug(f"NVMe smart-log for {device} timed out") + except Exception as e: + logger.debug(f"Error getting NVMe smart data for {device}: {e}") + except subprocess.TimeoutExpired: + smart_health['status'] = 'ERROR' + smart_health['issues'].append("SMART check timed out") except Exception as e: smart_health['status'] = 'ERROR' smart_health['severity'] = 'UNKNOWN' smart_health['issues'].append(f"Error checking SMART: {str(e)}") + logger.debug(f"Exception in _check_smart_health for {device}: {e}") + import traceback + logger.debug(traceback.format_exc()) return smart_health @@ -1292,11 +1370,15 @@ class SystemHealthMonitor: drives_health = {'overall_status': 'NORMAL', 'drives': []} try: - # Get physical disks only - physical_disks = [disk for disk in self._get_all_disks() - if disk.startswith(('/dev/sd', '/dev/nvme'))] + # Get only valid physical disks + physical_disks = self._get_all_disks() logger.debug(f"Checking physical disks: {physical_disks}") + if not physical_disks: + logger.warning("No valid physical disks found for monitoring") + drives_health['overall_status'] = 'WARNING' + return drives_health + # Get ALL partition information including device mapper partitions = psutil.disk_partitions(all=True) @@ -1355,10 +1437,15 @@ class SystemHealthMonitor: 'smart_attributes': smart_health['attributes'] }) + # Only report issues for drives that should be monitored if smart_health['status'] == 'UNHEALTHY': overall_status = 'CRITICAL' - elif smart_health['issues'] and overall_status != 'CRITICAL': - overall_status = 'WARNING' + elif smart_health['status'] == 'ERROR': + # Don't escalate overall status for ERROR drives (might be virtual) + logger.debug(f"Drive {disk} returned ERROR status, skipping from issue detection") + elif smart_health['issues'] and smart_health['status'] not in ['ERROR', 'NOT_SUPPORTED']: + if overall_status != 'CRITICAL': + overall_status = 'WARNING' drives_health['drives'].append(drive_report)