diff --git a/hwmonDaemon.py b/hwmonDaemon.py index d2cd812..ad61a73 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -244,13 +244,9 @@ class SystemHealthMonitor: :return: Boolean indicating if it's a physical disk """ return bool(re.match(r'/dev/(sd[a-z]|nvme\d+n\d+|mmcblk\d+)', device_path)) - def _check_smart_health(self, device: str) -> Dict[str, Any]: """ Check comprehensive SMART health metrics for a drive. - - :param device: Path to device - :return: Dictionary containing health metrics and status """ smart_health = { 'status': 'HEALTHY', @@ -259,7 +255,6 @@ class SystemHealthMonitor: 'attributes': {} } - # Get detailed SMART attributes try: result = subprocess.run( ['smartctl', '-A', '-H', device], @@ -270,42 +265,72 @@ class SystemHealthMonitor: output = result.stdout - # Check critical attributes - critical_thresholds = { - 'Reallocated_Sector_Ct': 10, - 'Current_Pending_Sector': 1, - 'Offline_Uncorrectable': 1, - 'Reported_Uncorrect': 1, - 'Command_Timeout': 5, - 'Temperature_Celsius': 60 + # Check overall SMART status first + if 'FAILED' in output and not 'PASSED' in output: + smart_health['status'] = 'UNHEALTHY' + smart_health['issues'].append("SMART overall health check failed") + + # Define critical attributes and their thresholds + critical_attributes = { + 'Reallocated_Sector_Ct': {'threshold': 0, 'critical': True}, + 'Current_Pending_Sector': {'threshold': 0, 'critical': True}, + 'Offline_Uncorrectable': {'threshold': 0, 'critical': True}, + 'Reported_Uncorrect': {'threshold': 0, 'critical': True}, + 'Command_Timeout': {'threshold': 5, 'critical': False}, + 'Temperature_Celsius': {'threshold': 65, 'critical': False}, + 'Wear_Leveling_Count': {'threshold': 10, 'critical': True}, + 'Media_Wearout_Indicator': {'threshold': 20, 'critical': True} } for line in output.split('\n'): - for attr, threshold in critical_thresholds.items(): - if attr in line: - try: - value = int(line.split()[9]) # Raw value is typically in column 10 - smart_health['attributes'][attr] = value + # Skip header lines + if 'ATTRIBUTE_NAME' in line or '===' in line: + continue + + for attr_name, limits in critical_attributes.items(): + if attr_name in line: + parts = line.split() + if len(parts) >= 10: + value = int(parts[9]) # Raw value + normalized = int(parts[3]) # Normalized value - if attr == 'Temperature_Celsius': + smart_health['attributes'][attr_name] = { + 'raw': value, + 'normalized': normalized + } + + # Check thresholds + if attr_name == 'Temperature_Celsius': smart_health['temp'] = value - if value > threshold: - smart_health['issues'].append(f"Drive temperature critical: {value}°C") - elif value > threshold: - smart_health['issues'].append(f"{attr} above threshold: {value}") - except (IndexError, ValueError): - continue - - # Check overall SMART status - if 'FAILED' in output or smart_health['issues']: - smart_health['status'] = 'UNHEALTHY' - + if value > limits['threshold']: + smart_health['issues'].append( + f"Drive temperature critical: {value}°C" + ) + elif value > limits['threshold']: + if limits['critical']: + smart_health['status'] = 'UNHEALTHY' + smart_health['issues'].append( + f"{attr_name} above threshold: {value}" + ) + + # Check for very low normalized values + if normalized <= 10 and attr_name != 'Temperature_Celsius': + smart_health['issues'].append( + f"{attr_name} normalized value critical: {normalized}" + ) + + # Check if WHEN_FAILED is present and not in the past + if 'WHEN_FAILED' in output: + for line in output.split('\n'): + if 'WHEN_FAILED' in line and 'In_the_past' not in line and '-' not in line: + smart_health['status'] = 'UNHEALTHY' + smart_health['issues'].append(f"Current failure detected: {line}") + except Exception as e: smart_health['status'] = 'ERROR' smart_health['issues'].append(f"Error checking SMART: {str(e)}") return smart_health - def _check_drives_health(self) -> Dict[str, Any]: """ Check overall health of physical SATA and NVMe drives including disk usage and SMART status.