From 45179d89a5eb0d4b433c1f3cb983a8bbe6d266bd Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Fri, 13 Dec 2024 18:41:13 -0500 Subject: [PATCH] Only sets has_ecc to True if it finds actual ECC controller rows --- hwmonDaemon.py | 60 +++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 1670ca3..5ec050e 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -431,40 +431,34 @@ class SystemHealthMonitor: bytes_value /= 1024.0 return f"{bytes_value:.1f}Y{suffix}" - def _check_memory_usage(self) -> Dict[str, Any]: - """ - Check for ECC memory errors if ECC memory is present. - - :return: Dictionary with memory health metrics and ECC status. - """ - memory_health = { - 'has_ecc': False, - 'ecc_errors': [], - 'status': 'OK' - } - - try: - # Check if ECC memory is present by looking at edac_mc - result = subprocess.run( - ['ls', '/sys/devices/system/edac/mc'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True - ) - - if result.returncode == 0: - memory_health['has_ecc'] = True - - # Check for ECC errors in mcX/csrowY/ue_count and ce_count files - for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'): - mc_name = os.path.basename(mc_dir) +def _check_memory_usage(self) -> Dict[str, Any]: + """ + Check for ECC memory errors if ECC memory is present. + """ + memory_health = { + 'has_ecc': False, + 'ecc_errors': [], + 'status': 'OK', + 'total_memory': self._convert_bytes(psutil.virtual_memory().total), + 'used_memory': self._convert_bytes(psutil.virtual_memory().used), + 'memory_percent': psutil.virtual_memory().percent + } + + try: + # More robust ECC detection - check both directory existence and content + edac_path = '/sys/devices/system/edac/mc' + if os.path.exists(edac_path) and os.listdir(edac_path): + # Verify actual ECC controller presence + for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'): + if os.path.exists(f"{mc_dir}/csrow0"): + memory_health['has_ecc'] = True # Check uncorrectable errors ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count") if ue_count > 0: memory_health['status'] = 'CRITICAL' memory_health['ecc_errors'].append( - f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}" + f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}" ) # Check correctable errors @@ -473,14 +467,14 @@ class SystemHealthMonitor: if memory_health['status'] != 'CRITICAL': memory_health['status'] = 'WARNING' memory_health['ecc_errors'].append( - f"Correctable ECC errors detected in {mc_name}: {ce_count}" + f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}" ) - except Exception as e: - memory_health['status'] = 'ERROR' - memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}") + except Exception as e: + memory_health['status'] = 'ERROR' + memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}") - return memory_health + return memory_health def _read_ecc_count(self, filepath: str) -> int: """