Only sets has_ecc to True if it finds actual ECC controller rows

This commit is contained in:
2024-12-13 18:41:13 -05:00
parent 75898a0c78
commit 45179d89a5

View File

@ -431,40 +431,34 @@ class SystemHealthMonitor:
bytes_value /= 1024.0 bytes_value /= 1024.0
return f"{bytes_value:.1f}Y{suffix}" return f"{bytes_value:.1f}Y{suffix}"
def _check_memory_usage(self) -> Dict[str, Any]: def _check_memory_usage(self) -> Dict[str, Any]:
""" """
Check for ECC memory errors if ECC memory is present. Check for ECC memory errors if ECC memory is present.
"""
:return: Dictionary with memory health metrics and ECC status. memory_health = {
""" 'has_ecc': False,
memory_health = { 'ecc_errors': [],
'has_ecc': False, 'status': 'OK',
'ecc_errors': [], 'total_memory': self._convert_bytes(psutil.virtual_memory().total),
'status': 'OK' 'used_memory': self._convert_bytes(psutil.virtual_memory().used),
} 'memory_percent': psutil.virtual_memory().percent
}
try:
# Check if ECC memory is present by looking at edac_mc try:
result = subprocess.run( # More robust ECC detection - check both directory existence and content
['ls', '/sys/devices/system/edac/mc'], edac_path = '/sys/devices/system/edac/mc'
stdout=subprocess.PIPE, if os.path.exists(edac_path) and os.listdir(edac_path):
stderr=subprocess.PIPE, # Verify actual ECC controller presence
text=True for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
) if os.path.exists(f"{mc_dir}/csrow0"):
memory_health['has_ecc'] = True
if result.returncode == 0:
memory_health['has_ecc'] = True
# Check for ECC errors in mcX/csrowY/ue_count and ce_count files
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
mc_name = os.path.basename(mc_dir)
# Check uncorrectable errors # Check uncorrectable errors
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count") ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
if ue_count > 0: if ue_count > 0:
memory_health['status'] = 'CRITICAL' memory_health['status'] = 'CRITICAL'
memory_health['ecc_errors'].append( memory_health['ecc_errors'].append(
f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}" f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}"
) )
# Check correctable errors # Check correctable errors
@ -473,14 +467,14 @@ class SystemHealthMonitor:
if memory_health['status'] != 'CRITICAL': if memory_health['status'] != 'CRITICAL':
memory_health['status'] = 'WARNING' memory_health['status'] = 'WARNING'
memory_health['ecc_errors'].append( memory_health['ecc_errors'].append(
f"Correctable ECC errors detected in {mc_name}: {ce_count}" f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}"
) )
except Exception as e: except Exception as e:
memory_health['status'] = 'ERROR' memory_health['status'] = 'ERROR'
memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}") memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
return memory_health return memory_health
def _read_ecc_count(self, filepath: str) -> int: def _read_ecc_count(self, filepath: str) -> int:
""" """