Enabled dmidecode as a ecc check

This commit is contained in:
2024-12-13 18:47:26 -05:00
parent f93d370116
commit d985b971a4

View File

@ -445,15 +445,29 @@ class SystemHealthMonitor:
}
try:
# More robust ECC detection - check both directory existence and content
edac_path = '/sys/devices/system/edac/mc'
if os.path.exists(edac_path) and os.listdir(edac_path):
# Verify actual ECC controller presence
# First check using dmidecode
result = subprocess.run(
['dmidecode', '--type', 'memory'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
if 'Error Correction Type: Multi-bit ECC' in result.stdout:
memory_health['has_ecc'] = True
# If dmidecode didn't find ECC, try the edac method as backup
if not memory_health['has_ecc']:
edac_path = '/sys/devices/system/edac/mc'
if os.path.exists(edac_path) and os.listdir(edac_path):
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
if os.path.exists(f"{mc_dir}/csrow0"):
memory_health['has_ecc'] = True
break
# If ECC is present, check for errors
if memory_health['has_ecc']:
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
if os.path.exists(f"{mc_dir}/csrow0"):
memory_health['has_ecc'] = True
# Check uncorrectable errors
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
if ue_count > 0:
memory_health['status'] = 'CRITICAL'
@ -461,7 +475,6 @@ class SystemHealthMonitor:
f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}"
)
# Check correctable errors
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
if ce_count > 0:
if memory_health['status'] != 'CRITICAL':