Enabled dmidecode as a ecc check
This commit is contained in:
@ -445,15 +445,29 @@ class SystemHealthMonitor:
|
|||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# More robust ECC detection - check both directory existence and content
|
# First check using dmidecode
|
||||||
|
result = subprocess.run(
|
||||||
|
['dmidecode', '--type', 'memory'],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
if 'Error Correction Type: Multi-bit ECC' in result.stdout:
|
||||||
|
memory_health['has_ecc'] = True
|
||||||
|
|
||||||
|
# If dmidecode didn't find ECC, try the edac method as backup
|
||||||
|
if not memory_health['has_ecc']:
|
||||||
edac_path = '/sys/devices/system/edac/mc'
|
edac_path = '/sys/devices/system/edac/mc'
|
||||||
if os.path.exists(edac_path) and os.listdir(edac_path):
|
if os.path.exists(edac_path) and os.listdir(edac_path):
|
||||||
# Verify actual ECC controller presence
|
|
||||||
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
||||||
if os.path.exists(f"{mc_dir}/csrow0"):
|
if os.path.exists(f"{mc_dir}/csrow0"):
|
||||||
memory_health['has_ecc'] = True
|
memory_health['has_ecc'] = True
|
||||||
|
break
|
||||||
|
|
||||||
# Check uncorrectable errors
|
# If ECC is present, check for errors
|
||||||
|
if memory_health['has_ecc']:
|
||||||
|
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
||||||
|
if os.path.exists(f"{mc_dir}/csrow0"):
|
||||||
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
|
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
|
||||||
if ue_count > 0:
|
if ue_count > 0:
|
||||||
memory_health['status'] = 'CRITICAL'
|
memory_health['status'] = 'CRITICAL'
|
||||||
@ -461,7 +475,6 @@ class SystemHealthMonitor:
|
|||||||
f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}"
|
f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check correctable errors
|
|
||||||
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
|
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
|
||||||
if ce_count > 0:
|
if ce_count > 0:
|
||||||
if memory_health['status'] != 'CRITICAL':
|
if memory_health['status'] != 'CRITICAL':
|
||||||
|
|||||||
Reference in New Issue
Block a user