Only sets has_ecc to True if it finds actual ECC controller rows
This commit is contained in:
@ -431,40 +431,34 @@ class SystemHealthMonitor:
|
|||||||
bytes_value /= 1024.0
|
bytes_value /= 1024.0
|
||||||
return f"{bytes_value:.1f}Y{suffix}"
|
return f"{bytes_value:.1f}Y{suffix}"
|
||||||
|
|
||||||
def _check_memory_usage(self) -> Dict[str, Any]:
|
def _check_memory_usage(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check for ECC memory errors if ECC memory is present.
|
Check for ECC memory errors if ECC memory is present.
|
||||||
|
|
||||||
:return: Dictionary with memory health metrics and ECC status.
|
|
||||||
"""
|
"""
|
||||||
memory_health = {
|
memory_health = {
|
||||||
'has_ecc': False,
|
'has_ecc': False,
|
||||||
'ecc_errors': [],
|
'ecc_errors': [],
|
||||||
'status': 'OK'
|
'status': 'OK',
|
||||||
|
'total_memory': self._convert_bytes(psutil.virtual_memory().total),
|
||||||
|
'used_memory': self._convert_bytes(psutil.virtual_memory().used),
|
||||||
|
'memory_percent': psutil.virtual_memory().percent
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Check if ECC memory is present by looking at edac_mc
|
# More robust ECC detection - check both directory existence and content
|
||||||
result = subprocess.run(
|
edac_path = '/sys/devices/system/edac/mc'
|
||||||
['ls', '/sys/devices/system/edac/mc'],
|
if os.path.exists(edac_path) and os.listdir(edac_path):
|
||||||
stdout=subprocess.PIPE,
|
# Verify actual ECC controller presence
|
||||||
stderr=subprocess.PIPE,
|
|
||||||
text=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode == 0:
|
|
||||||
memory_health['has_ecc'] = True
|
|
||||||
|
|
||||||
# Check for ECC errors in mcX/csrowY/ue_count and ce_count files
|
|
||||||
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
||||||
mc_name = os.path.basename(mc_dir)
|
if os.path.exists(f"{mc_dir}/csrow0"):
|
||||||
|
memory_health['has_ecc'] = True
|
||||||
|
|
||||||
# Check uncorrectable errors
|
# Check uncorrectable errors
|
||||||
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
|
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
|
||||||
if ue_count > 0:
|
if ue_count > 0:
|
||||||
memory_health['status'] = 'CRITICAL'
|
memory_health['status'] = 'CRITICAL'
|
||||||
memory_health['ecc_errors'].append(
|
memory_health['ecc_errors'].append(
|
||||||
f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}"
|
f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check correctable errors
|
# Check correctable errors
|
||||||
@ -473,7 +467,7 @@ class SystemHealthMonitor:
|
|||||||
if memory_health['status'] != 'CRITICAL':
|
if memory_health['status'] != 'CRITICAL':
|
||||||
memory_health['status'] = 'WARNING'
|
memory_health['status'] = 'WARNING'
|
||||||
memory_health['ecc_errors'].append(
|
memory_health['ecc_errors'].append(
|
||||||
f"Correctable ECC errors detected in {mc_name}: {ce_count}"
|
f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}"
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Reference in New Issue
Block a user