Did not have memory check within class
This commit is contained in:
@ -431,50 +431,50 @@ class SystemHealthMonitor:
|
||||
bytes_value /= 1024.0
|
||||
return f"{bytes_value:.1f}Y{suffix}"
|
||||
|
||||
def _check_memory_usage(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check for ECC memory errors if ECC memory is present.
|
||||
"""
|
||||
memory_health = {
|
||||
'has_ecc': False,
|
||||
'ecc_errors': [],
|
||||
'status': 'OK',
|
||||
'total_memory': self._convert_bytes(psutil.virtual_memory().total),
|
||||
'used_memory': self._convert_bytes(psutil.virtual_memory().used),
|
||||
'memory_percent': psutil.virtual_memory().percent
|
||||
}
|
||||
|
||||
try:
|
||||
# More robust ECC detection - check both directory existence and content
|
||||
edac_path = '/sys/devices/system/edac/mc'
|
||||
if os.path.exists(edac_path) and os.listdir(edac_path):
|
||||
# Verify actual ECC controller presence
|
||||
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
||||
if os.path.exists(f"{mc_dir}/csrow0"):
|
||||
memory_health['has_ecc'] = True
|
||||
|
||||
# Check uncorrectable errors
|
||||
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
|
||||
if ue_count > 0:
|
||||
memory_health['status'] = 'CRITICAL'
|
||||
memory_health['ecc_errors'].append(
|
||||
f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}"
|
||||
)
|
||||
|
||||
# Check correctable errors
|
||||
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
|
||||
if ce_count > 0:
|
||||
if memory_health['status'] != 'CRITICAL':
|
||||
memory_health['status'] = 'WARNING'
|
||||
memory_health['ecc_errors'].append(
|
||||
f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}"
|
||||
)
|
||||
def _check_memory_usage(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check for ECC memory errors if ECC memory is present.
|
||||
"""
|
||||
memory_health = {
|
||||
'has_ecc': False,
|
||||
'ecc_errors': [],
|
||||
'status': 'OK',
|
||||
'total_memory': self._convert_bytes(psutil.virtual_memory().total),
|
||||
'used_memory': self._convert_bytes(psutil.virtual_memory().used),
|
||||
'memory_percent': psutil.virtual_memory().percent
|
||||
}
|
||||
|
||||
try:
|
||||
# More robust ECC detection - check both directory existence and content
|
||||
edac_path = '/sys/devices/system/edac/mc'
|
||||
if os.path.exists(edac_path) and os.listdir(edac_path):
|
||||
# Verify actual ECC controller presence
|
||||
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
||||
if os.path.exists(f"{mc_dir}/csrow0"):
|
||||
memory_health['has_ecc'] = True
|
||||
|
||||
except Exception as e:
|
||||
memory_health['status'] = 'ERROR'
|
||||
memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
|
||||
|
||||
return memory_health
|
||||
# Check uncorrectable errors
|
||||
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
|
||||
if ue_count > 0:
|
||||
memory_health['status'] = 'CRITICAL'
|
||||
memory_health['ecc_errors'].append(
|
||||
f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}"
|
||||
)
|
||||
|
||||
# Check correctable errors
|
||||
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
|
||||
if ce_count > 0:
|
||||
if memory_health['status'] != 'CRITICAL':
|
||||
memory_health['status'] = 'WARNING'
|
||||
memory_health['ecc_errors'].append(
|
||||
f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
memory_health['status'] = 'ERROR'
|
||||
memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
|
||||
|
||||
return memory_health
|
||||
|
||||
def _read_ecc_count(self, filepath: str) -> int:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user