Added ECC support and sorted code

This commit is contained in:
2024-12-13 18:36:01 -05:00
parent 9a4a2dadc9
commit a9b0fb77f2
2 changed files with 178 additions and 48 deletions

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python3
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob
from typing import Dict, Any, List
# Create a logger
@ -20,6 +20,49 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
class SystemHealthMonitor:
PRIORITIES = {
'CRITICAL': '1',
'HIGH': '2',
'MEDIUM': '3',
'LOW': '4'
}
ISSUE_PRIORITIES = {
'SMART_FAILURE': PRIORITIES['HIGH'],
'DISK_CRITICAL': PRIORITIES['HIGH'],
'DISK_WARNING': PRIORITIES['MEDIUM'],
'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
'CPU_HIGH': PRIORITIES['MEDIUM'],
'NETWORK_FAILURE': PRIORITIES['HIGH']
}
CONFIG = {
'TICKET_API_URL': 'http://10.10.10.45/create_ticket_api.php',
'STATE_FILE': '/tmp/last_health_check.json',
'THRESHOLDS': {
'DISK_CRITICAL': 90,
'DISK_WARNING': 80,
'CPU_WARNING': 80,
'TEMPERATURE_WARNING': 65
},
'NETWORKS': {
'MANAGEMENT': '10.10.10.1',
'CEPH': '10.10.90.1',
'PING_TIMEOUT': 1, # seconds
'PING_COUNT': 1
}
}
TICKET_TEMPLATES = {
'ACTION_TYPE': '[auto]',
'ENVIRONMENT': '[production]',
'TICKET_TYPE': '[maintenance]',
'HARDWARE_TYPE': '[hardware]',
'NETWORK_TYPE': '[network]',
'SCOPE_SINGLE': '[single-node]',
'SCOPE_CLUSTER': '[cluster-wide]',
'DEFAULT_CATEGORY': 'Hardware',
'DEFAULT_ISSUE_TYPE': 'Problem'
}
def __init__(self,
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
state_file: str = '/tmp/last_health_check.json',
@ -34,7 +77,7 @@ class SystemHealthMonitor:
self.ticket_api_url = ticket_api_url
self.state_file = state_file
self.dry_run = dry_run
def run(self):
"""
Perform a one-shot health check of the system.
@ -127,33 +170,35 @@ class SystemHealthMonitor:
return
hostname = socket.gethostname()
action_type = "[auto]"
environment = "[production]"
ticket_type = "[maintenance]"
action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
environment = self.TICKET_TEMPLATES['ENVIRONMENT']
ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
for issue in issues:
# Set default values
priority = ""
category = "Hardware"
issue_type = "Problem"
scope = "[single-node]"
priority = self.PRIORITIES['MEDIUM'] # default
category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
scope = self.TICKET_TEMPLATES['SCOPE_SINGLE']
if "Disk" in issue:
hardware_type = "[hardware]"
hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
if "CRITICAL" in issue or "SMART failure" in issue:
priority = "2"
priority = self.ISSUE_PRIORITIES['DISK_CRITICAL']
elif "WARNING" in issue:
priority = "3"
elif "Memory" in issue:
priority = self.ISSUE_PRIORITIES['DISK_WARNING']
elif "Network" in issue:
hardware_type = self.TICKET_TEMPLATES['NETWORK_TYPE']
priority = self.ISSUE_PRIORITIES['NETWORK_FAILURE']
scope = self.TICKET_TEMPLATES['SCOPE_CLUSTER']
elif "Uncorrectable ECC" in issue:
hardware_type = "[hardware]"
priority = "2"
elif "Correctable ECC" in issue:
hardware_type = "[hardware]"
priority = "3"
elif "CPU" in issue:
hardware_type = "[hardware]"
priority = "3"
elif "Network" in issue:
hardware_type = "[network]"
priority = "2"
scope = "[cluster-wide]"
# Create standardized ticket title
ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}"
@ -212,15 +257,15 @@ class SystemHealthMonitor:
if partition.get('smart_status') == 'UNHEALTHY':
issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status")
# Check for memory-related issues
# Check for ECC memory errors
memory_health = health_report.get('memory_health', {})
if memory_health and memory_health.get('memory_percent', 0) > 80:
issues.append("Memory usage is above 80%")
if memory_health.get('has_ecc') and memory_health.get('ecc_errors'):
issues.extend(memory_health['ecc_errors'])
# Check for CPU-related issues
cpu_health = health_report.get('cpu_health', {})
if cpu_health and cpu_health.get('cpu_usage_percent', 0) > 80:
issues.append("CPU usage is above 80%")
if cpu_health and cpu_health.get('cpu_usage_percent', 0) > self.CONFIG['THRESHOLDS']['CPU_WARNING']:
issues.append("CPU usage is above threshold")
# Check for network-related issues
network_health = health_report.get('network_health', {})
@ -291,7 +336,7 @@ class SystemHealthMonitor:
if len(parts) >= 10:
temp = int(parts[9])
smart_health['temp'] = temp
if temp > 65:
if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
smart_health['issues'].append(f"High drive temperature: {temp}°C")
except Exception as e:
@ -318,9 +363,9 @@ class SystemHealthMonitor:
# Check disk usage
usage = psutil.disk_usage(partition.mountpoint)
disk_usage_status = 'NORMAL'
if usage.percent > 90:
if usage.percent > self.CONFIG['THRESHOLDS']['DISK_CRITICAL']:
disk_usage_status = 'CRITICAL_HIGH_USAGE'
elif usage.percent > 80:
elif usage.percent > self.CONFIG['THRESHOLDS']['DISK_WARNING']:
disk_usage_status = 'WARNING_HIGH_USAGE'
drive_report.update({
@ -372,19 +417,68 @@ class SystemHealthMonitor:
def _check_memory_usage(self) -> Dict[str, Any]:
"""
Check memory usage and return health metrics.
Check for ECC memory errors if ECC memory is present.
:return: Dictionary with memory health metrics.
:return: Dictionary with memory health metrics and ECC status.
"""
memory_info = psutil.virtual_memory()
memory_health = {
'total_memory': self._convert_bytes(memory_info.total),
'used_memory': self._convert_bytes(memory_info.used),
'memory_percent': memory_info.percent,
'status': 'OK' if memory_info.percent < 90 else 'WARNING'
'has_ecc': False,
'ecc_errors': [],
'status': 'OK'
}
try:
# Check if ECC memory is present by looking at edac_mc
result = subprocess.run(
['ls', '/sys/devices/system/edac/mc'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
if result.returncode == 0:
memory_health['has_ecc'] = True
# Check for ECC errors in mcX/csrowY/ue_count and ce_count files
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
mc_name = os.path.basename(mc_dir)
# Check uncorrectable errors
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
if ue_count > 0:
memory_health['status'] = 'CRITICAL'
memory_health['ecc_errors'].append(
f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}"
)
# Check correctable errors
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
if ce_count > 0:
if memory_health['status'] != 'CRITICAL':
memory_health['status'] = 'WARNING'
memory_health['ecc_errors'].append(
f"Correctable ECC errors detected in {mc_name}: {ce_count}"
)
except Exception as e:
memory_health['status'] = 'ERROR'
memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
return memory_health
def _read_ecc_count(self, filepath: str) -> int:
"""
Read ECC error count from a file.
:param filepath: Path to the ECC count file
:return: Number of ECC errors
"""
try:
with open(filepath, 'r') as f:
return int(f.read().strip())
except:
return 0
def _check_cpu_usage(self) -> Dict[str, Any]:
"""
Check CPU usage and return health metrics.
@ -394,7 +488,7 @@ class SystemHealthMonitor:
cpu_usage_percent = psutil.cpu_percent(interval=1)
cpu_health = {
'cpu_usage_percent': cpu_usage_percent,
'status': 'OK' if cpu_usage_percent < 90 else 'WARNING'
'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING'
}
return cpu_health
@ -405,30 +499,65 @@ class SystemHealthMonitor:
:return: Dictionary containing network health metrics and any issues found.
"""
network_health = {
'management_network': {'issues': []},
'ceph_network': {'issues': []}
'management_network': {
'issues': [],
'status': 'OK',
'latency': None
},
'ceph_network': {
'issues': [],
'status': 'OK',
'latency': None
}
}
try:
# Check management network connectivity
proc = subprocess.run(["ping", "-c", "1", "10.10.10.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if proc.returncode != 0:
mgmt_result = subprocess.run(
[
"ping",
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
self.CONFIG['NETWORKS']['MANAGEMENT']
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
if mgmt_result.returncode != 0:
network_health['management_network']['status'] = 'CRITICAL'
network_health['management_network']['issues'].append(
"Management network is unreachable."
"Management network is unreachable"
)
# Check Ceph network connectivity
proc = subprocess.run(["ping", "-c", "1", "10.10.90.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if proc.returncode != 0:
ceph_result = subprocess.run(
[
"ping",
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
self.CONFIG['NETWORKS']['CEPH']
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
if ceph_result.returncode != 0:
network_health['ceph_network']['status'] = 'CRITICAL'
network_health['ceph_network']['issues'].append(
"Ceph network is unreachable."
"Ceph network is unreachable"
)
return network_health
except Exception as e:
print(f"Network health check failed: {e}")
return {'error': str(e)}
logger.error(f"Network health check failed: {e}")
return {
'status': 'ERROR',
'error': str(e)
}
def main():
try:
@ -447,9 +576,9 @@ def main():
# Instantiate the SystemHealthMonitor class
monitor = SystemHealthMonitor(
ticket_api_url=ticket_api_url,
state_file=state_file,
dry_run=args.dry_run # Pass the dry-run flag
ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
state_file=SystemHealthMonitor.CONFIG['STATE_FILE'],
dry_run=args.dry_run
)
# Run the health checks
@ -476,4 +605,4 @@ if __name__ == "__main__":
# Set dry-run mode if specified
dry_run_mode = args.dry_run
main()
main()