Added ECC support and sorted code

This commit is contained in:
2024-12-13 18:36:01 -05:00
parent 9a4a2dadc9
commit a9b0fb77f2
2 changed files with 178 additions and 48 deletions

View File

@ -3,6 +3,7 @@ Description=Run System Health Monitoring Daemon Daily
[Timer] [Timer]
OnCalendar=daily OnCalendar=daily
RandomizedDelaySec=1h
Persistent=true Persistent=true
[Install] [Install]

View File

@ -1,5 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob
from typing import Dict, Any, List from typing import Dict, Any, List
# Create a logger # Create a logger
@ -20,6 +20,49 @@ console_handler.setFormatter(formatter)
logger.addHandler(console_handler) logger.addHandler(console_handler)
class SystemHealthMonitor: class SystemHealthMonitor:
PRIORITIES = {
'CRITICAL': '1',
'HIGH': '2',
'MEDIUM': '3',
'LOW': '4'
}
ISSUE_PRIORITIES = {
'SMART_FAILURE': PRIORITIES['HIGH'],
'DISK_CRITICAL': PRIORITIES['HIGH'],
'DISK_WARNING': PRIORITIES['MEDIUM'],
'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
'CPU_HIGH': PRIORITIES['MEDIUM'],
'NETWORK_FAILURE': PRIORITIES['HIGH']
}
CONFIG = {
'TICKET_API_URL': 'http://10.10.10.45/create_ticket_api.php',
'STATE_FILE': '/tmp/last_health_check.json',
'THRESHOLDS': {
'DISK_CRITICAL': 90,
'DISK_WARNING': 80,
'CPU_WARNING': 80,
'TEMPERATURE_WARNING': 65
},
'NETWORKS': {
'MANAGEMENT': '10.10.10.1',
'CEPH': '10.10.90.1',
'PING_TIMEOUT': 1, # seconds
'PING_COUNT': 1
}
}
TICKET_TEMPLATES = {
'ACTION_TYPE': '[auto]',
'ENVIRONMENT': '[production]',
'TICKET_TYPE': '[maintenance]',
'HARDWARE_TYPE': '[hardware]',
'NETWORK_TYPE': '[network]',
'SCOPE_SINGLE': '[single-node]',
'SCOPE_CLUSTER': '[cluster-wide]',
'DEFAULT_CATEGORY': 'Hardware',
'DEFAULT_ISSUE_TYPE': 'Problem'
}
def __init__(self, def __init__(self,
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php', ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
state_file: str = '/tmp/last_health_check.json', state_file: str = '/tmp/last_health_check.json',
@ -127,33 +170,35 @@ class SystemHealthMonitor:
return return
hostname = socket.gethostname() hostname = socket.gethostname()
action_type = "[auto]" action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
environment = "[production]" environment = self.TICKET_TEMPLATES['ENVIRONMENT']
ticket_type = "[maintenance]" ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
for issue in issues: for issue in issues:
# Set default values priority = self.PRIORITIES['MEDIUM'] # default
priority = "" category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
category = "Hardware" issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
issue_type = "Problem" scope = self.TICKET_TEMPLATES['SCOPE_SINGLE']
scope = "[single-node]"
if "Disk" in issue: if "Disk" in issue:
hardware_type = "[hardware]" hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
if "CRITICAL" in issue or "SMART failure" in issue: if "CRITICAL" in issue or "SMART failure" in issue:
priority = "2" priority = self.ISSUE_PRIORITIES['DISK_CRITICAL']
elif "WARNING" in issue: elif "WARNING" in issue:
priority = "3" priority = self.ISSUE_PRIORITIES['DISK_WARNING']
elif "Memory" in issue: elif "Network" in issue:
hardware_type = self.TICKET_TEMPLATES['NETWORK_TYPE']
priority = self.ISSUE_PRIORITIES['NETWORK_FAILURE']
scope = self.TICKET_TEMPLATES['SCOPE_CLUSTER']
elif "Uncorrectable ECC" in issue:
hardware_type = "[hardware]"
priority = "2"
elif "Correctable ECC" in issue:
hardware_type = "[hardware]" hardware_type = "[hardware]"
priority = "3" priority = "3"
elif "CPU" in issue: elif "CPU" in issue:
hardware_type = "[hardware]" hardware_type = "[hardware]"
priority = "3" priority = "3"
elif "Network" in issue:
hardware_type = "[network]"
priority = "2"
scope = "[cluster-wide]"
# Create standardized ticket title # Create standardized ticket title
ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}" ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}"
@ -212,15 +257,15 @@ class SystemHealthMonitor:
if partition.get('smart_status') == 'UNHEALTHY': if partition.get('smart_status') == 'UNHEALTHY':
issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status") issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status")
# Check for memory-related issues # Check for ECC memory errors
memory_health = health_report.get('memory_health', {}) memory_health = health_report.get('memory_health', {})
if memory_health and memory_health.get('memory_percent', 0) > 80: if memory_health.get('has_ecc') and memory_health.get('ecc_errors'):
issues.append("Memory usage is above 80%") issues.extend(memory_health['ecc_errors'])
# Check for CPU-related issues # Check for CPU-related issues
cpu_health = health_report.get('cpu_health', {}) cpu_health = health_report.get('cpu_health', {})
if cpu_health and cpu_health.get('cpu_usage_percent', 0) > 80: if cpu_health and cpu_health.get('cpu_usage_percent', 0) > self.CONFIG['THRESHOLDS']['CPU_WARNING']:
issues.append("CPU usage is above 80%") issues.append("CPU usage is above threshold")
# Check for network-related issues # Check for network-related issues
network_health = health_report.get('network_health', {}) network_health = health_report.get('network_health', {})
@ -291,7 +336,7 @@ class SystemHealthMonitor:
if len(parts) >= 10: if len(parts) >= 10:
temp = int(parts[9]) temp = int(parts[9])
smart_health['temp'] = temp smart_health['temp'] = temp
if temp > 65: if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
smart_health['issues'].append(f"High drive temperature: {temp}°C") smart_health['issues'].append(f"High drive temperature: {temp}°C")
except Exception as e: except Exception as e:
@ -318,9 +363,9 @@ class SystemHealthMonitor:
# Check disk usage # Check disk usage
usage = psutil.disk_usage(partition.mountpoint) usage = psutil.disk_usage(partition.mountpoint)
disk_usage_status = 'NORMAL' disk_usage_status = 'NORMAL'
if usage.percent > 90: if usage.percent > self.CONFIG['THRESHOLDS']['DISK_CRITICAL']:
disk_usage_status = 'CRITICAL_HIGH_USAGE' disk_usage_status = 'CRITICAL_HIGH_USAGE'
elif usage.percent > 80: elif usage.percent > self.CONFIG['THRESHOLDS']['DISK_WARNING']:
disk_usage_status = 'WARNING_HIGH_USAGE' disk_usage_status = 'WARNING_HIGH_USAGE'
drive_report.update({ drive_report.update({
@ -372,19 +417,68 @@ class SystemHealthMonitor:
def _check_memory_usage(self) -> Dict[str, Any]: def _check_memory_usage(self) -> Dict[str, Any]:
""" """
Check memory usage and return health metrics. Check for ECC memory errors if ECC memory is present.
:return: Dictionary with memory health metrics. :return: Dictionary with memory health metrics and ECC status.
""" """
memory_info = psutil.virtual_memory()
memory_health = { memory_health = {
'total_memory': self._convert_bytes(memory_info.total), 'has_ecc': False,
'used_memory': self._convert_bytes(memory_info.used), 'ecc_errors': [],
'memory_percent': memory_info.percent, 'status': 'OK'
'status': 'OK' if memory_info.percent < 90 else 'WARNING'
} }
try:
# Check if ECC memory is present by looking at edac_mc
result = subprocess.run(
['ls', '/sys/devices/system/edac/mc'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
if result.returncode == 0:
memory_health['has_ecc'] = True
# Check for ECC errors in mcX/csrowY/ue_count and ce_count files
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
mc_name = os.path.basename(mc_dir)
# Check uncorrectable errors
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
if ue_count > 0:
memory_health['status'] = 'CRITICAL'
memory_health['ecc_errors'].append(
f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}"
)
# Check correctable errors
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
if ce_count > 0:
if memory_health['status'] != 'CRITICAL':
memory_health['status'] = 'WARNING'
memory_health['ecc_errors'].append(
f"Correctable ECC errors detected in {mc_name}: {ce_count}"
)
except Exception as e:
memory_health['status'] = 'ERROR'
memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
return memory_health return memory_health
def _read_ecc_count(self, filepath: str) -> int:
"""
Read ECC error count from a file.
:param filepath: Path to the ECC count file
:return: Number of ECC errors
"""
try:
with open(filepath, 'r') as f:
return int(f.read().strip())
except:
return 0
def _check_cpu_usage(self) -> Dict[str, Any]: def _check_cpu_usage(self) -> Dict[str, Any]:
""" """
Check CPU usage and return health metrics. Check CPU usage and return health metrics.
@ -394,7 +488,7 @@ class SystemHealthMonitor:
cpu_usage_percent = psutil.cpu_percent(interval=1) cpu_usage_percent = psutil.cpu_percent(interval=1)
cpu_health = { cpu_health = {
'cpu_usage_percent': cpu_usage_percent, 'cpu_usage_percent': cpu_usage_percent,
'status': 'OK' if cpu_usage_percent < 90 else 'WARNING' 'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING'
} }
return cpu_health return cpu_health
@ -405,30 +499,65 @@ class SystemHealthMonitor:
:return: Dictionary containing network health metrics and any issues found. :return: Dictionary containing network health metrics and any issues found.
""" """
network_health = { network_health = {
'management_network': {'issues': []}, 'management_network': {
'ceph_network': {'issues': []} 'issues': [],
'status': 'OK',
'latency': None
},
'ceph_network': {
'issues': [],
'status': 'OK',
'latency': None
}
} }
try: try:
# Check management network connectivity # Check management network connectivity
proc = subprocess.run(["ping", "-c", "1", "10.10.10.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) mgmt_result = subprocess.run(
if proc.returncode != 0: [
"ping",
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
self.CONFIG['NETWORKS']['MANAGEMENT']
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
if mgmt_result.returncode != 0:
network_health['management_network']['status'] = 'CRITICAL'
network_health['management_network']['issues'].append( network_health['management_network']['issues'].append(
"Management network is unreachable." "Management network is unreachable"
) )
# Check Ceph network connectivity # Check Ceph network connectivity
proc = subprocess.run(["ping", "-c", "1", "10.10.90.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) ceph_result = subprocess.run(
if proc.returncode != 0: [
"ping",
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
self.CONFIG['NETWORKS']['CEPH']
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
if ceph_result.returncode != 0:
network_health['ceph_network']['status'] = 'CRITICAL'
network_health['ceph_network']['issues'].append( network_health['ceph_network']['issues'].append(
"Ceph network is unreachable." "Ceph network is unreachable"
) )
return network_health return network_health
except Exception as e: except Exception as e:
print(f"Network health check failed: {e}") logger.error(f"Network health check failed: {e}")
return {'error': str(e)} return {
'status': 'ERROR',
'error': str(e)
}
def main(): def main():
try: try:
@ -447,9 +576,9 @@ def main():
# Instantiate the SystemHealthMonitor class # Instantiate the SystemHealthMonitor class
monitor = SystemHealthMonitor( monitor = SystemHealthMonitor(
ticket_api_url=ticket_api_url, ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
state_file=state_file, state_file=SystemHealthMonitor.CONFIG['STATE_FILE'],
dry_run=args.dry_run # Pass the dry-run flag dry_run=args.dry_run
) )
# Run the health checks # Run the health checks