Added ECC support and sorted code
This commit is contained in:
225
hwmonDaemon.py
225
hwmonDaemon.py
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re
|
||||
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob
|
||||
from typing import Dict, Any, List
|
||||
|
||||
# Create a logger
|
||||
@ -20,6 +20,49 @@ console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
class SystemHealthMonitor:
|
||||
PRIORITIES = {
|
||||
'CRITICAL': '1',
|
||||
'HIGH': '2',
|
||||
'MEDIUM': '3',
|
||||
'LOW': '4'
|
||||
}
|
||||
ISSUE_PRIORITIES = {
|
||||
'SMART_FAILURE': PRIORITIES['HIGH'],
|
||||
'DISK_CRITICAL': PRIORITIES['HIGH'],
|
||||
'DISK_WARNING': PRIORITIES['MEDIUM'],
|
||||
'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
|
||||
'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
|
||||
'CPU_HIGH': PRIORITIES['MEDIUM'],
|
||||
'NETWORK_FAILURE': PRIORITIES['HIGH']
|
||||
}
|
||||
CONFIG = {
|
||||
'TICKET_API_URL': 'http://10.10.10.45/create_ticket_api.php',
|
||||
'STATE_FILE': '/tmp/last_health_check.json',
|
||||
'THRESHOLDS': {
|
||||
'DISK_CRITICAL': 90,
|
||||
'DISK_WARNING': 80,
|
||||
'CPU_WARNING': 80,
|
||||
'TEMPERATURE_WARNING': 65
|
||||
},
|
||||
'NETWORKS': {
|
||||
'MANAGEMENT': '10.10.10.1',
|
||||
'CEPH': '10.10.90.1',
|
||||
'PING_TIMEOUT': 1, # seconds
|
||||
'PING_COUNT': 1
|
||||
}
|
||||
}
|
||||
TICKET_TEMPLATES = {
|
||||
'ACTION_TYPE': '[auto]',
|
||||
'ENVIRONMENT': '[production]',
|
||||
'TICKET_TYPE': '[maintenance]',
|
||||
'HARDWARE_TYPE': '[hardware]',
|
||||
'NETWORK_TYPE': '[network]',
|
||||
'SCOPE_SINGLE': '[single-node]',
|
||||
'SCOPE_CLUSTER': '[cluster-wide]',
|
||||
'DEFAULT_CATEGORY': 'Hardware',
|
||||
'DEFAULT_ISSUE_TYPE': 'Problem'
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
|
||||
state_file: str = '/tmp/last_health_check.json',
|
||||
@ -34,7 +77,7 @@ class SystemHealthMonitor:
|
||||
self.ticket_api_url = ticket_api_url
|
||||
self.state_file = state_file
|
||||
self.dry_run = dry_run
|
||||
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Perform a one-shot health check of the system.
|
||||
@ -127,33 +170,35 @@ class SystemHealthMonitor:
|
||||
return
|
||||
|
||||
hostname = socket.gethostname()
|
||||
action_type = "[auto]"
|
||||
environment = "[production]"
|
||||
ticket_type = "[maintenance]"
|
||||
action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
|
||||
environment = self.TICKET_TEMPLATES['ENVIRONMENT']
|
||||
ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
|
||||
|
||||
for issue in issues:
|
||||
# Set default values
|
||||
priority = ""
|
||||
category = "Hardware"
|
||||
issue_type = "Problem"
|
||||
scope = "[single-node]"
|
||||
priority = self.PRIORITIES['MEDIUM'] # default
|
||||
category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
|
||||
issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
|
||||
scope = self.TICKET_TEMPLATES['SCOPE_SINGLE']
|
||||
|
||||
if "Disk" in issue:
|
||||
hardware_type = "[hardware]"
|
||||
hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
|
||||
if "CRITICAL" in issue or "SMART failure" in issue:
|
||||
priority = "2"
|
||||
priority = self.ISSUE_PRIORITIES['DISK_CRITICAL']
|
||||
elif "WARNING" in issue:
|
||||
priority = "3"
|
||||
elif "Memory" in issue:
|
||||
priority = self.ISSUE_PRIORITIES['DISK_WARNING']
|
||||
elif "Network" in issue:
|
||||
hardware_type = self.TICKET_TEMPLATES['NETWORK_TYPE']
|
||||
priority = self.ISSUE_PRIORITIES['NETWORK_FAILURE']
|
||||
scope = self.TICKET_TEMPLATES['SCOPE_CLUSTER']
|
||||
elif "Uncorrectable ECC" in issue:
|
||||
hardware_type = "[hardware]"
|
||||
priority = "2"
|
||||
elif "Correctable ECC" in issue:
|
||||
hardware_type = "[hardware]"
|
||||
priority = "3"
|
||||
elif "CPU" in issue:
|
||||
hardware_type = "[hardware]"
|
||||
priority = "3"
|
||||
elif "Network" in issue:
|
||||
hardware_type = "[network]"
|
||||
priority = "2"
|
||||
scope = "[cluster-wide]"
|
||||
|
||||
# Create standardized ticket title
|
||||
ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}"
|
||||
@ -212,15 +257,15 @@ class SystemHealthMonitor:
|
||||
if partition.get('smart_status') == 'UNHEALTHY':
|
||||
issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status")
|
||||
|
||||
# Check for memory-related issues
|
||||
# Check for ECC memory errors
|
||||
memory_health = health_report.get('memory_health', {})
|
||||
if memory_health and memory_health.get('memory_percent', 0) > 80:
|
||||
issues.append("Memory usage is above 80%")
|
||||
if memory_health.get('has_ecc') and memory_health.get('ecc_errors'):
|
||||
issues.extend(memory_health['ecc_errors'])
|
||||
|
||||
# Check for CPU-related issues
|
||||
cpu_health = health_report.get('cpu_health', {})
|
||||
if cpu_health and cpu_health.get('cpu_usage_percent', 0) > 80:
|
||||
issues.append("CPU usage is above 80%")
|
||||
if cpu_health and cpu_health.get('cpu_usage_percent', 0) > self.CONFIG['THRESHOLDS']['CPU_WARNING']:
|
||||
issues.append("CPU usage is above threshold")
|
||||
|
||||
# Check for network-related issues
|
||||
network_health = health_report.get('network_health', {})
|
||||
@ -291,7 +336,7 @@ class SystemHealthMonitor:
|
||||
if len(parts) >= 10:
|
||||
temp = int(parts[9])
|
||||
smart_health['temp'] = temp
|
||||
if temp > 65:
|
||||
if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
|
||||
smart_health['issues'].append(f"High drive temperature: {temp}°C")
|
||||
|
||||
except Exception as e:
|
||||
@ -318,9 +363,9 @@ class SystemHealthMonitor:
|
||||
# Check disk usage
|
||||
usage = psutil.disk_usage(partition.mountpoint)
|
||||
disk_usage_status = 'NORMAL'
|
||||
if usage.percent > 90:
|
||||
if usage.percent > self.CONFIG['THRESHOLDS']['DISK_CRITICAL']:
|
||||
disk_usage_status = 'CRITICAL_HIGH_USAGE'
|
||||
elif usage.percent > 80:
|
||||
elif usage.percent > self.CONFIG['THRESHOLDS']['DISK_WARNING']:
|
||||
disk_usage_status = 'WARNING_HIGH_USAGE'
|
||||
|
||||
drive_report.update({
|
||||
@ -372,19 +417,68 @@ class SystemHealthMonitor:
|
||||
|
||||
def _check_memory_usage(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check memory usage and return health metrics.
|
||||
Check for ECC memory errors if ECC memory is present.
|
||||
|
||||
:return: Dictionary with memory health metrics.
|
||||
:return: Dictionary with memory health metrics and ECC status.
|
||||
"""
|
||||
memory_info = psutil.virtual_memory()
|
||||
memory_health = {
|
||||
'total_memory': self._convert_bytes(memory_info.total),
|
||||
'used_memory': self._convert_bytes(memory_info.used),
|
||||
'memory_percent': memory_info.percent,
|
||||
'status': 'OK' if memory_info.percent < 90 else 'WARNING'
|
||||
'has_ecc': False,
|
||||
'ecc_errors': [],
|
||||
'status': 'OK'
|
||||
}
|
||||
|
||||
try:
|
||||
# Check if ECC memory is present by looking at edac_mc
|
||||
result = subprocess.run(
|
||||
['ls', '/sys/devices/system/edac/mc'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
memory_health['has_ecc'] = True
|
||||
|
||||
# Check for ECC errors in mcX/csrowY/ue_count and ce_count files
|
||||
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
||||
mc_name = os.path.basename(mc_dir)
|
||||
|
||||
# Check uncorrectable errors
|
||||
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
|
||||
if ue_count > 0:
|
||||
memory_health['status'] = 'CRITICAL'
|
||||
memory_health['ecc_errors'].append(
|
||||
f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}"
|
||||
)
|
||||
|
||||
# Check correctable errors
|
||||
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
|
||||
if ce_count > 0:
|
||||
if memory_health['status'] != 'CRITICAL':
|
||||
memory_health['status'] = 'WARNING'
|
||||
memory_health['ecc_errors'].append(
|
||||
f"Correctable ECC errors detected in {mc_name}: {ce_count}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
memory_health['status'] = 'ERROR'
|
||||
memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
|
||||
|
||||
return memory_health
|
||||
|
||||
def _read_ecc_count(self, filepath: str) -> int:
|
||||
"""
|
||||
Read ECC error count from a file.
|
||||
|
||||
:param filepath: Path to the ECC count file
|
||||
:return: Number of ECC errors
|
||||
"""
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
return int(f.read().strip())
|
||||
except:
|
||||
return 0
|
||||
|
||||
def _check_cpu_usage(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Check CPU usage and return health metrics.
|
||||
@ -394,7 +488,7 @@ class SystemHealthMonitor:
|
||||
cpu_usage_percent = psutil.cpu_percent(interval=1)
|
||||
cpu_health = {
|
||||
'cpu_usage_percent': cpu_usage_percent,
|
||||
'status': 'OK' if cpu_usage_percent < 90 else 'WARNING'
|
||||
'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING'
|
||||
}
|
||||
return cpu_health
|
||||
|
||||
@ -405,30 +499,65 @@ class SystemHealthMonitor:
|
||||
:return: Dictionary containing network health metrics and any issues found.
|
||||
"""
|
||||
network_health = {
|
||||
'management_network': {'issues': []},
|
||||
'ceph_network': {'issues': []}
|
||||
'management_network': {
|
||||
'issues': [],
|
||||
'status': 'OK',
|
||||
'latency': None
|
||||
},
|
||||
'ceph_network': {
|
||||
'issues': [],
|
||||
'status': 'OK',
|
||||
'latency': None
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
# Check management network connectivity
|
||||
proc = subprocess.run(["ping", "-c", "1", "10.10.10.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if proc.returncode != 0:
|
||||
mgmt_result = subprocess.run(
|
||||
[
|
||||
"ping",
|
||||
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
|
||||
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
|
||||
self.CONFIG['NETWORKS']['MANAGEMENT']
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
if mgmt_result.returncode != 0:
|
||||
network_health['management_network']['status'] = 'CRITICAL'
|
||||
network_health['management_network']['issues'].append(
|
||||
"Management network is unreachable."
|
||||
"Management network is unreachable"
|
||||
)
|
||||
|
||||
# Check Ceph network connectivity
|
||||
proc = subprocess.run(["ping", "-c", "1", "10.10.90.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
if proc.returncode != 0:
|
||||
ceph_result = subprocess.run(
|
||||
[
|
||||
"ping",
|
||||
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
|
||||
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
|
||||
self.CONFIG['NETWORKS']['CEPH']
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
if ceph_result.returncode != 0:
|
||||
network_health['ceph_network']['status'] = 'CRITICAL'
|
||||
network_health['ceph_network']['issues'].append(
|
||||
"Ceph network is unreachable."
|
||||
"Ceph network is unreachable"
|
||||
)
|
||||
|
||||
return network_health
|
||||
|
||||
except Exception as e:
|
||||
print(f"Network health check failed: {e}")
|
||||
return {'error': str(e)}
|
||||
logger.error(f"Network health check failed: {e}")
|
||||
return {
|
||||
'status': 'ERROR',
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
def main():
|
||||
try:
|
||||
@ -447,9 +576,9 @@ def main():
|
||||
|
||||
# Instantiate the SystemHealthMonitor class
|
||||
monitor = SystemHealthMonitor(
|
||||
ticket_api_url=ticket_api_url,
|
||||
state_file=state_file,
|
||||
dry_run=args.dry_run # Pass the dry-run flag
|
||||
ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
|
||||
state_file=SystemHealthMonitor.CONFIG['STATE_FILE'],
|
||||
dry_run=args.dry_run
|
||||
)
|
||||
|
||||
# Run the health checks
|
||||
@ -476,4 +605,4 @@ if __name__ == "__main__":
|
||||
# Set dry-run mode if specified
|
||||
dry_run_mode = args.dry_run
|
||||
|
||||
main()
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user