Added ECC support and sorted code
This commit is contained in:
@ -3,6 +3,7 @@ Description=Run System Health Monitoring Daemon Daily
|
|||||||
|
|
||||||
[Timer]
|
[Timer]
|
||||||
OnCalendar=daily
|
OnCalendar=daily
|
||||||
|
RandomizedDelaySec=1h
|
||||||
Persistent=true
|
Persistent=true
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
|
|||||||
221
hwmonDaemon.py
221
hwmonDaemon.py
@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re
|
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob
|
||||||
from typing import Dict, Any, List
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
# Create a logger
|
# Create a logger
|
||||||
@ -20,6 +20,49 @@ console_handler.setFormatter(formatter)
|
|||||||
logger.addHandler(console_handler)
|
logger.addHandler(console_handler)
|
||||||
|
|
||||||
class SystemHealthMonitor:
|
class SystemHealthMonitor:
|
||||||
|
PRIORITIES = {
|
||||||
|
'CRITICAL': '1',
|
||||||
|
'HIGH': '2',
|
||||||
|
'MEDIUM': '3',
|
||||||
|
'LOW': '4'
|
||||||
|
}
|
||||||
|
ISSUE_PRIORITIES = {
|
||||||
|
'SMART_FAILURE': PRIORITIES['HIGH'],
|
||||||
|
'DISK_CRITICAL': PRIORITIES['HIGH'],
|
||||||
|
'DISK_WARNING': PRIORITIES['MEDIUM'],
|
||||||
|
'UNCORRECTABLE_ECC': PRIORITIES['HIGH'],
|
||||||
|
'CORRECTABLE_ECC': PRIORITIES['MEDIUM'],
|
||||||
|
'CPU_HIGH': PRIORITIES['MEDIUM'],
|
||||||
|
'NETWORK_FAILURE': PRIORITIES['HIGH']
|
||||||
|
}
|
||||||
|
CONFIG = {
|
||||||
|
'TICKET_API_URL': 'http://10.10.10.45/create_ticket_api.php',
|
||||||
|
'STATE_FILE': '/tmp/last_health_check.json',
|
||||||
|
'THRESHOLDS': {
|
||||||
|
'DISK_CRITICAL': 90,
|
||||||
|
'DISK_WARNING': 80,
|
||||||
|
'CPU_WARNING': 80,
|
||||||
|
'TEMPERATURE_WARNING': 65
|
||||||
|
},
|
||||||
|
'NETWORKS': {
|
||||||
|
'MANAGEMENT': '10.10.10.1',
|
||||||
|
'CEPH': '10.10.90.1',
|
||||||
|
'PING_TIMEOUT': 1, # seconds
|
||||||
|
'PING_COUNT': 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
TICKET_TEMPLATES = {
|
||||||
|
'ACTION_TYPE': '[auto]',
|
||||||
|
'ENVIRONMENT': '[production]',
|
||||||
|
'TICKET_TYPE': '[maintenance]',
|
||||||
|
'HARDWARE_TYPE': '[hardware]',
|
||||||
|
'NETWORK_TYPE': '[network]',
|
||||||
|
'SCOPE_SINGLE': '[single-node]',
|
||||||
|
'SCOPE_CLUSTER': '[cluster-wide]',
|
||||||
|
'DEFAULT_CATEGORY': 'Hardware',
|
||||||
|
'DEFAULT_ISSUE_TYPE': 'Problem'
|
||||||
|
}
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
|
ticket_api_url: str = 'http://10.10.10.45/create_ticket_api.php',
|
||||||
state_file: str = '/tmp/last_health_check.json',
|
state_file: str = '/tmp/last_health_check.json',
|
||||||
@ -127,33 +170,35 @@ class SystemHealthMonitor:
|
|||||||
return
|
return
|
||||||
|
|
||||||
hostname = socket.gethostname()
|
hostname = socket.gethostname()
|
||||||
action_type = "[auto]"
|
action_type = self.TICKET_TEMPLATES['ACTION_TYPE']
|
||||||
environment = "[production]"
|
environment = self.TICKET_TEMPLATES['ENVIRONMENT']
|
||||||
ticket_type = "[maintenance]"
|
ticket_type = self.TICKET_TEMPLATES['TICKET_TYPE']
|
||||||
|
|
||||||
for issue in issues:
|
for issue in issues:
|
||||||
# Set default values
|
priority = self.PRIORITIES['MEDIUM'] # default
|
||||||
priority = ""
|
category = self.TICKET_TEMPLATES['DEFAULT_CATEGORY']
|
||||||
category = "Hardware"
|
issue_type = self.TICKET_TEMPLATES['DEFAULT_ISSUE_TYPE']
|
||||||
issue_type = "Problem"
|
scope = self.TICKET_TEMPLATES['SCOPE_SINGLE']
|
||||||
scope = "[single-node]"
|
|
||||||
|
|
||||||
if "Disk" in issue:
|
if "Disk" in issue:
|
||||||
hardware_type = "[hardware]"
|
hardware_type = self.TICKET_TEMPLATES['HARDWARE_TYPE']
|
||||||
if "CRITICAL" in issue or "SMART failure" in issue:
|
if "CRITICAL" in issue or "SMART failure" in issue:
|
||||||
priority = "2"
|
priority = self.ISSUE_PRIORITIES['DISK_CRITICAL']
|
||||||
elif "WARNING" in issue:
|
elif "WARNING" in issue:
|
||||||
priority = "3"
|
priority = self.ISSUE_PRIORITIES['DISK_WARNING']
|
||||||
elif "Memory" in issue:
|
elif "Network" in issue:
|
||||||
|
hardware_type = self.TICKET_TEMPLATES['NETWORK_TYPE']
|
||||||
|
priority = self.ISSUE_PRIORITIES['NETWORK_FAILURE']
|
||||||
|
scope = self.TICKET_TEMPLATES['SCOPE_CLUSTER']
|
||||||
|
elif "Uncorrectable ECC" in issue:
|
||||||
|
hardware_type = "[hardware]"
|
||||||
|
priority = "2"
|
||||||
|
elif "Correctable ECC" in issue:
|
||||||
hardware_type = "[hardware]"
|
hardware_type = "[hardware]"
|
||||||
priority = "3"
|
priority = "3"
|
||||||
elif "CPU" in issue:
|
elif "CPU" in issue:
|
||||||
hardware_type = "[hardware]"
|
hardware_type = "[hardware]"
|
||||||
priority = "3"
|
priority = "3"
|
||||||
elif "Network" in issue:
|
|
||||||
hardware_type = "[network]"
|
|
||||||
priority = "2"
|
|
||||||
scope = "[cluster-wide]"
|
|
||||||
|
|
||||||
# Create standardized ticket title
|
# Create standardized ticket title
|
||||||
ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}"
|
ticket_title = f"[{hostname}]{action_type}{hardware_type} {issue} {scope}{environment}{ticket_type}"
|
||||||
@ -212,15 +257,15 @@ class SystemHealthMonitor:
|
|||||||
if partition.get('smart_status') == 'UNHEALTHY':
|
if partition.get('smart_status') == 'UNHEALTHY':
|
||||||
issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status")
|
issues.append(f"Disk {partition['mountpoint']} has an unhealthy SMART status")
|
||||||
|
|
||||||
# Check for memory-related issues
|
# Check for ECC memory errors
|
||||||
memory_health = health_report.get('memory_health', {})
|
memory_health = health_report.get('memory_health', {})
|
||||||
if memory_health and memory_health.get('memory_percent', 0) > 80:
|
if memory_health.get('has_ecc') and memory_health.get('ecc_errors'):
|
||||||
issues.append("Memory usage is above 80%")
|
issues.extend(memory_health['ecc_errors'])
|
||||||
|
|
||||||
# Check for CPU-related issues
|
# Check for CPU-related issues
|
||||||
cpu_health = health_report.get('cpu_health', {})
|
cpu_health = health_report.get('cpu_health', {})
|
||||||
if cpu_health and cpu_health.get('cpu_usage_percent', 0) > 80:
|
if cpu_health and cpu_health.get('cpu_usage_percent', 0) > self.CONFIG['THRESHOLDS']['CPU_WARNING']:
|
||||||
issues.append("CPU usage is above 80%")
|
issues.append("CPU usage is above threshold")
|
||||||
|
|
||||||
# Check for network-related issues
|
# Check for network-related issues
|
||||||
network_health = health_report.get('network_health', {})
|
network_health = health_report.get('network_health', {})
|
||||||
@ -291,7 +336,7 @@ class SystemHealthMonitor:
|
|||||||
if len(parts) >= 10:
|
if len(parts) >= 10:
|
||||||
temp = int(parts[9])
|
temp = int(parts[9])
|
||||||
smart_health['temp'] = temp
|
smart_health['temp'] = temp
|
||||||
if temp > 65:
|
if temp > self.CONFIG['THRESHOLDS']['TEMPERATURE_WARNING']:
|
||||||
smart_health['issues'].append(f"High drive temperature: {temp}°C")
|
smart_health['issues'].append(f"High drive temperature: {temp}°C")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -318,9 +363,9 @@ class SystemHealthMonitor:
|
|||||||
# Check disk usage
|
# Check disk usage
|
||||||
usage = psutil.disk_usage(partition.mountpoint)
|
usage = psutil.disk_usage(partition.mountpoint)
|
||||||
disk_usage_status = 'NORMAL'
|
disk_usage_status = 'NORMAL'
|
||||||
if usage.percent > 90:
|
if usage.percent > self.CONFIG['THRESHOLDS']['DISK_CRITICAL']:
|
||||||
disk_usage_status = 'CRITICAL_HIGH_USAGE'
|
disk_usage_status = 'CRITICAL_HIGH_USAGE'
|
||||||
elif usage.percent > 80:
|
elif usage.percent > self.CONFIG['THRESHOLDS']['DISK_WARNING']:
|
||||||
disk_usage_status = 'WARNING_HIGH_USAGE'
|
disk_usage_status = 'WARNING_HIGH_USAGE'
|
||||||
|
|
||||||
drive_report.update({
|
drive_report.update({
|
||||||
@ -372,19 +417,68 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
def _check_memory_usage(self) -> Dict[str, Any]:
|
def _check_memory_usage(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check memory usage and return health metrics.
|
Check for ECC memory errors if ECC memory is present.
|
||||||
|
|
||||||
:return: Dictionary with memory health metrics.
|
:return: Dictionary with memory health metrics and ECC status.
|
||||||
"""
|
"""
|
||||||
memory_info = psutil.virtual_memory()
|
|
||||||
memory_health = {
|
memory_health = {
|
||||||
'total_memory': self._convert_bytes(memory_info.total),
|
'has_ecc': False,
|
||||||
'used_memory': self._convert_bytes(memory_info.used),
|
'ecc_errors': [],
|
||||||
'memory_percent': memory_info.percent,
|
'status': 'OK'
|
||||||
'status': 'OK' if memory_info.percent < 90 else 'WARNING'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check if ECC memory is present by looking at edac_mc
|
||||||
|
result = subprocess.run(
|
||||||
|
['ls', '/sys/devices/system/edac/mc'],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
memory_health['has_ecc'] = True
|
||||||
|
|
||||||
|
# Check for ECC errors in mcX/csrowY/ue_count and ce_count files
|
||||||
|
for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
|
||||||
|
mc_name = os.path.basename(mc_dir)
|
||||||
|
|
||||||
|
# Check uncorrectable errors
|
||||||
|
ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
|
||||||
|
if ue_count > 0:
|
||||||
|
memory_health['status'] = 'CRITICAL'
|
||||||
|
memory_health['ecc_errors'].append(
|
||||||
|
f"Uncorrectable ECC errors detected in {mc_name}: {ue_count}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check correctable errors
|
||||||
|
ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
|
||||||
|
if ce_count > 0:
|
||||||
|
if memory_health['status'] != 'CRITICAL':
|
||||||
|
memory_health['status'] = 'WARNING'
|
||||||
|
memory_health['ecc_errors'].append(
|
||||||
|
f"Correctable ECC errors detected in {mc_name}: {ce_count}"
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
memory_health['status'] = 'ERROR'
|
||||||
|
memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
|
||||||
|
|
||||||
return memory_health
|
return memory_health
|
||||||
|
|
||||||
|
def _read_ecc_count(self, filepath: str) -> int:
|
||||||
|
"""
|
||||||
|
Read ECC error count from a file.
|
||||||
|
|
||||||
|
:param filepath: Path to the ECC count file
|
||||||
|
:return: Number of ECC errors
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
return int(f.read().strip())
|
||||||
|
except:
|
||||||
|
return 0
|
||||||
|
|
||||||
def _check_cpu_usage(self) -> Dict[str, Any]:
|
def _check_cpu_usage(self) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Check CPU usage and return health metrics.
|
Check CPU usage and return health metrics.
|
||||||
@ -394,7 +488,7 @@ class SystemHealthMonitor:
|
|||||||
cpu_usage_percent = psutil.cpu_percent(interval=1)
|
cpu_usage_percent = psutil.cpu_percent(interval=1)
|
||||||
cpu_health = {
|
cpu_health = {
|
||||||
'cpu_usage_percent': cpu_usage_percent,
|
'cpu_usage_percent': cpu_usage_percent,
|
||||||
'status': 'OK' if cpu_usage_percent < 90 else 'WARNING'
|
'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING'
|
||||||
}
|
}
|
||||||
return cpu_health
|
return cpu_health
|
||||||
|
|
||||||
@ -405,30 +499,65 @@ class SystemHealthMonitor:
|
|||||||
:return: Dictionary containing network health metrics and any issues found.
|
:return: Dictionary containing network health metrics and any issues found.
|
||||||
"""
|
"""
|
||||||
network_health = {
|
network_health = {
|
||||||
'management_network': {'issues': []},
|
'management_network': {
|
||||||
'ceph_network': {'issues': []}
|
'issues': [],
|
||||||
|
'status': 'OK',
|
||||||
|
'latency': None
|
||||||
|
},
|
||||||
|
'ceph_network': {
|
||||||
|
'issues': [],
|
||||||
|
'status': 'OK',
|
||||||
|
'latency': None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Check management network connectivity
|
# Check management network connectivity
|
||||||
proc = subprocess.run(["ping", "-c", "1", "10.10.10.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
mgmt_result = subprocess.run(
|
||||||
if proc.returncode != 0:
|
[
|
||||||
|
"ping",
|
||||||
|
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
|
||||||
|
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
|
||||||
|
self.CONFIG['NETWORKS']['MANAGEMENT']
|
||||||
|
],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if mgmt_result.returncode != 0:
|
||||||
|
network_health['management_network']['status'] = 'CRITICAL'
|
||||||
network_health['management_network']['issues'].append(
|
network_health['management_network']['issues'].append(
|
||||||
"Management network is unreachable."
|
"Management network is unreachable"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check Ceph network connectivity
|
# Check Ceph network connectivity
|
||||||
proc = subprocess.run(["ping", "-c", "1", "10.10.90.1"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
ceph_result = subprocess.run(
|
||||||
if proc.returncode != 0:
|
[
|
||||||
|
"ping",
|
||||||
|
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
|
||||||
|
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
|
||||||
|
self.CONFIG['NETWORKS']['CEPH']
|
||||||
|
],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
if ceph_result.returncode != 0:
|
||||||
|
network_health['ceph_network']['status'] = 'CRITICAL'
|
||||||
network_health['ceph_network']['issues'].append(
|
network_health['ceph_network']['issues'].append(
|
||||||
"Ceph network is unreachable."
|
"Ceph network is unreachable"
|
||||||
)
|
)
|
||||||
|
|
||||||
return network_health
|
return network_health
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Network health check failed: {e}")
|
logger.error(f"Network health check failed: {e}")
|
||||||
return {'error': str(e)}
|
return {
|
||||||
|
'status': 'ERROR',
|
||||||
|
'error': str(e)
|
||||||
|
}
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
try:
|
try:
|
||||||
@ -447,9 +576,9 @@ def main():
|
|||||||
|
|
||||||
# Instantiate the SystemHealthMonitor class
|
# Instantiate the SystemHealthMonitor class
|
||||||
monitor = SystemHealthMonitor(
|
monitor = SystemHealthMonitor(
|
||||||
ticket_api_url=ticket_api_url,
|
ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
|
||||||
state_file=state_file,
|
state_file=SystemHealthMonitor.CONFIG['STATE_FILE'],
|
||||||
dry_run=args.dry_run # Pass the dry-run flag
|
dry_run=args.dry_run
|
||||||
)
|
)
|
||||||
|
|
||||||
# Run the health checks
|
# Run the health checks
|
||||||
|
|||||||
Reference in New Issue
Block a user