Adjusted network logic, made map of both subnet ip

This commit is contained in:
2024-12-04 21:14:47 -05:00
parent 8e5cda287d
commit 0b26b67019
2 changed files with 43 additions and 33 deletions

View File

@ -4,7 +4,7 @@ After=network.target
[Service] [Service]
Type=simple Type=simple
ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('https://10.10.10.58/JWS/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))" ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('https://10.10.10.110/JWS/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
Restart=always Restart=always
User=root User=root
Group=root Group=root

View File

@ -5,6 +5,8 @@ import json
import datetime import datetime
import requests import requests
import psutil import psutil
import socket
import subprocess
from typing import Dict, Any, List from typing import Dict, Any, List
class SystemHealthMonitor: class SystemHealthMonitor:
@ -62,7 +64,7 @@ class SystemHealthMonitor:
priority = "P4" # Default to low priority priority = "P4" # Default to low priority
categories = set() # To accumulate unique categories categories = set() # To accumulate unique categories
issue_types = set() # To accumulate unique issue types issue_types = set() # To accumulate unique issue types
hostname = "medium1" # Replace with actual logic to determine the hostname hostname = socket.gethostname()
action_type = "[auto]" action_type = "[auto]"
scope = "[cluster-wide]" scope = "[cluster-wide]"
environment = "[production]" environment = "[production]"
@ -246,7 +248,7 @@ class SystemHealthMonitor:
def _check_drive_smart_status(self) -> List[Dict[str, Any]]: def _check_drive_smart_status(self) -> List[Dict[str, Any]]:
""" """
Check SMART status of drives Check SMART status of drives using smartctl.
:return: List of SMART status for drives :return: List of SMART status for drives
""" """
@ -254,38 +256,27 @@ class SystemHealthMonitor:
try: try:
for disk in psutil.disk_partitions(): for disk in psutil.disk_partitions():
drive = disk.device drive = disk.device
# Example placeholder: SMART status retrieval would need smartmontools try:
drives.append({'drive': drive, 'status': 'HEALTHY'}) # Use smartctl to check the drive's SMART status
return drives result = subprocess.run(
['smartctl', '-H', drive],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)
output = result.stdout + result.stderr
status = 'HEALTHY' if 'PASSED' in output else 'UNHEALTHY'
drives.append({'drive': drive, 'status': status})
except Exception as e:
drives.append({'drive': drive, 'status': 'ERROR', 'error': str(e)})
except Exception as e: except Exception as e:
print(f"SMART status check failed: {e}") print(f"SMART status check failed: {e}")
return [{'error': str(e)}] return [{'error': str(e)}]
return drives
def _check_system_temperatures(self) -> Dict[str, Any]:
"""
Check system temperatures and return health metrics
:return: System temperature metrics
"""
try:
temperatures = psutil.sensors_temperatures()
if not temperatures:
return {'error': 'No temperature data available'}
temp_metrics = {}
for name, entries in temperatures.items():
temp_metrics[name] = [
{'label': entry.label, 'current': entry.current, 'high': entry.high, 'critical': entry.critical}
for entry in entries
]
return temp_metrics
except Exception as e:
print(f"Temperature health check failed: {e}")
return {'error': str(e)}
def _check_network_status(self) -> Dict[str, Any]: def _check_network_status(self) -> Dict[str, Any]:
""" """
Check network connectivity between nodes Check network connectivity between nodes and include detailed identifiers.
:return: Network health report :return: Network health report
""" """
@ -294,8 +285,21 @@ class SystemHealthMonitor:
'ceph_network': {'status': 'UNKNOWN', 'issues': []} 'ceph_network': {'status': 'UNKNOWN', 'issues': []}
} }
management_ips = ['10.10.10.2', '10.10.10.10', '10.10.10.4', '10.10.10.8', '10.10.10.9'] # IP-to-hostname mapping
ceph_ips = ['10.10.90.10', '10.10.90.4', '10.10.90.3', '10.10.90.2', '10.10.90.6'] management_mapping = {
'10.10.10.2': 'large1',
'10.10.10.10': 'medium1',
'10.10.10.4': 'medium2',
'10.10.10.8': 'micro1',
'10.10.10.9': 'micro2'
}
ceph_mapping = {
'10.10.90.10': 'large1',
'10.10.90.4': 'medium1',
'10.10.90.3': 'medium2',
'10.10.90.2': 'micro1',
'10.10.90.6': 'micro2'
}
def _ping_device(ip: str) -> bool: def _ping_device(ip: str) -> bool:
try: try:
@ -306,17 +310,23 @@ class SystemHealthMonitor:
return False return False
# Check management network # Check management network
management_ips = list(management_mapping.keys())
for source_ip in management_ips: for source_ip in management_ips:
for target_ip in management_ips: for target_ip in management_ips:
if source_ip != target_ip and not _ping_device(target_ip): if source_ip != target_ip and not _ping_device(target_ip):
issue = f"{source_ip} cannot reach {target_ip} in Management Network" source_host = management_mapping[source_ip]
target_host = management_mapping[target_ip]
issue = f"{source_host} ({source_ip}) cannot reach {target_host} ({target_ip}) in Management Network"
network_health['management_network']['issues'].append(issue) network_health['management_network']['issues'].append(issue)
# Check Ceph network # Check Ceph network
ceph_ips = list(ceph_mapping.keys())
for source_ip in ceph_ips: for source_ip in ceph_ips:
for target_ip in ceph_ips: for target_ip in ceph_ips:
if source_ip != target_ip and not _ping_device(target_ip): if source_ip != target_ip and not _ping_device(target_ip):
issue = f"{source_ip} cannot reach {target_ip} in Ceph Network" source_host = ceph_mapping[source_ip]
target_host = ceph_mapping[target_ip]
issue = f"{source_host} ({source_ip}) cannot reach {target_host} ({target_ip}) in Ceph Network"
network_health['ceph_network']['issues'].append(issue) network_health['ceph_network']['issues'].append(issue)
# Update statuses # Update statuses