Fix critical reliability and security issues in hwmonDaemon
Critical fixes implemented: - Add 10MB storage limit with automatic cleanup of old history files - Add file locking (fcntl) to prevent race conditions in history writes - Disable SMART monitoring for unreliable Ridata drives - Fix bare except clause in _read_ecc_count() to properly catch errors - Add timeouts to all network and subprocess calls (10s for API, 30s for subprocess) - Fix unchecked regex in ticket creation to prevent AttributeError - Add JSON decode error handling for ticket API responses Service configuration improvements: - hwmon.timer: Reduce jitter from 300s to 60s, add Persistent=true - hwmon.service: Add Restart=on-failure, TimeoutStartSec=300, logging to journal These changes improve reliability, prevent hung processes, eliminate race conditions, and add proper error handling throughout the daemon. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,11 @@ Type=oneshot
|
|||||||
ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
|
ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
|
||||||
User=root
|
User=root
|
||||||
Group=root
|
Group=root
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=60
|
||||||
|
TimeoutStartSec=300
|
||||||
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
@@ -1,9 +1,10 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Run System Health Monitoring Daemon Daily
|
Description=Run System Health Monitoring Daemon Hourly
|
||||||
|
|
||||||
[Timer]
|
[Timer]
|
||||||
OnCalendar=hourly
|
OnCalendar=hourly
|
||||||
RandomizedDelaySec=300
|
RandomizedDelaySec=60
|
||||||
|
Persistent=true
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=timers.target
|
WantedBy=timers.target
|
||||||
236
hwmonDaemon.py
236
hwmonDaemon.py
@@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime
|
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl
|
||||||
from typing import Dict, Any, List
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -526,6 +526,51 @@ class SystemHealthMonitor:
|
|||||||
# Ensure history directory exists
|
# Ensure history directory exists
|
||||||
os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True)
|
os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True)
|
||||||
|
|
||||||
|
def _enforce_storage_limit(self, history_dir: str, max_bytes: int = 10485760):
|
||||||
|
"""
|
||||||
|
Delete oldest history files if directory exceeds size limit (default 10MB).
|
||||||
|
|
||||||
|
:param history_dir: Directory containing history files
|
||||||
|
:param max_bytes: Maximum directory size in bytes (default 10MB)
|
||||||
|
"""
|
||||||
|
if not os.path.exists(history_dir):
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
total_size = 0
|
||||||
|
files_with_mtime = []
|
||||||
|
|
||||||
|
# Calculate total size and collect file metadata
|
||||||
|
for f in os.listdir(history_dir):
|
||||||
|
filepath = os.path.join(history_dir, f)
|
||||||
|
if f.startswith('smart_history_') and f.endswith('.json'):
|
||||||
|
try:
|
||||||
|
stat = os.stat(filepath)
|
||||||
|
total_size += stat.st_size
|
||||||
|
files_with_mtime.append((filepath, stat.st_mtime, stat.st_size))
|
||||||
|
except (IOError, OSError) as e:
|
||||||
|
logger.debug(f"Could not stat file {filepath}: {e}")
|
||||||
|
|
||||||
|
# If over limit, delete oldest files first
|
||||||
|
if total_size > max_bytes:
|
||||||
|
# Sort by modification time (oldest first)
|
||||||
|
files_with_mtime.sort(key=lambda x: x[1])
|
||||||
|
|
||||||
|
logger.info(f"History directory size ({total_size} bytes) exceeds limit ({max_bytes} bytes), cleaning up...")
|
||||||
|
|
||||||
|
for filepath, mtime, file_size in files_with_mtime:
|
||||||
|
if total_size <= max_bytes:
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
os.remove(filepath)
|
||||||
|
total_size -= file_size
|
||||||
|
logger.info(f"Removed old history file {os.path.basename(filepath)} (saved {file_size} bytes)")
|
||||||
|
except (IOError, OSError) as e:
|
||||||
|
logger.warning(f"Could not remove history file {filepath}: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error enforcing storage limit: {e}")
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# MAIN EXECUTION METHODS
|
# MAIN EXECUTION METHODS
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -599,65 +644,86 @@ class SystemHealthMonitor:
|
|||||||
def _analyze_smart_trends(self, device: str, current_attributes: dict) -> List[str]:
|
def _analyze_smart_trends(self, device: str, current_attributes: dict) -> List[str]:
|
||||||
"""Analyze SMART attribute trends to predict failures."""
|
"""Analyze SMART attribute trends to predict failures."""
|
||||||
issues = []
|
issues = []
|
||||||
|
|
||||||
# Create safe filename from device path
|
# Create safe filename from device path
|
||||||
device_safe = device.replace('/', '_').replace('-', '_')
|
device_safe = device.replace('/', '_').replace('-', '_')
|
||||||
historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json")
|
historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Load historical data
|
# Enforce storage limit before writing
|
||||||
if os.path.exists(historical_file):
|
self._enforce_storage_limit(self.CONFIG['HISTORY_DIR'])
|
||||||
with open(historical_file, 'r') as f:
|
|
||||||
history = json.load(f)
|
# Load historical data with file locking
|
||||||
else:
|
history = []
|
||||||
history = []
|
file_mode = 'r+' if os.path.exists(historical_file) else 'w+'
|
||||||
|
|
||||||
# Add current reading
|
with open(historical_file, file_mode) as f:
|
||||||
current_reading = {
|
# Acquire exclusive lock
|
||||||
'timestamp': datetime.datetime.now().isoformat(),
|
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||||
'attributes': current_attributes
|
try:
|
||||||
}
|
# Read existing data if file is not empty
|
||||||
history.append(current_reading)
|
if os.path.getsize(historical_file) > 0:
|
||||||
|
f.seek(0)
|
||||||
# Keep only recent data
|
try:
|
||||||
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
|
history = json.load(f)
|
||||||
history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"Corrupted history file {historical_file}, starting fresh: {e}")
|
||||||
# Analyze trends for critical attributes
|
history = []
|
||||||
if len(history) >= 3: # Need at least 3 data points
|
|
||||||
critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
|
# Add current reading
|
||||||
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
|
current_reading = {
|
||||||
|
'timestamp': datetime.datetime.now().isoformat(),
|
||||||
for attr in critical_attrs:
|
'attributes': current_attributes
|
||||||
if attr in current_attributes:
|
}
|
||||||
# Get last week's values
|
history.append(current_reading)
|
||||||
recent_history = history[-7:] if len(history) >= 7 else history
|
|
||||||
values = [h['attributes'].get(attr, 0) for h in recent_history]
|
# Keep only recent data (30 days default)
|
||||||
|
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
|
||||||
if len(values) >= 3:
|
history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
|
||||||
# Check for rapid increase
|
|
||||||
recent_increase = values[-1] - values[0]
|
# Analyze trends for critical attributes
|
||||||
if recent_increase > 0:
|
if len(history) >= 3: # Need at least 3 data points for trend analysis
|
||||||
rate = recent_increase / len(values)
|
critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
|
||||||
|
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
|
||||||
# Different thresholds for different attributes
|
|
||||||
if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
|
for attr in critical_attrs:
|
||||||
if rate > 0.5: # More than 0.5 sectors per check
|
if attr in current_attributes:
|
||||||
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
|
# Get last week's values
|
||||||
elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
|
recent_history = history[-7:] if len(history) >= 7 else history
|
||||||
if rate > 0.2: # Any consistent increase is concerning
|
values = [h['attributes'].get(attr, 0) for h in recent_history]
|
||||||
issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
|
|
||||||
else: # Program/Erase fail counts
|
if len(values) >= 3:
|
||||||
if rate > 1: # More than 1 error per check
|
# Check for rapid increase
|
||||||
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
|
recent_increase = values[-1] - values[0]
|
||||||
|
if recent_increase > 0:
|
||||||
# Save updated history
|
rate = recent_increase / len(values)
|
||||||
with open(historical_file, 'w') as f:
|
|
||||||
json.dump(history, f, indent=2)
|
# Different thresholds for different attributes
|
||||||
|
if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
|
||||||
|
if rate > 0.5: # More than 0.5 sectors per check
|
||||||
|
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
|
||||||
|
elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
|
||||||
|
if rate > 0.2: # Any consistent increase is concerning
|
||||||
|
issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
|
||||||
|
else: # Program/Erase fail counts
|
||||||
|
if rate > 1: # More than 1 error per check
|
||||||
|
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
|
||||||
|
|
||||||
|
# Write updated history atomically
|
||||||
|
f.seek(0)
|
||||||
|
f.truncate()
|
||||||
|
json.dump(history, f, indent=2)
|
||||||
|
f.flush()
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Release lock
|
||||||
|
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||||
|
|
||||||
|
except (IOError, OSError) as e:
|
||||||
|
logger.debug(f"I/O error analyzing trends for {device}: {e}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Error analyzing trends for {device}: {e}")
|
logger.error(f"Unexpected error analyzing trends for {device}: {e}")
|
||||||
|
|
||||||
return issues
|
return issues
|
||||||
|
|
||||||
def _check_thermal_health(self, device: str, temperature: int, drive_type: str = 'HDD') -> List[str]:
|
def _check_thermal_health(self, device: str, temperature: int, drive_type: str = 'HDD') -> List[str]:
|
||||||
@@ -1198,10 +1264,14 @@ class SystemHealthMonitor:
|
|||||||
|
|
||||||
drive_size = ""
|
drive_size = ""
|
||||||
if "Drive" in issue and "/dev/" in issue:
|
if "Drive" in issue and "/dev/" in issue:
|
||||||
device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0)
|
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
|
||||||
drive_details = self._get_drive_details(device)
|
if device_match:
|
||||||
if drive_details['capacity']:
|
device = device_match.group(0)
|
||||||
drive_size = f"[{drive_details['capacity']}] "
|
drive_details = self._get_drive_details(device)
|
||||||
|
if drive_details['capacity']:
|
||||||
|
drive_size = f"[{drive_details['capacity']}] "
|
||||||
|
else:
|
||||||
|
logger.warning(f"Could not extract device from issue: {issue}")
|
||||||
|
|
||||||
# Determine if this is a hardware or software issue
|
# Determine if this is a hardware or software issue
|
||||||
issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
|
issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
|
||||||
@@ -1240,10 +1310,15 @@ class SystemHealthMonitor:
|
|||||||
headers = {
|
headers = {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
|
'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
|
||||||
}
|
},
|
||||||
|
timeout=10 # 10 second timeout for API calls
|
||||||
)
|
)
|
||||||
|
|
||||||
response_data = response.json()
|
try:
|
||||||
|
response_data = response.json()
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.error(f"Invalid JSON response from ticket API: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
if response_data.get('success'):
|
if response_data.get('success'):
|
||||||
logger.info(f"Ticket created successfully: {ticket_title}")
|
logger.info(f"Ticket created successfully: {ticket_title}")
|
||||||
@@ -1742,6 +1817,14 @@ class SystemHealthMonitor:
|
|||||||
smart_health['issues'].append("Unable to read device information")
|
smart_health['issues'].append("Unable to read device information")
|
||||||
return smart_health
|
return smart_health
|
||||||
|
|
||||||
|
# Skip Ridata drives entirely - unreliable and being replaced
|
||||||
|
manufacturer = self._detect_manufacturer(drive_details.get('model', ''))
|
||||||
|
if manufacturer == 'Ridata':
|
||||||
|
smart_health['status'] = 'SKIPPED'
|
||||||
|
smart_health['issues'].append("Ridata drive - monitoring disabled (unreliable hardware)")
|
||||||
|
logger.debug(f"Skipping SMART monitoring for Ridata drive {device}")
|
||||||
|
return smart_health
|
||||||
|
|
||||||
logger.debug(f"Drive details for {device}: {drive_details}")
|
logger.debug(f"Drive details for {device}: {drive_details}")
|
||||||
|
|
||||||
manufacturer_profile = self._get_manufacturer_profile(
|
manufacturer_profile = self._get_manufacturer_profile(
|
||||||
@@ -2276,14 +2359,15 @@ class SystemHealthMonitor:
|
|||||||
def _read_ecc_count(self, filepath: str) -> int:
|
def _read_ecc_count(self, filepath: str) -> int:
|
||||||
"""
|
"""
|
||||||
Read ECC error count from a file.
|
Read ECC error count from a file.
|
||||||
|
|
||||||
:param filepath: Path to the ECC count file
|
:param filepath: Path to the ECC count file
|
||||||
:return: Number of ECC errors
|
:return: Number of ECC errors
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
with open(filepath, 'r') as f:
|
with open(filepath, 'r') as f:
|
||||||
return int(f.read().strip())
|
return int(f.read().strip())
|
||||||
except:
|
except (IOError, OSError, ValueError) as e:
|
||||||
|
logger.debug(f"Could not read ECC count from {filepath}: {e}")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def _check_cpu_usage(self) -> Dict[str, Any]:
|
def _check_cpu_usage(self) -> Dict[str, Any]:
|
||||||
@@ -2322,16 +2406,17 @@ class SystemHealthMonitor:
|
|||||||
# Check management network connectivity
|
# Check management network connectivity
|
||||||
mgmt_result = subprocess.run(
|
mgmt_result = subprocess.run(
|
||||||
[
|
[
|
||||||
"ping",
|
"ping",
|
||||||
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
|
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
|
||||||
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
|
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
|
||||||
self.CONFIG['NETWORKS']['MANAGEMENT']
|
self.CONFIG['NETWORKS']['MANAGEMENT']
|
||||||
],
|
],
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
text=True
|
text=True,
|
||||||
|
timeout=30 # 30 second timeout for subprocess
|
||||||
)
|
)
|
||||||
|
|
||||||
if mgmt_result.returncode != 0:
|
if mgmt_result.returncode != 0:
|
||||||
network_health['management_network']['status'] = 'CRITICAL'
|
network_health['management_network']['status'] = 'CRITICAL'
|
||||||
network_health['management_network']['issues'].append(
|
network_health['management_network']['issues'].append(
|
||||||
@@ -2348,7 +2433,8 @@ class SystemHealthMonitor:
|
|||||||
],
|
],
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
text=True
|
text=True,
|
||||||
|
timeout=30 # 30 second timeout for subprocess
|
||||||
)
|
)
|
||||||
|
|
||||||
if ceph_result.returncode != 0:
|
if ceph_result.returncode != 0:
|
||||||
@@ -2382,28 +2468,30 @@ class SystemHealthMonitor:
|
|||||||
['pct', 'list'],
|
['pct', 'list'],
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
text=True
|
text=True,
|
||||||
|
timeout=30 # 30 second timeout
|
||||||
)
|
)
|
||||||
logger.debug(f"pct list output:\n{result.stdout}")
|
logger.debug(f"pct list output:\n{result.stdout}")
|
||||||
|
|
||||||
for line in result.stdout.split('\n')[1:]:
|
for line in result.stdout.split('\n')[1:]:
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
parts = line.split()
|
parts = line.split()
|
||||||
if len(parts) < 2:
|
if len(parts) < 2:
|
||||||
logger.debug(f"Skipping invalid line: {line}")
|
logger.debug(f"Skipping invalid line: {line}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
vmid, status = parts[0], parts[1]
|
vmid, status = parts[0], parts[1]
|
||||||
|
|
||||||
if status.lower() == 'running':
|
if status.lower() == 'running':
|
||||||
logger.debug(f"Checking container {vmid} disk usage")
|
logger.debug(f"Checking container {vmid} disk usage")
|
||||||
disk_info = subprocess.run(
|
disk_info = subprocess.run(
|
||||||
['pct', 'df', vmid],
|
['pct', 'df', vmid],
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
text=True
|
text=True,
|
||||||
|
timeout=30 # 30 second timeout per container
|
||||||
)
|
)
|
||||||
|
|
||||||
container_info = {
|
container_info = {
|
||||||
|
|||||||
Reference in New Issue
Block a user