Fix critical reliability and security issues in hwmonDaemon

Critical fixes implemented:
- Add 10MB storage limit with automatic cleanup of old history files
- Add file locking (fcntl) to prevent race conditions in history writes
- Disable SMART monitoring for unreliable Ridata drives
- Fix bare except clause in _read_ecc_count() to properly catch errors
- Add timeouts to all network and subprocess calls (10s for API, 30s for subprocess)
- Fix unchecked regex in ticket creation to prevent AttributeError
- Add JSON decode error handling for ticket API responses

Service configuration improvements:
- hwmon.timer: Reduce jitter from 300s to 60s, add Persistent=true
- hwmon.service: Add Restart=on-failure, TimeoutStartSec=300, logging to journal

These changes improve reliability, prevent hung processes, eliminate race
conditions, and add proper error handling throughout the daemon.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-06 16:55:48 -05:00
parent 0577c7fc1b
commit fe832c42f3
3 changed files with 170 additions and 76 deletions

View File

@@ -7,6 +7,11 @@ Type=oneshot
ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
User=root
Group=root
Restart=on-failure
RestartSec=60
TimeoutStartSec=300
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target

View File

@@ -1,9 +1,10 @@
[Unit]
Description=Run System Health Monitoring Daemon Daily
Description=Run System Health Monitoring Daemon Hourly
[Timer]
OnCalendar=hourly
RandomizedDelaySec=300
RandomizedDelaySec=60
Persistent=true
[Install]
WantedBy=timers.target

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env python3
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl
from typing import Dict, Any, List
# =============================================================================
@@ -526,6 +526,51 @@ class SystemHealthMonitor:
# Ensure history directory exists
os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True)
def _enforce_storage_limit(self, history_dir: str, max_bytes: int = 10485760):
"""
Delete oldest history files if directory exceeds size limit (default 10MB).
:param history_dir: Directory containing history files
:param max_bytes: Maximum directory size in bytes (default 10MB)
"""
if not os.path.exists(history_dir):
return
try:
total_size = 0
files_with_mtime = []
# Calculate total size and collect file metadata
for f in os.listdir(history_dir):
filepath = os.path.join(history_dir, f)
if f.startswith('smart_history_') and f.endswith('.json'):
try:
stat = os.stat(filepath)
total_size += stat.st_size
files_with_mtime.append((filepath, stat.st_mtime, stat.st_size))
except (IOError, OSError) as e:
logger.debug(f"Could not stat file {filepath}: {e}")
# If over limit, delete oldest files first
if total_size > max_bytes:
# Sort by modification time (oldest first)
files_with_mtime.sort(key=lambda x: x[1])
logger.info(f"History directory size ({total_size} bytes) exceeds limit ({max_bytes} bytes), cleaning up...")
for filepath, mtime, file_size in files_with_mtime:
if total_size <= max_bytes:
break
try:
os.remove(filepath)
total_size -= file_size
logger.info(f"Removed old history file {os.path.basename(filepath)} (saved {file_size} bytes)")
except (IOError, OSError) as e:
logger.warning(f"Could not remove history file {filepath}: {e}")
except Exception as e:
logger.error(f"Error enforcing storage limit: {e}")
# =============================================================================
# MAIN EXECUTION METHODS
# =============================================================================
@@ -605,11 +650,24 @@ class SystemHealthMonitor:
historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json")
try:
# Load historical data
if os.path.exists(historical_file):
with open(historical_file, 'r') as f:
# Enforce storage limit before writing
self._enforce_storage_limit(self.CONFIG['HISTORY_DIR'])
# Load historical data with file locking
history = []
file_mode = 'r+' if os.path.exists(historical_file) else 'w+'
with open(historical_file, file_mode) as f:
# Acquire exclusive lock
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
try:
# Read existing data if file is not empty
if os.path.getsize(historical_file) > 0:
f.seek(0)
try:
history = json.load(f)
else:
except json.JSONDecodeError as e:
logger.warning(f"Corrupted history file {historical_file}, starting fresh: {e}")
history = []
# Add current reading
@@ -619,12 +677,12 @@ class SystemHealthMonitor:
}
history.append(current_reading)
# Keep only recent data
# Keep only recent data (30 days default)
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
# Analyze trends for critical attributes
if len(history) >= 3: # Need at least 3 data points
if len(history) >= 3: # Need at least 3 data points for trend analysis
critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
@@ -651,12 +709,20 @@ class SystemHealthMonitor:
if rate > 1: # More than 1 error per check
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
# Save updated history
with open(historical_file, 'w') as f:
# Write updated history atomically
f.seek(0)
f.truncate()
json.dump(history, f, indent=2)
f.flush()
finally:
# Release lock
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
except (IOError, OSError) as e:
logger.debug(f"I/O error analyzing trends for {device}: {e}")
except Exception as e:
logger.debug(f"Error analyzing trends for {device}: {e}")
logger.error(f"Unexpected error analyzing trends for {device}: {e}")
return issues
@@ -1198,10 +1264,14 @@ class SystemHealthMonitor:
drive_size = ""
if "Drive" in issue and "/dev/" in issue:
device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0)
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
if device_match:
device = device_match.group(0)
drive_details = self._get_drive_details(device)
if drive_details['capacity']:
drive_size = f"[{drive_details['capacity']}] "
else:
logger.warning(f"Could not extract device from issue: {issue}")
# Determine if this is a hardware or software issue
issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
@@ -1240,10 +1310,15 @@ class SystemHealthMonitor:
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
}
},
timeout=10 # 10 second timeout for API calls
)
try:
response_data = response.json()
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON response from ticket API: {e}")
continue
if response_data.get('success'):
logger.info(f"Ticket created successfully: {ticket_title}")
@@ -1742,6 +1817,14 @@ class SystemHealthMonitor:
smart_health['issues'].append("Unable to read device information")
return smart_health
# Skip Ridata drives entirely - unreliable and being replaced
manufacturer = self._detect_manufacturer(drive_details.get('model', ''))
if manufacturer == 'Ridata':
smart_health['status'] = 'SKIPPED'
smart_health['issues'].append("Ridata drive - monitoring disabled (unreliable hardware)")
logger.debug(f"Skipping SMART monitoring for Ridata drive {device}")
return smart_health
logger.debug(f"Drive details for {device}: {drive_details}")
manufacturer_profile = self._get_manufacturer_profile(
@@ -2283,7 +2366,8 @@ class SystemHealthMonitor:
try:
with open(filepath, 'r') as f:
return int(f.read().strip())
except:
except (IOError, OSError, ValueError) as e:
logger.debug(f"Could not read ECC count from {filepath}: {e}")
return 0
def _check_cpu_usage(self) -> Dict[str, Any]:
@@ -2329,7 +2413,8 @@ class SystemHealthMonitor:
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
text=True,
timeout=30 # 30 second timeout for subprocess
)
if mgmt_result.returncode != 0:
@@ -2348,7 +2433,8 @@ class SystemHealthMonitor:
],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
text=True,
timeout=30 # 30 second timeout for subprocess
)
if ceph_result.returncode != 0:
@@ -2382,7 +2468,8 @@ class SystemHealthMonitor:
['pct', 'list'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
text=True,
timeout=30 # 30 second timeout
)
logger.debug(f"pct list output:\n{result.stdout}")
@@ -2403,7 +2490,8 @@ class SystemHealthMonitor:
['pct', 'df', vmid],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
text=True,
timeout=30 # 30 second timeout per container
)
container_info = {