Fix critical reliability and security issues in hwmonDaemon
Critical fixes implemented: - Add 10MB storage limit with automatic cleanup of old history files - Add file locking (fcntl) to prevent race conditions in history writes - Disable SMART monitoring for unreliable Ridata drives - Fix bare except clause in _read_ecc_count() to properly catch errors - Add timeouts to all network and subprocess calls (10s for API, 30s for subprocess) - Fix unchecked regex in ticket creation to prevent AttributeError - Add JSON decode error handling for ticket API responses Service configuration improvements: - hwmon.timer: Reduce jitter from 300s to 60s, add Persistent=true - hwmon.service: Add Restart=on-failure, TimeoutStartSec=300, logging to journal These changes improve reliability, prevent hung processes, eliminate race conditions, and add proper error handling throughout the daemon. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,11 @@ Type=oneshot
|
||||
ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
|
||||
User=root
|
||||
Group=root
|
||||
Restart=on-failure
|
||||
RestartSec=60
|
||||
TimeoutStartSec=300
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -1,9 +1,10 @@
|
||||
[Unit]
|
||||
Description=Run System Health Monitoring Daemon Daily
|
||||
Description=Run System Health Monitoring Daemon Hourly
|
||||
|
||||
[Timer]
|
||||
OnCalendar=hourly
|
||||
RandomizedDelaySec=300
|
||||
RandomizedDelaySec=60
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
198
hwmonDaemon.py
198
hwmonDaemon.py
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime
|
||||
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl
|
||||
from typing import Dict, Any, List
|
||||
|
||||
# =============================================================================
|
||||
@@ -526,6 +526,51 @@ class SystemHealthMonitor:
|
||||
# Ensure history directory exists
|
||||
os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True)
|
||||
|
||||
def _enforce_storage_limit(self, history_dir: str, max_bytes: int = 10485760):
|
||||
"""
|
||||
Delete oldest history files if directory exceeds size limit (default 10MB).
|
||||
|
||||
:param history_dir: Directory containing history files
|
||||
:param max_bytes: Maximum directory size in bytes (default 10MB)
|
||||
"""
|
||||
if not os.path.exists(history_dir):
|
||||
return
|
||||
|
||||
try:
|
||||
total_size = 0
|
||||
files_with_mtime = []
|
||||
|
||||
# Calculate total size and collect file metadata
|
||||
for f in os.listdir(history_dir):
|
||||
filepath = os.path.join(history_dir, f)
|
||||
if f.startswith('smart_history_') and f.endswith('.json'):
|
||||
try:
|
||||
stat = os.stat(filepath)
|
||||
total_size += stat.st_size
|
||||
files_with_mtime.append((filepath, stat.st_mtime, stat.st_size))
|
||||
except (IOError, OSError) as e:
|
||||
logger.debug(f"Could not stat file {filepath}: {e}")
|
||||
|
||||
# If over limit, delete oldest files first
|
||||
if total_size > max_bytes:
|
||||
# Sort by modification time (oldest first)
|
||||
files_with_mtime.sort(key=lambda x: x[1])
|
||||
|
||||
logger.info(f"History directory size ({total_size} bytes) exceeds limit ({max_bytes} bytes), cleaning up...")
|
||||
|
||||
for filepath, mtime, file_size in files_with_mtime:
|
||||
if total_size <= max_bytes:
|
||||
break
|
||||
try:
|
||||
os.remove(filepath)
|
||||
total_size -= file_size
|
||||
logger.info(f"Removed old history file {os.path.basename(filepath)} (saved {file_size} bytes)")
|
||||
except (IOError, OSError) as e:
|
||||
logger.warning(f"Could not remove history file {filepath}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error enforcing storage limit: {e}")
|
||||
|
||||
# =============================================================================
|
||||
# MAIN EXECUTION METHODS
|
||||
# =============================================================================
|
||||
@@ -605,58 +650,79 @@ class SystemHealthMonitor:
|
||||
historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json")
|
||||
|
||||
try:
|
||||
# Load historical data
|
||||
if os.path.exists(historical_file):
|
||||
with open(historical_file, 'r') as f:
|
||||
history = json.load(f)
|
||||
else:
|
||||
history = []
|
||||
# Enforce storage limit before writing
|
||||
self._enforce_storage_limit(self.CONFIG['HISTORY_DIR'])
|
||||
|
||||
# Add current reading
|
||||
current_reading = {
|
||||
'timestamp': datetime.datetime.now().isoformat(),
|
||||
'attributes': current_attributes
|
||||
}
|
||||
history.append(current_reading)
|
||||
# Load historical data with file locking
|
||||
history = []
|
||||
file_mode = 'r+' if os.path.exists(historical_file) else 'w+'
|
||||
|
||||
# Keep only recent data
|
||||
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
|
||||
history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
|
||||
with open(historical_file, file_mode) as f:
|
||||
# Acquire exclusive lock
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
|
||||
try:
|
||||
# Read existing data if file is not empty
|
||||
if os.path.getsize(historical_file) > 0:
|
||||
f.seek(0)
|
||||
try:
|
||||
history = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.warning(f"Corrupted history file {historical_file}, starting fresh: {e}")
|
||||
history = []
|
||||
|
||||
# Analyze trends for critical attributes
|
||||
if len(history) >= 3: # Need at least 3 data points
|
||||
critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
|
||||
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
|
||||
# Add current reading
|
||||
current_reading = {
|
||||
'timestamp': datetime.datetime.now().isoformat(),
|
||||
'attributes': current_attributes
|
||||
}
|
||||
history.append(current_reading)
|
||||
|
||||
for attr in critical_attrs:
|
||||
if attr in current_attributes:
|
||||
# Get last week's values
|
||||
recent_history = history[-7:] if len(history) >= 7 else history
|
||||
values = [h['attributes'].get(attr, 0) for h in recent_history]
|
||||
# Keep only recent data (30 days default)
|
||||
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
|
||||
history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
|
||||
|
||||
if len(values) >= 3:
|
||||
# Check for rapid increase
|
||||
recent_increase = values[-1] - values[0]
|
||||
if recent_increase > 0:
|
||||
rate = recent_increase / len(values)
|
||||
# Analyze trends for critical attributes
|
||||
if len(history) >= 3: # Need at least 3 data points for trend analysis
|
||||
critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
|
||||
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
|
||||
|
||||
# Different thresholds for different attributes
|
||||
if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
|
||||
if rate > 0.5: # More than 0.5 sectors per check
|
||||
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
|
||||
elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
|
||||
if rate > 0.2: # Any consistent increase is concerning
|
||||
issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
|
||||
else: # Program/Erase fail counts
|
||||
if rate > 1: # More than 1 error per check
|
||||
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
|
||||
for attr in critical_attrs:
|
||||
if attr in current_attributes:
|
||||
# Get last week's values
|
||||
recent_history = history[-7:] if len(history) >= 7 else history
|
||||
values = [h['attributes'].get(attr, 0) for h in recent_history]
|
||||
|
||||
# Save updated history
|
||||
with open(historical_file, 'w') as f:
|
||||
json.dump(history, f, indent=2)
|
||||
if len(values) >= 3:
|
||||
# Check for rapid increase
|
||||
recent_increase = values[-1] - values[0]
|
||||
if recent_increase > 0:
|
||||
rate = recent_increase / len(values)
|
||||
|
||||
# Different thresholds for different attributes
|
||||
if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
|
||||
if rate > 0.5: # More than 0.5 sectors per check
|
||||
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
|
||||
elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
|
||||
if rate > 0.2: # Any consistent increase is concerning
|
||||
issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
|
||||
else: # Program/Erase fail counts
|
||||
if rate > 1: # More than 1 error per check
|
||||
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
|
||||
|
||||
# Write updated history atomically
|
||||
f.seek(0)
|
||||
f.truncate()
|
||||
json.dump(history, f, indent=2)
|
||||
f.flush()
|
||||
|
||||
finally:
|
||||
# Release lock
|
||||
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
|
||||
|
||||
except (IOError, OSError) as e:
|
||||
logger.debug(f"I/O error analyzing trends for {device}: {e}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Error analyzing trends for {device}: {e}")
|
||||
logger.error(f"Unexpected error analyzing trends for {device}: {e}")
|
||||
|
||||
return issues
|
||||
|
||||
@@ -1198,10 +1264,14 @@ class SystemHealthMonitor:
|
||||
|
||||
drive_size = ""
|
||||
if "Drive" in issue and "/dev/" in issue:
|
||||
device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0)
|
||||
drive_details = self._get_drive_details(device)
|
||||
if drive_details['capacity']:
|
||||
drive_size = f"[{drive_details['capacity']}] "
|
||||
device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
|
||||
if device_match:
|
||||
device = device_match.group(0)
|
||||
drive_details = self._get_drive_details(device)
|
||||
if drive_details['capacity']:
|
||||
drive_size = f"[{drive_details['capacity']}] "
|
||||
else:
|
||||
logger.warning(f"Could not extract device from issue: {issue}")
|
||||
|
||||
# Determine if this is a hardware or software issue
|
||||
issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
|
||||
@@ -1240,10 +1310,15 @@ class SystemHealthMonitor:
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
|
||||
}
|
||||
},
|
||||
timeout=10 # 10 second timeout for API calls
|
||||
)
|
||||
|
||||
response_data = response.json()
|
||||
try:
|
||||
response_data = response.json()
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Invalid JSON response from ticket API: {e}")
|
||||
continue
|
||||
|
||||
if response_data.get('success'):
|
||||
logger.info(f"Ticket created successfully: {ticket_title}")
|
||||
@@ -1742,6 +1817,14 @@ class SystemHealthMonitor:
|
||||
smart_health['issues'].append("Unable to read device information")
|
||||
return smart_health
|
||||
|
||||
# Skip Ridata drives entirely - unreliable and being replaced
|
||||
manufacturer = self._detect_manufacturer(drive_details.get('model', ''))
|
||||
if manufacturer == 'Ridata':
|
||||
smart_health['status'] = 'SKIPPED'
|
||||
smart_health['issues'].append("Ridata drive - monitoring disabled (unreliable hardware)")
|
||||
logger.debug(f"Skipping SMART monitoring for Ridata drive {device}")
|
||||
return smart_health
|
||||
|
||||
logger.debug(f"Drive details for {device}: {drive_details}")
|
||||
|
||||
manufacturer_profile = self._get_manufacturer_profile(
|
||||
@@ -2283,7 +2366,8 @@ class SystemHealthMonitor:
|
||||
try:
|
||||
with open(filepath, 'r') as f:
|
||||
return int(f.read().strip())
|
||||
except:
|
||||
except (IOError, OSError, ValueError) as e:
|
||||
logger.debug(f"Could not read ECC count from {filepath}: {e}")
|
||||
return 0
|
||||
|
||||
def _check_cpu_usage(self) -> Dict[str, Any]:
|
||||
@@ -2329,7 +2413,8 @@ class SystemHealthMonitor:
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
text=True,
|
||||
timeout=30 # 30 second timeout for subprocess
|
||||
)
|
||||
|
||||
if mgmt_result.returncode != 0:
|
||||
@@ -2348,7 +2433,8 @@ class SystemHealthMonitor:
|
||||
],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
text=True,
|
||||
timeout=30 # 30 second timeout for subprocess
|
||||
)
|
||||
|
||||
if ceph_result.returncode != 0:
|
||||
@@ -2382,7 +2468,8 @@ class SystemHealthMonitor:
|
||||
['pct', 'list'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
text=True,
|
||||
timeout=30 # 30 second timeout
|
||||
)
|
||||
logger.debug(f"pct list output:\n{result.stdout}")
|
||||
|
||||
@@ -2403,7 +2490,8 @@ class SystemHealthMonitor:
|
||||
['pct', 'df', vmid],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
text=True,
|
||||
timeout=30 # 30 second timeout per container
|
||||
)
|
||||
|
||||
container_info = {
|
||||
|
||||
Reference in New Issue
Block a user