Fix critical reliability and security issues in hwmonDaemon

Critical fixes implemented:
- Add 10MB storage limit with automatic cleanup of old history files
- Add file locking (fcntl) to prevent race conditions in history writes
- Disable SMART monitoring for unreliable Ridata drives
- Fix bare except clause in _read_ecc_count() to properly catch errors
- Add timeouts to all network and subprocess calls (10s for API, 30s for subprocess)
- Fix unchecked regex in ticket creation to prevent AttributeError
- Add JSON decode error handling for ticket API responses

Service configuration improvements:
- hwmon.timer: Reduce jitter from 300s to 60s, add Persistent=true
- hwmon.service: Add Restart=on-failure, TimeoutStartSec=300, logging to journal

These changes improve reliability, prevent hung processes, eliminate race
conditions, and add proper error handling throughout the daemon.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-06 16:55:48 -05:00
parent 0577c7fc1b
commit fe832c42f3
3 changed files with 170 additions and 76 deletions

View File

@@ -7,6 +7,11 @@ Type=oneshot
ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))" ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
User=root User=root
Group=root Group=root
Restart=on-failure
RestartSec=60
TimeoutStartSec=300
StandardOutput=journal
StandardError=journal
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@@ -1,9 +1,10 @@
[Unit] [Unit]
Description=Run System Health Monitoring Daemon Daily Description=Run System Health Monitoring Daemon Hourly
[Timer] [Timer]
OnCalendar=hourly OnCalendar=hourly
RandomizedDelaySec=300 RandomizedDelaySec=60
Persistent=true
[Install] [Install]
WantedBy=timers.target WantedBy=timers.target

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl
from typing import Dict, Any, List from typing import Dict, Any, List
# ============================================================================= # =============================================================================
@@ -526,6 +526,51 @@ class SystemHealthMonitor:
# Ensure history directory exists # Ensure history directory exists
os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True) os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True)
def _enforce_storage_limit(self, history_dir: str, max_bytes: int = 10485760):
"""
Delete oldest history files if directory exceeds size limit (default 10MB).
:param history_dir: Directory containing history files
:param max_bytes: Maximum directory size in bytes (default 10MB)
"""
if not os.path.exists(history_dir):
return
try:
total_size = 0
files_with_mtime = []
# Calculate total size and collect file metadata
for f in os.listdir(history_dir):
filepath = os.path.join(history_dir, f)
if f.startswith('smart_history_') and f.endswith('.json'):
try:
stat = os.stat(filepath)
total_size += stat.st_size
files_with_mtime.append((filepath, stat.st_mtime, stat.st_size))
except (IOError, OSError) as e:
logger.debug(f"Could not stat file {filepath}: {e}")
# If over limit, delete oldest files first
if total_size > max_bytes:
# Sort by modification time (oldest first)
files_with_mtime.sort(key=lambda x: x[1])
logger.info(f"History directory size ({total_size} bytes) exceeds limit ({max_bytes} bytes), cleaning up...")
for filepath, mtime, file_size in files_with_mtime:
if total_size <= max_bytes:
break
try:
os.remove(filepath)
total_size -= file_size
logger.info(f"Removed old history file {os.path.basename(filepath)} (saved {file_size} bytes)")
except (IOError, OSError) as e:
logger.warning(f"Could not remove history file {filepath}: {e}")
except Exception as e:
logger.error(f"Error enforcing storage limit: {e}")
# ============================================================================= # =============================================================================
# MAIN EXECUTION METHODS # MAIN EXECUTION METHODS
# ============================================================================= # =============================================================================
@@ -599,65 +644,86 @@ class SystemHealthMonitor:
def _analyze_smart_trends(self, device: str, current_attributes: dict) -> List[str]: def _analyze_smart_trends(self, device: str, current_attributes: dict) -> List[str]:
"""Analyze SMART attribute trends to predict failures.""" """Analyze SMART attribute trends to predict failures."""
issues = [] issues = []
# Create safe filename from device path # Create safe filename from device path
device_safe = device.replace('/', '_').replace('-', '_') device_safe = device.replace('/', '_').replace('-', '_')
historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json") historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json")
try: try:
# Load historical data # Enforce storage limit before writing
if os.path.exists(historical_file): self._enforce_storage_limit(self.CONFIG['HISTORY_DIR'])
with open(historical_file, 'r') as f:
history = json.load(f) # Load historical data with file locking
else: history = []
history = [] file_mode = 'r+' if os.path.exists(historical_file) else 'w+'
# Add current reading with open(historical_file, file_mode) as f:
current_reading = { # Acquire exclusive lock
'timestamp': datetime.datetime.now().isoformat(), fcntl.flock(f.fileno(), fcntl.LOCK_EX)
'attributes': current_attributes try:
} # Read existing data if file is not empty
history.append(current_reading) if os.path.getsize(historical_file) > 0:
f.seek(0)
# Keep only recent data try:
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS']) history = json.load(f)
history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date] except json.JSONDecodeError as e:
logger.warning(f"Corrupted history file {historical_file}, starting fresh: {e}")
# Analyze trends for critical attributes history = []
if len(history) >= 3: # Need at least 3 data points
critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect', # Add current reading
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count'] current_reading = {
'timestamp': datetime.datetime.now().isoformat(),
for attr in critical_attrs: 'attributes': current_attributes
if attr in current_attributes: }
# Get last week's values history.append(current_reading)
recent_history = history[-7:] if len(history) >= 7 else history
values = [h['attributes'].get(attr, 0) for h in recent_history] # Keep only recent data (30 days default)
cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
if len(values) >= 3: history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
# Check for rapid increase
recent_increase = values[-1] - values[0] # Analyze trends for critical attributes
if recent_increase > 0: if len(history) >= 3: # Need at least 3 data points for trend analysis
rate = recent_increase / len(values) critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
# Different thresholds for different attributes
if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']: for attr in critical_attrs:
if rate > 0.5: # More than 0.5 sectors per check if attr in current_attributes:
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks") # Get last week's values
elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']: recent_history = history[-7:] if len(history) >= 7 else history
if rate > 0.2: # Any consistent increase is concerning values = [h['attributes'].get(attr, 0) for h in recent_history]
issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
else: # Program/Erase fail counts if len(values) >= 3:
if rate > 1: # More than 1 error per check # Check for rapid increase
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks") recent_increase = values[-1] - values[0]
if recent_increase > 0:
# Save updated history rate = recent_increase / len(values)
with open(historical_file, 'w') as f:
json.dump(history, f, indent=2) # Different thresholds for different attributes
if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
if rate > 0.5: # More than 0.5 sectors per check
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
if rate > 0.2: # Any consistent increase is concerning
issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
else: # Program/Erase fail counts
if rate > 1: # More than 1 error per check
issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
# Write updated history atomically
f.seek(0)
f.truncate()
json.dump(history, f, indent=2)
f.flush()
finally:
# Release lock
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
except (IOError, OSError) as e:
logger.debug(f"I/O error analyzing trends for {device}: {e}")
except Exception as e: except Exception as e:
logger.debug(f"Error analyzing trends for {device}: {e}") logger.error(f"Unexpected error analyzing trends for {device}: {e}")
return issues return issues
def _check_thermal_health(self, device: str, temperature: int, drive_type: str = 'HDD') -> List[str]: def _check_thermal_health(self, device: str, temperature: int, drive_type: str = 'HDD') -> List[str]:
@@ -1198,10 +1264,14 @@ class SystemHealthMonitor:
drive_size = "" drive_size = ""
if "Drive" in issue and "/dev/" in issue: if "Drive" in issue and "/dev/" in issue:
device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0) device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
drive_details = self._get_drive_details(device) if device_match:
if drive_details['capacity']: device = device_match.group(0)
drive_size = f"[{drive_details['capacity']}] " drive_details = self._get_drive_details(device)
if drive_details['capacity']:
drive_size = f"[{drive_details['capacity']}] "
else:
logger.warning(f"Could not extract device from issue: {issue}")
# Determine if this is a hardware or software issue # Determine if this is a hardware or software issue
issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE' issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
@@ -1240,10 +1310,15 @@ class SystemHealthMonitor:
headers = { headers = {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}' 'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
} },
timeout=10 # 10 second timeout for API calls
) )
response_data = response.json() try:
response_data = response.json()
except json.JSONDecodeError as e:
logger.error(f"Invalid JSON response from ticket API: {e}")
continue
if response_data.get('success'): if response_data.get('success'):
logger.info(f"Ticket created successfully: {ticket_title}") logger.info(f"Ticket created successfully: {ticket_title}")
@@ -1742,6 +1817,14 @@ class SystemHealthMonitor:
smart_health['issues'].append("Unable to read device information") smart_health['issues'].append("Unable to read device information")
return smart_health return smart_health
# Skip Ridata drives entirely - unreliable and being replaced
manufacturer = self._detect_manufacturer(drive_details.get('model', ''))
if manufacturer == 'Ridata':
smart_health['status'] = 'SKIPPED'
smart_health['issues'].append("Ridata drive - monitoring disabled (unreliable hardware)")
logger.debug(f"Skipping SMART monitoring for Ridata drive {device}")
return smart_health
logger.debug(f"Drive details for {device}: {drive_details}") logger.debug(f"Drive details for {device}: {drive_details}")
manufacturer_profile = self._get_manufacturer_profile( manufacturer_profile = self._get_manufacturer_profile(
@@ -2276,14 +2359,15 @@ class SystemHealthMonitor:
def _read_ecc_count(self, filepath: str) -> int: def _read_ecc_count(self, filepath: str) -> int:
""" """
Read ECC error count from a file. Read ECC error count from a file.
:param filepath: Path to the ECC count file :param filepath: Path to the ECC count file
:return: Number of ECC errors :return: Number of ECC errors
""" """
try: try:
with open(filepath, 'r') as f: with open(filepath, 'r') as f:
return int(f.read().strip()) return int(f.read().strip())
except: except (IOError, OSError, ValueError) as e:
logger.debug(f"Could not read ECC count from {filepath}: {e}")
return 0 return 0
def _check_cpu_usage(self) -> Dict[str, Any]: def _check_cpu_usage(self) -> Dict[str, Any]:
@@ -2322,16 +2406,17 @@ class SystemHealthMonitor:
# Check management network connectivity # Check management network connectivity
mgmt_result = subprocess.run( mgmt_result = subprocess.run(
[ [
"ping", "ping",
"-c", str(self.CONFIG['NETWORKS']['PING_COUNT']), "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
"-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']), "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
self.CONFIG['NETWORKS']['MANAGEMENT'] self.CONFIG['NETWORKS']['MANAGEMENT']
], ],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True text=True,
timeout=30 # 30 second timeout for subprocess
) )
if mgmt_result.returncode != 0: if mgmt_result.returncode != 0:
network_health['management_network']['status'] = 'CRITICAL' network_health['management_network']['status'] = 'CRITICAL'
network_health['management_network']['issues'].append( network_health['management_network']['issues'].append(
@@ -2348,7 +2433,8 @@ class SystemHealthMonitor:
], ],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True text=True,
timeout=30 # 30 second timeout for subprocess
) )
if ceph_result.returncode != 0: if ceph_result.returncode != 0:
@@ -2382,28 +2468,30 @@ class SystemHealthMonitor:
['pct', 'list'], ['pct', 'list'],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True text=True,
timeout=30 # 30 second timeout
) )
logger.debug(f"pct list output:\n{result.stdout}") logger.debug(f"pct list output:\n{result.stdout}")
for line in result.stdout.split('\n')[1:]: for line in result.stdout.split('\n')[1:]:
if not line.strip(): if not line.strip():
continue continue
parts = line.split() parts = line.split()
if len(parts) < 2: if len(parts) < 2:
logger.debug(f"Skipping invalid line: {line}") logger.debug(f"Skipping invalid line: {line}")
continue continue
vmid, status = parts[0], parts[1] vmid, status = parts[0], parts[1]
if status.lower() == 'running': if status.lower() == 'running':
logger.debug(f"Checking container {vmid} disk usage") logger.debug(f"Checking container {vmid} disk usage")
disk_info = subprocess.run( disk_info = subprocess.run(
['pct', 'df', vmid], ['pct', 'df', vmid],
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
text=True text=True,
timeout=30 # 30 second timeout per container
) )
container_info = { container_info = {