Fix critical reliability and security issues in hwmonDaemon

Critical fixes implemented: - Add 10MB storage limit with automatic cleanup of old history files - Add file locking (fcntl) to prevent race conditions in history writes - Disable SMART monitoring for unreliable Ridata drives - Fix bare except clause in _read_ecc_count() to properly catch errors - Add timeouts to all network and subprocess calls (10s for API, 30s for subprocess) - Fix unchecked regex in ticket creation to prevent AttributeError - Add JSON decode error handling for ticket API responses Service configuration improvements: - hwmon.timer: Reduce jitter from 300s to 60s, add Persistent=true - hwmon.service: Add Restart=on-failure, TimeoutStartSec=300, logging to journal These changes improve reliability, prevent hung processes, eliminate race conditions, and add proper error handling throughout the daemon. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 16:55:48 -05:00
parent 0577c7fc1b
commit fe832c42f3
3 changed files with 170 additions and 76 deletions
--- a/hwmon.service
+++ b/hwmon.service
@@ -7,6 +7,11 @@ Type=oneshot
 ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
 User=root
 Group=root
 Restart=on-failure
 RestartSec=60
 TimeoutStartSec=300
 StandardOutput=journal
 StandardError=journal
 [Install]
 WantedBy=multi-user.target
--- a/hwmon.timer
+++ b/hwmon.timer
@@ -1,9 +1,10 @@
 [Unit]
-Description=Run System Health Monitoring Daemon Daily
+Description=Run System Health Monitoring Daemon Hourly
 [Timer]
 OnCalendar=hourly
-RandomizedDelaySec=300
+RandomizedDelaySec=60
 Persistent=true
 [Install]
 WantedBy=timers.target
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime
+import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl
 from typing import Dict, Any, List
 # =============================================================================
@@ -526,6 +526,51 @@ class SystemHealthMonitor:
        # Ensure history directory exists
        os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True)
    def _enforce_storage_limit(self, history_dir: str, max_bytes: int = 10485760):
        """
        Delete oldest history files if directory exceeds size limit (default 10MB).
        :param history_dir: Directory containing history files
        :param max_bytes: Maximum directory size in bytes (default 10MB)
        """
        if not os.path.exists(history_dir):
            return
        try:
            total_size = 0
            files_with_mtime = []
            # Calculate total size and collect file metadata
            for f in os.listdir(history_dir):
                filepath = os.path.join(history_dir, f)
                if f.startswith('smart_history_') and f.endswith('.json'):
                    try:
                        stat = os.stat(filepath)
                        total_size += stat.st_size
                        files_with_mtime.append((filepath, stat.st_mtime, stat.st_size))
                    except (IOError, OSError) as e:
                        logger.debug(f"Could not stat file {filepath}: {e}")
            # If over limit, delete oldest files first
            if total_size > max_bytes:
                # Sort by modification time (oldest first)
                files_with_mtime.sort(key=lambda x: x[1])
                logger.info(f"History directory size ({total_size} bytes) exceeds limit ({max_bytes} bytes), cleaning up...")
                for filepath, mtime, file_size in files_with_mtime:
                    if total_size <= max_bytes:
                        break
                    try:
                        os.remove(filepath)
                        total_size -= file_size
                        logger.info(f"Removed old history file {os.path.basename(filepath)} (saved {file_size} bytes)")
                    except (IOError, OSError) as e:
                        logger.warning(f"Could not remove history file {filepath}: {e}")
        except Exception as e:
            logger.error(f"Error enforcing storage limit: {e}")
    # =============================================================================
    # MAIN EXECUTION METHODS
    # =============================================================================
@@ -599,65 +644,86 @@ class SystemHealthMonitor:
    def _analyze_smart_trends(self, device: str, current_attributes: dict) -> List[str]:
        """Analyze SMART attribute trends to predict failures."""
        issues = []
-        
+
        # Create safe filename from device path
        device_safe = device.replace('/', '_').replace('-', '_')
        historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json")
-        
+
        try:
-            # Load historical data
+            # Enforce storage limit before writing
-            if os.path.exists(historical_file):
+            self._enforce_storage_limit(self.CONFIG['HISTORY_DIR'])
-                with open(historical_file, 'r') as f:
+
-                    history = json.load(f)
+            # Load historical data with file locking
-            else:
+            history = []
-                history = []
+            file_mode = 'r+' if os.path.exists(historical_file) else 'w+'
-            
+
-            # Add current reading
+            with open(historical_file, file_mode) as f:
-            current_reading = {
+                # Acquire exclusive lock
-                'timestamp': datetime.datetime.now().isoformat(),
+                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
-                'attributes': current_attributes
+                try:
-            }
+                    # Read existing data if file is not empty
-            history.append(current_reading)
+                    if os.path.getsize(historical_file) > 0:
-            
+                        f.seek(0)
-            # Keep only recent data
+                        try:
-            cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
+                            history = json.load(f)
-            history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
+                        except json.JSONDecodeError as e:
-            
+                            logger.warning(f"Corrupted history file {historical_file}, starting fresh: {e}")
-            # Analyze trends for critical attributes
+                            history = []
-            if len(history) >= 3:  # Need at least 3 data points
+
-                critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect', 
+                    # Add current reading
-                                'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
+                    current_reading = {
-                
+                        'timestamp': datetime.datetime.now().isoformat(),
-                for attr in critical_attrs:
+                        'attributes': current_attributes
-                    if attr in current_attributes:
+                    }
-                        # Get last week's values
+                    history.append(current_reading)
-                        recent_history = history[-7:] if len(history) >= 7 else history
+
-                        values = [h['attributes'].get(attr, 0) for h in recent_history]
+                    # Keep only recent data (30 days default)
-                        
+                    cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
-                        if len(values) >= 3:
+                    history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
-                            # Check for rapid increase
+
-                            recent_increase = values[-1] - values[0]
+                    # Analyze trends for critical attributes
-                            if recent_increase > 0:
+                    if len(history) >= 3:  # Need at least 3 data points for trend analysis
-                                rate = recent_increase / len(values)
+                        critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
-                                
+                                        'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
-                                # Different thresholds for different attributes
+
-                                if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
+                        for attr in critical_attrs:
-                                    if rate > 0.5:  # More than 0.5 sectors per check
+                            if attr in current_attributes:
-                                        issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
+                                # Get last week's values
-                                elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
+                                recent_history = history[-7:] if len(history) >= 7 else history
-                                    if rate > 0.2:  # Any consistent increase is concerning
+                                values = [h['attributes'].get(attr, 0) for h in recent_history]
-                                        issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
+
-                                else:  # Program/Erase fail counts
+                                if len(values) >= 3:
-                                    if rate > 1:  # More than 1 error per check
+                                    # Check for rapid increase
-                                        issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
+                                    recent_increase = values[-1] - values[0]
-            
+                                    if recent_increase > 0:
-            # Save updated history
+                                        rate = recent_increase / len(values)
-            with open(historical_file, 'w') as f:
+
-                json.dump(history, f, indent=2)
+                                        # Different thresholds for different attributes
-                
+                                        if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
                                            if rate > 0.5:  # More than 0.5 sectors per check
                                                issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
                                        elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
                                            if rate > 0.2:  # Any consistent increase is concerning
                                                issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
                                        else:  # Program/Erase fail counts
                                            if rate > 1:  # More than 1 error per check
                                                issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
                    # Write updated history atomically
                    f.seek(0)
                    f.truncate()
                    json.dump(history, f, indent=2)
                    f.flush()
                finally:
                    # Release lock
                    fcntl.flock(f.fileno(), fcntl.LOCK_UN)
        except (IOError, OSError) as e:
            logger.debug(f"I/O error analyzing trends for {device}: {e}")
        except Exception as e:
-            logger.debug(f"Error analyzing trends for {device}: {e}")
+            logger.error(f"Unexpected error analyzing trends for {device}: {e}")
-        
+
        return issues
    def _check_thermal_health(self, device: str, temperature: int, drive_type: str = 'HDD') -> List[str]:
@@ -1198,10 +1264,14 @@ class SystemHealthMonitor:
            drive_size = ""
            if "Drive" in issue and "/dev/" in issue:
-                device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0)
+                device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
-                drive_details = self._get_drive_details(device)
+                if device_match:
-                if drive_details['capacity']:
+                    device = device_match.group(0)
-                    drive_size = f"[{drive_details['capacity']}] "
+                    drive_details = self._get_drive_details(device)
                    if drive_details['capacity']:
                        drive_size = f"[{drive_details['capacity']}] "
                else:
                    logger.warning(f"Could not extract device from issue: {issue}")
            # Determine if this is a hardware or software issue
            issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
@@ -1240,10 +1310,15 @@ class SystemHealthMonitor:
                        headers = {
                            'Content-Type': 'application/json',
                            'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
-                        }
+                        },
                        timeout=10  # 10 second timeout for API calls
                    )
-                    
+
-                    response_data = response.json()
+                    try:
                        response_data = response.json()
                    except json.JSONDecodeError as e:
                        logger.error(f"Invalid JSON response from ticket API: {e}")
                        continue
                    if response_data.get('success'):
                        logger.info(f"Ticket created successfully: {ticket_title}")
@@ -1742,6 +1817,14 @@ class SystemHealthMonitor:
                smart_health['issues'].append("Unable to read device information")
                return smart_health
            # Skip Ridata drives entirely - unreliable and being replaced
            manufacturer = self._detect_manufacturer(drive_details.get('model', ''))
            if manufacturer == 'Ridata':
                smart_health['status'] = 'SKIPPED'
                smart_health['issues'].append("Ridata drive - monitoring disabled (unreliable hardware)")
                logger.debug(f"Skipping SMART monitoring for Ridata drive {device}")
                return smart_health
            logger.debug(f"Drive details for {device}: {drive_details}")
            manufacturer_profile = self._get_manufacturer_profile(
@@ -2276,14 +2359,15 @@ class SystemHealthMonitor:
    def _read_ecc_count(self, filepath: str) -> int:
        """
        Read ECC error count from a file.
-        
+
        :param filepath: Path to the ECC count file
        :return: Number of ECC errors
        """
        try:
            with open(filepath, 'r') as f:
                return int(f.read().strip())
-        except:
+        except (IOError, OSError, ValueError) as e:
            logger.debug(f"Could not read ECC count from {filepath}: {e}")
            return 0
    def _check_cpu_usage(self) -> Dict[str, Any]:
@@ -2322,16 +2406,17 @@ class SystemHealthMonitor:
            # Check management network connectivity
            mgmt_result = subprocess.run(
                [
-                    "ping", 
+                    "ping",
                    "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
                    "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
                    self.CONFIG['NETWORKS']['MANAGEMENT']
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
-                text=True
+                text=True,
                timeout=30  # 30 second timeout for subprocess
            )
-            
+
            if mgmt_result.returncode != 0:
                network_health['management_network']['status'] = 'CRITICAL'
                network_health['management_network']['issues'].append(
@@ -2348,7 +2433,8 @@ class SystemHealthMonitor:
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
-                text=True
+                text=True,
                timeout=30  # 30 second timeout for subprocess
            )
            if ceph_result.returncode != 0:
@@ -2382,28 +2468,30 @@ class SystemHealthMonitor:
                ['pct', 'list'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
-                text=True
+                text=True,
                timeout=30  # 30 second timeout
            )
            logger.debug(f"pct list output:\n{result.stdout}")
-            
+
            for line in result.stdout.split('\n')[1:]:
                if not line.strip():
                    continue
-                    
+
                parts = line.split()
                if len(parts) < 2:
                    logger.debug(f"Skipping invalid line: {line}")
                    continue
-                    
+
                vmid, status = parts[0], parts[1]
-                
+
                if status.lower() == 'running':
                    logger.debug(f"Checking container {vmid} disk usage")
                    disk_info = subprocess.run(
                        ['pct', 'df', vmid],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
-                        text=True
+                        text=True,
                        timeout=30  # 30 second timeout per container
                    )
                    container_info = {