Fix critical reliability and security issues in hwmonDaemon

Critical fixes implemented: - Add 10MB storage limit with automatic cleanup of old history files - Add file locking (fcntl) to prevent race conditions in history writes - Disable SMART monitoring for unreliable Ridata drives - Fix bare except clause in _read_ecc_count() to properly catch errors - Add timeouts to all network and subprocess calls (10s for API, 30s for subprocess) - Fix unchecked regex in ticket creation to prevent AttributeError - Add JSON decode error handling for ticket API responses Service configuration improvements: - hwmon.timer: Reduce jitter from 300s to 60s, add Persistent=true - hwmon.service: Add Restart=on-failure, TimeoutStartSec=300, logging to journal These changes improve reliability, prevent hung processes, eliminate race conditions, and add proper error handling throughout the daemon. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 16:55:48 -05:00
parent 0577c7fc1b
commit fe832c42f3
3 changed files with 170 additions and 76 deletions
--- a/hwmon.service
+++ b/hwmon.service
@@ -7,6 +7,11 @@ Type=oneshot
 ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
 User=root
 Group=root
+Restart=on-failure
+RestartSec=60
+TimeoutStartSec=300
+StandardOutput=journal
+StandardError=journal

 [Install]
 WantedBy=multi-user.target
--- a/hwmon.timer
+++ b/hwmon.timer
@@ -1,9 +1,10 @@
 [Unit]
-Description=Run System Health Monitoring Daemon Daily
+Description=Run System Health Monitoring Daemon Hourly

 [Timer]
 OnCalendar=hourly
-RandomizedDelaySec=300
+RandomizedDelaySec=60
+Persistent=true

 [Install]
 WantedBy=timers.target
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime
+import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl
 from typing import Dict, Any, List

 # =============================================================================
@@ -526,6 +526,51 @@ class SystemHealthMonitor:
        # Ensure history directory exists
        os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True)

+    def _enforce_storage_limit(self, history_dir: str, max_bytes: int = 10485760):
+        """
+        Delete oldest history files if directory exceeds size limit (default 10MB).
+
+        :param history_dir: Directory containing history files
+        :param max_bytes: Maximum directory size in bytes (default 10MB)
+        """
+        if not os.path.exists(history_dir):
+            return
+
+        try:
+            total_size = 0
+            files_with_mtime = []
+
+            # Calculate total size and collect file metadata
+            for f in os.listdir(history_dir):
+                filepath = os.path.join(history_dir, f)
+                if f.startswith('smart_history_') and f.endswith('.json'):
+                    try:
+                        stat = os.stat(filepath)
+                        total_size += stat.st_size
+                        files_with_mtime.append((filepath, stat.st_mtime, stat.st_size))
+                    except (IOError, OSError) as e:
+                        logger.debug(f"Could not stat file {filepath}: {e}")
+
+            # If over limit, delete oldest files first
+            if total_size > max_bytes:
+                # Sort by modification time (oldest first)
+                files_with_mtime.sort(key=lambda x: x[1])
+
+                logger.info(f"History directory size ({total_size} bytes) exceeds limit ({max_bytes} bytes), cleaning up...")
+
+                for filepath, mtime, file_size in files_with_mtime:
+                    if total_size <= max_bytes:
+                        break
+                    try:
+                        os.remove(filepath)
+                        total_size -= file_size
+                        logger.info(f"Removed old history file {os.path.basename(filepath)} (saved {file_size} bytes)")
+                    except (IOError, OSError) as e:
+                        logger.warning(f"Could not remove history file {filepath}: {e}")
+
+        except Exception as e:
+            logger.error(f"Error enforcing storage limit: {e}")
+
    # =============================================================================
    # MAIN EXECUTION METHODS
    # =============================================================================
@@ -605,58 +650,79 @@ class SystemHealthMonitor:
        historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json")

        try:
-            # Load historical data
-            if os.path.exists(historical_file):
-                with open(historical_file, 'r') as f:
-                    history = json.load(f)
-            else:
-                history = []
+            # Enforce storage limit before writing
+            self._enforce_storage_limit(self.CONFIG['HISTORY_DIR'])

-            # Add current reading
-            current_reading = {
-                'timestamp': datetime.datetime.now().isoformat(),
-                'attributes': current_attributes
-            }
-            history.append(current_reading)
+            # Load historical data with file locking
+            history = []
+            file_mode = 'r+' if os.path.exists(historical_file) else 'w+'

-            # Keep only recent data
-            cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
-            history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
+            with open(historical_file, file_mode) as f:
+                # Acquire exclusive lock
+                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+                try:
+                    # Read existing data if file is not empty
+                    if os.path.getsize(historical_file) > 0:
+                        f.seek(0)
+                        try:
+                            history = json.load(f)
+                        except json.JSONDecodeError as e:
+                            logger.warning(f"Corrupted history file {historical_file}, starting fresh: {e}")
+                            history = []

-            # Analyze trends for critical attributes
-            if len(history) >= 3:  # Need at least 3 data points
-                critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect', 
-                                'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
+                    # Add current reading
+                    current_reading = {
+                        'timestamp': datetime.datetime.now().isoformat(),
+                        'attributes': current_attributes
+                    }
+                    history.append(current_reading)

-                for attr in critical_attrs:
-                    if attr in current_attributes:
-                        # Get last week's values
-                        recent_history = history[-7:] if len(history) >= 7 else history
-                        values = [h['attributes'].get(attr, 0) for h in recent_history]
+                    # Keep only recent data (30 days default)
+                    cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
+                    history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]

-                        if len(values) >= 3:
-                            # Check for rapid increase
-                            recent_increase = values[-1] - values[0]
-                            if recent_increase > 0:
-                                rate = recent_increase / len(values)
+                    # Analyze trends for critical attributes
+                    if len(history) >= 3:  # Need at least 3 data points for trend analysis
+                        critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
+                                        'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']

-                                # Different thresholds for different attributes
-                                if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
-                                    if rate > 0.5:  # More than 0.5 sectors per check
-                                        issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
-                                elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
-                                    if rate > 0.2:  # Any consistent increase is concerning
-                                        issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
-                                else:  # Program/Erase fail counts
-                                    if rate > 1:  # More than 1 error per check
-                                        issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
+                        for attr in critical_attrs:
+                            if attr in current_attributes:
+                                # Get last week's values
+                                recent_history = history[-7:] if len(history) >= 7 else history
+                                values = [h['attributes'].get(attr, 0) for h in recent_history]

-            # Save updated history
-            with open(historical_file, 'w') as f:
-                json.dump(history, f, indent=2)
+                                if len(values) >= 3:
+                                    # Check for rapid increase
+                                    recent_increase = values[-1] - values[0]
+                                    if recent_increase > 0:
+                                        rate = recent_increase / len(values)

+                                        # Different thresholds for different attributes
+                                        if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
+                                            if rate > 0.5:  # More than 0.5 sectors per check
+                                                issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
+                                        elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
+                                            if rate > 0.2:  # Any consistent increase is concerning
+                                                issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
+                                        else:  # Program/Erase fail counts
+                                            if rate > 1:  # More than 1 error per check
+                                                issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
+
+                    # Write updated history atomically
+                    f.seek(0)
+                    f.truncate()
+                    json.dump(history, f, indent=2)
+                    f.flush()
+
+                finally:
+                    # Release lock
+                    fcntl.flock(f.fileno(), fcntl.LOCK_UN)
+
+        except (IOError, OSError) as e:
+            logger.debug(f"I/O error analyzing trends for {device}: {e}")
        except Exception as e:
-            logger.debug(f"Error analyzing trends for {device}: {e}")
+            logger.error(f"Unexpected error analyzing trends for {device}: {e}")

        return issues

@@ -1198,10 +1264,14 @@ class SystemHealthMonitor:

            drive_size = ""
            if "Drive" in issue and "/dev/" in issue:
-                device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0)
-                drive_details = self._get_drive_details(device)
-                if drive_details['capacity']:
-                    drive_size = f"[{drive_details['capacity']}] "
+                device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
+                if device_match:
+                    device = device_match.group(0)
+                    drive_details = self._get_drive_details(device)
+                    if drive_details['capacity']:
+                        drive_size = f"[{drive_details['capacity']}] "
+                else:
+                    logger.warning(f"Could not extract device from issue: {issue}")

            # Determine if this is a hardware or software issue
            issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
@@ -1240,10 +1310,15 @@ class SystemHealthMonitor:
                        headers = {
                            'Content-Type': 'application/json',
                            'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
-                        }
+                        },
+                        timeout=10  # 10 second timeout for API calls
                    )

-                    response_data = response.json()
+                    try:
+                        response_data = response.json()
+                    except json.JSONDecodeError as e:
+                        logger.error(f"Invalid JSON response from ticket API: {e}")
+                        continue
                    
                    if response_data.get('success'):
                        logger.info(f"Ticket created successfully: {ticket_title}")
@@ -1742,6 +1817,14 @@ class SystemHealthMonitor:
                smart_health['issues'].append("Unable to read device information")
                return smart_health

+            # Skip Ridata drives entirely - unreliable and being replaced
+            manufacturer = self._detect_manufacturer(drive_details.get('model', ''))
+            if manufacturer == 'Ridata':
+                smart_health['status'] = 'SKIPPED'
+                smart_health['issues'].append("Ridata drive - monitoring disabled (unreliable hardware)")
+                logger.debug(f"Skipping SMART monitoring for Ridata drive {device}")
+                return smart_health
+
            logger.debug(f"Drive details for {device}: {drive_details}")

            manufacturer_profile = self._get_manufacturer_profile(
@@ -2283,7 +2366,8 @@ class SystemHealthMonitor:
        try:
            with open(filepath, 'r') as f:
                return int(f.read().strip())
-        except:
+        except (IOError, OSError, ValueError) as e:
+            logger.debug(f"Could not read ECC count from {filepath}: {e}")
            return 0

    def _check_cpu_usage(self) -> Dict[str, Any]:
@@ -2329,7 +2413,8 @@ class SystemHealthMonitor:
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
-                text=True
+                text=True,
+                timeout=30  # 30 second timeout for subprocess
            )

            if mgmt_result.returncode != 0:
@@ -2348,7 +2433,8 @@ class SystemHealthMonitor:
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
-                text=True
+                text=True,
+                timeout=30  # 30 second timeout for subprocess
            )

            if ceph_result.returncode != 0:
@@ -2382,7 +2468,8 @@ class SystemHealthMonitor:
                ['pct', 'list'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
-                text=True
+                text=True,
+                timeout=30  # 30 second timeout
            )
            logger.debug(f"pct list output:\n{result.stdout}")

@@ -2403,7 +2490,8 @@ class SystemHealthMonitor:
                        ['pct', 'df', vmid],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
-                        text=True
+                        text=True,
+                        timeout=30  # 30 second timeout per container
                    )
                    
                    container_info = {