diff --git a/hwmon.service b/hwmon.service
index 6ecd647..e7a966e 100644
--- a/hwmon.service
+++ b/hwmon.service
@@ -7,6 +7,11 @@ Type=oneshot
 ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))"
 User=root
 Group=root
+Restart=on-failure
+RestartSec=60
+TimeoutStartSec=300
+StandardOutput=journal
+StandardError=journal
 
 [Install]
 WantedBy=multi-user.target
\ No newline at end of file
diff --git a/hwmon.timer b/hwmon.timer
index be07167..1b8b054 100644
--- a/hwmon.timer
+++ b/hwmon.timer
@@ -1,9 +1,10 @@
 [Unit]
-Description=Run System Health Monitoring Daemon Daily
+Description=Run System Health Monitoring Daemon Hourly
 
 [Timer]
 OnCalendar=hourly
-RandomizedDelaySec=300
+RandomizedDelaySec=60
+Persistent=true
 
 [Install]
 WantedBy=timers.target
\ No newline at end of file
diff --git a/hwmonDaemon.py b/hwmonDaemon.py
index 5c0b105..d480fda 100644
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime
+import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl
 from typing import Dict, Any, List
 
 # =============================================================================
@@ -526,6 +526,51 @@ class SystemHealthMonitor:
         # Ensure history directory exists
         os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True)
 
+    def _enforce_storage_limit(self, history_dir: str, max_bytes: int = 10485760):
+        """
+        Delete oldest history files if directory exceeds size limit (default 10MB).
+
+        :param history_dir: Directory containing history files
+        :param max_bytes: Maximum directory size in bytes (default 10MB)
+        """
+        if not os.path.exists(history_dir):
+            return
+
+        try:
+            total_size = 0
+            files_with_mtime = []
+
+            # Calculate total size and collect file metadata
+            for f in os.listdir(history_dir):
+                filepath = os.path.join(history_dir, f)
+                if f.startswith('smart_history_') and f.endswith('.json'):
+                    try:
+                        stat = os.stat(filepath)
+                        total_size += stat.st_size
+                        files_with_mtime.append((filepath, stat.st_mtime, stat.st_size))
+                    except (IOError, OSError) as e:
+                        logger.debug(f"Could not stat file {filepath}: {e}")
+
+            # If over limit, delete oldest files first
+            if total_size > max_bytes:
+                # Sort by modification time (oldest first)
+                files_with_mtime.sort(key=lambda x: x[1])
+
+                logger.info(f"History directory size ({total_size} bytes) exceeds limit ({max_bytes} bytes), cleaning up...")
+
+                for filepath, mtime, file_size in files_with_mtime:
+                    if total_size <= max_bytes:
+                        break
+                    try:
+                        os.remove(filepath)
+                        total_size -= file_size
+                        logger.info(f"Removed old history file {os.path.basename(filepath)} (saved {file_size} bytes)")
+                    except (IOError, OSError) as e:
+                        logger.warning(f"Could not remove history file {filepath}: {e}")
+
+        except Exception as e:
+            logger.error(f"Error enforcing storage limit: {e}")
+
     # =============================================================================
     # MAIN EXECUTION METHODS
     # =============================================================================
@@ -599,65 +644,86 @@ class SystemHealthMonitor:
     def _analyze_smart_trends(self, device: str, current_attributes: dict) -> List[str]:
         """Analyze SMART attribute trends to predict failures."""
         issues = []
-        
+
         # Create safe filename from device path
         device_safe = device.replace('/', '_').replace('-', '_')
         historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json")
-        
+
         try:
-            # Load historical data
-            if os.path.exists(historical_file):
-                with open(historical_file, 'r') as f:
-                    history = json.load(f)
-            else:
-                history = []
-            
-            # Add current reading
-            current_reading = {
-                'timestamp': datetime.datetime.now().isoformat(),
-                'attributes': current_attributes
-            }
-            history.append(current_reading)
-            
-            # Keep only recent data
-            cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
-            history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
-            
-            # Analyze trends for critical attributes
-            if len(history) >= 3:  # Need at least 3 data points
-                critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect', 
-                                'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
-                
-                for attr in critical_attrs:
-                    if attr in current_attributes:
-                        # Get last week's values
-                        recent_history = history[-7:] if len(history) >= 7 else history
-                        values = [h['attributes'].get(attr, 0) for h in recent_history]
-                        
-                        if len(values) >= 3:
-                            # Check for rapid increase
-                            recent_increase = values[-1] - values[0]
-                            if recent_increase > 0:
-                                rate = recent_increase / len(values)
-                                
-                                # Different thresholds for different attributes
-                                if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
-                                    if rate > 0.5:  # More than 0.5 sectors per check
-                                        issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
-                                elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
-                                    if rate > 0.2:  # Any consistent increase is concerning
-                                        issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
-                                else:  # Program/Erase fail counts
-                                    if rate > 1:  # More than 1 error per check
-                                        issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
-            
-            # Save updated history
-            with open(historical_file, 'w') as f:
-                json.dump(history, f, indent=2)
-                
+            # Enforce storage limit before writing
+            self._enforce_storage_limit(self.CONFIG['HISTORY_DIR'])
+
+            # Load historical data with file locking
+            history = []
+            file_mode = 'r+' if os.path.exists(historical_file) else 'w+'
+
+            with open(historical_file, file_mode) as f:
+                # Acquire exclusive lock
+                fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+                try:
+                    # Read existing data if file is not empty
+                    if os.path.getsize(historical_file) > 0:
+                        f.seek(0)
+                        try:
+                            history = json.load(f)
+                        except json.JSONDecodeError as e:
+                            logger.warning(f"Corrupted history file {historical_file}, starting fresh: {e}")
+                            history = []
+
+                    # Add current reading
+                    current_reading = {
+                        'timestamp': datetime.datetime.now().isoformat(),
+                        'attributes': current_attributes
+                    }
+                    history.append(current_reading)
+
+                    # Keep only recent data (30 days default)
+                    cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS'])
+                    history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date]
+
+                    # Analyze trends for critical attributes
+                    if len(history) >= 3:  # Need at least 3 data points for trend analysis
+                        critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect',
+                                        'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count']
+
+                        for attr in critical_attrs:
+                            if attr in current_attributes:
+                                # Get last week's values
+                                recent_history = history[-7:] if len(history) >= 7 else history
+                                values = [h['attributes'].get(attr, 0) for h in recent_history]
+
+                                if len(values) >= 3:
+                                    # Check for rapid increase
+                                    recent_increase = values[-1] - values[0]
+                                    if recent_increase > 0:
+                                        rate = recent_increase / len(values)
+
+                                        # Different thresholds for different attributes
+                                        if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']:
+                                            if rate > 0.5:  # More than 0.5 sectors per check
+                                                issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
+                                        elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']:
+                                            if rate > 0.2:  # Any consistent increase is concerning
+                                                issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks")
+                                        else:  # Program/Erase fail counts
+                                            if rate > 1:  # More than 1 error per check
+                                                issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks")
+
+                    # Write updated history atomically
+                    f.seek(0)
+                    f.truncate()
+                    json.dump(history, f, indent=2)
+                    f.flush()
+
+                finally:
+                    # Release lock
+                    fcntl.flock(f.fileno(), fcntl.LOCK_UN)
+
+        except (IOError, OSError) as e:
+            logger.debug(f"I/O error analyzing trends for {device}: {e}")
         except Exception as e:
-            logger.debug(f"Error analyzing trends for {device}: {e}")
-        
+            logger.error(f"Unexpected error analyzing trends for {device}: {e}")
+
         return issues
 
     def _check_thermal_health(self, device: str, temperature: int, drive_type: str = 'HDD') -> List[str]:
@@ -1198,10 +1264,14 @@ class SystemHealthMonitor:
 
             drive_size = ""
             if "Drive" in issue and "/dev/" in issue:
-                device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0)
-                drive_details = self._get_drive_details(device)
-                if drive_details['capacity']:
-                    drive_size = f"[{drive_details['capacity']}] "
+                device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue)
+                if device_match:
+                    device = device_match.group(0)
+                    drive_details = self._get_drive_details(device)
+                    if drive_details['capacity']:
+                        drive_size = f"[{drive_details['capacity']}] "
+                else:
+                    logger.warning(f"Could not extract device from issue: {issue}")
 
             # Determine if this is a hardware or software issue
             issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE'
@@ -1240,10 +1310,15 @@ class SystemHealthMonitor:
                         headers = {
                             'Content-Type': 'application/json',
                             'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}'
-                        }
+                        },
+                        timeout=10  # 10 second timeout for API calls
                     )
-                    
-                    response_data = response.json()
+
+                    try:
+                        response_data = response.json()
+                    except json.JSONDecodeError as e:
+                        logger.error(f"Invalid JSON response from ticket API: {e}")
+                        continue
                     
                     if response_data.get('success'):
                         logger.info(f"Ticket created successfully: {ticket_title}")
@@ -1742,6 +1817,14 @@ class SystemHealthMonitor:
                 smart_health['issues'].append("Unable to read device information")
                 return smart_health
 
+            # Skip Ridata drives entirely - unreliable and being replaced
+            manufacturer = self._detect_manufacturer(drive_details.get('model', ''))
+            if manufacturer == 'Ridata':
+                smart_health['status'] = 'SKIPPED'
+                smart_health['issues'].append("Ridata drive - monitoring disabled (unreliable hardware)")
+                logger.debug(f"Skipping SMART monitoring for Ridata drive {device}")
+                return smart_health
+
             logger.debug(f"Drive details for {device}: {drive_details}")
 
             manufacturer_profile = self._get_manufacturer_profile(
@@ -2276,14 +2359,15 @@ class SystemHealthMonitor:
     def _read_ecc_count(self, filepath: str) -> int:
         """
         Read ECC error count from a file.
-        
+
         :param filepath: Path to the ECC count file
         :return: Number of ECC errors
         """
         try:
             with open(filepath, 'r') as f:
                 return int(f.read().strip())
-        except:
+        except (IOError, OSError, ValueError) as e:
+            logger.debug(f"Could not read ECC count from {filepath}: {e}")
             return 0
 
     def _check_cpu_usage(self) -> Dict[str, Any]:
@@ -2322,16 +2406,17 @@ class SystemHealthMonitor:
             # Check management network connectivity
             mgmt_result = subprocess.run(
                 [
-                    "ping", 
+                    "ping",
                     "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
                     "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
                     self.CONFIG['NETWORKS']['MANAGEMENT']
                 ],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
-                text=True
+                text=True,
+                timeout=30  # 30 second timeout for subprocess
             )
-            
+
             if mgmt_result.returncode != 0:
                 network_health['management_network']['status'] = 'CRITICAL'
                 network_health['management_network']['issues'].append(
@@ -2348,7 +2433,8 @@ class SystemHealthMonitor:
                 ],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
-                text=True
+                text=True,
+                timeout=30  # 30 second timeout for subprocess
             )
 
             if ceph_result.returncode != 0:
@@ -2382,28 +2468,30 @@ class SystemHealthMonitor:
                 ['pct', 'list'],
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
-                text=True
+                text=True,
+                timeout=30  # 30 second timeout
             )
             logger.debug(f"pct list output:\n{result.stdout}")
-            
+
             for line in result.stdout.split('\n')[1:]:
                 if not line.strip():
                     continue
-                    
+
                 parts = line.split()
                 if len(parts) < 2:
                     logger.debug(f"Skipping invalid line: {line}")
                     continue
-                    
+
                 vmid, status = parts[0], parts[1]
-                
+
                 if status.lower() == 'running':
                     logger.debug(f"Checking container {vmid} disk usage")
                     disk_info = subprocess.run(
                         ['pct', 'df', vmid],
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
-                        text=True
+                        text=True,
+                        timeout=30  # 30 second timeout per container
                     )
                     
                     container_info = {