diff --git a/hwmon.service b/hwmon.service index 6ecd647..e7a966e 100644 --- a/hwmon.service +++ b/hwmon.service @@ -7,6 +7,11 @@ Type=oneshot ExecStart=/usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))" User=root Group=root +Restart=on-failure +RestartSec=60 +TimeoutStartSec=300 +StandardOutput=journal +StandardError=journal [Install] WantedBy=multi-user.target \ No newline at end of file diff --git a/hwmon.timer b/hwmon.timer index be07167..1b8b054 100644 --- a/hwmon.timer +++ b/hwmon.timer @@ -1,9 +1,10 @@ [Unit] -Description=Run System Health Monitoring Daemon Daily +Description=Run System Health Monitoring Daemon Hourly [Timer] OnCalendar=hourly -RandomizedDelaySec=300 +RandomizedDelaySec=60 +Persistent=true [Install] WantedBy=timers.target \ No newline at end of file diff --git a/hwmonDaemon.py b/hwmonDaemon.py index 5c0b105..d480fda 100644 --- a/hwmonDaemon.py +++ b/hwmonDaemon.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime +import os, sys, json, requests, psutil, socket, subprocess, logging, argparse, urllib.request, re, glob, datetime, fcntl from typing import Dict, Any, List # ============================================================================= @@ -526,6 +526,51 @@ class SystemHealthMonitor: # Ensure history directory exists os.makedirs(self.CONFIG['HISTORY_DIR'], exist_ok=True) + def _enforce_storage_limit(self, history_dir: str, max_bytes: int = 10485760): + """ + Delete oldest history files if directory exceeds size limit (default 10MB). + + :param history_dir: Directory containing history files + :param max_bytes: Maximum directory size in bytes (default 10MB) + """ + if not os.path.exists(history_dir): + return + + try: + total_size = 0 + files_with_mtime = [] + + # Calculate total size and collect file metadata + for f in os.listdir(history_dir): + filepath = os.path.join(history_dir, f) + if f.startswith('smart_history_') and f.endswith('.json'): + try: + stat = os.stat(filepath) + total_size += stat.st_size + files_with_mtime.append((filepath, stat.st_mtime, stat.st_size)) + except (IOError, OSError) as e: + logger.debug(f"Could not stat file {filepath}: {e}") + + # If over limit, delete oldest files first + if total_size > max_bytes: + # Sort by modification time (oldest first) + files_with_mtime.sort(key=lambda x: x[1]) + + logger.info(f"History directory size ({total_size} bytes) exceeds limit ({max_bytes} bytes), cleaning up...") + + for filepath, mtime, file_size in files_with_mtime: + if total_size <= max_bytes: + break + try: + os.remove(filepath) + total_size -= file_size + logger.info(f"Removed old history file {os.path.basename(filepath)} (saved {file_size} bytes)") + except (IOError, OSError) as e: + logger.warning(f"Could not remove history file {filepath}: {e}") + + except Exception as e: + logger.error(f"Error enforcing storage limit: {e}") + # ============================================================================= # MAIN EXECUTION METHODS # ============================================================================= @@ -599,65 +644,86 @@ class SystemHealthMonitor: def _analyze_smart_trends(self, device: str, current_attributes: dict) -> List[str]: """Analyze SMART attribute trends to predict failures.""" issues = [] - + # Create safe filename from device path device_safe = device.replace('/', '_').replace('-', '_') historical_file = os.path.join(self.CONFIG['HISTORY_DIR'], f"smart_history_{device_safe}.json") - + try: - # Load historical data - if os.path.exists(historical_file): - with open(historical_file, 'r') as f: - history = json.load(f) - else: - history = [] - - # Add current reading - current_reading = { - 'timestamp': datetime.datetime.now().isoformat(), - 'attributes': current_attributes - } - history.append(current_reading) - - # Keep only recent data - cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS']) - history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date] - - # Analyze trends for critical attributes - if len(history) >= 3: # Need at least 3 data points - critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect', - 'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count'] - - for attr in critical_attrs: - if attr in current_attributes: - # Get last week's values - recent_history = history[-7:] if len(history) >= 7 else history - values = [h['attributes'].get(attr, 0) for h in recent_history] - - if len(values) >= 3: - # Check for rapid increase - recent_increase = values[-1] - values[0] - if recent_increase > 0: - rate = recent_increase / len(values) - - # Different thresholds for different attributes - if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']: - if rate > 0.5: # More than 0.5 sectors per check - issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks") - elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']: - if rate > 0.2: # Any consistent increase is concerning - issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks") - else: # Program/Erase fail counts - if rate > 1: # More than 1 error per check - issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks") - - # Save updated history - with open(historical_file, 'w') as f: - json.dump(history, f, indent=2) - + # Enforce storage limit before writing + self._enforce_storage_limit(self.CONFIG['HISTORY_DIR']) + + # Load historical data with file locking + history = [] + file_mode = 'r+' if os.path.exists(historical_file) else 'w+' + + with open(historical_file, file_mode) as f: + # Acquire exclusive lock + fcntl.flock(f.fileno(), fcntl.LOCK_EX) + try: + # Read existing data if file is not empty + if os.path.getsize(historical_file) > 0: + f.seek(0) + try: + history = json.load(f) + except json.JSONDecodeError as e: + logger.warning(f"Corrupted history file {historical_file}, starting fresh: {e}") + history = [] + + # Add current reading + current_reading = { + 'timestamp': datetime.datetime.now().isoformat(), + 'attributes': current_attributes + } + history.append(current_reading) + + # Keep only recent data (30 days default) + cutoff_date = datetime.datetime.now() - datetime.timedelta(days=self.CONFIG['HISTORY_RETENTION_DAYS']) + history = [h for h in history if datetime.datetime.fromisoformat(h['timestamp']) > cutoff_date] + + # Analyze trends for critical attributes + if len(history) >= 3: # Need at least 3 data points for trend analysis + critical_attrs = ['Reallocated_Sector_Ct', 'Current_Pending_Sector', 'Reported_Uncorrect', + 'Offline_Uncorrectable', 'Program_Fail_Count', 'Erase_Fail_Count'] + + for attr in critical_attrs: + if attr in current_attributes: + # Get last week's values + recent_history = history[-7:] if len(history) >= 7 else history + values = [h['attributes'].get(attr, 0) for h in recent_history] + + if len(values) >= 3: + # Check for rapid increase + recent_increase = values[-1] - values[0] + if recent_increase > 0: + rate = recent_increase / len(values) + + # Different thresholds for different attributes + if attr in ['Reallocated_Sector_Ct', 'Current_Pending_Sector']: + if rate > 0.5: # More than 0.5 sectors per check + issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks") + elif attr in ['Reported_Uncorrect', 'Offline_Uncorrectable']: + if rate > 0.2: # Any consistent increase is concerning + issues.append(f"TREND ALERT: Increasing {attr}: +{recent_increase} in {len(values)} checks") + else: # Program/Erase fail counts + if rate > 1: # More than 1 error per check + issues.append(f"TREND ALERT: Rapid increase in {attr}: +{recent_increase} in {len(values)} checks") + + # Write updated history atomically + f.seek(0) + f.truncate() + json.dump(history, f, indent=2) + f.flush() + + finally: + # Release lock + fcntl.flock(f.fileno(), fcntl.LOCK_UN) + + except (IOError, OSError) as e: + logger.debug(f"I/O error analyzing trends for {device}: {e}") except Exception as e: - logger.debug(f"Error analyzing trends for {device}: {e}") - + logger.error(f"Unexpected error analyzing trends for {device}: {e}") + return issues def _check_thermal_health(self, device: str, temperature: int, drive_type: str = 'HDD') -> List[str]: @@ -1198,10 +1264,14 @@ class SystemHealthMonitor: drive_size = "" if "Drive" in issue and "/dev/" in issue: - device = re.search(r'/dev/[a-zA-Z0-9]+', issue).group(0) - drive_details = self._get_drive_details(device) - if drive_details['capacity']: - drive_size = f"[{drive_details['capacity']}] " + device_match = re.search(r'/dev/[a-zA-Z0-9]+', issue) + if device_match: + device = device_match.group(0) + drive_details = self._get_drive_details(device) + if drive_details['capacity']: + drive_size = f"[{drive_details['capacity']}] " + else: + logger.warning(f"Could not extract device from issue: {issue}") # Determine if this is a hardware or software issue issue_category = 'SOFTWARE' if 'LXC' in issue else 'HARDWARE' @@ -1240,10 +1310,15 @@ class SystemHealthMonitor: headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.CONFIG["TICKET_API_KEY"]}' - } + }, + timeout=10 # 10 second timeout for API calls ) - - response_data = response.json() + + try: + response_data = response.json() + except json.JSONDecodeError as e: + logger.error(f"Invalid JSON response from ticket API: {e}") + continue if response_data.get('success'): logger.info(f"Ticket created successfully: {ticket_title}") @@ -1742,6 +1817,14 @@ class SystemHealthMonitor: smart_health['issues'].append("Unable to read device information") return smart_health + # Skip Ridata drives entirely - unreliable and being replaced + manufacturer = self._detect_manufacturer(drive_details.get('model', '')) + if manufacturer == 'Ridata': + smart_health['status'] = 'SKIPPED' + smart_health['issues'].append("Ridata drive - monitoring disabled (unreliable hardware)") + logger.debug(f"Skipping SMART monitoring for Ridata drive {device}") + return smart_health + logger.debug(f"Drive details for {device}: {drive_details}") manufacturer_profile = self._get_manufacturer_profile( @@ -2276,14 +2359,15 @@ class SystemHealthMonitor: def _read_ecc_count(self, filepath: str) -> int: """ Read ECC error count from a file. - + :param filepath: Path to the ECC count file :return: Number of ECC errors """ try: with open(filepath, 'r') as f: return int(f.read().strip()) - except: + except (IOError, OSError, ValueError) as e: + logger.debug(f"Could not read ECC count from {filepath}: {e}") return 0 def _check_cpu_usage(self) -> Dict[str, Any]: @@ -2322,16 +2406,17 @@ class SystemHealthMonitor: # Check management network connectivity mgmt_result = subprocess.run( [ - "ping", + "ping", "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']), "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']), self.CONFIG['NETWORKS']['MANAGEMENT'] ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, - text=True + text=True, + timeout=30 # 30 second timeout for subprocess ) - + if mgmt_result.returncode != 0: network_health['management_network']['status'] = 'CRITICAL' network_health['management_network']['issues'].append( @@ -2348,7 +2433,8 @@ class SystemHealthMonitor: ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, - text=True + text=True, + timeout=30 # 30 second timeout for subprocess ) if ceph_result.returncode != 0: @@ -2382,28 +2468,30 @@ class SystemHealthMonitor: ['pct', 'list'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, - text=True + text=True, + timeout=30 # 30 second timeout ) logger.debug(f"pct list output:\n{result.stdout}") - + for line in result.stdout.split('\n')[1:]: if not line.strip(): continue - + parts = line.split() if len(parts) < 2: logger.debug(f"Skipping invalid line: {line}") continue - + vmid, status = parts[0], parts[1] - + if status.lower() == 'running': logger.debug(f"Checking container {vmid} disk usage") disk_info = subprocess.run( ['pct', 'df', vmid], stdout=subprocess.PIPE, stderr=subprocess.PIPE, - text=True + text=True, + timeout=30 # 30 second timeout per container ) container_info = {