Better identification of RiData drives

2025-05-29 19:02:27 -04:00
parent 95a5a8227a
commit 1e6260a899
1 changed files with 15 additions and 475 deletions
--- a/hwmonDaemon.py
+++ b/hwmonDaemon.py
@@ -109,8 +109,9 @@ class SystemHealthMonitor:
    }
    MANUFACTURER_SMART_PROFILES = {
        'Ridata': {
-            'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK'],
+            'aliases': ['Ridata', 'Ritek', 'RIDATA', 'RITEK', 'SSD 512GB'],  # Add the generic model
-            'wear_leveling_behavior': 'countup',  # Based on your data, it counts up
+            'firmware_patterns': ['HT3618B7', 'HT36'],  # Add firmware pattern matching
            'wear_leveling_behavior': 'countup',
            'wear_leveling_baseline': 0,
            'wear_leveling_thresholds': {
                'warning': 500000,   # Much higher threshold for countup behavior
@@ -1105,12 +1106,20 @@ class SystemHealthMonitor:
            logger.debug(f"Could not parse SMART value: {raw_value}")
            return 0
-    def _get_manufacturer_profile(self, model: str, manufacturer: str = None) -> Dict[str, Any]:
+    def _get_manufacturer_profile(self, model: str, manufacturer: str = None, firmware: str = None) -> Dict[str, Any]:
        """
-        Get manufacturer-specific SMART profile based on drive model/manufacturer.
+        Get manufacturer-specific SMART profile based on drive model/manufacturer/firmware.
        """
        # Check each manufacturer profile
        for mfg, profile in self.MANUFACTURER_SMART_PROFILES.items():
            # Check firmware patterns first (most specific for OEM drives)
            if firmware and 'firmware_patterns' in profile:
                for pattern in profile['firmware_patterns']:
                    if pattern in firmware:
                        logger.debug(f"Matched manufacturer profile: {mfg} for firmware: {firmware}")
                        return profile
            # Check model/manufacturer aliases
            for alias in profile['aliases']:
                if alias.lower() in model.lower() or (manufacturer and alias.lower() in manufacturer.lower()):
                    logger.debug(f"Matched manufacturer profile: {mfg} for model: {model}")
@@ -1124,7 +1133,7 @@ class SystemHealthMonitor:
        """
        Determine if a drive is considered "new" based on power-on hours.
        """
-        return power_on_hours < 168  # Less than 1 week of runtime
+        return power_on_hours < 720  # Less than 1 week of runtime
    def _check_smart_health(self, device: str) -> Dict[str, Any]:
        """
@@ -1302,473 +1311,4 @@ class SystemHealthMonitor:
            logger.debug(f"=== SMART Health Check for {device} ===")
            logger.debug(f"Manufacturer profile: {manufacturer_profile.get('aliases', ['Unknown'])[0] if manufacturer_profile else 'None'}")
            logger.debug("Raw SMART attributes:")
-            for attr, value in smart_health['attributes'].items():
+            for attr, value in smart_health['attributes'].
                logger.debug(f"{attr}: {value}")
            logger.debug(f"Temperature: {smart_health['temp']}°C")
            logger.debug(f"Is new drive: {is_new_drive}")
            logger.debug(f"Detected Issues: {smart_health['issues']}")
            logger.debug("=== End SMART Check ===\n")
            # Special handling for NVMe drives
            if 'nvme' in device:
                try:
                    nvme_result = subprocess.run(
                        ['nvme', 'smart-log', device],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                        text=True,
                        timeout=10
                    )
                    logger.debug(f"NVMe smart-log raw output for {device}:")
                    logger.debug(nvme_result.stdout)
                    # Initialize the temperature attribute
                    if smart_health['temp'] is None:
                        smart_health['attributes']['Temperature_Celsius'] = None
                    for line in nvme_result.stdout.split('\n'):
                        # Fix the NoneType error by checking if line exists and has content
                        if line and line.strip() and 'temperature' in line.lower():
                            try:
                                temp_str = line.split(':')[1].strip() if ':' in line else line.strip()
                                logger.debug(f"Raw temperature string: {temp_str}")
                                # Extract first temperature value more safely
                                digits = ''.join(c for c in temp_str if c.isdigit())
                                if len(digits) >= 2:
                                    temp_value = int(digits[:2])
                                    logger.debug(f"Parsed temperature value: {temp_value}")
                                    # Set both temperature fields
                                    smart_health['temp'] = temp_value
                                    smart_health['attributes']['Temperature_Celsius'] = temp_value
                                    logger.debug(f"Final temperature recorded: {smart_health['temp']}")
                                    break
                            except (ValueError, IndexError, AttributeError) as e:
                                logger.debug(f"Error parsing NVMe temperature from line '{line}': {e}")
                                continue
                except subprocess.TimeoutExpired:
                    logger.debug(f"NVMe smart-log for {device} timed out")
                except Exception as e:
                    logger.debug(f"Error getting NVMe smart data for {device}: {e}")
        except subprocess.TimeoutExpired:
            smart_health['status'] = 'ERROR'
            smart_health['issues'].append("SMART check timed out")
        except Exception as e:
            smart_health['status'] = 'ERROR'
            smart_health['severity'] = 'UNKNOWN'
            smart_health['issues'].append(f"Error checking SMART: {str(e)}")
            logger.debug(f"Exception in _check_smart_health for {device}: {e}")
            import traceback
            logger.debug(traceback.format_exc())
        return smart_health
    def _check_drives_health(self) -> Dict[str, Any]:
        drives_health = {'overall_status': 'NORMAL', 'drives': []}
        try:
            # Get only valid physical disks
            physical_disks = self._get_all_disks()
            logger.debug(f"Checking physical disks: {physical_disks}")
            if not physical_disks:
                logger.warning("No valid physical disks found for monitoring")
                drives_health['overall_status'] = 'WARNING'
                return drives_health
            # Get ALL partition information including device mapper
            partitions = psutil.disk_partitions(all=True)
            # Create mapping of base devices to their partitions
            device_partitions = {}
            for part in partitions:
                # Extract base device (e.g., /dev/sda from /dev/sda1)
                base_device = re.match(r'(/dev/[a-z]+)', part.device)
                if base_device:
                    base_dev = base_device.group(1)
                    if base_dev not in device_partitions:
                        device_partitions[base_dev] = []
                    device_partitions[base_dev].append(part)
            overall_status = 'NORMAL'
            for disk in physical_disks:
                drive_report = {
                    'device': disk,
                    'partitions': [],
                    'smart_status': 'UNKNOWN',
                    'usage_percent': 0
                }
                # Add partition information if available
                if disk in device_partitions:
                    total_used = 0
                    total_space = 0
                    for partition in device_partitions[disk]:
                        try:
                            usage = psutil.disk_usage(partition.mountpoint)
                            total_used += usage.used
                            total_space += usage.total
                            part_info = {
                                'device': partition.device,
                                'mountpoint': partition.mountpoint,
                                'fstype': partition.fstype,
                                'total_space': self._convert_bytes(usage.total),
                                'used_space': self._convert_bytes(usage.used),
                                'free_space': self._convert_bytes(usage.free),
                                'usage_percent': usage.percent
                            }
                            drive_report['partitions'].append(part_info)
                        except Exception as e:
                            logger.debug(f"Error getting partition usage for {partition.device}: {e}")
                    # Calculate overall drive usage percentage
                    if total_space > 0:
                        drive_report['usage_percent'] = (total_used / total_space) * 100
                # Check SMART health
                smart_health = self._check_smart_health(disk)
                drive_report.update({
                    'smart_status': smart_health['status'],
                    'smart_issues': smart_health['issues'],
                    'temperature': smart_health['temp'],
                    'smart_attributes': smart_health['attributes']
                })
                # Only report issues for drives that should be monitored
                if smart_health['status'] == 'UNHEALTHY':
                    overall_status = 'CRITICAL'
                elif smart_health['status'] == 'ERROR':
                    # Don't escalate overall status for ERROR drives (might be virtual)
                    logger.debug(f"Drive {disk} returned ERROR status, skipping from issue detection")
                elif smart_health['issues'] and smart_health['status'] not in ['ERROR', 'NOT_SUPPORTED']:
                    if overall_status != 'CRITICAL':
                        overall_status = 'WARNING'
                drives_health['drives'].append(drive_report)
            drives_health['overall_status'] = overall_status
        except Exception as e:
            logger.error(f"Error checking drives health: {str(e)}")
        return drives_health
    @staticmethod
    def _convert_bytes(bytes_value: int, suffix: str = 'B') -> str:
        """
        Convert bytes to a human-readable format.
        :param bytes_value: Number of bytes to convert.
        :param suffix: Suffix to append (default is 'B' for bytes).
        :return: Formatted string with the size in human-readable form.
        """
        for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
            if abs(bytes_value) < 1024.0:
                return f"{bytes_value:.1f}{unit}{suffix}"
            bytes_value /= 1024.0
        return f"{bytes_value:.1f}Y{suffix}"
    def _convert_size_to_bytes(self, size_str: str) -> float:
        """Convert size string with units to bytes"""
        units = {'B': 1, 'K': 1024, 'M': 1024**2, 'G': 1024**3, 'T': 1024**4}
        size = float(size_str[:-1])
        unit = size_str[-1].upper()
        return size * units[unit]
    def _check_memory_usage(self) -> Dict[str, Any]:
        """
        Check for ECC memory errors if ECC memory is present.
        """
        memory_health = {
            'has_ecc': False,
            'ecc_errors': [],
            'status': 'OK',
            'total_memory': self._convert_bytes(psutil.virtual_memory().total),
            'used_memory': self._convert_bytes(psutil.virtual_memory().used),
            'memory_percent': psutil.virtual_memory().percent
        }
        try:
            # First check using dmidecode
            result = subprocess.run(
                ['dmidecode', '--type', 'memory'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            if 'Error Correction Type: Multi-bit ECC' in result.stdout:
                memory_health['has_ecc'] = True
            # If dmidecode didn't find ECC, try the edac method as backup
            if not memory_health['has_ecc']:
                edac_path = '/sys/devices/system/edac/mc'
                if os.path.exists(edac_path) and os.listdir(edac_path):
                    for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
                        if os.path.exists(f"{mc_dir}/csrow0"):
                            memory_health['has_ecc'] = True
                            break
            # If ECC is present, check for errors
            if memory_health['has_ecc']:
                for mc_dir in glob.glob('/sys/devices/system/edac/mc/mc[0-9]*'):
                    if os.path.exists(f"{mc_dir}/csrow0"):
                        ue_count = self._read_ecc_count(f"{mc_dir}/csrow0/ue_count")
                        if ue_count > 0:
                            memory_health['status'] = 'CRITICAL'
                            memory_health['ecc_errors'].append(
                                f"Uncorrectable ECC errors detected in {os.path.basename(mc_dir)}: {ue_count}"
                            )
                        ce_count = self._read_ecc_count(f"{mc_dir}/csrow0/ce_count")
                        if ce_count > 0:
                            if memory_health['status'] != 'CRITICAL':
                                memory_health['status'] = 'WARNING'
                            memory_health['ecc_errors'].append(
                                f"Correctable ECC errors detected in {os.path.basename(mc_dir)}: {ce_count}"
                            )
        except Exception as e:
            memory_health['status'] = 'ERROR'
            memory_health['ecc_errors'].append(f"Error checking ECC status: {str(e)}")
        return memory_health
    def _read_ecc_count(self, filepath: str) -> int:
        """
        Read ECC error count from a file.
        :param filepath: Path to the ECC count file
        :return: Number of ECC errors
        """
        try:
            with open(filepath, 'r') as f:
                return int(f.read().strip())
        except:
            return 0
    def _check_cpu_usage(self) -> Dict[str, Any]:
        """
        Check CPU usage and return health metrics.
        :return: Dictionary with CPU health metrics.
        """
        cpu_usage_percent = psutil.cpu_percent(interval=1)
        cpu_health = {
            'cpu_usage_percent': cpu_usage_percent,
            'status': 'OK' if cpu_usage_percent < self.CONFIG['THRESHOLDS']['CPU_WARNING'] else 'WARNING'
        }
        return cpu_health
    def _check_network_status(self) -> Dict[str, Any]:
        """
        Check the status of network interfaces and report any issues.
        :return: Dictionary containing network health metrics and any issues found.
        """
        network_health = {
            'management_network': {
                'issues': [],
                'status': 'OK',
                'latency': None
            },
            'ceph_network': {
                'issues': [],
                'status': 'OK',
                'latency': None
            }
        }
        try:
            # Check management network connectivity
            mgmt_result = subprocess.run(
                [
                    "ping", 
                    "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
                    "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
                    self.CONFIG['NETWORKS']['MANAGEMENT']
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            if mgmt_result.returncode != 0:
                network_health['management_network']['status'] = 'CRITICAL'
                network_health['management_network']['issues'].append(
                    "Management network is unreachable"
                )
            # Check Ceph network connectivity
            ceph_result = subprocess.run(
                [
                    "ping",
                    "-c", str(self.CONFIG['NETWORKS']['PING_COUNT']),
                    "-W", str(self.CONFIG['NETWORKS']['PING_TIMEOUT']),
                    self.CONFIG['NETWORKS']['CEPH']
                ],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            if ceph_result.returncode != 0:
                network_health['ceph_network']['status'] = 'CRITICAL'
                network_health['ceph_network']['issues'].append(
                    "Ceph network is unreachable"
                )
            return network_health
        except Exception as e:
            logger.error(f"Network health check failed: {e}")
            return {
                'status': 'ERROR',
                'error': str(e)
            }
    def _check_lxc_storage(self) -> Dict[str, Any]:
        """
        Check storage utilization for all running LXC containers
        """
        logger.debug("Starting LXC storage check")
        lxc_health = {
            'status': 'OK',
            'containers': [],
            'issues': []
        }
        try:
            result = subprocess.run(
                ['pct', 'list'],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            logger.debug(f"pct list output:\n{result.stdout}")
            for line in result.stdout.split('\n')[1:]:
                if not line.strip():
                    continue
                parts = line.split()
                if len(parts) < 2:
                    logger.debug(f"Skipping invalid line: {line}")
                    continue
                vmid, status = parts[0], parts[1]
                if status.lower() == 'running':
                    logger.debug(f"Checking container {vmid} disk usage")
                    disk_info = subprocess.run(
                        ['pct', 'df', vmid],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                        text=True
                    )
                    container_info = {
                        'vmid': vmid,
                        'filesystems': []
                    }
                    for fs_line in disk_info.stdout.split('\n')[1:]:
                        if not fs_line.strip() or 'MP' in fs_line:
                            continue
                        columns = line.split()
                        logger.debug(f"Split parts: {parts}")
                        if len(columns) >= 6:
                            try:
                                # Skip excluded mounts
                                if parts[0].startswith('appPool:') or '/mnt/pve/mediaf' in parts[0]:
                                    continue
                                # Get the mountpoint (last column)
                                if len(parts) > 5:
                                    # The mountpoint is the last column
                                    mountpoint = columns[-1]
                                else:
                                    mountpoint = "/"
                                # Skip excluded mountpoints
                                if self._is_excluded_mount(mountpoint):
                                    logger.debug(f"Skipping excluded mount: {mountpoint}")
                                    continue
                                # Parse size values safely
                                total_space = self._parse_size(columns[-5])
                                used_space = self._parse_size(columns[-4])
                                available_space = self._parse_size(columns[-3])
                                # Parse percentage safely
                                try:
                                    usage_percent = float(columns[-2].rstrip('%'))
                                except (ValueError, IndexError):
                                    # Calculate percentage if parsing fails
                                    usage_percent = (used_space / total_space * 100) if total_space > 0 else 0
                                filesystem = {
                                    'mountpoint': mountpoint,
                                    'total_space': total_space,
                                    'used_space': used_space,
                                    'available': available_space,
                                    'usage_percent': usage_percent
                                }
                                container_info['filesystems'].append(filesystem)
                                # Check thresholds
                                if usage_percent >= self.CONFIG['THRESHOLDS']['LXC_CRITICAL']:
                                    lxc_health['status'] = 'CRITICAL'
                                    issue = f"LXC {vmid} critical storage usage: {usage_percent:.1f}% on {mountpoint}"
                                    lxc_health['issues'].append(issue)
                                elif usage_percent >= self.CONFIG['THRESHOLDS']['LXC_WARNING']:
                                    if lxc_health['status'] != 'CRITICAL':
                                        lxc_health['status'] = 'WARNING'
                                    issue = f"LXC {vmid} high storage usage: {usage_percent:.1f}% on {mountpoint}"
                                    lxc_health['issues'].append(issue)
                                logger.debug(f"Filesystem details: {filesystem}")
                            except Exception as e:
                                logger.debug(f"Error processing line: {str(e)}")
                                logger.debug(f"Full exception: {repr(e)}")
                                continue
                    # Only add container info if we have filesystem data
                    if container_info['filesystems']:
                        lxc_health['containers'].append(container_info)
                        logger.debug(f"Added container info for VMID {vmid}")
            logger.debug("=== LXC Storage Check Summary ===")
            logger.debug(f"Status: {lxc_health['status']}")
            logger.debug(f"Total containers checked: {len(lxc_health['containers'])}")
            logger.debug(f"Issues found: {len(lxc_health['issues'])}")
            logger.debug("=== End LXC Storage Check ===")
        except Exception as e:
            logger.debug(f"Critical error during LXC storage check: {str(e)}")
            lxc_health['status'] = 'ERROR'
            error_msg = f"Error checking LXC storage: {str(e)}"
            lxc_health['issues'].append(error_msg)
        return lxc_health
 def main():
    parser = argparse.ArgumentParser(description="System Health Monitor")
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Enable dry-run mode (simulate ticket creation without actual API calls)."
    )
    args = parser.parse_args()
    monitor = SystemHealthMonitor(
        ticket_api_url=SystemHealthMonitor.CONFIG['TICKET_API_URL'],
        dry_run=args.dry_run
    )
    monitor.run()
 if __name__ == "__main__":
    main()