ceph_osd_analyzer.py

#!/usr/bin/env python3
"""
Advanced Ceph OSD Replacement Candidate Analyzer

This script identifies the best OSD replacement candidates by analyzing:
- SMART health data (wear, errors, temperature) from ALL cluster nodes
- Capacity utilization and imbalance
- Host-level distribution and resilience
- Age and performance metrics
- PG distribution balance

Usage: sudo python3 ceph_osd_analyzer.py [--class hdd|nvme] [--min-size 8] [--debug]
"""

import json
import subprocess
import sys
import argparse
from collections import defaultdict
from datetime import datetime
import re

DEBUG = False

class Colors:
    RED = '\033[91m'
    YELLOW = '\033[93m'
    GREEN = '\033[92m'
    BLUE = '\033[94m'
    CYAN = '\033[96m'
    BOLD = '\033[1m'
    END = '\033[0m'

def run_command(cmd, parse_json=False, host=None):
    """Execute shell command locally or via SSH and return output"""
    try:
        if host:
            cmd = f"ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 {host} '{cmd}'"
        
        if DEBUG:
            print(f"{Colors.CYAN}DEBUG: Running: {cmd}{Colors.END}")
        
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
        if parse_json:
            return json.loads(result.stdout)
        return result.stdout.strip()
    except subprocess.CalledProcessError as e:
        if DEBUG:
            print(f"{Colors.YELLOW}DEBUG: Command failed: {cmd}{Colors.END}")
            if e.stderr:
                print(f"{Colors.YELLOW}DEBUG: stderr: {e.stderr[:200]}{Colors.END}")
        return None if parse_json else ""
    except json.JSONDecodeError as e:
        if DEBUG:
            print(f"{Colors.RED}Error parsing JSON from: {cmd}{Colors.END}")
        return None

def get_osd_tree():
    """Get OSD tree structure"""
    return run_command("ceph osd tree -f json", parse_json=True)

def get_osd_df():
    """Get OSD disk usage statistics"""
    return run_command("ceph osd df -f json", parse_json=True)

def get_osd_metadata(osd_id):
    """Get metadata for specific OSD"""
    return run_command(f"ceph osd metadata osd.{osd_id} -f json", parse_json=True)

# Performance metrics removed for simplicity

def get_osd_host_mapping(osd_tree):
    """Build mapping of OSD ID to hostname"""
    osd_to_host = {}
    
    for node in osd_tree['nodes']:
        if node['type'] == 'host':
            host_name = node['name']
            for child_id in node.get('children', []):
                osd_to_host[child_id] = host_name
    
    return osd_to_host

def get_device_path_for_osd(osd_id, hostname):
    """Get the physical device path for an OSD on a host (resolve dm devices)."""
    metadata = get_osd_metadata(osd_id)
    if metadata:
        # Try 'bluestore_bdev_devices' first
        phys_dev = metadata.get('bluestore_bdev_devices')
        if phys_dev:
            device = f"/dev/{phys_dev.strip()}"
            if DEBUG:
                print(f"{Colors.GREEN}DEBUG: Found physical device from metadata: {device}{Colors.END}")
            return device

        # Also try devices field which sometimes has the info
        devices = metadata.get('devices')
        if devices:
            # devices might be comma-separated
            first_dev = devices.split(',')[0].strip()
            if first_dev and not first_dev.startswith('dm-'):
                device = f"/dev/{first_dev}" if not first_dev.startswith('/dev/') else first_dev
                if DEBUG:
                    print(f"{Colors.GREEN}DEBUG: Found device from metadata.devices: {device}{Colors.END}")
                return device

    # Fallback: follow the symlink
    result = run_command(f"readlink -f /var/lib/ceph/osd/ceph-{osd_id}/block", host=hostname)
    if result and result.startswith('/dev/'):
        # Check if it is a dm device, try to find underlying
        if '/dev/dm-' in result or '/dev/mapper/' in result:
            # Try multiple methods to resolve dm device
            base = run_command(f"lsblk -no pkname {result}", host=hostname)
            if not base:
                # Alternative: use ls -l on /dev/mapper
                base = run_command(f"ls -l {result} | awk '{{print $NF}}' | xargs basename", host=hostname)
            if base:
                device = f"/dev/{base.strip()}"
                if DEBUG:
                    print(f"{Colors.GREEN}DEBUG: Resolved dm device {result} -> {device}{Colors.END}")
                return device
        else:
            if DEBUG:
                print(f"{Colors.GREEN}DEBUG: Using device symlink {result}{Colors.END}")
            return result

    # Try alternative: lsblk with PKNAME (parent kernel name)
    result = run_command(f"lsblk -no pkname /var/lib/ceph/osd/ceph-{osd_id}/block 2>/dev/null", host=hostname)
    if result:
        device = f"/dev/{result.strip()}"
        if DEBUG:
            print(f"{Colors.GREEN}DEBUG: Found device from lsblk pkname: {device}{Colors.END}")
        return device

    # Last resort: try to get from ceph-volume lvm list
    result = run_command(f"ceph-volume lvm list | grep -A 20 'osd id.*{osd_id}' | grep 'devices' | awk '{{print $2}}'", host=hostname)
    if result:
        device = result.strip()
        if DEBUG:
            print(f"{Colors.GREEN}DEBUG: Found device from ceph-volume: {device}{Colors.END}")
        return device

    if DEBUG:
        print(f"{Colors.RED}DEBUG: Could not determine device for osd.{osd_id}{Colors.END}")
    return None

def get_smart_data_remote(device_path, hostname):
    """Get SMART data from a remote host with multiple fallback methods"""
    if not device_path:
        return None

    # Determine device type
    tran = run_command(f"lsblk -no tran {device_path} 2>/dev/null", host=hostname)
    tran = tran.strip() if tran else ""

    # Try different command variations based on device type
    commands_to_try = []

    if tran == "nvme" or "nvme" in device_path:
        commands_to_try = [
            f"sudo smartctl -a -j {device_path} -d nvme",
            f"smartctl -a -j {device_path} -d nvme",  # Try without sudo
            f"sudo smartctl -a -j {device_path}",
        ]
    elif tran == "usb":
        # USB-connected drives need special device type flags
        commands_to_try = [
            f"sudo smartctl -a -j {device_path} -d sat",      # SAT (SCSI-ATA Translation)
            f"sudo smartctl -a -j {device_path} -d usbjmicron", # JMicron USB bridge
            f"sudo smartctl -a -j {device_path} -d usbcypress", # Cypress USB bridge
            f"sudo smartctl -a -j {device_path} -d usb",       # Generic USB
            f"sudo smartctl -a -j {device_path} -d scsi",      # SCSI passthrough
            f"sudo smartctl -a -j {device_path}",              # Auto-detect
        ]
    elif tran == "sata":
        commands_to_try = [
            f"sudo smartctl -a -j {device_path}",
            f"smartctl -a -j {device_path}",
            f"sudo smartctl -a -j {device_path} -d ata",
        ]
    else:
        # Unknown or no transport, try generic approaches including USB
        commands_to_try = [
            f"sudo smartctl -a -j {device_path}",
            f"smartctl -a -j {device_path}",
            f"sudo smartctl -a -j {device_path} -d sat",      # Try USB/SAT
            f"sudo smartctl -a -j {device_path} -d auto",
        ]

    # Try each command until one succeeds
    for cmd in commands_to_try:
        result = run_command(f"{cmd} 2>/dev/null", host=hostname, parse_json=True)
        if result and ('ata_smart_attributes' in result or 'nvme_smart_health_information_log' in result):
            if DEBUG:
                print(f"{Colors.GREEN}DEBUG: SMART success with: {cmd}{Colors.END}")
            return result

    if DEBUG:
        print(f"{Colors.RED}DEBUG: All SMART methods failed for {device_path} on {hostname}{Colors.END}")
        print(f"{Colors.YELLOW}DEBUG: Transport type detected: {tran if tran else 'unknown'}{Colors.END}")

    return None

def get_device_health(osd_id, hostname):
    """Get device SMART health metrics from the appropriate host"""
    if DEBUG:
        print(f"{Colors.CYAN}DEBUG: Getting health for osd.{osd_id} on {hostname}{Colors.END}")

    # First try ceph's built-in health metrics
    data = run_command(f"ceph device query-daemon-health-metrics osd.{osd_id} -f json 2>/dev/null", parse_json=True)

    if data:
        # Ceph returns data nested under device ID, extract it
        if isinstance(data, dict) and len(data) > 0:
            # Get the first (and usually only) device entry
            device_data = next(iter(data.values())) if data else None
            if device_data and ('ata_smart_attributes' in device_data or 'nvme_smart_health_information_log' in device_data):
                if DEBUG:
                    print(f"{Colors.GREEN}DEBUG: Got SMART data from ceph device query (nested format){Colors.END}")
                return device_data

        # Also check if data is already in the right format (backward compatibility)
        if 'ata_smart_attributes' in data or 'nvme_smart_health_information_log' in data:
            if DEBUG:
                print(f"{Colors.GREEN}DEBUG: Got SMART data from ceph device query (direct format){Colors.END}")
            return data
    
    # If that fails, get device path and query via SSH
    device_path = get_device_path_for_osd(osd_id, hostname)
    if DEBUG:
        print(f"{Colors.CYAN}DEBUG: Device path for osd.{osd_id}: {device_path}{Colors.END}")
    
    if device_path:
        smart_data = get_smart_data_remote(device_path, hostname)
        if smart_data and DEBUG:
            print(f"{Colors.GREEN}DEBUG: Got SMART data via SSH from {hostname}{Colors.END}")
        return smart_data
    
    return None

def parse_smart_health(smart_data):
    """Parse SMART data and calculate health score"""
    score = 100.0
    issues = []
    metrics = {}

    if not smart_data:
        # CRITICAL: Failed SMART reads are a red flag - could indicate drive issues
        return 0.0, ["CRITICAL: No SMART data available - drive may be failing"], metrics
    
    # Check for HDD SMART data
    if 'ata_smart_attributes' in smart_data:
        attrs = smart_data['ata_smart_attributes'].get('table', [])
        
        for attr in attrs:
            attr_id = attr.get('id')
            name = attr.get('name', '')
            value = attr.get('value', 0)
            raw_value = attr.get('raw', {}).get('value', 0)
            
            # Reallocated Sectors (5) - CRITICAL indicator of imminent failure
            if attr_id == 5:
                metrics['reallocated_sectors'] = raw_value
                if raw_value > 0:
                    # ANY reallocated sectors is a severe problem
                    if raw_value >= 10:
                        score -= 95  # Drive is failing, near-zero health
                    elif raw_value >= 5:
                        score -= 85  # Critical failure imminent
                    else:
                        score -= 70  # Even 1-4 sectors is very serious
                    issues.append(f"CRITICAL: Reallocated sectors: {raw_value} - DRIVE FAILING")
            
            # Spin Retry Count (10) - CRITICAL
            elif attr_id == 10:
                metrics['spin_retry'] = raw_value
                if raw_value > 0:
                    score -= min(40, raw_value * 10)
                    issues.append(f"CRITICAL: Spin retry count: {raw_value}")

            # Pending Sectors (197) - CRITICAL
            elif attr_id == 197:
                metrics['pending_sectors'] = raw_value
                if raw_value > 0:
                    score -= min(60, raw_value * 10)
                    issues.append(f"CRITICAL: Pending sectors: {raw_value}")

            # Uncorrectable Sectors (198) - CRITICAL
            elif attr_id == 198:
                metrics['uncorrectable_sectors'] = raw_value
                if raw_value > 0:
                    score -= min(70, raw_value * 15)
                    issues.append(f"CRITICAL: Uncorrectable sectors: {raw_value}")
            
            # Temperature (190, 194)
            elif attr_id in [190, 194]:
                # Only use valid temperature values
                if isinstance(raw_value, int) and 0 < raw_value < 100:
                    metrics['temperature'] = raw_value
                    if raw_value > 60:
                        score -= min(10, (raw_value - 60) * 2)
                        issues.append(f"High temperature: {raw_value}°C")
            
            # Power On Hours (9)
            elif attr_id == 9:
                metrics['power_on_hours'] = raw_value
                age_years = raw_value / 8760
                metrics['age_years'] = age_years
                if age_years > 5:
                    score -= min(15, (age_years - 5) * 3)
                    issues.append(f"Drive age: {age_years:.1f} years")
    
    # Check for NVMe SMART data
    elif 'nvme_smart_health_information_log' in smart_data:
        nvme_health = smart_data['nvme_smart_health_information_log']
        
        # Available spare
        spare = nvme_health.get('available_spare', 100)
        if spare < 50:
            score -= (100 - spare) * 0.5
            issues.append(f"Low available spare: {spare}%")
        
        # Percentage used
        pct_used = nvme_health.get('percentage_used', 0)
        metrics['percentage_used'] = pct_used
        if pct_used > 80:
            score -= min(30, (pct_used - 80) * 1.5)
            issues.append(f"High wear: {pct_used}%")
        
        # Media errors - CRITICAL for NVMe
        media_errors = nvme_health.get('media_errors', 0)
        if media_errors > 0:
            score -= min(60, media_errors * 10)
            issues.append(f"CRITICAL: Media errors: {media_errors}")
        
        # Temperature
        temp = nvme_health.get('temperature', 0)
        if 0 < temp < 150:  # Valid temperature range
            metrics['temperature'] = temp
            if temp > 70:
                score -= min(10, (temp - 70) * 2)
                issues.append(f"High temperature: {temp}°C")
    
    return max(0, score), issues, metrics

def calculate_capacity_score(osd_data, host_osds_data, osd_class):
    """Calculate score based on capacity optimization potential"""
    score = 0.0
    factors = []
    
    weight = osd_data.get('crush_weight', 0)
    utilization = osd_data.get('utilization', 0)
    
    # Small drives are better candidates
    if weight < 2:
        score += 40
        factors.append(f"Very small drive ({weight:.1f}TB) - high capacity gain")
    elif weight < 5:
        score += 30
        factors.append(f"Small drive ({weight:.1f}TB) - good capacity gain")
    elif weight < 10:
        score += 15
        factors.append(f"Medium drive ({weight:.1f}TB)")
    else:
        score += 5
        factors.append(f"Large drive ({weight:.1f}TB) - lower priority")
    
    # High utilization drives are harder to replace
    if utilization > 70:
        score -= 15
        factors.append(f"High utilization ({utilization:.1f}%) - requires data migration")
    elif utilization > 50:
        score -= 8
        factors.append(f"Medium utilization ({utilization:.1f}%)")
    
    # Host balance consideration
    same_class_osds = [o for o in host_osds_data if o.get('device_class') == osd_class]
    if same_class_osds:
        host_total_weight = sum(o.get('crush_weight', 0) for o in same_class_osds)
        host_avg_weight = host_total_weight / len(same_class_osds)
        
        if weight < host_avg_weight * 0.5:
            score += 15
            factors.append(f"Below host average ({host_avg_weight:.1f}TB) - improves balance")
    
    return score, factors

def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree):
    """Calculate score based on cluster resilience improvement"""
    score = 0.0
    factors = []
    
    osd_class = osd_data.get('device_class', 'hdd')
    
    # Count OSDs per host by class
    host_class_counts = {}
    for host_node in [n for n in osd_tree['nodes'] if n['type'] == 'host']:
        h_name = host_node['name']
        host_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes'])) 
                     if osd_tree['nodes'][i].get('id') in host_node.get('children', []) 
                     and osd_tree['nodes'][i].get('type') == 'osd']
        
        host_class_counts[h_name] = {
            'hdd': len([o for o in host_osds if o.get('device_class') == 'hdd' and o.get('status') == 'up']),
            'nvme': len([o for o in host_osds if o.get('device_class') == 'nvme' and o.get('status') == 'up'])
        }
    
    if host_name not in host_class_counts:
        return 0, ["Host not found in cluster"]
    
    current_count = host_class_counts[host_name][osd_class]
    avg_count = sum(h[osd_class] for h in host_class_counts.values()) / len(host_class_counts)
    
    # Hosts with more OSDs are better candidates
    if current_count > avg_count * 1.2:
        score += 20
        factors.append(f"Host has {current_count} {osd_class} OSDs (above average {avg_count:.1f})")
    elif current_count > avg_count:
        score += 10
        factors.append(f"Host slightly above average {osd_class} count")
    
    # Check for down OSDs on same host
    host_node = next((n for n in osd_tree['nodes'] if n['type'] == 'host' and n['name'] == host_name), None)
    if host_node:
        down_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes']))
                     if osd_tree['nodes'][i].get('id') in host_node.get('children', [])
                     and osd_tree['nodes'][i].get('status') == 'down']
        if down_osds:
            score += 15
            factors.append(f"Host has {len(down_osds)} down OSD(s) - may have hardware issues")
    
    return score, factors

# Performance metrics removed for simplicity

def analyze_cluster():
    """Main analysis function"""
    print(f"{Colors.BOLD}{Colors.CYAN}=== Ceph OSD Replacement Candidate Analyzer ==={Colors.END}\n")
    
    # Gather data
    print("Gathering cluster data...")
    osd_tree = get_osd_tree()
    osd_df = get_osd_df()
    
    if not osd_tree or not osd_df:
        print(f"{Colors.RED}Failed to gather cluster data{Colors.END}")
        return
    
    # Build OSD to host mapping
    osd_to_host = get_osd_host_mapping(osd_tree)
    
    # Parse OSD data
    osd_df_map = {node['id']: node for node in osd_df['nodes']}
    
    # Build host data map
    host_osds_map = defaultdict(list)
    for node in osd_tree['nodes']:
        if node['type'] == 'osd' and node.get('status') == 'up':
            host_name = osd_to_host.get(node['id'])
            if host_name:
                osd_df_data = osd_df_map.get(node['id'], {})
                host_osds_map[host_name].append({
                    'id': node['id'],
                    'device_class': node.get('device_class', 'hdd'),
                    'crush_weight': osd_df_data.get('crush_weight', 0)
                })
    
    # Analyze each OSD
    candidates = []
    failed_smart = []
    
    print("Analyzing OSDs across all cluster nodes...\n")
    
    total_osds = len([n for n in osd_tree['nodes'] if n['type'] == 'osd' and n.get('status') == 'up'])
    current_osd = 0
    
    for node in osd_tree['nodes']:
        if node['type'] != 'osd' or node.get('status') != 'up':
            continue
        
        current_osd += 1
        osd_id = node['id']
        osd_name = node['name']
        device_class = node.get('device_class', 'hdd')
        host_name = osd_to_host.get(osd_id, 'unknown')
        
        print(f"[{current_osd}/{total_osds}] Analyzing {osd_name} on {host_name} ({device_class})...".ljust(80), end='\r')
        
        # Get OSD data
        osd_df_data = osd_df_map.get(osd_id, {})
        
        # SMART health analysis
        health_data = get_device_health(osd_id, host_name)
        if not health_data:
            failed_smart.append((osd_name, host_name))
        
        health_score, health_issues, health_metrics = parse_smart_health(health_data)
        
        # Capacity optimization score
        capacity_score, capacity_factors = calculate_capacity_score(
            osd_df_data, host_osds_map.get(host_name, []), device_class
        )
        
        # Resilience score
        resilience_score, resilience_factors = calculate_resilience_score(
            node, host_name, host_osds_map, osd_tree
        )
        
        # Calculate total score with revised weights
        # Priority: Failed drives > Small failing drives > Small drives > Any failing
        has_health_issues = len(health_issues) > 0
        has_critical_issues = any('CRITICAL:' in issue and ('Reallocated' in issue or 'Uncorrectable' in issue or 'Pending' in issue)
                                  for issue in health_issues)
        is_small = osd_df_data.get('crush_weight', 0) < 5

        # Base scoring: 80% health, 15% capacity, 5% resilience
        base_score = (
            (100 - health_score) * 0.80 +   # Health is critical
            capacity_score * 0.15 +          # Capacity matters for small drives
            resilience_score * 0.05          # Cluster resilience (minor)
        )

        # Apply multipliers for priority combinations
        if health_score == 0:  # Failed SMART reads
            if is_small:
                base_score += 30  # Failed SMART + small = top priority
            else:
                base_score += 20  # Failed SMART alone is still critical
        elif has_critical_issues:  # Reallocated/pending/uncorrectable sectors
            if is_small:
                base_score += 25  # Critical issues + small drive
            else:
                base_score += 20  # Critical issues alone
        elif has_health_issues and is_small:
            base_score += 15  # Small + beginning to fail

        total_score = min(100, base_score)  # Cap at 100
        
        candidates.append({
            'osd_id': osd_id,
            'osd_name': osd_name,
            'host': host_name,
            'device_class': device_class,
            'weight': osd_df_data.get('crush_weight', 0),
            'size': osd_df_data.get('kb', 0) / 1024 / 1024 / 1024,  # TB
            'utilization': osd_df_data.get('utilization', 0),
            'total_score': total_score,
            'health_score': health_score,
            'health_issues': health_issues,
            'health_metrics': health_metrics,
            'capacity_factors': capacity_factors,
            'resilience_factors': resilience_factors,
        })
    
    print(" " * 80, end='\r')
    
    # Show SMART failures if any
    if failed_smart:
        print(f"\n{Colors.YELLOW}Note: Unable to retrieve SMART data for {len(failed_smart)} OSDs:{Colors.END}")
        for osd_name, host in failed_smart[:5]:
            print(f"  - {osd_name} on {host}")
        if len(failed_smart) > 5:
            print(f"  ... and {len(failed_smart) - 5} more")
        print()
    
    # Sort by total score
    candidates.sort(key=lambda x: x['total_score'], reverse=True)
    
    # Display results
    print(f"\n{Colors.BOLD}{Colors.CYAN}=== TOP REPLACEMENT CANDIDATES (ALL HOSTS) ==={Colors.END}\n")
    
    for rank, candidate in enumerate(candidates[:15], 1):
        score_color = Colors.RED if candidate['total_score'] > 50 else Colors.YELLOW if candidate['total_score'] > 30 else Colors.GREEN
        health_color = Colors.GREEN if candidate['health_score'] > 80 else Colors.YELLOW if candidate['health_score'] > 60 else Colors.RED
        
        print(f"{Colors.BOLD}#{rank} - {candidate['osd_name']} ({candidate['device_class'].upper()}){Colors.END}")
        print(f"  Host: {candidate['host']}")
        print(f"  Size: {candidate['size']:.2f} TB (weight: {candidate['weight']:.2f})")
        print(f"  Utilization: {candidate['utilization']:.1f}%")
        print(f"  {score_color}Replacement Score: {candidate['total_score']:.1f}/100{Colors.END}")
        print(f"  {health_color}Health Score: {candidate['health_score']:.1f}/100{Colors.END}")
        
        if candidate['health_issues']:
            print(f"  {Colors.RED}Health Issues:{Colors.END}")
            for issue in candidate['health_issues'][:3]:
                print(f"    - {issue}")
        
        if candidate['capacity_factors']:
            print(f"  Capacity Optimization:")
            for factor in candidate['capacity_factors'][:2]:
                print(f"    • {factor}")
        
        if candidate['resilience_factors']:
            print(f"  Host Distribution:")
            for factor in candidate['resilience_factors'][:2]:
                print(f"    • {factor}")
        
        print()
    
    # Summary by class
    print(f"\n{Colors.BOLD}{Colors.CYAN}=== SUMMARY BY DEVICE CLASS ==={Colors.END}\n")
    for device_class in ['hdd', 'nvme']:
        class_candidates = [c for c in candidates if c['device_class'] == device_class]
        if class_candidates:
            top_candidate = class_candidates[0]
            print(f"{Colors.BOLD}{device_class.upper()}:{Colors.END}")
            print(f"  Top candidate: {top_candidate['osd_name']} (score: {top_candidate['total_score']:.1f})")
            print(f"  Host: {top_candidate['host']}")
            print(f"  Capacity gain potential: {top_candidate['weight']:.2f} TB")
            print()
    
    # Summary by host
    print(f"\n{Colors.BOLD}{Colors.CYAN}=== TOP CANDIDATES BY HOST ==={Colors.END}\n")
    hosts_seen = set()
    for candidate in candidates:
        if candidate['host'] not in hosts_seen and len(hosts_seen) < 5:
            hosts_seen.add(candidate['host'])
            print(f"{Colors.BOLD}{candidate['host']}:{Colors.END}")
            print(f"  Top candidate: {candidate['osd_name']} (score: {candidate['total_score']:.1f})")
            print(f"  {candidate['device_class'].upper()}, {candidate['weight']:.2f} TB, {candidate['utilization']:.1f}% used")
            if candidate['health_issues']:
                print(f"  Issues: {candidate['health_issues'][0]}")
            print()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Analyze Ceph OSDs for replacement candidates across entire cluster')
    parser.add_argument('--class', dest='device_class', choices=['hdd', 'nvme'], 
                        help='Filter by device class')
    parser.add_argument('--min-size', type=float, default=0,
                        help='Minimum OSD size in TB to consider')
    parser.add_argument('--debug', action='store_true',
                        help='Enable debug output')
    
    args = parser.parse_args()
    
    if args.debug:
        DEBUG = True
    
    try:
        analyze_cluster()
    except KeyboardInterrupt:
        print(f"\n{Colors.YELLOW}Analysis interrupted{Colors.END}")
        sys.exit(0)
    except Exception as e:
        print(f"{Colors.RED}Error: {e}{Colors.END}")
        import traceback
        traceback.print_exc()
        sys.exit(1)