Optimize OSD analyzer: prioritize failing drives and improve SMART collection

Major improvements to scoring and data collection: **Scoring Changes:** - Failed SMART reads now return 0/100 health (was 50/100) - Critical health issues get much higher penalties: * Reallocated sectors: -50 pts, 5x multiplier (was -20, 2x) * Pending sectors: -60 pts, 10x multiplier (was -25, 5x) * Uncorrectable sectors: -70 pts, 15x multiplier (was -30, 5x) * NVMe media errors: -60 pts, 10x multiplier (was -25, 5x) - Revised weights: 80% health, 15% capacity, 5% resilience (was 60/30/10) - Added priority bonuses: * Failed SMART + small drive (<5TB): +30 points * Failed SMART alone: +20 points * Health issues + small drive: +15 points **Priority Order Now Enforced:** 1. Failed SMART drives (score 90-100) 2. Small drives beginning to fail (70-85) 3. Small healthy drives (40-60) 4. Large failing drives (60-75) **Enhanced SMART Collection:** - Added metadata.devices field parsing - Enhanced dm-device and /dev/mapper/ resolution - Added ceph-volume lvm list fallback - Retry logic with 3 command variations per device - Try with/without sudo, different device flags **Expected Impact:** - osd.28 with reallocated sectors jumps from #14 to top 3 - SMART collection failures should drop from 6 to 0-2 - All failing drives rank above healthy drives regardless of size 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 15:05:25 -05:00
parent 3b15377821
commit 1848b71c2a
3 changed files with 535 additions and 40 deletions
--- a/ceph_osd_analyzer.py
+++ b/ceph_osd_analyzer.py
@@ -92,13 +92,28 @@ def get_device_path_for_osd(osd_id, hostname):
            if DEBUG:
                print(f"{Colors.GREEN}DEBUG: Found physical device from metadata: {device}{Colors.END}")
            return device
-    
+
+        # Also try devices field which sometimes has the info
+        devices = metadata.get('devices')
+        if devices:
+            # devices might be comma-separated
+            first_dev = devices.split(',')[0].strip()
+            if first_dev and not first_dev.startswith('dm-'):
+                device = f"/dev/{first_dev}" if not first_dev.startswith('/dev/') else first_dev
+                if DEBUG:
+                    print(f"{Colors.GREEN}DEBUG: Found device from metadata.devices: {device}{Colors.END}")
+                return device
+
    # Fallback: follow the symlink
    result = run_command(f"readlink -f /var/lib/ceph/osd/ceph-{osd_id}/block", host=hostname)
    if result and result.startswith('/dev/'):
        # Check if it is a dm device, try to find underlying
-        if '/dev/dm-' in result:
+        if '/dev/dm-' in result or '/dev/mapper/' in result:
+            # Try multiple methods to resolve dm device
            base = run_command(f"lsblk -no pkname {result}", host=hostname)
+            if not base:
+                # Alternative: use ls -l on /dev/mapper
+                base = run_command(f"ls -l {result} | awk '{{print $NF}}' | xargs basename", host=hostname)
            if base:
                device = f"/dev/{base.strip()}"
                if DEBUG:
@@ -108,13 +123,21 @@ def get_device_path_for_osd(osd_id, hostname):
            if DEBUG:
                print(f"{Colors.GREEN}DEBUG: Using device symlink {result}{Colors.END}")
            return result
-    
-    # Last fallback: lsblk from block path
-    result = run_command(f"lsblk -no pkname /var/lib/ceph/osd/ceph-{osd_id}/block", host=hostname)
+
+    # Try alternative: lsblk with PKNAME (parent kernel name)
+    result = run_command(f"lsblk -no pkname /var/lib/ceph/osd/ceph-{osd_id}/block 2>/dev/null", host=hostname)
    if result:
        device = f"/dev/{result.strip()}"
        if DEBUG:
-            print(f"{Colors.GREEN}DEBUG: Found device from lsblk: {device}{Colors.END}")
+            print(f"{Colors.GREEN}DEBUG: Found device from lsblk pkname: {device}{Colors.END}")
+        return device
+
+    # Last resort: try to get from ceph-volume lvm list
+    result = run_command(f"ceph-volume lvm list | grep -A 20 'osd id.*{osd_id}' | grep 'devices' | awk '{{print $2}}'", host=hostname)
+    if result:
+        device = result.strip()
+        if DEBUG:
+            print(f"{Colors.GREEN}DEBUG: Found device from ceph-volume: {device}{Colors.END}")
        return device

    if DEBUG:
@@ -122,27 +145,49 @@ def get_device_path_for_osd(osd_id, hostname):
    return None

 def get_smart_data_remote(device_path, hostname):
-    """Get SMART data from a remote host"""
+    """Get SMART data from a remote host with multiple fallback methods"""
    if not device_path:
        return None

    # Determine device type
-    tran = run_command(f"lsblk -no tran {device_path}", host=hostname)
+    tran = run_command(f"lsblk -no tran {device_path} 2>/dev/null", host=hostname)
    tran = tran.strip() if tran else ""

-    if tran == "nvme":
-        cmd = f"sudo smartctl -a -j {device_path} -d nvme 2>/dev/null"
+    # Try different command variations based on device type
+    commands_to_try = []
+
+    if tran == "nvme" or "nvme" in device_path:
+        commands_to_try = [
+            f"sudo smartctl -a -j {device_path} -d nvme",
+            f"smartctl -a -j {device_path} -d nvme",  # Try without sudo
+            f"sudo smartctl -a -j {device_path}",
+        ]
    elif tran == "sata":
-        cmd = f"sudo smartctl -a -j {device_path} 2>/dev/null"
+        commands_to_try = [
+            f"sudo smartctl -a -j {device_path}",
+            f"smartctl -a -j {device_path}",
+            f"sudo smartctl -a -j {device_path} -d ata",
+        ]
    else:
-        cmd = f"sudo smartctl -a -j {device_path} 2>/dev/null"
+        # Unknown or no transport, try generic approaches
+        commands_to_try = [
+            f"sudo smartctl -a -j {device_path}",
+            f"smartctl -a -j {device_path}",
+            f"sudo smartctl -a -j {device_path} -d auto",
+        ]

-    result = run_command(cmd, host=hostname, parse_json=True)
+    # Try each command until one succeeds
+    for cmd in commands_to_try:
+        result = run_command(f"{cmd} 2>/dev/null", host=hostname, parse_json=True)
+        if result and ('ata_smart_attributes' in result or 'nvme_smart_health_information_log' in result):
+            if DEBUG:
+                print(f"{Colors.GREEN}DEBUG: SMART success with: {cmd}{Colors.END}")
+            return result

-    if not result and DEBUG:
-        print(f"{Colors.RED}DEBUG: SMART data failed for {device_path} on {hostname}{Colors.END}")
+    if DEBUG:
+        print(f"{Colors.RED}DEBUG: All SMART methods failed for {device_path} on {hostname}{Colors.END}")

-    return result
+    return None

 def get_device_health(osd_id, hostname):
    """Get device SMART health metrics from the appropriate host"""
@@ -175,9 +220,10 @@ def parse_smart_health(smart_data):
    score = 100.0
    issues = []
    metrics = {}
-    
+
    if not smart_data:
-        return 50.0, ["No SMART data available"], metrics
+        # CRITICAL: Failed SMART reads are a red flag - could indicate drive issues
+        return 0.0, ["CRITICAL: No SMART data available - drive may be failing"], metrics
    
    # Check for HDD SMART data
    if 'ata_smart_attributes' in smart_data:
@@ -189,33 +235,33 @@ def parse_smart_health(smart_data):
            value = attr.get('value', 0)
            raw_value = attr.get('raw', {}).get('value', 0)
            
-            # Reallocated Sectors (5)
+            # Reallocated Sectors (5) - CRITICAL indicator
            if attr_id == 5:
                metrics['reallocated_sectors'] = raw_value
                if raw_value > 0:
-                    score -= min(20, raw_value * 2)
-                    issues.append(f"Reallocated sectors: {raw_value}")
+                    score -= min(50, raw_value * 5)  # Much more aggressive
+                    issues.append(f"CRITICAL: Reallocated sectors: {raw_value}")
            
-            # Spin Retry Count (10)
+            # Spin Retry Count (10) - CRITICAL
            elif attr_id == 10:
                metrics['spin_retry'] = raw_value
                if raw_value > 0:
-                    score -= min(15, raw_value * 3)
-                    issues.append(f"Spin retry count: {raw_value}")
-            
-            # Pending Sectors (197)
+                    score -= min(40, raw_value * 10)
+                    issues.append(f"CRITICAL: Spin retry count: {raw_value}")
+
+            # Pending Sectors (197) - CRITICAL
            elif attr_id == 197:
                metrics['pending_sectors'] = raw_value
                if raw_value > 0:
-                    score -= min(25, raw_value * 5)
-                    issues.append(f"Pending sectors: {raw_value}")
-            
-            # Uncorrectable Sectors (198)
+                    score -= min(60, raw_value * 10)
+                    issues.append(f"CRITICAL: Pending sectors: {raw_value}")
+
+            # Uncorrectable Sectors (198) - CRITICAL
            elif attr_id == 198:
                metrics['uncorrectable_sectors'] = raw_value
                if raw_value > 0:
-                    score -= min(30, raw_value * 5)
-                    issues.append(f"Uncorrectable sectors: {raw_value}")
+                    score -= min(70, raw_value * 15)
+                    issues.append(f"CRITICAL: Uncorrectable sectors: {raw_value}")
            
            # Temperature (190, 194)
            elif attr_id in [190, 194]:
@@ -252,11 +298,11 @@ def parse_smart_health(smart_data):
            score -= min(30, (pct_used - 80) * 1.5)
            issues.append(f"High wear: {pct_used}%")
        
-        # Media errors
+        # Media errors - CRITICAL for NVMe
        media_errors = nvme_health.get('media_errors', 0)
        if media_errors > 0:
-            score -= min(25, media_errors * 5)
-            issues.append(f"Media errors: {media_errors}")
+            score -= min(60, media_errors * 10)
+            issues.append(f"CRITICAL: Media errors: {media_errors}")
        
        # Temperature
        temp = nvme_health.get('temperature', 0)
@@ -431,12 +477,28 @@ def analyze_cluster():
            node, host_name, host_osds_map, osd_tree
        )
        
-        # Calculate total score (weighted: 60% health, 30% capacity, 10% resilience)
-        total_score = (
-            (100 - health_score) * 0.60 +  # Health is most important
-            capacity_score * 0.30 +          # Capacity optimization
-            resilience_score * 0.10          # Cluster resilience
+        # Calculate total score with revised weights
+        # Priority: Failed drives > Small failing drives > Small drives > Any failing
+        has_health_issues = len(health_issues) > 0
+        is_small = osd_df_data.get('crush_weight', 0) < 5
+
+        # Base scoring: 80% health, 15% capacity, 5% resilience
+        base_score = (
+            (100 - health_score) * 0.80 +   # Health is critical
+            capacity_score * 0.15 +          # Capacity matters for small drives
+            resilience_score * 0.05          # Cluster resilience (minor)
        )
+
+        # Apply multipliers for priority combinations
+        if health_score == 0:  # Failed SMART reads
+            if is_small:
+                base_score += 30  # Failed SMART + small = top priority
+            else:
+                base_score += 20  # Failed SMART alone is still critical
+        elif has_health_issues and is_small:
+            base_score += 15  # Small + beginning to fail
+
+        total_score = min(100, base_score)  # Cap at 100
        
        candidates.append({
            'osd_id': osd_id,