From db757345fb996b2e502cceb6ba4fb627c7642beb Mon Sep 17 00:00:00 2001
From: Jared Vititoe <jjvititoe1@gmail.com>
Date: Mon, 22 Dec 2025 17:08:13 -0500
Subject: [PATCH] Better patterns and error handling

---
 ceph_osd_analyzer.py | 98 ++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 54 deletions(-)

diff --git a/ceph_osd_analyzer.py b/ceph_osd_analyzer.py
index 5b72751..db1a9e1 100644
--- a/ceph_osd_analyzer.py
+++ b/ceph_osd_analyzer.py
@@ -31,7 +31,7 @@ class Colors:
     BOLD = '\033[1m'
     END = '\033[0m'
 
-def run_command(cmd, parse_json=False, host=None, suppress_warnings=False):
+def run_command(cmd, parse_json=False, host=None):
     """Execute shell command locally or via SSH and return output"""
     try:
         if host:
@@ -47,7 +47,8 @@ def run_command(cmd, parse_json=False, host=None, suppress_warnings=False):
     except subprocess.CalledProcessError as e:
         if DEBUG:
             print(f"{Colors.YELLOW}DEBUG: Command failed: {cmd}{Colors.END}")
-            print(f"{Colors.YELLOW}DEBUG: stderr: {e.stderr}{Colors.END}")
+            if e.stderr:
+                print(f"{Colors.YELLOW}DEBUG: stderr: {e.stderr[:200]}{Colors.END}")
         return None if parse_json else ""
     except json.JSONDecodeError as e:
         if DEBUG:
@@ -70,10 +71,6 @@ def get_osd_perf():
     """Get OSD performance statistics"""
     return run_command("ceph osd perf -f json", parse_json=True)
 
-def get_pg_dump():
-    """Get PG dump for distribution analysis"""
-    return run_command("ceph pg dump -f json 2>/dev/null", parse_json=True)
-
 def get_osd_host_mapping(osd_tree):
     """Build mapping of OSD ID to hostname"""
     osd_to_host = {}
@@ -88,36 +85,36 @@ def get_osd_host_mapping(osd_tree):
 
 def get_device_path_for_osd(osd_id, hostname):
     """Get the device path for an OSD on a specific host"""
-    # Try to get from ceph metadata first
+    # Method 1: Try ceph metadata
     metadata = get_osd_metadata(osd_id)
     if metadata:
-        # Get device path from metadata
         devices = metadata.get('devices', '')
-        
         if devices:
             device = devices.split(',')[0] if ',' in devices else devices
-            # Ensure it has /dev/ prefix
             if device and not device.startswith('/dev/'):
                 device = f"/dev/{device}"
             if device and device != '/dev/':
+                if DEBUG:
+                    print(f"{Colors.GREEN}DEBUG: Found device from metadata: {device}{Colors.END}")
                 return device
     
-    # Fallback: query the OSD symlink on the remote host
+    # Method 2: Query symlink on remote host
     result = run_command(f"readlink -f /var/lib/ceph/osd/ceph-{osd_id}/block 2>/dev/null", host=hostname)
     if result and result.startswith('/dev/'):
+        if DEBUG:
+            print(f"{Colors.GREEN}DEBUG: Found device from symlink: {result}{Colors.END}")
         return result
     
-    # Try to get from ceph-volume
-    cmd = f"ceph-volume lvm list {osd_id} -f json 2>/dev/null"
-    result = run_command(cmd, host=hostname, parse_json=True)
-    
+    # Method 3: Try lsblk
+    result = run_command(f"lsblk -no pkname /var/lib/ceph/osd/ceph-{osd_id}/block 2>/dev/null", host=hostname)
     if result:
-        # Parse ceph-volume output
-        for osd_key, osd_info in result.items():
-            if isinstance(osd_info, list) and len(osd_info) > 0:
-                block_device = osd_info[0].get('devices', [])
-                if block_device:
-                    return block_device[0] if isinstance(block_device, list) else block_device
+        device = f"/dev/{result.strip()}"
+        if DEBUG:
+            print(f"{Colors.GREEN}DEBUG: Found device from lsblk: {device}{Colors.END}")
+        return device
+    
+    if DEBUG:
+        print(f"{Colors.RED}DEBUG: Could not find device for osd.{osd_id}{Colors.END}")
     
     return None
 
@@ -126,12 +123,11 @@ def get_smart_data_remote(device_path, hostname):
     if not device_path:
         return None
 
-    # Strip partition suffix:
-    # /dev/sda1 -> /dev/sda
-    # /dev/nvme0n1p1 -> /dev/nvme0n1
+    # Strip partition suffix
     base_device = re.sub(r'p?\d+$', '', device_path)
 
-    cmd = f"smartctl -a -j {base_device} 2>/dev/null"
+    # Use sudo for smartctl
+    cmd = f"sudo smartctl -a -j {base_device} 2>/dev/null"
     result = run_command(cmd, host=hostname, parse_json=True)
 
     return result
@@ -171,7 +167,7 @@ def parse_smart_health(smart_data):
     if not smart_data:
         return 50.0, ["No SMART data available"], metrics
     
-    # Check for different SMART data formats
+    # Check for HDD SMART data
     if 'ata_smart_attributes' in smart_data:
         attrs = smart_data['ata_smart_attributes'].get('table', [])
         
@@ -179,7 +175,6 @@ def parse_smart_health(smart_data):
             attr_id = attr.get('id')
             name = attr.get('name', '')
             value = attr.get('value', 0)
-            worst = attr.get('worst', 0)
             raw_value = attr.get('raw', {}).get('value', 0)
             
             # Reallocated Sectors (5)
@@ -212,10 +207,12 @@ def parse_smart_health(smart_data):
             
             # Temperature (190, 194)
             elif attr_id in [190, 194]:
-                metrics['temperature'] = raw_value
-                if raw_value > 60:
-                    score -= min(10, (raw_value - 60) * 2)
-                    issues.append(f"High temperature: {raw_value}°C")
+                # Only use valid temperature values
+                if isinstance(raw_value, int) and 0 < raw_value < 100:
+                    metrics['temperature'] = raw_value
+                    if raw_value > 60:
+                        score -= min(10, (raw_value - 60) * 2)
+                        issues.append(f"High temperature: {raw_value}°C")
             
             # Power On Hours (9)
             elif attr_id == 9:
@@ -225,16 +222,8 @@ def parse_smart_health(smart_data):
                 if age_years > 5:
                     score -= min(15, (age_years - 5) * 3)
                     issues.append(f"Drive age: {age_years:.1f} years")
-            
-            # Wear leveling (for SSDs, 177)
-            elif attr_id == 177 and value < worst:
-                metrics['wear_leveling'] = value
-                wear_percent = 100 - value
-                if wear_percent > 20:
-                    score -= min(20, wear_percent)
-                    issues.append(f"Wear level: {wear_percent}%")
     
-    # NVMe SMART data
+    # Check for NVMe SMART data
     elif 'nvme_smart_health_information_log' in smart_data:
         nvme_health = smart_data['nvme_smart_health_information_log']
         
@@ -259,10 +248,11 @@ def parse_smart_health(smart_data):
         
         # Temperature
         temp = nvme_health.get('temperature', 0)
-        metrics['temperature'] = temp
-        if temp > 70:
-            score -= min(10, (temp - 70) * 2)
-            issues.append(f"High temperature: {temp}°C")
+        if 0 < temp < 150:  # Valid temperature range
+            metrics['temperature'] = temp
+            if temp > 70:
+                score -= min(10, (temp - 70) * 2)
+                issues.append(f"High temperature: {temp}°C")
     
     return max(0, score), issues, metrics
 
@@ -274,19 +264,19 @@ def calculate_capacity_score(osd_data, host_osds_data, osd_class):
     weight = osd_data.get('crush_weight', 0)
     utilization = osd_data.get('utilization', 0)
     
-    # Small drives are better candidates (more capacity gain)
+    # Small drives are better candidates
     if weight < 2:
         score += 40
-        factors.append(f"Very small drive ({weight}TB) - high capacity gain")
+        factors.append(f"Very small drive ({weight:.1f}TB) - high capacity gain")
     elif weight < 5:
         score += 30
-        factors.append(f"Small drive ({weight}TB) - good capacity gain")
+        factors.append(f"Small drive ({weight:.1f}TB) - good capacity gain")
     elif weight < 10:
         score += 15
-        factors.append(f"Medium drive ({weight}TB)")
+        factors.append(f"Medium drive ({weight:.1f}TB)")
     else:
         score += 5
-        factors.append(f"Large drive ({weight}TB) - lower priority")
+        factors.append(f"Large drive ({weight:.1f}TB) - lower priority")
     
     # High utilization drives are harder to replace
     if utilization > 70:
@@ -334,7 +324,7 @@ def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree):
     current_count = host_class_counts[host_name][osd_class]
     avg_count = sum(h[osd_class] for h in host_class_counts.values()) / len(host_class_counts)
     
-    # Hosts with more OSDs are better candidates for reduction
+    # Hosts with more OSDs are better candidates
     if current_count > avg_count * 1.2:
         score += 20
         factors.append(f"Host has {current_count} {osd_class} OSDs (above average {avg_count:.1f})")
@@ -342,7 +332,7 @@ def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree):
         score += 10
         factors.append(f"Host slightly above average {osd_class} count")
     
-    # Check for down OSDs on same host (indicates potential issues)
+    # Check for down OSDs on same host
     host_node = next((n for n in osd_tree['nodes'] if n['type'] == 'host' and n['name'] == host_name), None)
     if host_node:
         down_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes']))
@@ -450,7 +440,7 @@ def analyze_cluster():
         osd_df_data = osd_df_map.get(osd_id, {})
         osd_perf_data = osd_perf_map.get(osd_id, {})
         
-        # SMART health analysis - query from the correct host
+        # SMART health analysis
         health_data = get_device_health(osd_id, host_name)
         if not health_data:
             failed_smart.append((osd_name, host_name))
@@ -498,7 +488,7 @@ def analyze_cluster():
             'performance_factors': performance_factors,
         })
     
-    print(" " * 80, end='\r')  # Clear the line
+    print(" " * 80, end='\r')
     
     # Show SMART failures if any
     if failed_smart:
@@ -509,7 +499,7 @@ def analyze_cluster():
             print(f"  ... and {len(failed_smart) - 5} more")
         print()
     
-    # Sort by total score (descending)
+    # Sort by total score
     candidates.sort(key=lambda x: x['total_score'], reverse=True)
     
     # Display results