From db757345fb996b2e502cceb6ba4fb627c7642beb Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Mon, 22 Dec 2025 17:08:13 -0500 Subject: [PATCH] Better patterns and error handling --- ceph_osd_analyzer.py | 98 ++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 54 deletions(-) diff --git a/ceph_osd_analyzer.py b/ceph_osd_analyzer.py index 5b72751..db1a9e1 100644 --- a/ceph_osd_analyzer.py +++ b/ceph_osd_analyzer.py @@ -31,7 +31,7 @@ class Colors: BOLD = '\033[1m' END = '\033[0m' -def run_command(cmd, parse_json=False, host=None, suppress_warnings=False): +def run_command(cmd, parse_json=False, host=None): """Execute shell command locally or via SSH and return output""" try: if host: @@ -47,7 +47,8 @@ def run_command(cmd, parse_json=False, host=None, suppress_warnings=False): except subprocess.CalledProcessError as e: if DEBUG: print(f"{Colors.YELLOW}DEBUG: Command failed: {cmd}{Colors.END}") - print(f"{Colors.YELLOW}DEBUG: stderr: {e.stderr}{Colors.END}") + if e.stderr: + print(f"{Colors.YELLOW}DEBUG: stderr: {e.stderr[:200]}{Colors.END}") return None if parse_json else "" except json.JSONDecodeError as e: if DEBUG: @@ -70,10 +71,6 @@ def get_osd_perf(): """Get OSD performance statistics""" return run_command("ceph osd perf -f json", parse_json=True) -def get_pg_dump(): - """Get PG dump for distribution analysis""" - return run_command("ceph pg dump -f json 2>/dev/null", parse_json=True) - def get_osd_host_mapping(osd_tree): """Build mapping of OSD ID to hostname""" osd_to_host = {} @@ -88,36 +85,36 @@ def get_osd_host_mapping(osd_tree): def get_device_path_for_osd(osd_id, hostname): """Get the device path for an OSD on a specific host""" - # Try to get from ceph metadata first + # Method 1: Try ceph metadata metadata = get_osd_metadata(osd_id) if metadata: - # Get device path from metadata devices = metadata.get('devices', '') - if devices: device = devices.split(',')[0] if ',' in devices else devices - # Ensure it has /dev/ prefix if device and not device.startswith('/dev/'): device = f"/dev/{device}" if device and device != '/dev/': + if DEBUG: + print(f"{Colors.GREEN}DEBUG: Found device from metadata: {device}{Colors.END}") return device - # Fallback: query the OSD symlink on the remote host + # Method 2: Query symlink on remote host result = run_command(f"readlink -f /var/lib/ceph/osd/ceph-{osd_id}/block 2>/dev/null", host=hostname) if result and result.startswith('/dev/'): + if DEBUG: + print(f"{Colors.GREEN}DEBUG: Found device from symlink: {result}{Colors.END}") return result - # Try to get from ceph-volume - cmd = f"ceph-volume lvm list {osd_id} -f json 2>/dev/null" - result = run_command(cmd, host=hostname, parse_json=True) - + # Method 3: Try lsblk + result = run_command(f"lsblk -no pkname /var/lib/ceph/osd/ceph-{osd_id}/block 2>/dev/null", host=hostname) if result: - # Parse ceph-volume output - for osd_key, osd_info in result.items(): - if isinstance(osd_info, list) and len(osd_info) > 0: - block_device = osd_info[0].get('devices', []) - if block_device: - return block_device[0] if isinstance(block_device, list) else block_device + device = f"/dev/{result.strip()}" + if DEBUG: + print(f"{Colors.GREEN}DEBUG: Found device from lsblk: {device}{Colors.END}") + return device + + if DEBUG: + print(f"{Colors.RED}DEBUG: Could not find device for osd.{osd_id}{Colors.END}") return None @@ -126,12 +123,11 @@ def get_smart_data_remote(device_path, hostname): if not device_path: return None - # Strip partition suffix: - # /dev/sda1 -> /dev/sda - # /dev/nvme0n1p1 -> /dev/nvme0n1 + # Strip partition suffix base_device = re.sub(r'p?\d+$', '', device_path) - cmd = f"smartctl -a -j {base_device} 2>/dev/null" + # Use sudo for smartctl + cmd = f"sudo smartctl -a -j {base_device} 2>/dev/null" result = run_command(cmd, host=hostname, parse_json=True) return result @@ -171,7 +167,7 @@ def parse_smart_health(smart_data): if not smart_data: return 50.0, ["No SMART data available"], metrics - # Check for different SMART data formats + # Check for HDD SMART data if 'ata_smart_attributes' in smart_data: attrs = smart_data['ata_smart_attributes'].get('table', []) @@ -179,7 +175,6 @@ def parse_smart_health(smart_data): attr_id = attr.get('id') name = attr.get('name', '') value = attr.get('value', 0) - worst = attr.get('worst', 0) raw_value = attr.get('raw', {}).get('value', 0) # Reallocated Sectors (5) @@ -212,10 +207,12 @@ def parse_smart_health(smart_data): # Temperature (190, 194) elif attr_id in [190, 194]: - metrics['temperature'] = raw_value - if raw_value > 60: - score -= min(10, (raw_value - 60) * 2) - issues.append(f"High temperature: {raw_value}°C") + # Only use valid temperature values + if isinstance(raw_value, int) and 0 < raw_value < 100: + metrics['temperature'] = raw_value + if raw_value > 60: + score -= min(10, (raw_value - 60) * 2) + issues.append(f"High temperature: {raw_value}°C") # Power On Hours (9) elif attr_id == 9: @@ -225,16 +222,8 @@ def parse_smart_health(smart_data): if age_years > 5: score -= min(15, (age_years - 5) * 3) issues.append(f"Drive age: {age_years:.1f} years") - - # Wear leveling (for SSDs, 177) - elif attr_id == 177 and value < worst: - metrics['wear_leveling'] = value - wear_percent = 100 - value - if wear_percent > 20: - score -= min(20, wear_percent) - issues.append(f"Wear level: {wear_percent}%") - # NVMe SMART data + # Check for NVMe SMART data elif 'nvme_smart_health_information_log' in smart_data: nvme_health = smart_data['nvme_smart_health_information_log'] @@ -259,10 +248,11 @@ def parse_smart_health(smart_data): # Temperature temp = nvme_health.get('temperature', 0) - metrics['temperature'] = temp - if temp > 70: - score -= min(10, (temp - 70) * 2) - issues.append(f"High temperature: {temp}°C") + if 0 < temp < 150: # Valid temperature range + metrics['temperature'] = temp + if temp > 70: + score -= min(10, (temp - 70) * 2) + issues.append(f"High temperature: {temp}°C") return max(0, score), issues, metrics @@ -274,19 +264,19 @@ def calculate_capacity_score(osd_data, host_osds_data, osd_class): weight = osd_data.get('crush_weight', 0) utilization = osd_data.get('utilization', 0) - # Small drives are better candidates (more capacity gain) + # Small drives are better candidates if weight < 2: score += 40 - factors.append(f"Very small drive ({weight}TB) - high capacity gain") + factors.append(f"Very small drive ({weight:.1f}TB) - high capacity gain") elif weight < 5: score += 30 - factors.append(f"Small drive ({weight}TB) - good capacity gain") + factors.append(f"Small drive ({weight:.1f}TB) - good capacity gain") elif weight < 10: score += 15 - factors.append(f"Medium drive ({weight}TB)") + factors.append(f"Medium drive ({weight:.1f}TB)") else: score += 5 - factors.append(f"Large drive ({weight}TB) - lower priority") + factors.append(f"Large drive ({weight:.1f}TB) - lower priority") # High utilization drives are harder to replace if utilization > 70: @@ -334,7 +324,7 @@ def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree): current_count = host_class_counts[host_name][osd_class] avg_count = sum(h[osd_class] for h in host_class_counts.values()) / len(host_class_counts) - # Hosts with more OSDs are better candidates for reduction + # Hosts with more OSDs are better candidates if current_count > avg_count * 1.2: score += 20 factors.append(f"Host has {current_count} {osd_class} OSDs (above average {avg_count:.1f})") @@ -342,7 +332,7 @@ def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree): score += 10 factors.append(f"Host slightly above average {osd_class} count") - # Check for down OSDs on same host (indicates potential issues) + # Check for down OSDs on same host host_node = next((n for n in osd_tree['nodes'] if n['type'] == 'host' and n['name'] == host_name), None) if host_node: down_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes'])) @@ -450,7 +440,7 @@ def analyze_cluster(): osd_df_data = osd_df_map.get(osd_id, {}) osd_perf_data = osd_perf_map.get(osd_id, {}) - # SMART health analysis - query from the correct host + # SMART health analysis health_data = get_device_health(osd_id, host_name) if not health_data: failed_smart.append((osd_name, host_name)) @@ -498,7 +488,7 @@ def analyze_cluster(): 'performance_factors': performance_factors, }) - print(" " * 80, end='\r') # Clear the line + print(" " * 80, end='\r') # Show SMART failures if any if failed_smart: @@ -509,7 +499,7 @@ def analyze_cluster(): print(f" ... and {len(failed_smart) - 5} more") print() - # Sort by total score (descending) + # Sort by total score candidates.sort(key=lambda x: x['total_score'], reverse=True) # Display results