diff --git a/ceph_osd_analyzer.py b/ceph_osd_analyzer.py index d276161..1e4383b 100644 --- a/ceph_osd_analyzer.py +++ b/ceph_osd_analyzer.py @@ -3,7 +3,7 @@ Advanced Ceph OSD Replacement Candidate Analyzer This script identifies the best OSD replacement candidates by analyzing: -- SMART health data (wear, errors, temperature) +- SMART health data (wear, errors, temperature) from ALL cluster nodes - Capacity utilization and imbalance - Host-level distribution and resilience - Age and performance metrics @@ -29,16 +29,19 @@ class Colors: BOLD = '\033[1m' END = '\033[0m' -def run_command(cmd, parse_json=False): - """Execute shell command and return output""" +def run_command(cmd, parse_json=False, host=None): + """Execute shell command locally or via SSH and return output""" try: + if host: + cmd = f"ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 {host} '{cmd}'" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) if parse_json: return json.loads(result.stdout) return result.stdout.strip() except subprocess.CalledProcessError as e: - print(f"{Colors.RED}Error executing: {cmd}{Colors.END}") - print(f"{Colors.RED}{e.stderr}{Colors.END}") + if host: + print(f"{Colors.YELLOW}Warning: Failed to execute on {host}: {cmd}{Colors.END}") return None if parse_json else "" except json.JSONDecodeError as e: print(f"{Colors.RED}Error parsing JSON from: {cmd}{Colors.END}") @@ -64,38 +67,86 @@ def get_pg_dump(): """Get PG dump for distribution analysis""" return run_command("ceph pg dump -f json 2>/dev/null", parse_json=True) -def get_device_health(osd_id): - """Get device SMART health metrics""" - data = run_command(f"ceph device query-daemon-health-metrics osd.{osd_id} -f json 2>/dev/null", parse_json=True) - if not data: - # Try alternative method - metadata = get_osd_metadata(osd_id) - if metadata and 'device_ids' in metadata: - device = metadata['device_ids'] - return get_smart_data_direct(device) - return data +def get_osd_host_mapping(osd_tree): + """Build mapping of OSD ID to hostname""" + osd_to_host = {} + + for node in osd_tree['nodes']: + if node['type'] == 'host': + host_name = node['name'] + for child_id in node.get('children', []): + osd_to_host[child_id] = host_name + + return osd_to_host -def get_smart_data_direct(device_path): - """Fallback: Get SMART data directly from device""" - # Try to find the device - try: - result = run_command(f"smartctl -a -j {device_path} 2>/dev/null", parse_json=True) - return result - except: +def get_device_path_for_osd(osd_id, hostname): + """Get the device path for an OSD on a specific host""" + # Try to get from ceph metadata first + metadata = get_osd_metadata(osd_id) + if metadata: + # Get device path from metadata + devices = metadata.get('devices', '') + device_ids = metadata.get('device_ids', '') + + if devices: + return devices.split(',')[0] if ',' in devices else devices + elif device_ids: + # device_ids format is like "VENDOR_MODEL_SERIAL" + # We need to find the actual device path on the host + pass + + # Fallback: query the OSD on the remote host for its device + cmd = f"ceph-volume lvm list osd.{osd_id} -f json 2>/dev/null || ceph-volume simple scan /var/lib/ceph/osd/ceph-{osd_id} 2>/dev/null" + result = run_command(cmd, host=hostname, parse_json=False) + + if result: + # Try to extract device path from output + for line in result.split('\n'): + if 'block_device' in line or 'device' in line: + match = re.search(r'/dev/[a-z0-9]+', line) + if match: + return match.group(0) + + # Last resort: try common patterns + common_paths = [ + f"/dev/disk/by-partuuid/$(readlink /var/lib/ceph/osd/ceph-{osd_id}/block | xargs basename)", + f"/var/lib/ceph/osd/ceph-{osd_id}/block" + ] + + for path in common_paths: + result = run_command(f"readlink -f {path} 2>/dev/null", host=hostname) + if result and result.startswith('/dev/'): + return result + + return None + +def get_smart_data_remote(device_path, hostname): + """Get SMART data from a remote host""" + if not device_path: return None + + # Remove partition numbers if present + base_device = re.sub(r'p?\d+$', '', device_path) + + cmd = f"smartctl -a -j {base_device} 2>/dev/null" + result = run_command(cmd, host=hostname, parse_json=True) + + return result -def get_osd_pool_mapping(): - """Get which pools each OSD belongs to""" - pg_data = get_pg_dump() - osd_pools = defaultdict(set) +def get_device_health(osd_id, hostname): + """Get device SMART health metrics from the appropriate host""" + # First try ceph's built-in health metrics + data = run_command(f"ceph device query-daemon-health-metrics osd.{osd_id} -f json 2>/dev/null", parse_json=True) - if pg_data and 'pg_map' in pg_data and 'pg_stats' in pg_data['pg_map']: - for pg in pg_data['pg_map']['pg_stats']: - pool_id = pg['pgid'].split('.')[0] - for osd in pg.get('acting', []): - osd_pools[osd].add(int(pool_id)) + if data and 'ata_smart_attributes' in data or 'nvme_smart_health_information_log' in data: + return data - return osd_pools + # If that fails, get device path and query via SSH + device_path = get_device_path_for_osd(osd_id, hostname) + if device_path: + return get_smart_data_remote(device_path, hostname) + + return None def parse_smart_health(smart_data): """Parse SMART data and calculate health score""" @@ -201,7 +252,7 @@ def parse_smart_health(smart_data): return max(0, score), issues, metrics -def calculate_capacity_score(osd_data, host_osds, osd_class): +def calculate_capacity_score(osd_data, host_osds_data, osd_class): """Calculate score based on capacity optimization potential""" score = 0.0 factors = [] @@ -232,31 +283,40 @@ def calculate_capacity_score(osd_data, host_osds, osd_class): factors.append(f"Medium utilization ({utilization:.1f}%)") # Host balance consideration - host_total_weight = sum(o.get('crush_weight', 0) for o in host_osds if o.get('device_class') == osd_class) - host_avg_weight = host_total_weight / len([o for o in host_osds if o.get('device_class') == osd_class]) if host_osds else 0 - - if weight < host_avg_weight * 0.5: - score += 15 - factors.append(f"Below host average ({host_avg_weight:.1f}TB) - improves balance") + same_class_osds = [o for o in host_osds_data if o.get('device_class') == osd_class] + if same_class_osds: + host_total_weight = sum(o.get('crush_weight', 0) for o in same_class_osds) + host_avg_weight = host_total_weight / len(same_class_osds) + + if weight < host_avg_weight * 0.5: + score += 15 + factors.append(f"Below host average ({host_avg_weight:.1f}TB) - improves balance") return score, factors -def calculate_resilience_score(osd_data, host_data, all_hosts): +def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree): """Calculate score based on cluster resilience improvement""" score = 0.0 factors = [] - host_name = host_data['name'] osd_class = osd_data.get('device_class', 'hdd') # Count OSDs per host by class host_class_counts = {} - for host in all_hosts: - host_class_counts[host['name']] = { - 'hdd': len([o for o in host.get('children', []) if o.get('device_class') == 'hdd' and o.get('status') == 'up']), - 'nvme': len([o for o in host.get('children', []) if o.get('device_class') == 'nvme' and o.get('status') == 'up']) + for host_node in [n for n in osd_tree['nodes'] if n['type'] == 'host']: + h_name = host_node['name'] + host_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes'])) + if osd_tree['nodes'][i].get('id') in host_node.get('children', []) + and osd_tree['nodes'][i].get('type') == 'osd'] + + host_class_counts[h_name] = { + 'hdd': len([o for o in host_osds if o.get('device_class') == 'hdd' and o.get('status') == 'up']), + 'nvme': len([o for o in host_osds if o.get('device_class') == 'nvme' and o.get('status') == 'up']) } + if host_name not in host_class_counts: + return 0, ["Host not found in cluster"] + current_count = host_class_counts[host_name][osd_class] avg_count = sum(h[osd_class] for h in host_class_counts.values()) / len(host_class_counts) @@ -269,10 +329,14 @@ def calculate_resilience_score(osd_data, host_data, all_hosts): factors.append(f"Host slightly above average {osd_class} count") # Check for down OSDs on same host (indicates potential issues) - down_osds = [o for o in host_data.get('children', []) if o.get('status') == 'down'] - if down_osds: - score += 15 - factors.append(f"Host has {len(down_osds)} down OSD(s) - may have hardware issues") + host_node = next((n for n in osd_tree['nodes'] if n['type'] == 'host' and n['name'] == host_name), None) + if host_node: + down_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes'])) + if osd_tree['nodes'][i].get('id') in host_node.get('children', []) + and osd_tree['nodes'][i].get('status') == 'down'] + if down_osds: + score += 15 + factors.append(f"Host has {len(down_osds)} down OSD(s) - may have hardware issues") return score, factors @@ -323,6 +387,9 @@ def analyze_cluster(): print(f"{Colors.RED}Failed to gather cluster data{Colors.END}") return + # Build OSD to host mapping + osd_to_host = get_osd_host_mapping(osd_tree) + # Parse OSD data osd_df_map = {node['id']: node for node in osd_df['nodes']} osd_perf_map = {p['id']: p for p in osd_perf.get('osd_perf_infos', [])} if osd_perf else {} @@ -331,77 +398,87 @@ def analyze_cluster(): pg_counts = [node['pgs'] for node in osd_df['nodes'] if node.get('pgs', 0) > 0] avg_pg_count = sum(pg_counts) / len(pg_counts) if pg_counts else 0 - # Build host map - hosts = [node for node in osd_tree['nodes'] if node['type'] == 'host'] - host_map = {host['id']: host for host in hosts} + # Build host data map + host_osds_map = defaultdict(list) + for node in osd_tree['nodes']: + if node['type'] == 'osd' and node.get('status') == 'up': + host_name = osd_to_host.get(node['id']) + if host_name: + osd_df_data = osd_df_map.get(node['id'], {}) + host_osds_map[host_name].append({ + 'id': node['id'], + 'device_class': node.get('device_class', 'hdd'), + 'crush_weight': osd_df_data.get('crush_weight', 0) + }) # Analyze each OSD candidates = [] - print("Analyzing OSDs...\n") + print("Analyzing OSDs across all cluster nodes...\n") - for host in hosts: - host_osds = [node for node in osd_tree['nodes'] if node.get('id') in [c for c in host.get('children', [])]] + total_osds = len([n for n in osd_tree['nodes'] if n['type'] == 'osd' and n.get('status') == 'up']) + current_osd = 0 + + for node in osd_tree['nodes']: + if node['type'] != 'osd' or node.get('status') != 'up': + continue - for osd_id in host.get('children', []): - osd_node = next((n for n in osd_tree['nodes'] if n['id'] == osd_id), None) - if not osd_node or osd_node.get('status') != 'up': - continue - - osd_id_num = osd_node['id'] - osd_name = osd_node['name'] - device_class = osd_node.get('device_class', 'hdd') - - print(f"Analyzing {osd_name} ({device_class})...", end='\r') - - # Get OSD data - osd_df_data = osd_df_map.get(osd_id_num, {}) - osd_perf_data = osd_perf_map.get(osd_id_num, {}) - - # SMART health analysis - health_data = get_device_health(osd_id_num) - health_score, health_issues, health_metrics = parse_smart_health(health_data) - - # Capacity optimization score - capacity_score, capacity_factors = calculate_capacity_score( - osd_df_data, host_osds, device_class - ) - - # Resilience score - resilience_score, resilience_factors = calculate_resilience_score( - osd_node, host, hosts - ) - - # Performance score - performance_score, performance_factors = calculate_performance_score( - osd_perf_data, osd_df_data.get('pgs', 0), avg_pg_count - ) - - # Calculate total score (weighted) - total_score = ( - (100 - health_score) * 0.40 + # Health is most important - capacity_score * 0.30 + # Capacity optimization - resilience_score * 0.20 + # Cluster resilience - performance_score * 0.10 # Performance issues - ) - - candidates.append({ - 'osd_id': osd_id_num, - 'osd_name': osd_name, - 'host': host['name'], - 'device_class': device_class, - 'weight': osd_df_data.get('crush_weight', 0), - 'size': osd_df_data.get('kb', 0) / 1024 / 1024 / 1024, # TB - 'utilization': osd_df_data.get('utilization', 0), - 'pgs': osd_df_data.get('pgs', 0), - 'total_score': total_score, - 'health_score': health_score, - 'health_issues': health_issues, - 'health_metrics': health_metrics, - 'capacity_factors': capacity_factors, - 'resilience_factors': resilience_factors, - 'performance_factors': performance_factors, - }) + current_osd += 1 + osd_id = node['id'] + osd_name = node['name'] + device_class = node.get('device_class', 'hdd') + host_name = osd_to_host.get(osd_id, 'unknown') + + print(f"[{current_osd}/{total_osds}] Analyzing {osd_name} on {host_name} ({device_class})...".ljust(80), end='\r') + + # Get OSD data + osd_df_data = osd_df_map.get(osd_id, {}) + osd_perf_data = osd_perf_map.get(osd_id, {}) + + # SMART health analysis - query from the correct host + health_data = get_device_health(osd_id, host_name) + health_score, health_issues, health_metrics = parse_smart_health(health_data) + + # Capacity optimization score + capacity_score, capacity_factors = calculate_capacity_score( + osd_df_data, host_osds_map.get(host_name, []), device_class + ) + + # Resilience score + resilience_score, resilience_factors = calculate_resilience_score( + node, host_name, host_osds_map, osd_tree + ) + + # Performance score + performance_score, performance_factors = calculate_performance_score( + osd_perf_data, osd_df_data.get('pgs', 0), avg_pg_count + ) + + # Calculate total score (weighted) + total_score = ( + (100 - health_score) * 0.40 + # Health is most important + capacity_score * 0.30 + # Capacity optimization + resilience_score * 0.20 + # Cluster resilience + performance_score * 0.10 # Performance issues + ) + + candidates.append({ + 'osd_id': osd_id, + 'osd_name': osd_name, + 'host': host_name, + 'device_class': device_class, + 'weight': osd_df_data.get('crush_weight', 0), + 'size': osd_df_data.get('kb', 0) / 1024 / 1024 / 1024, # TB + 'utilization': osd_df_data.get('utilization', 0), + 'pgs': osd_df_data.get('pgs', 0), + 'total_score': total_score, + 'health_score': health_score, + 'health_issues': health_issues, + 'health_metrics': health_metrics, + 'capacity_factors': capacity_factors, + 'resilience_factors': resilience_factors, + 'performance_factors': performance_factors, + }) print(" " * 80, end='\r') # Clear the line @@ -409,7 +486,7 @@ def analyze_cluster(): candidates.sort(key=lambda x: x['total_score'], reverse=True) # Display results - print(f"\n{Colors.BOLD}{Colors.CYAN}=== TOP REPLACEMENT CANDIDATES ==={Colors.END}\n") + print(f"\n{Colors.BOLD}{Colors.CYAN}=== TOP REPLACEMENT CANDIDATES (ALL HOSTS) ==={Colors.END}\n") for rank, candidate in enumerate(candidates[:15], 1): score_color = Colors.RED if candidate['total_score'] > 50 else Colors.YELLOW if candidate['total_score'] > 30 else Colors.GREEN @@ -455,9 +532,22 @@ def analyze_cluster(): print(f" Host: {top_candidate['host']}") print(f" Capacity gain potential: {top_candidate['weight']:.2f} TB") print() + + # Summary by host + print(f"\n{Colors.BOLD}{Colors.CYAN}=== TOP CANDIDATES BY HOST ==={Colors.END}\n") + hosts_seen = set() + for candidate in candidates: + if candidate['host'] not in hosts_seen and len(hosts_seen) < 5: + hosts_seen.add(candidate['host']) + print(f"{Colors.BOLD}{candidate['host']}:{Colors.END}") + print(f" Top candidate: {candidate['osd_name']} (score: {candidate['total_score']:.1f})") + print(f" {candidate['device_class'].upper()}, {candidate['weight']:.2f} TB, {candidate['utilization']:.1f}% used") + if candidate['health_issues']: + print(f" Issues: {candidate['health_issues'][0]}") + print() if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Analyze Ceph OSDs for replacement candidates') + parser = argparse.ArgumentParser(description='Analyze Ceph OSDs for replacement candidates across entire cluster') parser.add_argument('--class', dest='device_class', choices=['hdd', 'nvme'], help='Filter by device class') parser.add_argument('--min-size', type=float, default=0,