From 1610aa2606815756a59119eed92228a24516f324 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Mon, 22 Dec 2025 17:14:02 -0500 Subject: [PATCH] removed pg and latency counters --- ceph_osd_analyzer.py | 67 +++++--------------------------------------- 1 file changed, 7 insertions(+), 60 deletions(-) diff --git a/ceph_osd_analyzer.py b/ceph_osd_analyzer.py index db1a9e1..4b90d39 100644 --- a/ceph_osd_analyzer.py +++ b/ceph_osd_analyzer.py @@ -67,9 +67,7 @@ def get_osd_metadata(osd_id): """Get metadata for specific OSD""" return run_command(f"ceph osd metadata osd.{osd_id} -f json", parse_json=True) -def get_osd_perf(): - """Get OSD performance statistics""" - return run_command("ceph osd perf -f json", parse_json=True) +# Performance metrics removed for simplicity def get_osd_host_mapping(osd_tree): """Build mapping of OSD ID to hostname""" @@ -344,38 +342,7 @@ def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree): return score, factors -def calculate_performance_score(osd_perf_data, pg_count, avg_pg_count): - """Calculate score based on performance metrics""" - score = 0.0 - factors = [] - - if not osd_perf_data: - return 0, ["No performance data available"] - - commit_latency = osd_perf_data.get('commit_latency_ms', 0) - apply_latency = osd_perf_data.get('apply_latency_ms', 0) - - # High latency indicates slow drive - if commit_latency > 50: - score += 15 - factors.append(f"High commit latency ({commit_latency}ms)") - elif commit_latency > 30: - score += 8 - factors.append(f"Elevated commit latency ({commit_latency}ms)") - - if apply_latency > 50: - score += 15 - factors.append(f"High apply latency ({apply_latency}ms)") - - # PG imbalance - if pg_count > avg_pg_count * 1.3: - score += 10 - factors.append(f"High PG count ({pg_count} vs avg {avg_pg_count:.0f})") - elif pg_count < avg_pg_count * 0.7: - score -= 5 - factors.append(f"Low PG count ({pg_count}) - already underutilized") - - return score, factors +# Performance metrics removed for simplicity def analyze_cluster(): """Main analysis function""" @@ -385,7 +352,6 @@ def analyze_cluster(): print("Gathering cluster data...") osd_tree = get_osd_tree() osd_df = get_osd_df() - osd_perf = get_osd_perf() if not osd_tree or not osd_df: print(f"{Colors.RED}Failed to gather cluster data{Colors.END}") @@ -396,11 +362,6 @@ def analyze_cluster(): # Parse OSD data osd_df_map = {node['id']: node for node in osd_df['nodes']} - osd_perf_map = {p['id']: p for p in osd_perf.get('osd_perf_infos', [])} if osd_perf else {} - - # Calculate average PG count - pg_counts = [node['pgs'] for node in osd_df['nodes'] if node.get('pgs', 0) > 0] - avg_pg_count = sum(pg_counts) / len(pg_counts) if pg_counts else 0 # Build host data map host_osds_map = defaultdict(list) @@ -438,7 +399,6 @@ def analyze_cluster(): # Get OSD data osd_df_data = osd_df_map.get(osd_id, {}) - osd_perf_data = osd_perf_map.get(osd_id, {}) # SMART health analysis health_data = get_device_health(osd_id, host_name) @@ -457,17 +417,11 @@ def analyze_cluster(): node, host_name, host_osds_map, osd_tree ) - # Performance score - performance_score, performance_factors = calculate_performance_score( - osd_perf_data, osd_df_data.get('pgs', 0), avg_pg_count - ) - - # Calculate total score (weighted) + # Calculate total score (weighted: 60% health, 30% capacity, 10% resilience) total_score = ( - (100 - health_score) * 0.40 + # Health is most important + (100 - health_score) * 0.60 + # Health is most important capacity_score * 0.30 + # Capacity optimization - resilience_score * 0.20 + # Cluster resilience - performance_score * 0.10 # Performance issues + resilience_score * 0.10 # Cluster resilience ) candidates.append({ @@ -478,14 +432,12 @@ def analyze_cluster(): 'weight': osd_df_data.get('crush_weight', 0), 'size': osd_df_data.get('kb', 0) / 1024 / 1024 / 1024, # TB 'utilization': osd_df_data.get('utilization', 0), - 'pgs': osd_df_data.get('pgs', 0), 'total_score': total_score, 'health_score': health_score, 'health_issues': health_issues, 'health_metrics': health_metrics, 'capacity_factors': capacity_factors, 'resilience_factors': resilience_factors, - 'performance_factors': performance_factors, }) print(" " * 80, end='\r') @@ -512,7 +464,7 @@ def analyze_cluster(): print(f"{Colors.BOLD}#{rank} - {candidate['osd_name']} ({candidate['device_class'].upper()}){Colors.END}") print(f" Host: {candidate['host']}") print(f" Size: {candidate['size']:.2f} TB (weight: {candidate['weight']:.2f})") - print(f" Utilization: {candidate['utilization']:.1f}% | PGs: {candidate['pgs']}") + print(f" Utilization: {candidate['utilization']:.1f}%") print(f" {score_color}Replacement Score: {candidate['total_score']:.1f}/100{Colors.END}") print(f" {health_color}Health Score: {candidate['health_score']:.1f}/100{Colors.END}") @@ -527,15 +479,10 @@ def analyze_cluster(): print(f" • {factor}") if candidate['resilience_factors']: - print(f" Resilience Impact:") + print(f" Host Distribution:") for factor in candidate['resilience_factors'][:2]: print(f" • {factor}") - if candidate['performance_factors']: - print(f" Performance Metrics:") - for factor in candidate['performance_factors'][:2]: - print(f" • {factor}") - print() # Summary by class