removed pg and latency counters

This commit is contained in:
2025-12-22 17:14:02 -05:00
parent db757345fb
commit 1610aa2606

View File

@@ -67,9 +67,7 @@ def get_osd_metadata(osd_id):
"""Get metadata for specific OSD"""
return run_command(f"ceph osd metadata osd.{osd_id} -f json", parse_json=True)
def get_osd_perf():
"""Get OSD performance statistics"""
return run_command("ceph osd perf -f json", parse_json=True)
# Performance metrics removed for simplicity
def get_osd_host_mapping(osd_tree):
"""Build mapping of OSD ID to hostname"""
@@ -344,38 +342,7 @@ def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree):
return score, factors
def calculate_performance_score(osd_perf_data, pg_count, avg_pg_count):
"""Calculate score based on performance metrics"""
score = 0.0
factors = []
if not osd_perf_data:
return 0, ["No performance data available"]
commit_latency = osd_perf_data.get('commit_latency_ms', 0)
apply_latency = osd_perf_data.get('apply_latency_ms', 0)
# High latency indicates slow drive
if commit_latency > 50:
score += 15
factors.append(f"High commit latency ({commit_latency}ms)")
elif commit_latency > 30:
score += 8
factors.append(f"Elevated commit latency ({commit_latency}ms)")
if apply_latency > 50:
score += 15
factors.append(f"High apply latency ({apply_latency}ms)")
# PG imbalance
if pg_count > avg_pg_count * 1.3:
score += 10
factors.append(f"High PG count ({pg_count} vs avg {avg_pg_count:.0f})")
elif pg_count < avg_pg_count * 0.7:
score -= 5
factors.append(f"Low PG count ({pg_count}) - already underutilized")
return score, factors
# Performance metrics removed for simplicity
def analyze_cluster():
"""Main analysis function"""
@@ -385,7 +352,6 @@ def analyze_cluster():
print("Gathering cluster data...")
osd_tree = get_osd_tree()
osd_df = get_osd_df()
osd_perf = get_osd_perf()
if not osd_tree or not osd_df:
print(f"{Colors.RED}Failed to gather cluster data{Colors.END}")
@@ -396,11 +362,6 @@ def analyze_cluster():
# Parse OSD data
osd_df_map = {node['id']: node for node in osd_df['nodes']}
osd_perf_map = {p['id']: p for p in osd_perf.get('osd_perf_infos', [])} if osd_perf else {}
# Calculate average PG count
pg_counts = [node['pgs'] for node in osd_df['nodes'] if node.get('pgs', 0) > 0]
avg_pg_count = sum(pg_counts) / len(pg_counts) if pg_counts else 0
# Build host data map
host_osds_map = defaultdict(list)
@@ -438,7 +399,6 @@ def analyze_cluster():
# Get OSD data
osd_df_data = osd_df_map.get(osd_id, {})
osd_perf_data = osd_perf_map.get(osd_id, {})
# SMART health analysis
health_data = get_device_health(osd_id, host_name)
@@ -457,17 +417,11 @@ def analyze_cluster():
node, host_name, host_osds_map, osd_tree
)
# Performance score
performance_score, performance_factors = calculate_performance_score(
osd_perf_data, osd_df_data.get('pgs', 0), avg_pg_count
)
# Calculate total score (weighted)
# Calculate total score (weighted: 60% health, 30% capacity, 10% resilience)
total_score = (
(100 - health_score) * 0.40 + # Health is most important
(100 - health_score) * 0.60 + # Health is most important
capacity_score * 0.30 + # Capacity optimization
resilience_score * 0.20 + # Cluster resilience
performance_score * 0.10 # Performance issues
resilience_score * 0.10 # Cluster resilience
)
candidates.append({
@@ -478,14 +432,12 @@ def analyze_cluster():
'weight': osd_df_data.get('crush_weight', 0),
'size': osd_df_data.get('kb', 0) / 1024 / 1024 / 1024, # TB
'utilization': osd_df_data.get('utilization', 0),
'pgs': osd_df_data.get('pgs', 0),
'total_score': total_score,
'health_score': health_score,
'health_issues': health_issues,
'health_metrics': health_metrics,
'capacity_factors': capacity_factors,
'resilience_factors': resilience_factors,
'performance_factors': performance_factors,
})
print(" " * 80, end='\r')
@@ -512,7 +464,7 @@ def analyze_cluster():
print(f"{Colors.BOLD}#{rank} - {candidate['osd_name']} ({candidate['device_class'].upper()}){Colors.END}")
print(f" Host: {candidate['host']}")
print(f" Size: {candidate['size']:.2f} TB (weight: {candidate['weight']:.2f})")
print(f" Utilization: {candidate['utilization']:.1f}% | PGs: {candidate['pgs']}")
print(f" Utilization: {candidate['utilization']:.1f}%")
print(f" {score_color}Replacement Score: {candidate['total_score']:.1f}/100{Colors.END}")
print(f" {health_color}Health Score: {candidate['health_score']:.1f}/100{Colors.END}")
@@ -527,15 +479,10 @@ def analyze_cluster():
print(f"{factor}")
if candidate['resilience_factors']:
print(f" Resilience Impact:")
print(f" Host Distribution:")
for factor in candidate['resilience_factors'][:2]:
print(f"{factor}")
if candidate['performance_factors']:
print(f" Performance Metrics:")
for factor in candidate['performance_factors'][:2]:
print(f"{factor}")
print()
# Summary by class