removed pg and latency counters
This commit is contained in:
@@ -67,9 +67,7 @@ def get_osd_metadata(osd_id):
|
|||||||
"""Get metadata for specific OSD"""
|
"""Get metadata for specific OSD"""
|
||||||
return run_command(f"ceph osd metadata osd.{osd_id} -f json", parse_json=True)
|
return run_command(f"ceph osd metadata osd.{osd_id} -f json", parse_json=True)
|
||||||
|
|
||||||
def get_osd_perf():
|
# Performance metrics removed for simplicity
|
||||||
"""Get OSD performance statistics"""
|
|
||||||
return run_command("ceph osd perf -f json", parse_json=True)
|
|
||||||
|
|
||||||
def get_osd_host_mapping(osd_tree):
|
def get_osd_host_mapping(osd_tree):
|
||||||
"""Build mapping of OSD ID to hostname"""
|
"""Build mapping of OSD ID to hostname"""
|
||||||
@@ -344,38 +342,7 @@ def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree):
|
|||||||
|
|
||||||
return score, factors
|
return score, factors
|
||||||
|
|
||||||
def calculate_performance_score(osd_perf_data, pg_count, avg_pg_count):
|
# Performance metrics removed for simplicity
|
||||||
"""Calculate score based on performance metrics"""
|
|
||||||
score = 0.0
|
|
||||||
factors = []
|
|
||||||
|
|
||||||
if not osd_perf_data:
|
|
||||||
return 0, ["No performance data available"]
|
|
||||||
|
|
||||||
commit_latency = osd_perf_data.get('commit_latency_ms', 0)
|
|
||||||
apply_latency = osd_perf_data.get('apply_latency_ms', 0)
|
|
||||||
|
|
||||||
# High latency indicates slow drive
|
|
||||||
if commit_latency > 50:
|
|
||||||
score += 15
|
|
||||||
factors.append(f"High commit latency ({commit_latency}ms)")
|
|
||||||
elif commit_latency > 30:
|
|
||||||
score += 8
|
|
||||||
factors.append(f"Elevated commit latency ({commit_latency}ms)")
|
|
||||||
|
|
||||||
if apply_latency > 50:
|
|
||||||
score += 15
|
|
||||||
factors.append(f"High apply latency ({apply_latency}ms)")
|
|
||||||
|
|
||||||
# PG imbalance
|
|
||||||
if pg_count > avg_pg_count * 1.3:
|
|
||||||
score += 10
|
|
||||||
factors.append(f"High PG count ({pg_count} vs avg {avg_pg_count:.0f})")
|
|
||||||
elif pg_count < avg_pg_count * 0.7:
|
|
||||||
score -= 5
|
|
||||||
factors.append(f"Low PG count ({pg_count}) - already underutilized")
|
|
||||||
|
|
||||||
return score, factors
|
|
||||||
|
|
||||||
def analyze_cluster():
|
def analyze_cluster():
|
||||||
"""Main analysis function"""
|
"""Main analysis function"""
|
||||||
@@ -385,7 +352,6 @@ def analyze_cluster():
|
|||||||
print("Gathering cluster data...")
|
print("Gathering cluster data...")
|
||||||
osd_tree = get_osd_tree()
|
osd_tree = get_osd_tree()
|
||||||
osd_df = get_osd_df()
|
osd_df = get_osd_df()
|
||||||
osd_perf = get_osd_perf()
|
|
||||||
|
|
||||||
if not osd_tree or not osd_df:
|
if not osd_tree or not osd_df:
|
||||||
print(f"{Colors.RED}Failed to gather cluster data{Colors.END}")
|
print(f"{Colors.RED}Failed to gather cluster data{Colors.END}")
|
||||||
@@ -396,11 +362,6 @@ def analyze_cluster():
|
|||||||
|
|
||||||
# Parse OSD data
|
# Parse OSD data
|
||||||
osd_df_map = {node['id']: node for node in osd_df['nodes']}
|
osd_df_map = {node['id']: node for node in osd_df['nodes']}
|
||||||
osd_perf_map = {p['id']: p for p in osd_perf.get('osd_perf_infos', [])} if osd_perf else {}
|
|
||||||
|
|
||||||
# Calculate average PG count
|
|
||||||
pg_counts = [node['pgs'] for node in osd_df['nodes'] if node.get('pgs', 0) > 0]
|
|
||||||
avg_pg_count = sum(pg_counts) / len(pg_counts) if pg_counts else 0
|
|
||||||
|
|
||||||
# Build host data map
|
# Build host data map
|
||||||
host_osds_map = defaultdict(list)
|
host_osds_map = defaultdict(list)
|
||||||
@@ -438,7 +399,6 @@ def analyze_cluster():
|
|||||||
|
|
||||||
# Get OSD data
|
# Get OSD data
|
||||||
osd_df_data = osd_df_map.get(osd_id, {})
|
osd_df_data = osd_df_map.get(osd_id, {})
|
||||||
osd_perf_data = osd_perf_map.get(osd_id, {})
|
|
||||||
|
|
||||||
# SMART health analysis
|
# SMART health analysis
|
||||||
health_data = get_device_health(osd_id, host_name)
|
health_data = get_device_health(osd_id, host_name)
|
||||||
@@ -457,17 +417,11 @@ def analyze_cluster():
|
|||||||
node, host_name, host_osds_map, osd_tree
|
node, host_name, host_osds_map, osd_tree
|
||||||
)
|
)
|
||||||
|
|
||||||
# Performance score
|
# Calculate total score (weighted: 60% health, 30% capacity, 10% resilience)
|
||||||
performance_score, performance_factors = calculate_performance_score(
|
|
||||||
osd_perf_data, osd_df_data.get('pgs', 0), avg_pg_count
|
|
||||||
)
|
|
||||||
|
|
||||||
# Calculate total score (weighted)
|
|
||||||
total_score = (
|
total_score = (
|
||||||
(100 - health_score) * 0.40 + # Health is most important
|
(100 - health_score) * 0.60 + # Health is most important
|
||||||
capacity_score * 0.30 + # Capacity optimization
|
capacity_score * 0.30 + # Capacity optimization
|
||||||
resilience_score * 0.20 + # Cluster resilience
|
resilience_score * 0.10 # Cluster resilience
|
||||||
performance_score * 0.10 # Performance issues
|
|
||||||
)
|
)
|
||||||
|
|
||||||
candidates.append({
|
candidates.append({
|
||||||
@@ -478,14 +432,12 @@ def analyze_cluster():
|
|||||||
'weight': osd_df_data.get('crush_weight', 0),
|
'weight': osd_df_data.get('crush_weight', 0),
|
||||||
'size': osd_df_data.get('kb', 0) / 1024 / 1024 / 1024, # TB
|
'size': osd_df_data.get('kb', 0) / 1024 / 1024 / 1024, # TB
|
||||||
'utilization': osd_df_data.get('utilization', 0),
|
'utilization': osd_df_data.get('utilization', 0),
|
||||||
'pgs': osd_df_data.get('pgs', 0),
|
|
||||||
'total_score': total_score,
|
'total_score': total_score,
|
||||||
'health_score': health_score,
|
'health_score': health_score,
|
||||||
'health_issues': health_issues,
|
'health_issues': health_issues,
|
||||||
'health_metrics': health_metrics,
|
'health_metrics': health_metrics,
|
||||||
'capacity_factors': capacity_factors,
|
'capacity_factors': capacity_factors,
|
||||||
'resilience_factors': resilience_factors,
|
'resilience_factors': resilience_factors,
|
||||||
'performance_factors': performance_factors,
|
|
||||||
})
|
})
|
||||||
|
|
||||||
print(" " * 80, end='\r')
|
print(" " * 80, end='\r')
|
||||||
@@ -512,7 +464,7 @@ def analyze_cluster():
|
|||||||
print(f"{Colors.BOLD}#{rank} - {candidate['osd_name']} ({candidate['device_class'].upper()}){Colors.END}")
|
print(f"{Colors.BOLD}#{rank} - {candidate['osd_name']} ({candidate['device_class'].upper()}){Colors.END}")
|
||||||
print(f" Host: {candidate['host']}")
|
print(f" Host: {candidate['host']}")
|
||||||
print(f" Size: {candidate['size']:.2f} TB (weight: {candidate['weight']:.2f})")
|
print(f" Size: {candidate['size']:.2f} TB (weight: {candidate['weight']:.2f})")
|
||||||
print(f" Utilization: {candidate['utilization']:.1f}% | PGs: {candidate['pgs']}")
|
print(f" Utilization: {candidate['utilization']:.1f}%")
|
||||||
print(f" {score_color}Replacement Score: {candidate['total_score']:.1f}/100{Colors.END}")
|
print(f" {score_color}Replacement Score: {candidate['total_score']:.1f}/100{Colors.END}")
|
||||||
print(f" {health_color}Health Score: {candidate['health_score']:.1f}/100{Colors.END}")
|
print(f" {health_color}Health Score: {candidate['health_score']:.1f}/100{Colors.END}")
|
||||||
|
|
||||||
@@ -527,15 +479,10 @@ def analyze_cluster():
|
|||||||
print(f" • {factor}")
|
print(f" • {factor}")
|
||||||
|
|
||||||
if candidate['resilience_factors']:
|
if candidate['resilience_factors']:
|
||||||
print(f" Resilience Impact:")
|
print(f" Host Distribution:")
|
||||||
for factor in candidate['resilience_factors'][:2]:
|
for factor in candidate['resilience_factors'][:2]:
|
||||||
print(f" • {factor}")
|
print(f" • {factor}")
|
||||||
|
|
||||||
if candidate['performance_factors']:
|
|
||||||
print(f" Performance Metrics:")
|
|
||||||
for factor in candidate['performance_factors'][:2]:
|
|
||||||
print(f" • {factor}")
|
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# Summary by class
|
# Summary by class
|
||||||
|
|||||||
Reference in New Issue
Block a user