Enables ssh to all hosts to gather smart data

This commit is contained in:
2025-12-22 16:50:04 -05:00
parent a861276013
commit 43d35feb46

View File

@@ -3,7 +3,7 @@
Advanced Ceph OSD Replacement Candidate Analyzer Advanced Ceph OSD Replacement Candidate Analyzer
This script identifies the best OSD replacement candidates by analyzing: This script identifies the best OSD replacement candidates by analyzing:
- SMART health data (wear, errors, temperature) - SMART health data (wear, errors, temperature) from ALL cluster nodes
- Capacity utilization and imbalance - Capacity utilization and imbalance
- Host-level distribution and resilience - Host-level distribution and resilience
- Age and performance metrics - Age and performance metrics
@@ -29,16 +29,19 @@ class Colors:
BOLD = '\033[1m' BOLD = '\033[1m'
END = '\033[0m' END = '\033[0m'
def run_command(cmd, parse_json=False): def run_command(cmd, parse_json=False, host=None):
"""Execute shell command and return output""" """Execute shell command locally or via SSH and return output"""
try: try:
if host:
cmd = f"ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 {host} '{cmd}'"
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True) result = subprocess.run(cmd, shell=True, capture_output=True, text=True, check=True)
if parse_json: if parse_json:
return json.loads(result.stdout) return json.loads(result.stdout)
return result.stdout.strip() return result.stdout.strip()
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"{Colors.RED}Error executing: {cmd}{Colors.END}") if host:
print(f"{Colors.RED}{e.stderr}{Colors.END}") print(f"{Colors.YELLOW}Warning: Failed to execute on {host}: {cmd}{Colors.END}")
return None if parse_json else "" return None if parse_json else ""
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
print(f"{Colors.RED}Error parsing JSON from: {cmd}{Colors.END}") print(f"{Colors.RED}Error parsing JSON from: {cmd}{Colors.END}")
@@ -64,38 +67,86 @@ def get_pg_dump():
"""Get PG dump for distribution analysis""" """Get PG dump for distribution analysis"""
return run_command("ceph pg dump -f json 2>/dev/null", parse_json=True) return run_command("ceph pg dump -f json 2>/dev/null", parse_json=True)
def get_device_health(osd_id): def get_osd_host_mapping(osd_tree):
"""Get device SMART health metrics""" """Build mapping of OSD ID to hostname"""
data = run_command(f"ceph device query-daemon-health-metrics osd.{osd_id} -f json 2>/dev/null", parse_json=True) osd_to_host = {}
if not data:
# Try alternative method for node in osd_tree['nodes']:
metadata = get_osd_metadata(osd_id) if node['type'] == 'host':
if metadata and 'device_ids' in metadata: host_name = node['name']
device = metadata['device_ids'] for child_id in node.get('children', []):
return get_smart_data_direct(device) osd_to_host[child_id] = host_name
return data
return osd_to_host
def get_smart_data_direct(device_path): def get_device_path_for_osd(osd_id, hostname):
"""Fallback: Get SMART data directly from device""" """Get the device path for an OSD on a specific host"""
# Try to find the device # Try to get from ceph metadata first
try: metadata = get_osd_metadata(osd_id)
result = run_command(f"smartctl -a -j {device_path} 2>/dev/null", parse_json=True) if metadata:
return result # Get device path from metadata
except: devices = metadata.get('devices', '')
device_ids = metadata.get('device_ids', '')
if devices:
return devices.split(',')[0] if ',' in devices else devices
elif device_ids:
# device_ids format is like "VENDOR_MODEL_SERIAL"
# We need to find the actual device path on the host
pass
# Fallback: query the OSD on the remote host for its device
cmd = f"ceph-volume lvm list osd.{osd_id} -f json 2>/dev/null || ceph-volume simple scan /var/lib/ceph/osd/ceph-{osd_id} 2>/dev/null"
result = run_command(cmd, host=hostname, parse_json=False)
if result:
# Try to extract device path from output
for line in result.split('\n'):
if 'block_device' in line or 'device' in line:
match = re.search(r'/dev/[a-z0-9]+', line)
if match:
return match.group(0)
# Last resort: try common patterns
common_paths = [
f"/dev/disk/by-partuuid/$(readlink /var/lib/ceph/osd/ceph-{osd_id}/block | xargs basename)",
f"/var/lib/ceph/osd/ceph-{osd_id}/block"
]
for path in common_paths:
result = run_command(f"readlink -f {path} 2>/dev/null", host=hostname)
if result and result.startswith('/dev/'):
return result
return None
def get_smart_data_remote(device_path, hostname):
"""Get SMART data from a remote host"""
if not device_path:
return None return None
# Remove partition numbers if present
base_device = re.sub(r'p?\d+$', '', device_path)
cmd = f"smartctl -a -j {base_device} 2>/dev/null"
result = run_command(cmd, host=hostname, parse_json=True)
return result
def get_osd_pool_mapping(): def get_device_health(osd_id, hostname):
"""Get which pools each OSD belongs to""" """Get device SMART health metrics from the appropriate host"""
pg_data = get_pg_dump() # First try ceph's built-in health metrics
osd_pools = defaultdict(set) data = run_command(f"ceph device query-daemon-health-metrics osd.{osd_id} -f json 2>/dev/null", parse_json=True)
if pg_data and 'pg_map' in pg_data and 'pg_stats' in pg_data['pg_map']: if data and 'ata_smart_attributes' in data or 'nvme_smart_health_information_log' in data:
for pg in pg_data['pg_map']['pg_stats']: return data
pool_id = pg['pgid'].split('.')[0]
for osd in pg.get('acting', []):
osd_pools[osd].add(int(pool_id))
return osd_pools # If that fails, get device path and query via SSH
device_path = get_device_path_for_osd(osd_id, hostname)
if device_path:
return get_smart_data_remote(device_path, hostname)
return None
def parse_smart_health(smart_data): def parse_smart_health(smart_data):
"""Parse SMART data and calculate health score""" """Parse SMART data and calculate health score"""
@@ -201,7 +252,7 @@ def parse_smart_health(smart_data):
return max(0, score), issues, metrics return max(0, score), issues, metrics
def calculate_capacity_score(osd_data, host_osds, osd_class): def calculate_capacity_score(osd_data, host_osds_data, osd_class):
"""Calculate score based on capacity optimization potential""" """Calculate score based on capacity optimization potential"""
score = 0.0 score = 0.0
factors = [] factors = []
@@ -232,31 +283,40 @@ def calculate_capacity_score(osd_data, host_osds, osd_class):
factors.append(f"Medium utilization ({utilization:.1f}%)") factors.append(f"Medium utilization ({utilization:.1f}%)")
# Host balance consideration # Host balance consideration
host_total_weight = sum(o.get('crush_weight', 0) for o in host_osds if o.get('device_class') == osd_class) same_class_osds = [o for o in host_osds_data if o.get('device_class') == osd_class]
host_avg_weight = host_total_weight / len([o for o in host_osds if o.get('device_class') == osd_class]) if host_osds else 0 if same_class_osds:
host_total_weight = sum(o.get('crush_weight', 0) for o in same_class_osds)
if weight < host_avg_weight * 0.5: host_avg_weight = host_total_weight / len(same_class_osds)
score += 15
factors.append(f"Below host average ({host_avg_weight:.1f}TB) - improves balance") if weight < host_avg_weight * 0.5:
score += 15
factors.append(f"Below host average ({host_avg_weight:.1f}TB) - improves balance")
return score, factors return score, factors
def calculate_resilience_score(osd_data, host_data, all_hosts): def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree):
"""Calculate score based on cluster resilience improvement""" """Calculate score based on cluster resilience improvement"""
score = 0.0 score = 0.0
factors = [] factors = []
host_name = host_data['name']
osd_class = osd_data.get('device_class', 'hdd') osd_class = osd_data.get('device_class', 'hdd')
# Count OSDs per host by class # Count OSDs per host by class
host_class_counts = {} host_class_counts = {}
for host in all_hosts: for host_node in [n for n in osd_tree['nodes'] if n['type'] == 'host']:
host_class_counts[host['name']] = { h_name = host_node['name']
'hdd': len([o for o in host.get('children', []) if o.get('device_class') == 'hdd' and o.get('status') == 'up']), host_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes']))
'nvme': len([o for o in host.get('children', []) if o.get('device_class') == 'nvme' and o.get('status') == 'up']) if osd_tree['nodes'][i].get('id') in host_node.get('children', [])
and osd_tree['nodes'][i].get('type') == 'osd']
host_class_counts[h_name] = {
'hdd': len([o for o in host_osds if o.get('device_class') == 'hdd' and o.get('status') == 'up']),
'nvme': len([o for o in host_osds if o.get('device_class') == 'nvme' and o.get('status') == 'up'])
} }
if host_name not in host_class_counts:
return 0, ["Host not found in cluster"]
current_count = host_class_counts[host_name][osd_class] current_count = host_class_counts[host_name][osd_class]
avg_count = sum(h[osd_class] for h in host_class_counts.values()) / len(host_class_counts) avg_count = sum(h[osd_class] for h in host_class_counts.values()) / len(host_class_counts)
@@ -269,10 +329,14 @@ def calculate_resilience_score(osd_data, host_data, all_hosts):
factors.append(f"Host slightly above average {osd_class} count") factors.append(f"Host slightly above average {osd_class} count")
# Check for down OSDs on same host (indicates potential issues) # Check for down OSDs on same host (indicates potential issues)
down_osds = [o for o in host_data.get('children', []) if o.get('status') == 'down'] host_node = next((n for n in osd_tree['nodes'] if n['type'] == 'host' and n['name'] == host_name), None)
if down_osds: if host_node:
score += 15 down_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes']))
factors.append(f"Host has {len(down_osds)} down OSD(s) - may have hardware issues") if osd_tree['nodes'][i].get('id') in host_node.get('children', [])
and osd_tree['nodes'][i].get('status') == 'down']
if down_osds:
score += 15
factors.append(f"Host has {len(down_osds)} down OSD(s) - may have hardware issues")
return score, factors return score, factors
@@ -323,6 +387,9 @@ def analyze_cluster():
print(f"{Colors.RED}Failed to gather cluster data{Colors.END}") print(f"{Colors.RED}Failed to gather cluster data{Colors.END}")
return return
# Build OSD to host mapping
osd_to_host = get_osd_host_mapping(osd_tree)
# Parse OSD data # Parse OSD data
osd_df_map = {node['id']: node for node in osd_df['nodes']} osd_df_map = {node['id']: node for node in osd_df['nodes']}
osd_perf_map = {p['id']: p for p in osd_perf.get('osd_perf_infos', [])} if osd_perf else {} osd_perf_map = {p['id']: p for p in osd_perf.get('osd_perf_infos', [])} if osd_perf else {}
@@ -331,77 +398,87 @@ def analyze_cluster():
pg_counts = [node['pgs'] for node in osd_df['nodes'] if node.get('pgs', 0) > 0] pg_counts = [node['pgs'] for node in osd_df['nodes'] if node.get('pgs', 0) > 0]
avg_pg_count = sum(pg_counts) / len(pg_counts) if pg_counts else 0 avg_pg_count = sum(pg_counts) / len(pg_counts) if pg_counts else 0
# Build host map # Build host data map
hosts = [node for node in osd_tree['nodes'] if node['type'] == 'host'] host_osds_map = defaultdict(list)
host_map = {host['id']: host for host in hosts} for node in osd_tree['nodes']:
if node['type'] == 'osd' and node.get('status') == 'up':
host_name = osd_to_host.get(node['id'])
if host_name:
osd_df_data = osd_df_map.get(node['id'], {})
host_osds_map[host_name].append({
'id': node['id'],
'device_class': node.get('device_class', 'hdd'),
'crush_weight': osd_df_data.get('crush_weight', 0)
})
# Analyze each OSD # Analyze each OSD
candidates = [] candidates = []
print("Analyzing OSDs...\n") print("Analyzing OSDs across all cluster nodes...\n")
for host in hosts: total_osds = len([n for n in osd_tree['nodes'] if n['type'] == 'osd' and n.get('status') == 'up'])
host_osds = [node for node in osd_tree['nodes'] if node.get('id') in [c for c in host.get('children', [])]] current_osd = 0
for node in osd_tree['nodes']:
if node['type'] != 'osd' or node.get('status') != 'up':
continue
for osd_id in host.get('children', []): current_osd += 1
osd_node = next((n for n in osd_tree['nodes'] if n['id'] == osd_id), None) osd_id = node['id']
if not osd_node or osd_node.get('status') != 'up': osd_name = node['name']
continue device_class = node.get('device_class', 'hdd')
host_name = osd_to_host.get(osd_id, 'unknown')
osd_id_num = osd_node['id']
osd_name = osd_node['name'] print(f"[{current_osd}/{total_osds}] Analyzing {osd_name} on {host_name} ({device_class})...".ljust(80), end='\r')
device_class = osd_node.get('device_class', 'hdd')
# Get OSD data
print(f"Analyzing {osd_name} ({device_class})...", end='\r') osd_df_data = osd_df_map.get(osd_id, {})
osd_perf_data = osd_perf_map.get(osd_id, {})
# Get OSD data
osd_df_data = osd_df_map.get(osd_id_num, {}) # SMART health analysis - query from the correct host
osd_perf_data = osd_perf_map.get(osd_id_num, {}) health_data = get_device_health(osd_id, host_name)
health_score, health_issues, health_metrics = parse_smart_health(health_data)
# SMART health analysis
health_data = get_device_health(osd_id_num) # Capacity optimization score
health_score, health_issues, health_metrics = parse_smart_health(health_data) capacity_score, capacity_factors = calculate_capacity_score(
osd_df_data, host_osds_map.get(host_name, []), device_class
# Capacity optimization score )
capacity_score, capacity_factors = calculate_capacity_score(
osd_df_data, host_osds, device_class # Resilience score
) resilience_score, resilience_factors = calculate_resilience_score(
node, host_name, host_osds_map, osd_tree
# Resilience score )
resilience_score, resilience_factors = calculate_resilience_score(
osd_node, host, hosts # Performance score
) performance_score, performance_factors = calculate_performance_score(
osd_perf_data, osd_df_data.get('pgs', 0), avg_pg_count
# Performance score )
performance_score, performance_factors = calculate_performance_score(
osd_perf_data, osd_df_data.get('pgs', 0), avg_pg_count # Calculate total score (weighted)
) total_score = (
(100 - health_score) * 0.40 + # Health is most important
# Calculate total score (weighted) capacity_score * 0.30 + # Capacity optimization
total_score = ( resilience_score * 0.20 + # Cluster resilience
(100 - health_score) * 0.40 + # Health is most important performance_score * 0.10 # Performance issues
capacity_score * 0.30 + # Capacity optimization )
resilience_score * 0.20 + # Cluster resilience
performance_score * 0.10 # Performance issues candidates.append({
) 'osd_id': osd_id,
'osd_name': osd_name,
candidates.append({ 'host': host_name,
'osd_id': osd_id_num, 'device_class': device_class,
'osd_name': osd_name, 'weight': osd_df_data.get('crush_weight', 0),
'host': host['name'], 'size': osd_df_data.get('kb', 0) / 1024 / 1024 / 1024, # TB
'device_class': device_class, 'utilization': osd_df_data.get('utilization', 0),
'weight': osd_df_data.get('crush_weight', 0), 'pgs': osd_df_data.get('pgs', 0),
'size': osd_df_data.get('kb', 0) / 1024 / 1024 / 1024, # TB 'total_score': total_score,
'utilization': osd_df_data.get('utilization', 0), 'health_score': health_score,
'pgs': osd_df_data.get('pgs', 0), 'health_issues': health_issues,
'total_score': total_score, 'health_metrics': health_metrics,
'health_score': health_score, 'capacity_factors': capacity_factors,
'health_issues': health_issues, 'resilience_factors': resilience_factors,
'health_metrics': health_metrics, 'performance_factors': performance_factors,
'capacity_factors': capacity_factors, })
'resilience_factors': resilience_factors,
'performance_factors': performance_factors,
})
print(" " * 80, end='\r') # Clear the line print(" " * 80, end='\r') # Clear the line
@@ -409,7 +486,7 @@ def analyze_cluster():
candidates.sort(key=lambda x: x['total_score'], reverse=True) candidates.sort(key=lambda x: x['total_score'], reverse=True)
# Display results # Display results
print(f"\n{Colors.BOLD}{Colors.CYAN}=== TOP REPLACEMENT CANDIDATES ==={Colors.END}\n") print(f"\n{Colors.BOLD}{Colors.CYAN}=== TOP REPLACEMENT CANDIDATES (ALL HOSTS) ==={Colors.END}\n")
for rank, candidate in enumerate(candidates[:15], 1): for rank, candidate in enumerate(candidates[:15], 1):
score_color = Colors.RED if candidate['total_score'] > 50 else Colors.YELLOW if candidate['total_score'] > 30 else Colors.GREEN score_color = Colors.RED if candidate['total_score'] > 50 else Colors.YELLOW if candidate['total_score'] > 30 else Colors.GREEN
@@ -455,9 +532,22 @@ def analyze_cluster():
print(f" Host: {top_candidate['host']}") print(f" Host: {top_candidate['host']}")
print(f" Capacity gain potential: {top_candidate['weight']:.2f} TB") print(f" Capacity gain potential: {top_candidate['weight']:.2f} TB")
print() print()
# Summary by host
print(f"\n{Colors.BOLD}{Colors.CYAN}=== TOP CANDIDATES BY HOST ==={Colors.END}\n")
hosts_seen = set()
for candidate in candidates:
if candidate['host'] not in hosts_seen and len(hosts_seen) < 5:
hosts_seen.add(candidate['host'])
print(f"{Colors.BOLD}{candidate['host']}:{Colors.END}")
print(f" Top candidate: {candidate['osd_name']} (score: {candidate['total_score']:.1f})")
print(f" {candidate['device_class'].upper()}, {candidate['weight']:.2f} TB, {candidate['utilization']:.1f}% used")
if candidate['health_issues']:
print(f" Issues: {candidate['health_issues'][0]}")
print()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Analyze Ceph OSDs for replacement candidates') parser = argparse.ArgumentParser(description='Analyze Ceph OSDs for replacement candidates across entire cluster')
parser.add_argument('--class', dest='device_class', choices=['hdd', 'nvme'], parser.add_argument('--class', dest='device_class', choices=['hdd', 'nvme'],
help='Filter by device class') help='Filter by device class')
parser.add_argument('--min-size', type=float, default=0, parser.add_argument('--min-size', type=float, default=0,