Better patterns and error handling
This commit is contained in:
@@ -31,7 +31,7 @@ class Colors:
|
|||||||
BOLD = '\033[1m'
|
BOLD = '\033[1m'
|
||||||
END = '\033[0m'
|
END = '\033[0m'
|
||||||
|
|
||||||
def run_command(cmd, parse_json=False, host=None, suppress_warnings=False):
|
def run_command(cmd, parse_json=False, host=None):
|
||||||
"""Execute shell command locally or via SSH and return output"""
|
"""Execute shell command locally or via SSH and return output"""
|
||||||
try:
|
try:
|
||||||
if host:
|
if host:
|
||||||
@@ -47,7 +47,8 @@ def run_command(cmd, parse_json=False, host=None, suppress_warnings=False):
|
|||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
print(f"{Colors.YELLOW}DEBUG: Command failed: {cmd}{Colors.END}")
|
print(f"{Colors.YELLOW}DEBUG: Command failed: {cmd}{Colors.END}")
|
||||||
print(f"{Colors.YELLOW}DEBUG: stderr: {e.stderr}{Colors.END}")
|
if e.stderr:
|
||||||
|
print(f"{Colors.YELLOW}DEBUG: stderr: {e.stderr[:200]}{Colors.END}")
|
||||||
return None if parse_json else ""
|
return None if parse_json else ""
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
if DEBUG:
|
if DEBUG:
|
||||||
@@ -70,10 +71,6 @@ def get_osd_perf():
|
|||||||
"""Get OSD performance statistics"""
|
"""Get OSD performance statistics"""
|
||||||
return run_command("ceph osd perf -f json", parse_json=True)
|
return run_command("ceph osd perf -f json", parse_json=True)
|
||||||
|
|
||||||
def get_pg_dump():
|
|
||||||
"""Get PG dump for distribution analysis"""
|
|
||||||
return run_command("ceph pg dump -f json 2>/dev/null", parse_json=True)
|
|
||||||
|
|
||||||
def get_osd_host_mapping(osd_tree):
|
def get_osd_host_mapping(osd_tree):
|
||||||
"""Build mapping of OSD ID to hostname"""
|
"""Build mapping of OSD ID to hostname"""
|
||||||
osd_to_host = {}
|
osd_to_host = {}
|
||||||
@@ -88,36 +85,36 @@ def get_osd_host_mapping(osd_tree):
|
|||||||
|
|
||||||
def get_device_path_for_osd(osd_id, hostname):
|
def get_device_path_for_osd(osd_id, hostname):
|
||||||
"""Get the device path for an OSD on a specific host"""
|
"""Get the device path for an OSD on a specific host"""
|
||||||
# Try to get from ceph metadata first
|
# Method 1: Try ceph metadata
|
||||||
metadata = get_osd_metadata(osd_id)
|
metadata = get_osd_metadata(osd_id)
|
||||||
if metadata:
|
if metadata:
|
||||||
# Get device path from metadata
|
|
||||||
devices = metadata.get('devices', '')
|
devices = metadata.get('devices', '')
|
||||||
|
|
||||||
if devices:
|
if devices:
|
||||||
device = devices.split(',')[0] if ',' in devices else devices
|
device = devices.split(',')[0] if ',' in devices else devices
|
||||||
# Ensure it has /dev/ prefix
|
|
||||||
if device and not device.startswith('/dev/'):
|
if device and not device.startswith('/dev/'):
|
||||||
device = f"/dev/{device}"
|
device = f"/dev/{device}"
|
||||||
if device and device != '/dev/':
|
if device and device != '/dev/':
|
||||||
|
if DEBUG:
|
||||||
|
print(f"{Colors.GREEN}DEBUG: Found device from metadata: {device}{Colors.END}")
|
||||||
return device
|
return device
|
||||||
|
|
||||||
# Fallback: query the OSD symlink on the remote host
|
# Method 2: Query symlink on remote host
|
||||||
result = run_command(f"readlink -f /var/lib/ceph/osd/ceph-{osd_id}/block 2>/dev/null", host=hostname)
|
result = run_command(f"readlink -f /var/lib/ceph/osd/ceph-{osd_id}/block 2>/dev/null", host=hostname)
|
||||||
if result and result.startswith('/dev/'):
|
if result and result.startswith('/dev/'):
|
||||||
|
if DEBUG:
|
||||||
|
print(f"{Colors.GREEN}DEBUG: Found device from symlink: {result}{Colors.END}")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# Try to get from ceph-volume
|
# Method 3: Try lsblk
|
||||||
cmd = f"ceph-volume lvm list {osd_id} -f json 2>/dev/null"
|
result = run_command(f"lsblk -no pkname /var/lib/ceph/osd/ceph-{osd_id}/block 2>/dev/null", host=hostname)
|
||||||
result = run_command(cmd, host=hostname, parse_json=True)
|
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
# Parse ceph-volume output
|
device = f"/dev/{result.strip()}"
|
||||||
for osd_key, osd_info in result.items():
|
if DEBUG:
|
||||||
if isinstance(osd_info, list) and len(osd_info) > 0:
|
print(f"{Colors.GREEN}DEBUG: Found device from lsblk: {device}{Colors.END}")
|
||||||
block_device = osd_info[0].get('devices', [])
|
return device
|
||||||
if block_device:
|
|
||||||
return block_device[0] if isinstance(block_device, list) else block_device
|
if DEBUG:
|
||||||
|
print(f"{Colors.RED}DEBUG: Could not find device for osd.{osd_id}{Colors.END}")
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -126,12 +123,11 @@ def get_smart_data_remote(device_path, hostname):
|
|||||||
if not device_path:
|
if not device_path:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Strip partition suffix:
|
# Strip partition suffix
|
||||||
# /dev/sda1 -> /dev/sda
|
|
||||||
# /dev/nvme0n1p1 -> /dev/nvme0n1
|
|
||||||
base_device = re.sub(r'p?\d+$', '', device_path)
|
base_device = re.sub(r'p?\d+$', '', device_path)
|
||||||
|
|
||||||
cmd = f"smartctl -a -j {base_device} 2>/dev/null"
|
# Use sudo for smartctl
|
||||||
|
cmd = f"sudo smartctl -a -j {base_device} 2>/dev/null"
|
||||||
result = run_command(cmd, host=hostname, parse_json=True)
|
result = run_command(cmd, host=hostname, parse_json=True)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
@@ -171,7 +167,7 @@ def parse_smart_health(smart_data):
|
|||||||
if not smart_data:
|
if not smart_data:
|
||||||
return 50.0, ["No SMART data available"], metrics
|
return 50.0, ["No SMART data available"], metrics
|
||||||
|
|
||||||
# Check for different SMART data formats
|
# Check for HDD SMART data
|
||||||
if 'ata_smart_attributes' in smart_data:
|
if 'ata_smart_attributes' in smart_data:
|
||||||
attrs = smart_data['ata_smart_attributes'].get('table', [])
|
attrs = smart_data['ata_smart_attributes'].get('table', [])
|
||||||
|
|
||||||
@@ -179,7 +175,6 @@ def parse_smart_health(smart_data):
|
|||||||
attr_id = attr.get('id')
|
attr_id = attr.get('id')
|
||||||
name = attr.get('name', '')
|
name = attr.get('name', '')
|
||||||
value = attr.get('value', 0)
|
value = attr.get('value', 0)
|
||||||
worst = attr.get('worst', 0)
|
|
||||||
raw_value = attr.get('raw', {}).get('value', 0)
|
raw_value = attr.get('raw', {}).get('value', 0)
|
||||||
|
|
||||||
# Reallocated Sectors (5)
|
# Reallocated Sectors (5)
|
||||||
@@ -212,10 +207,12 @@ def parse_smart_health(smart_data):
|
|||||||
|
|
||||||
# Temperature (190, 194)
|
# Temperature (190, 194)
|
||||||
elif attr_id in [190, 194]:
|
elif attr_id in [190, 194]:
|
||||||
metrics['temperature'] = raw_value
|
# Only use valid temperature values
|
||||||
if raw_value > 60:
|
if isinstance(raw_value, int) and 0 < raw_value < 100:
|
||||||
score -= min(10, (raw_value - 60) * 2)
|
metrics['temperature'] = raw_value
|
||||||
issues.append(f"High temperature: {raw_value}°C")
|
if raw_value > 60:
|
||||||
|
score -= min(10, (raw_value - 60) * 2)
|
||||||
|
issues.append(f"High temperature: {raw_value}°C")
|
||||||
|
|
||||||
# Power On Hours (9)
|
# Power On Hours (9)
|
||||||
elif attr_id == 9:
|
elif attr_id == 9:
|
||||||
@@ -226,15 +223,7 @@ def parse_smart_health(smart_data):
|
|||||||
score -= min(15, (age_years - 5) * 3)
|
score -= min(15, (age_years - 5) * 3)
|
||||||
issues.append(f"Drive age: {age_years:.1f} years")
|
issues.append(f"Drive age: {age_years:.1f} years")
|
||||||
|
|
||||||
# Wear leveling (for SSDs, 177)
|
# Check for NVMe SMART data
|
||||||
elif attr_id == 177 and value < worst:
|
|
||||||
metrics['wear_leveling'] = value
|
|
||||||
wear_percent = 100 - value
|
|
||||||
if wear_percent > 20:
|
|
||||||
score -= min(20, wear_percent)
|
|
||||||
issues.append(f"Wear level: {wear_percent}%")
|
|
||||||
|
|
||||||
# NVMe SMART data
|
|
||||||
elif 'nvme_smart_health_information_log' in smart_data:
|
elif 'nvme_smart_health_information_log' in smart_data:
|
||||||
nvme_health = smart_data['nvme_smart_health_information_log']
|
nvme_health = smart_data['nvme_smart_health_information_log']
|
||||||
|
|
||||||
@@ -259,10 +248,11 @@ def parse_smart_health(smart_data):
|
|||||||
|
|
||||||
# Temperature
|
# Temperature
|
||||||
temp = nvme_health.get('temperature', 0)
|
temp = nvme_health.get('temperature', 0)
|
||||||
metrics['temperature'] = temp
|
if 0 < temp < 150: # Valid temperature range
|
||||||
if temp > 70:
|
metrics['temperature'] = temp
|
||||||
score -= min(10, (temp - 70) * 2)
|
if temp > 70:
|
||||||
issues.append(f"High temperature: {temp}°C")
|
score -= min(10, (temp - 70) * 2)
|
||||||
|
issues.append(f"High temperature: {temp}°C")
|
||||||
|
|
||||||
return max(0, score), issues, metrics
|
return max(0, score), issues, metrics
|
||||||
|
|
||||||
@@ -274,19 +264,19 @@ def calculate_capacity_score(osd_data, host_osds_data, osd_class):
|
|||||||
weight = osd_data.get('crush_weight', 0)
|
weight = osd_data.get('crush_weight', 0)
|
||||||
utilization = osd_data.get('utilization', 0)
|
utilization = osd_data.get('utilization', 0)
|
||||||
|
|
||||||
# Small drives are better candidates (more capacity gain)
|
# Small drives are better candidates
|
||||||
if weight < 2:
|
if weight < 2:
|
||||||
score += 40
|
score += 40
|
||||||
factors.append(f"Very small drive ({weight}TB) - high capacity gain")
|
factors.append(f"Very small drive ({weight:.1f}TB) - high capacity gain")
|
||||||
elif weight < 5:
|
elif weight < 5:
|
||||||
score += 30
|
score += 30
|
||||||
factors.append(f"Small drive ({weight}TB) - good capacity gain")
|
factors.append(f"Small drive ({weight:.1f}TB) - good capacity gain")
|
||||||
elif weight < 10:
|
elif weight < 10:
|
||||||
score += 15
|
score += 15
|
||||||
factors.append(f"Medium drive ({weight}TB)")
|
factors.append(f"Medium drive ({weight:.1f}TB)")
|
||||||
else:
|
else:
|
||||||
score += 5
|
score += 5
|
||||||
factors.append(f"Large drive ({weight}TB) - lower priority")
|
factors.append(f"Large drive ({weight:.1f}TB) - lower priority")
|
||||||
|
|
||||||
# High utilization drives are harder to replace
|
# High utilization drives are harder to replace
|
||||||
if utilization > 70:
|
if utilization > 70:
|
||||||
@@ -334,7 +324,7 @@ def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree):
|
|||||||
current_count = host_class_counts[host_name][osd_class]
|
current_count = host_class_counts[host_name][osd_class]
|
||||||
avg_count = sum(h[osd_class] for h in host_class_counts.values()) / len(host_class_counts)
|
avg_count = sum(h[osd_class] for h in host_class_counts.values()) / len(host_class_counts)
|
||||||
|
|
||||||
# Hosts with more OSDs are better candidates for reduction
|
# Hosts with more OSDs are better candidates
|
||||||
if current_count > avg_count * 1.2:
|
if current_count > avg_count * 1.2:
|
||||||
score += 20
|
score += 20
|
||||||
factors.append(f"Host has {current_count} {osd_class} OSDs (above average {avg_count:.1f})")
|
factors.append(f"Host has {current_count} {osd_class} OSDs (above average {avg_count:.1f})")
|
||||||
@@ -342,7 +332,7 @@ def calculate_resilience_score(osd_data, host_name, all_hosts_data, osd_tree):
|
|||||||
score += 10
|
score += 10
|
||||||
factors.append(f"Host slightly above average {osd_class} count")
|
factors.append(f"Host slightly above average {osd_class} count")
|
||||||
|
|
||||||
# Check for down OSDs on same host (indicates potential issues)
|
# Check for down OSDs on same host
|
||||||
host_node = next((n for n in osd_tree['nodes'] if n['type'] == 'host' and n['name'] == host_name), None)
|
host_node = next((n for n in osd_tree['nodes'] if n['type'] == 'host' and n['name'] == host_name), None)
|
||||||
if host_node:
|
if host_node:
|
||||||
down_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes']))
|
down_osds = [osd_tree['nodes'][i] for i in range(len(osd_tree['nodes']))
|
||||||
@@ -450,7 +440,7 @@ def analyze_cluster():
|
|||||||
osd_df_data = osd_df_map.get(osd_id, {})
|
osd_df_data = osd_df_map.get(osd_id, {})
|
||||||
osd_perf_data = osd_perf_map.get(osd_id, {})
|
osd_perf_data = osd_perf_map.get(osd_id, {})
|
||||||
|
|
||||||
# SMART health analysis - query from the correct host
|
# SMART health analysis
|
||||||
health_data = get_device_health(osd_id, host_name)
|
health_data = get_device_health(osd_id, host_name)
|
||||||
if not health_data:
|
if not health_data:
|
||||||
failed_smart.append((osd_name, host_name))
|
failed_smart.append((osd_name, host_name))
|
||||||
@@ -498,7 +488,7 @@ def analyze_cluster():
|
|||||||
'performance_factors': performance_factors,
|
'performance_factors': performance_factors,
|
||||||
})
|
})
|
||||||
|
|
||||||
print(" " * 80, end='\r') # Clear the line
|
print(" " * 80, end='\r')
|
||||||
|
|
||||||
# Show SMART failures if any
|
# Show SMART failures if any
|
||||||
if failed_smart:
|
if failed_smart:
|
||||||
@@ -509,7 +499,7 @@ def analyze_cluster():
|
|||||||
print(f" ... and {len(failed_smart) - 5} more")
|
print(f" ... and {len(failed_smart) - 5} more")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# Sort by total score (descending)
|
# Sort by total score
|
||||||
candidates.sort(key=lambda x: x['total_score'], reverse=True)
|
candidates.sort(key=lambda x: x['total_score'], reverse=True)
|
||||||
|
|
||||||
# Display results
|
# Display results
|
||||||
|
|||||||
Reference in New Issue
Block a user