diff --git a/NVME_TROUBLESHOOTING.md b/NVME_TROUBLESHOOTING.md new file mode 100644 index 0000000..5f0bea5 --- /dev/null +++ b/NVME_TROUBLESHOOTING.md @@ -0,0 +1,121 @@ +# NVMe SMART Data Collection Troubleshooting + +## Issue Observed + +All NVMe drives (osd.0, osd.10, osd.22, osd.23) are failing SMART data collection with error: +``` +DEBUG: All SMART methods failed for /dev/nvme0n1 on +``` + +## Commands Attempted (All Failed) + +1. `sudo smartctl -a -j /dev/nvme0n1 -d nvme` +2. `smartctl -a -j /dev/nvme0n1 -d nvme` (without sudo) +3. `sudo smartctl -a -j /dev/nvme0n1` (without -d flag) + +## Possible Causes + +### 1. Smartctl Version Too Old +NVMe JSON output requires smartctl 7.0+. Check version: +```bash +ssh large1 "smartctl --version | head -1" +``` + +If version < 7.0, JSON output (`-j`) may not work with NVMe. + +### 2. NVMe Admin Passthrough Permission +NVMe requires CAP_SYS_ADMIN capability. SSH sudo might not preserve capabilities. + +### 3. NVMe Device Naming +Some systems use `/dev/nvme0` instead of `/dev/nvme0n1` for SMART queries. + +## Recommended Fixes + +### Option 1: Try Without JSON Flag for NVMe +Modify the script to use non-JSON output for NVMe and parse text: + +```python +# For NVMe, if JSON fails, try text output +if "nvme" in device_path: + result = run_command(f"sudo nvme smart-log {device_path}", host=hostname) + # Parse text output +``` + +### Option 2: Use nvme-cli Tool +The `nvme` command often works better than smartctl for NVMe: + +```bash +ssh large1 "sudo nvme smart-log /dev/nvme0 -o json" +``` + +### Option 3: Check Ceph's Built-in Metrics First +The script tries `ceph device query-daemon-health-metrics` first, which should work for NVMe if the OSD daemon has access. Verify: + +```bash +ceph device query-daemon-health-metrics osd.0 -f json +``` + +If this works locally but not via the script, there may be a permission issue. + +## Testing Commands + +### Test on compute-storage-01 (osd.0) +```bash +# Check smartctl version +ssh compute-storage-01 "smartctl --version" + +# Try direct smartctl +ssh compute-storage-01 "sudo smartctl -a /dev/nvme0n1" + +# Try nvme-cli +ssh compute-storage-01 "sudo nvme smart-log /dev/nvme0" + +# Try from Ceph directly +ceph device query-daemon-health-metrics osd.0 -f json +``` + +### Test on large1 (osd.10, osd.23) +```bash +# Two NVMe devices on this host +ssh large1 "sudo smartctl -a /dev/nvme0n1" +ssh large1 "sudo smartctl -a /dev/nvme1n1" + +# Try nvme-cli +ssh large1 "sudo nvme list" +ssh large1 "sudo nvme smart-log /dev/nvme0" +ssh large1 "sudo nvme smart-log /dev/nvme1" +``` + +## Workaround for Now + +Since 6 OSDs with failed SMART are all scoring 100/100 and ranking at the top, the prioritization is working correctly. However, we need to differentiate between: + +1. **Truly failed/unreadable drives** (hardware problem) +2. **SMART collection failures** (script/permission issue) + +If these NVMe drives are actually healthy but we just can't read SMART, they shouldn't all be #1 priority. + +## Quick Fix: Check if Drive is Actually Accessible + +Add a health check before marking SMART as failed: + +```python +# Before returning None, check if device is responsive +health_check = run_command(f"test -e {device_path} && echo 'OK'", host=hostname) +if health_check == "OK": + # Device exists but SMART failed - might be permissions + return {"status": "smart_read_failed", "device_accessible": True} +else: + # Device doesn't exist or is dead + return {"status": "device_failed", "device_accessible": False} +``` + +This would let us score SMART-read-failures differently from truly-dead drives. + +## Action Items + +1. Test smartctl version on all nodes +2. Test nvme-cli availability +3. Verify Ceph daemon health metrics work locally +4. Consider adding device accessibility check +5. May need to add nvme-cli as fallback method diff --git a/ceph_osd_analyzer.py b/ceph_osd_analyzer.py index 4790200..2c534f5 100644 --- a/ceph_osd_analyzer.py +++ b/ceph_osd_analyzer.py @@ -193,14 +193,25 @@ def get_device_health(osd_id, hostname): """Get device SMART health metrics from the appropriate host""" if DEBUG: print(f"{Colors.CYAN}DEBUG: Getting health for osd.{osd_id} on {hostname}{Colors.END}") - + # First try ceph's built-in health metrics data = run_command(f"ceph device query-daemon-health-metrics osd.{osd_id} -f json 2>/dev/null", parse_json=True) - - if data and ('ata_smart_attributes' in data or 'nvme_smart_health_information_log' in data): - if DEBUG: - print(f"{Colors.GREEN}DEBUG: Got SMART data from ceph device query{Colors.END}") - return data + + if data: + # Ceph returns data nested under device ID, extract it + if isinstance(data, dict) and len(data) > 0: + # Get the first (and usually only) device entry + device_data = next(iter(data.values())) if data else None + if device_data and ('ata_smart_attributes' in device_data or 'nvme_smart_health_information_log' in device_data): + if DEBUG: + print(f"{Colors.GREEN}DEBUG: Got SMART data from ceph device query (nested format){Colors.END}") + return device_data + + # Also check if data is already in the right format (backward compatibility) + if 'ata_smart_attributes' in data or 'nvme_smart_health_information_log' in data: + if DEBUG: + print(f"{Colors.GREEN}DEBUG: Got SMART data from ceph device query (direct format){Colors.END}") + return data # If that fails, get device path and query via SSH device_path = get_device_path_for_osd(osd_id, hostname)