#!/bin/bash #============================================================================== # Drive Atlas - Server Drive Mapping Tool # Maps physical drive bays to logical device names using PCI paths #============================================================================== #------------------------------------------------------------------------------ # Dependency Checks # Verifies required commands are available before running #------------------------------------------------------------------------------ # Required dependencies (script will not function without these) REQUIRED_DEPS=(lsblk lspci readlink hostname) # Optional dependencies (enhanced functionality) OPTIONAL_DEPS=(smartctl ceph ceph-volume bc nvme) FRESH_START_URL="http://10.10.10.63:3000/LotusGuild/freshStartScript/raw/branch/main/freshStart.sh" #------------------------------------------------------------------------------ # check_dependencies # # Verifies required and optional commands are available. # Exits with error if required dependencies are missing. # Warns about missing optional dependencies. #------------------------------------------------------------------------------ check_dependencies() { local missing_required=() local missing_optional=() # Check required dependencies for cmd in "${REQUIRED_DEPS[@]}"; do if ! command -v "$cmd" &>/dev/null; then missing_required+=("$cmd") fi done # Check optional dependencies for cmd in "${OPTIONAL_DEPS[@]}"; do if ! command -v "$cmd" &>/dev/null; then missing_optional+=("$cmd") fi done # Report missing required dependencies and exit if [[ ${#missing_required[@]} -gt 0 ]]; then echo "ERROR: Missing required dependencies: ${missing_required[*]}" >&2 echo "" >&2 echo "Please install the missing packages or run the fresh start script:" >&2 echo " curl -s $FRESH_START_URL | bash" >&2 echo "" >&2 exit 1 fi # Warn about missing optional dependencies if [[ ${#missing_optional[@]} -gt 0 ]]; then echo "Note: Some optional features unavailable. Missing: ${missing_optional[*]}" >&2 echo " Install them or run: curl -s $FRESH_START_URL | bash" >&2 echo "" >&2 fi # Check for sudo access (needed for smartctl) if command -v smartctl &>/dev/null && ! sudo -n true 2>/dev/null; then echo "Note: SMART data requires sudo access. Run with sudo for full functionality." >&2 fi } # Run dependency check at script start check_dependencies #------------------------------------------------------------------------------ # Chassis Type Definitions # These define the physical layout and display formatting for each chassis type #------------------------------------------------------------------------------ generate_10bay_layout() { local hostname=$1 build_drive_map # Fixed width for consistent box drawing (fits device names like "nvme0n1") local drive_width=10 # Main chassis section printf "┌────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐\n" printf "│ %-126s │\n" "$hostname - Sliger CX4712 (10x 3.5\" Hot-swap)" printf "│ │\n" # Show storage controllers printf "│ Storage Controllers: │\n" while IFS= read -r ctrl; do [[ -n "$ctrl" ]] && printf "│ %-126s│\n" "$ctrl" done < <(get_storage_controllers) printf "│ │\n" # M.2 NVMe slot if present if [[ -n "${DRIVE_MAP[m2-1]}" ]]; then printf "│ M.2 NVMe: %-10s │\n" "${DRIVE_MAP[m2-1]}" printf "│ │\n" fi printf "│ Front Hot-swap Bays: │\n" printf "│ │\n" # Bay top borders printf "│ " for bay in {1..10}; do printf "┌──────────┐ " done printf " │\n" # Bay contents printf "│ " for bay in {1..10}; do printf "│%-2d:%-7s│ " "$bay" "${DRIVE_MAP[$bay]:-EMPTY}" done printf " │\n" # Bay bottom borders printf "│ " for bay in {1..10}; do printf "└──────────┘ " done printf " │\n" printf "└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n" } generate_micro_layout() { local hostname=$1 build_drive_map # Check for eMMC storage local emmc_device="" if [[ -b /dev/mmcblk0 ]]; then emmc_device="mmcblk0" fi printf "┌─────────────────────────────────────────────────────────────┐\n" printf "│ %-57s │\n" "$hostname - Micro SBC" printf "│ │\n" printf "│ Storage Controllers: │\n" while IFS= read -r ctrl; do [[ -n "$ctrl" ]] && printf "│ %-57s│\n" "$ctrl" done < <(get_storage_controllers) printf "│ │\n" # Show eMMC if present if [[ -n "$emmc_device" ]]; then local emmc_size=$(lsblk -d -n -o SIZE "/dev/$emmc_device" 2>/dev/null | xargs) printf "│ ┌─────────────────────────────────────────────────────┐ │\n" printf "│ │ Onboard eMMC: %-10s (%s) │ │\n" "$emmc_device" "$emmc_size" printf "│ └─────────────────────────────────────────────────────┘ │\n" printf "│ │\n" fi printf "│ SATA Ports (rear): │\n" printf "│ ┌──────────────┐ ┌──────────────┐ │\n" printf "│ │ 1: %-9s │ │ 2: %-9s │ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}" printf "│ └──────────────┘ └──────────────┘ │\n" printf "└─────────────────────────────────────────────────────────────┘\n" } generate_large1_layout() { local hostname=$1 build_drive_map # large1 has 3 stacks of 5 bays at front (15 total) + 2 M.2 slots # Physical bay mapping TBD - current mapping is by controller order printf "┌─────────────────────────────────────────────────────────────────────────┐\n" printf "│ %-69s │\n" "$hostname - Rosewill RSV-L4500U (15x 3.5\" Bays)" printf "│ │\n" printf "│ Storage Controllers: │\n" while IFS= read -r ctrl; do [[ -n "$ctrl" ]] && printf "│ %-69s│\n" "$ctrl" done < <(get_storage_controllers) printf "│ │\n" printf "│ M.2 NVMe: M1: %-10s M2: %-10s │\n" "${DRIVE_MAP[m2-1]:-EMPTY}" "${DRIVE_MAP[m2-2]:-EMPTY}" printf "│ │\n" printf "│ Front Bays (3 stacks x 5 rows): [Bay mapping TBD] │\n" printf "│ Stack A Stack B Stack C │\n" printf "│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │\n" printf "│ │1:%-8s│ │2:%-8s│ │3:%-8s│ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}" "${DRIVE_MAP[3]:-EMPTY}" printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n" printf "│ │4:%-8s│ │5:%-8s│ │6:%-8s│ │\n" "${DRIVE_MAP[4]:-EMPTY}" "${DRIVE_MAP[5]:-EMPTY}" "${DRIVE_MAP[6]:-EMPTY}" printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n" printf "│ │7:%-8s│ │8:%-8s│ │9:%-8s│ │\n" "${DRIVE_MAP[7]:-EMPTY}" "${DRIVE_MAP[8]:-EMPTY}" "${DRIVE_MAP[9]:-EMPTY}" printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n" printf "│ │10:%-7s│ │11:%-7s│ │12:%-7s│ │\n" "${DRIVE_MAP[10]:-EMPTY}" "${DRIVE_MAP[11]:-EMPTY}" "${DRIVE_MAP[12]:-EMPTY}" printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n" printf "│ │13:%-7s│ │14:%-7s│ │15:%-7s│ │\n" "${DRIVE_MAP[13]:-EMPTY}" "${DRIVE_MAP[14]:-EMPTY}" "${DRIVE_MAP[15]:-EMPTY}" printf "│ └──────────┘ └──────────┘ └──────────┘ │\n" printf "└─────────────────────────────────────────────────────────────────────────┘\n" } #------------------------------------------------------------------------------ # Server-Specific Drive Mappings # Maps PCI paths to physical bay numbers for each server # Format: "pci-path bay-number" #------------------------------------------------------------------------------ declare -A SERVER_MAPPINGS=( # compute-storage-01 (formerly medium2) # Motherboard: B650D4U3-2Q/BCM with AMD SATA controller # HBA: LSI SAS3008 at 01:00.0 (mini-SAS HD ports) # Cable mapping from user notes: # - Mobo SATA: top-right=bay1, bottom-right=bay2, bottom-left=bay3, top-left=bay4 # - HBA bottom mini-SAS: bays 5,6,7,8 # - HBA top mini-SAS: bays 9,10 ["compute-storage-01"]=" pci-0000:0d:00.0-ata-2 1 pci-0000:0d:00.0-ata-1 2 pci-0000:0d:00.0-ata-3 3 pci-0000:0d:00.0-ata-4 4 pci-0000:01:00.0-sas-phy6-lun-0 5 pci-0000:01:00.0-sas-phy7-lun-0 6 pci-0000:01:00.0-sas-phy5-lun-0 7 pci-0000:01:00.0-sas-phy2-lun-0 8 pci-0000:01:00.0-sas-phy4-lun-0 9 pci-0000:01:00.0-sas-phy3-lun-0 10 pci-0000:0e:00.0-nvme-1 m2-1 " # compute-storage-gpu-01 # Motherboard: ASUS PRIME B550-PLUS with AMD SATA controller at 02:00.1 # 5 SATA ports + 1 M.2 NVMe slot # sdf is USB/card reader - not mapped ["compute-storage-gpu-01"]=" pci-0000:02:00.1-ata-1 1 pci-0000:02:00.1-ata-2 2 pci-0000:02:00.1-ata-3 3 pci-0000:02:00.1-ata-4 4 pci-0000:02:00.1-ata-5 5 pci-0000:0c:00.0-nvme-1 m2-1 " # storage-01 # Motherboard: ASRock A320M-HDV R4.0 with AMD SATA controller at 02:00.1 # 4 SATA ports used (ata-1, ata-2, ata-5, ata-6) - ata-3/4 empty ["storage-01"]=" pci-0000:02:00.1-ata-1 1 pci-0000:02:00.1-ata-2 2 pci-0000:02:00.1-ata-5 3 pci-0000:02:00.1-ata-6 4 " # large1 # Custom tower with multiple controllers: # - HBA: LSI SAS2008 at 10:00.0 (7 drives) # - AMD SATA at 16:00.1 (3 drives) # - ASMedia SATA at 25:00.0 (2 drives) # - 2x NVMe slots ["large1"]=" pci-0000:10:00.0-sas-phy0-lun-0 1 pci-0000:10:00.0-sas-phy1-lun-0 2 pci-0000:10:00.0-sas-phy3-lun-0 3 pci-0000:10:00.0-sas-phy4-lun-0 4 pci-0000:10:00.0-sas-phy5-lun-0 5 pci-0000:10:00.0-sas-phy6-lun-0 6 pci-0000:10:00.0-sas-phy7-lun-0 7 pci-0000:16:00.1-ata-3 8 pci-0000:16:00.1-ata-7 9 pci-0000:16:00.1-ata-8 10 pci-0000:25:00.0-ata-1 11 pci-0000:25:00.0-ata-2 12 pci-0000:2a:00.0-nvme-1 m2-1 pci-0000:26:00.0-nvme-1 m2-2 " # micro1 # ZimaBoard 832 - Single board computer # 2 SATA ports on rear (currently unused) # Boot from onboard eMMC (mmcblk0) # SATA controller at 00:12.0 ["micro1"]=" " # monitor-02 # ZimaBoard 832 - Single board computer # 2 SATA ports on rear (currently unused) # Boot from onboard eMMC (mmcblk0) # SATA controller would be at a specific PCI address when drives connected ["monitor-02"]=" " ) declare -A CHASSIS_TYPES=( ["compute-storage-01"]="10bay" ["compute-storage-gpu-01"]="10bay" ["storage-01"]="10bay" ["large1"]="large1" ["micro1"]="micro" # ZimaBoard 832 ["monitor-02"]="micro" # ZimaBoard 832 ) #------------------------------------------------------------------------------ # Core Functions #------------------------------------------------------------------------------ get_storage_controllers() { # Returns a formatted list of storage controllers (HBAs, SATA, NVMe) lspci 2>/dev/null | grep -iE "SAS|SATA|RAID|Mass storage|NVMe" | while read -r line; do pci_addr=$(echo "$line" | awk '{print $1}') # Get short description (strip PCI address) desc=$(echo "$line" | sed 's/^[0-9a-f:.]\+ //') echo " $pci_addr: $desc" done } #------------------------------------------------------------------------------ # build_drive_map # # Builds a global associative array mapping physical bay numbers to device names. # Uses PCI paths from SERVER_MAPPINGS to resolve current device assignments. # # Sets: DRIVE_MAP (global associative array) # Keys: Bay identifiers (1, 2, ..., m2-1, m2-2, etc.) # Values: Device names (sda, nvme0n1, etc.) #------------------------------------------------------------------------------ build_drive_map() { local host="$(hostname)" local mapping="${SERVER_MAPPINGS[$host]}" # Declare global array directly instead of copying from local declare -g -A DRIVE_MAP=() if [[ -n "$mapping" ]]; then while read -r path slot; do [[ -z "$path" || -z "$slot" ]] && continue if [[ -L "/dev/disk/by-path/$path" ]]; then local drive="$(readlink -f "/dev/disk/by-path/$path" | sed 's/.*\///')" DRIVE_MAP[$slot]="$drive" fi done <<< "$mapping" fi } #------------------------------------------------------------------------------ # get_drive_smart_info # # Retrieves SMART data for a given device. # # Args: # $1 - Device name (e.g., sda, nvme0n1) # # Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL # TYPE: SSD, HDD, or NVMe # TEMP: Temperature in Celsius (or "-" if unavailable) # HEALTH: ✓ for passed, ✗ for failed # MODEL: Drive model string # SERIAL: Drive serial number #------------------------------------------------------------------------------ get_drive_smart_info() { local device="$1" local smart_info local temp="-" local type="HDD" local health="✗" local model="-" local serial="-" smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)" # Temperature parsing - handles multiple formats: # - SATA: "194 Temperature_Celsius ... 35" (value at end of line) # - SATA: "Temperature: 42 Celsius" # - SATA: "Current Temperature: 35 Celsius" # - NVMe: "Temperature: 42 Celsius" if echo "$smart_info" | grep -q "Temperature_Celsius"; then # SMART attribute format - temperature is typically the 10th field (raw value) # But we use the last numeric field before any parentheses for reliability temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')" elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then # Simple "Temperature: XX Celsius" format temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')" fi # Device type detection - handles SSD, HDD, and NVMe if [[ "$device" == nvme* ]]; then type="NVMe" elif echo "$smart_info" | grep -q "Rotation Rate"; then if echo "$smart_info" | grep "Rotation Rate" | grep -qiE "solid state|0 rpm"; then type="SSD" else type="HDD" fi elif echo "$smart_info" | grep -qiE "SSD|Solid State"; then type="SSD" fi # Health status if echo "$smart_info" | grep -q "SMART overall-health.*PASSED"; then health="✓" elif echo "$smart_info" | grep -q "SMART Health Status.*OK"; then # NVMe format health="✓" fi # Model - try multiple field names model="$(echo "$smart_info" | grep -E "^(Device Model|Model Number|Product):" | head -1 | cut -d: -f2 | xargs)" [[ -z "$model" ]] && model="-" # Serial number - capture everything after the colon to handle spaces serial="$(echo "$smart_info" | grep -E "^Serial [Nn]umber:" | head -1 | cut -d: -f2 | xargs)" [[ -z "$serial" ]] && serial="-" # Format temperature with unit if we have a value local temp_display if [[ -n "$temp" && "$temp" != "-" ]]; then temp_display="${temp}°C" else temp_display="-" fi echo "${type}|${temp_display}|${health}|${model}|${serial}" } #------------------------------------------------------------------------------ # Main Display Logic #------------------------------------------------------------------------------ HOSTNAME=$(hostname) CHASSIS_TYPE=${CHASSIS_TYPES[$HOSTNAME]:-"unknown"} # Display chassis layout case "$CHASSIS_TYPE" in "10bay") generate_10bay_layout "$HOSTNAME" ;; "large1") generate_large1_layout "$HOSTNAME" ;; "micro") generate_micro_layout "$HOSTNAME" ;; *) echo "┌─────────────────────────────────────────────────────────┐" echo "│ Unknown server: $HOSTNAME" echo "│ No chassis mapping defined yet" echo "│ Run diagnose-drives.sh to gather PCI path information" echo "└─────────────────────────────────────────────────────────┘" ;; esac #------------------------------------------------------------------------------ # Drive Details Section #------------------------------------------------------------------------------ echo -e "\n=== Drive Details with SMART Status (by Bay Position) ===" printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" echo "----------------------------------------------------------------------------------------------------------------------------------------------------" # Build reverse map: device -> bay declare -A DEVICE_TO_BAY for bay in "${!DRIVE_MAP[@]}"; do device="${DRIVE_MAP[$bay]}" if [[ -n "$device" && "$device" != "EMPTY" ]]; then DEVICE_TO_BAY[$device]=$bay fi done # Sort drives by bay position for bay in $(printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^[0-9]+$' | sort -n); do device="${DRIVE_MAP[$bay]}" if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then size=$(lsblk -d -n -o SIZE "/dev/$device" 2>/dev/null) smart_info=$(get_drive_smart_info "$device") IFS='|' read -r type temp health model serial <<< "$smart_info" # Check for Ceph OSD osd_id=$(ceph-volume lvm list 2>/dev/null | grep -B 20 "/dev/$device" | grep "osd id" | awk '{print "osd."$3}' | head -1) # Get Ceph status if OSD exists ceph_status="-" if [[ -n "$osd_id" ]]; then # Get in/out and up/down status from ceph osd tree osd_num=$(echo "$osd_id" | sed 's/osd\.//') # Parse ceph osd tree output - column 5 is STATUS (up/down), column 6 is REWEIGHT (1.0 = in, 0 = out) tree_line=$(ceph osd tree 2>/dev/null | grep -E "^\s*${osd_num}\s+" | grep "osd.${osd_num}") up_status=$(echo "$tree_line" | awk '{print $5}') reweight=$(echo "$tree_line" | awk '{print $6}') # Default to unknown if we can't parse [[ -z "$up_status" ]] && up_status="unknown" [[ -z "$reweight" ]] && reweight="0" # Determine in/out based on reweight (1.0 = in, 0 = out) # Use awk for floating point comparison (more portable than bc) if awk "BEGIN {exit !($reweight > 0)}"; then in_status="in" else in_status="out" fi ceph_status="${up_status}/${in_status}" else osd_id="-" fi # Check if boot drive usage="-" if mount | grep -q "^/dev/${device}"; then mount_point=$(mount | grep "^/dev/${device}" | awk '{print $3}' | head -1) if [[ "$mount_point" == "/" ]]; then usage="BOOT" else usage="$mount_point" fi fi printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s\n" "$bay" "/dev/$device" "$size" "$type" "$temp" "$health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" fi done # NVMe drives nvme_devices=$(lsblk -d -n -o NAME,SIZE | grep "^nvme" 2>/dev/null) if [ -n "$nvme_devices" ]; then echo -e "\n=== NVMe Drives ===" printf "%-15s %-10s %-10s %-40s %-25s\n" "DEVICE" "SIZE" "TYPE" "MODEL" "SERIAL" echo "------------------------------------------------------------------------------------------------------" echo "$nvme_devices" | while read -r name size; do device="/dev/$name" # Get model and serial from smartctl for accuracy smart_info=$(sudo smartctl -i "$device" 2>/dev/null) model=$(echo "$smart_info" | grep "Model Number" | cut -d: -f2 | xargs) serial=$(echo "$smart_info" | grep "Serial Number" | cut -d: -f2 | xargs) [[ -z "$model" ]] && model="-" [[ -z "$serial" ]] && serial="-" printf "%-15s %-10s %-10s %-40s %-25s\n" "$device" "$size" "NVMe" "$model" "$serial" done fi #------------------------------------------------------------------------------ # Optional sections #------------------------------------------------------------------------------ # Ceph RBD Devices rbd_devices=$(lsblk -d -n -o NAME,SIZE,TYPE 2>/dev/null | grep "rbd" | sort -V) if [ -n "$rbd_devices" ]; then echo -e "\n=== Ceph RBD Devices ===" printf "%-15s %-10s %-10s %-30s\n" "DEVICE" "SIZE" "TYPE" "MOUNTPOINT" echo "------------------------------------------------------------" echo "$rbd_devices" | while read -r name size type; do # Get mountpoint if any mountpoint=$(lsblk -n -o MOUNTPOINT "/dev/$name" 2>/dev/null | head -1) [[ -z "$mountpoint" ]] && mountpoint="-" printf "%-15s %-10s %-10s %-30s\n" "/dev/$name" "$size" "$type" "$mountpoint" done fi # Show mapping diagnostic info if DEBUG is set if [[ -n "$DEBUG" ]]; then echo -e "\n=== DEBUG: Drive Mappings ===" for key in "${!DRIVE_MAP[@]}"; do echo "Bay $key: ${DRIVE_MAP[$key]}" done | sort -n fi