#!/bin/bash #============================================================================== # Drive Atlas - Server Drive Mapping Tool # Maps physical drive bays to logical device names using PCI paths #============================================================================== # Shell safety options: # -o pipefail: Exit status of pipe is rightmost non-zero exit code # Note: Not using -e (errexit) to allow graceful degradation when tools fail # Note: Not using -u (nounset) as script uses ${var:-default} patterns set -o pipefail VERSION="1.1.0" #------------------------------------------------------------------------------ # Path Constants # Centralized path definitions to avoid hardcoding throughout the script #------------------------------------------------------------------------------ readonly DISK_BY_PATH="/dev/disk/by-path" #------------------------------------------------------------------------------ # show_usage # # Displays help message with usage information and available options. #------------------------------------------------------------------------------ show_usage() { cat << EOF Drive Atlas v${VERSION} - Server Drive Mapping Tool Maps physical drive bays to logical device names using PCI paths. Displays visual chassis layouts and comprehensive drive information. USAGE: $(basename "$0") [OPTIONS] OPTIONS: -h, --help Show this help message and exit -v, --version Show version information -d, --debug Enable debug output (show drive mappings) -s, --skip-smart Skip SMART data collection (faster) -c, --color Enable colored output --verbose Show detailed error messages and warnings --no-ceph Skip Ceph OSD information --show-pci Show PCI paths in output EXAMPLES: $(basename "$0") # Normal run with all features $(basename "$0") --skip-smart # Fast run without SMART data $(basename "$0") --color # Run with colored output $(basename "$0") --verbose # Show all errors and warnings $(basename "$0") --debug # Show mapping debug info ENVIRONMENT VARIABLES: DEBUG=1 Same as --debug flag For more information, see: https://code.lotusguild.org/LotusGuild/driveAtlas EOF } #------------------------------------------------------------------------------ # Command Line Argument Parsing #------------------------------------------------------------------------------ SKIP_SMART=false SKIP_CEPH=false SHOW_PCI=false USE_COLOR=false VERBOSE=false while [[ $# -gt 0 ]]; do case "$1" in -h|--help) show_usage exit 0 ;; -v|--version) echo "Drive Atlas v${VERSION}" exit 0 ;; -d|--debug) DEBUG=1 shift ;; -s|--skip-smart) SKIP_SMART=true shift ;; --no-ceph) SKIP_CEPH=true shift ;; --show-pci) SHOW_PCI=true shift ;; -c|--color) USE_COLOR=true shift ;; --verbose) VERBOSE=true shift ;; *) echo "Unknown option: $1" >&2 echo "Use --help for usage information." >&2 exit 1 ;; esac done #------------------------------------------------------------------------------ # Color Definitions # ANSI escape codes for terminal colors #------------------------------------------------------------------------------ if [[ "$USE_COLOR" == true ]]; then COLOR_RESET='\033[0m' COLOR_RED='\033[0;31m' COLOR_GREEN='\033[0;32m' COLOR_YELLOW='\033[0;33m' COLOR_BLUE='\033[0;34m' COLOR_CYAN='\033[0;36m' COLOR_BOLD='\033[1m' else COLOR_RESET='' COLOR_RED='' COLOR_GREEN='' COLOR_YELLOW='' COLOR_BLUE='' COLOR_CYAN='' COLOR_BOLD='' fi #------------------------------------------------------------------------------ # colorize_health # # Returns health indicator with appropriate color # Args: $1 - health status (✓ or ✗) #------------------------------------------------------------------------------ colorize_health() { local health="$1" if [[ "$USE_COLOR" == true ]]; then if [[ "$health" == "✓" ]]; then printf '%b%s%b' "$COLOR_GREEN" "$health" "$COLOR_RESET" else printf '%b%s%b' "$COLOR_RED" "$health" "$COLOR_RESET" fi else printf '%s' "$health" fi } #------------------------------------------------------------------------------ # colorize_temp # # Returns temperature with color based on value # Args: $1 - temperature string (e.g., "45°C") #------------------------------------------------------------------------------ colorize_temp() { local temp_str="$1" local temp_val if [[ "$USE_COLOR" != true || "$temp_str" == "-" ]]; then echo "$temp_str" return fi # Extract numeric value temp_val="${temp_str%°C}" if [[ "$temp_val" =~ ^[0-9]+$ ]]; then if [[ "$temp_val" -ge 60 ]]; then printf '%b%s%b' "$COLOR_RED" "$temp_str" "$COLOR_RESET" elif [[ "$temp_val" -ge 50 ]]; then printf '%b%s%b' "$COLOR_YELLOW" "$temp_str" "$COLOR_RESET" else printf '%b%s%b' "$COLOR_GREEN" "$temp_str" "$COLOR_RESET" fi else printf '%s' "$temp_str" fi } #------------------------------------------------------------------------------ # colorize_header # # Returns header text in blue/bold # Args: $1 - header text #------------------------------------------------------------------------------ colorize_header() { if [[ "$USE_COLOR" == true ]]; then printf '%b%b%s%b\n' "$COLOR_BLUE" "$COLOR_BOLD" "$1" "$COLOR_RESET" else printf '%s\n' "$1" fi } #------------------------------------------------------------------------------ # log_error # # Logs an error message to stderr. Always shown regardless of verbose mode. # Args: $1 - error message #------------------------------------------------------------------------------ log_error() { if [[ "$USE_COLOR" == true ]]; then printf '%bERROR:%b %s\n' "$COLOR_RED" "$COLOR_RESET" "$1" >&2 else printf 'ERROR: %s\n' "$1" >&2 fi } #------------------------------------------------------------------------------ # log_warn # # Logs a warning message to stderr. Only shown in verbose mode. # Args: $1 - warning message #------------------------------------------------------------------------------ log_warn() { if [[ "$VERBOSE" == true ]]; then if [[ "$USE_COLOR" == true ]]; then printf '%bWARN:%b %s\n' "$COLOR_YELLOW" "$COLOR_RESET" "$1" >&2 else printf 'WARN: %s\n' "$1" >&2 fi fi } #------------------------------------------------------------------------------ # log_info # # Logs an informational message to stderr. Only shown in verbose mode. # Args: $1 - info message #------------------------------------------------------------------------------ log_info() { if [[ "$VERBOSE" == true ]]; then if [[ "$USE_COLOR" == true ]]; then printf '%bINFO:%b %s\n' "$COLOR_CYAN" "$COLOR_RESET" "$1" >&2 else printf 'INFO: %s\n' "$1" >&2 fi fi } #------------------------------------------------------------------------------ # Dependency Checks # Verifies required commands are available before running #------------------------------------------------------------------------------ # Required dependencies (script will not function without these) REQUIRED_DEPS=(lsblk lspci readlink hostname) # Optional dependencies (enhanced functionality) OPTIONAL_DEPS=(smartctl ceph ceph-volume bc nvme) FRESH_START_URL="http://10.10.10.63:3000/LotusGuild/freshStartScript/raw/branch/main/freshStart.sh" #------------------------------------------------------------------------------ # check_dependencies # # Verifies required and optional commands are available. # Exits with error if required dependencies are missing. # Warns about missing optional dependencies. #------------------------------------------------------------------------------ check_dependencies() { local missing_required=() local missing_optional=() # Check required dependencies for cmd in "${REQUIRED_DEPS[@]}"; do if ! command -v "$cmd" &>/dev/null; then missing_required+=("$cmd") fi done # Check optional dependencies for cmd in "${OPTIONAL_DEPS[@]}"; do if ! command -v "$cmd" &>/dev/null; then missing_optional+=("$cmd") fi done # Report missing required dependencies and exit if [[ ${#missing_required[@]} -gt 0 ]]; then echo "ERROR: Missing required dependencies: ${missing_required[*]}" >&2 echo "" >&2 echo "Please install the missing packages or run the fresh start script:" >&2 echo " curl -s $FRESH_START_URL | bash" >&2 echo "" >&2 exit 1 fi # Warn about missing optional dependencies if [[ ${#missing_optional[@]} -gt 0 ]]; then echo "Note: Some optional features unavailable. Missing: ${missing_optional[*]}" >&2 echo " Install them or run: curl -s $FRESH_START_URL | bash" >&2 echo "" >&2 fi # Check for sudo access (needed for smartctl) if command -v smartctl &>/dev/null && ! sudo -n true 2>/dev/null; then echo "Note: SMART data requires sudo access. Run with sudo for full functionality." >&2 fi } # Run dependency check at script start check_dependencies #------------------------------------------------------------------------------ # Chassis Layout Generator Functions # These define the physical layout and display formatting for each chassis type #------------------------------------------------------------------------------ #------------------------------------------------------------------------------ # generate_10bay_layout # # Generates ASCII art representation of a 10-bay hot-swap chassis (Sliger CX4712). # Shows storage controllers, M.2 NVMe slot, and 10 front hot-swap bays. # # Args: # $1 - Hostname to display in the layout header # # Side effects: Calls build_drive_map() to populate DRIVE_MAP #------------------------------------------------------------------------------ generate_10bay_layout() { local hostname="$1" build_drive_map # Fixed width for consistent box drawing (fits device names like "nvme0n1") local drive_width=10 # Main chassis section printf "┌────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐\n" printf "│ %-126s │\n" "$hostname - Sliger CX4712 (10x 3.5\" Hot-swap)" printf "│ │\n" # Show storage controllers printf "│ Storage Controllers: │\n" while IFS= read -r ctrl; do [[ -n "$ctrl" ]] && printf "│ %-126s│\n" "$ctrl" done < <(get_storage_controllers) printf "│ │\n" # M.2 NVMe slot if present if [[ -n "${DRIVE_MAP[m2-1]}" ]]; then printf "│ M.2 NVMe: %-10s │\n" "${DRIVE_MAP[m2-1]}" printf "│ │\n" fi printf "│ Front Hot-swap Bays: │\n" printf "│ │\n" # Bay top borders printf "│ " for bay in {1..10}; do printf "┌──────────┐ " done printf " │\n" # Bay contents printf "│ " for bay in {1..10}; do printf "│%-2d:%-7s│ " "$bay" "${DRIVE_MAP[$bay]:-EMPTY}" done printf " │\n" # Bay bottom borders printf "│ " for bay in {1..10}; do printf "└──────────┘ " done printf " │\n" printf "└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n" } #------------------------------------------------------------------------------ # generate_micro_layout # # Generates ASCII art representation of a micro SBC (e.g., ZimaBoard). # Shows storage controllers, onboard eMMC (if present), and 2 SATA ports. # # Args: # $1 - Hostname to display in the layout header # # Side effects: Calls build_drive_map() to populate DRIVE_MAP #------------------------------------------------------------------------------ generate_micro_layout() { local hostname="$1" build_drive_map # Check for eMMC storage local emmc_device="" if [[ -b /dev/mmcblk0 ]]; then emmc_device="mmcblk0" fi printf "┌─────────────────────────────────────────────────────────────┐\n" printf "│ %-57s │\n" "$hostname - Micro SBC" printf "│ │\n" printf "│ Storage Controllers: │\n" while IFS= read -r ctrl; do [[ -n "$ctrl" ]] && printf "│ %-57s│\n" "$ctrl" done < <(get_storage_controllers) printf "│ │\n" # Show eMMC if present if [[ -n "$emmc_device" ]]; then local emmc_size=$(lsblk -d -n -o SIZE "/dev/$emmc_device" 2>/dev/null | xargs) printf "│ ┌─────────────────────────────────────────────────────┐ │\n" printf "│ │ Onboard eMMC: %-10s (%s) │ │\n" "$emmc_device" "$emmc_size" printf "│ └─────────────────────────────────────────────────────┘ │\n" printf "│ │\n" fi printf "│ SATA Ports (rear): │\n" printf "│ ┌──────────────┐ ┌──────────────┐ │\n" printf "│ │ 1: %-9s │ │ 2: %-9s │ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}" printf "│ └──────────────┘ └──────────────┘ │\n" printf "└─────────────────────────────────────────────────────────────┘\n" } #------------------------------------------------------------------------------ # generate_large1_layout # # Generates ASCII art representation of a large1 chassis (Rosewill RSV-L4500U). # Shows storage controllers, 2 M.2 NVMe slots, and 15 front bays in 3x5 grid. # # Args: # $1 - Hostname to display in the layout header # # Side effects: Calls build_drive_map() to populate DRIVE_MAP #------------------------------------------------------------------------------ generate_large1_layout() { local hostname="$1" build_drive_map # large1 has 3 stacks of 5 bays at front (15 total) + 2 M.2 slots # Physical bay mapping TBD - current mapping is by controller order printf "┌─────────────────────────────────────────────────────────────────────────┐\n" printf "│ %-69s │\n" "$hostname - Rosewill RSV-L4500U (15x 3.5\" Bays)" printf "│ │\n" printf "│ Storage Controllers: │\n" while IFS= read -r ctrl; do [[ -n "$ctrl" ]] && printf "│ %-69s│\n" "$ctrl" done < <(get_storage_controllers) printf "│ │\n" printf "│ M.2 NVMe: M1: %-10s M2: %-10s │\n" "${DRIVE_MAP[m2-1]:-EMPTY}" "${DRIVE_MAP[m2-2]:-EMPTY}" printf "│ │\n" printf "│ Front Bays (3 stacks x 5 rows): [Bay mapping TBD] │\n" printf "│ Stack A Stack B Stack C │\n" printf "│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │\n" printf "│ │1:%-8s│ │2:%-8s│ │3:%-8s│ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}" "${DRIVE_MAP[3]:-EMPTY}" printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n" printf "│ │4:%-8s│ │5:%-8s│ │6:%-8s│ │\n" "${DRIVE_MAP[4]:-EMPTY}" "${DRIVE_MAP[5]:-EMPTY}" "${DRIVE_MAP[6]:-EMPTY}" printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n" printf "│ │7:%-8s│ │8:%-8s│ │9:%-8s│ │\n" "${DRIVE_MAP[7]:-EMPTY}" "${DRIVE_MAP[8]:-EMPTY}" "${DRIVE_MAP[9]:-EMPTY}" printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n" printf "│ │10:%-7s│ │11:%-7s│ │12:%-7s│ │\n" "${DRIVE_MAP[10]:-EMPTY}" "${DRIVE_MAP[11]:-EMPTY}" "${DRIVE_MAP[12]:-EMPTY}" printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n" printf "│ │13:%-7s│ │14:%-7s│ │15:%-7s│ │\n" "${DRIVE_MAP[13]:-EMPTY}" "${DRIVE_MAP[14]:-EMPTY}" "${DRIVE_MAP[15]:-EMPTY}" printf "│ └──────────┘ └──────────┘ └──────────┘ │\n" printf "└─────────────────────────────────────────────────────────────────────────┘\n" } #------------------------------------------------------------------------------ # Server-Specific Drive Mappings # Maps PCI paths to physical bay numbers for each server # Format: "pci-path bay-number" #------------------------------------------------------------------------------ declare -A SERVER_MAPPINGS=( # compute-storage-01 (formerly medium2) # Motherboard: B650D4U3-2Q/BCM with AMD SATA controller # HBA: LSI SAS3008 at 01:00.0 (mini-SAS HD ports) # Cable mapping from user notes: # - Mobo SATA: top-right=bay1, bottom-right=bay2, bottom-left=bay3, top-left=bay4 # - HBA bottom mini-SAS: bays 5,6,7,8 # - HBA top mini-SAS: bays 9,10 ["compute-storage-01"]=" pci-0000:0d:00.0-ata-2 1 pci-0000:0d:00.0-ata-1 2 pci-0000:0d:00.0-ata-3 3 pci-0000:0d:00.0-ata-4 4 pci-0000:01:00.0-sas-phy6-lun-0 5 pci-0000:01:00.0-sas-phy7-lun-0 6 pci-0000:01:00.0-sas-phy5-lun-0 7 pci-0000:01:00.0-sas-phy2-lun-0 8 pci-0000:01:00.0-sas-phy4-lun-0 9 pci-0000:01:00.0-sas-phy3-lun-0 10 pci-0000:0e:00.0-nvme-1 m2-1 " # compute-storage-gpu-01 # Motherboard: ASUS PRIME B550-PLUS with AMD SATA controller at 02:00.1 # 5 SATA ports + 1 M.2 NVMe slot # sdf is USB/card reader - not mapped ["compute-storage-gpu-01"]=" pci-0000:02:00.1-ata-1 1 pci-0000:02:00.1-ata-2 2 pci-0000:02:00.1-ata-3 3 pci-0000:02:00.1-ata-4 4 pci-0000:02:00.1-ata-5 5 pci-0000:0c:00.0-nvme-1 m2-1 " # storage-01 # Motherboard: ASRock A320M-HDV R4.0 with AMD SATA controller at 02:00.1 # 4 SATA ports used (ata-1, ata-2, ata-5, ata-6) - ata-3/4 empty ["storage-01"]=" pci-0000:02:00.1-ata-1 1 pci-0000:02:00.1-ata-2 2 pci-0000:02:00.1-ata-5 3 pci-0000:02:00.1-ata-6 4 " # large1 # Custom tower with multiple controllers: # - HBA: LSI SAS2008 at 10:00.0 (7 drives) # - AMD SATA at 16:00.1 (3 drives) # - ASMedia SATA at 25:00.0 (2 drives) # - 2x NVMe slots ["large1"]=" pci-0000:10:00.0-sas-phy0-lun-0 1 pci-0000:10:00.0-sas-phy1-lun-0 2 pci-0000:10:00.0-sas-phy3-lun-0 3 pci-0000:10:00.0-sas-phy4-lun-0 4 pci-0000:10:00.0-sas-phy5-lun-0 5 pci-0000:10:00.0-sas-phy6-lun-0 6 pci-0000:10:00.0-sas-phy7-lun-0 7 pci-0000:16:00.1-ata-3 8 pci-0000:16:00.1-ata-7 9 pci-0000:16:00.1-ata-8 10 pci-0000:25:00.0-ata-1 11 pci-0000:25:00.0-ata-2 12 pci-0000:2a:00.0-nvme-1 m2-1 pci-0000:26:00.0-nvme-1 m2-2 " # micro1 # ZimaBoard 832 - Single board computer # 2 SATA ports on rear (currently unused) # Boot from onboard eMMC (mmcblk0) # SATA controller at 00:12.0 ["micro1"]=" " # monitor-02 # ZimaBoard 832 - Single board computer # 2 SATA ports on rear (currently unused) # Boot from onboard eMMC (mmcblk0) # SATA controller would be at a specific PCI address when drives connected ["monitor-02"]=" " ) declare -A CHASSIS_TYPES=( ["compute-storage-01"]="10bay" ["compute-storage-gpu-01"]="10bay" ["storage-01"]="10bay" ["large1"]="large1" ["micro1"]="micro" # ZimaBoard 832 ["monitor-02"]="micro" # ZimaBoard 832 ) #------------------------------------------------------------------------------ # Core Functions #------------------------------------------------------------------------------ # Cache for lspci output (populated on first call) LSPCI_CACHE="" #------------------------------------------------------------------------------ # get_storage_controllers # # Returns a formatted list of storage controllers found via lspci. # Uses cached output if available to avoid redundant lspci calls. # # Output Format: " PCI_ADDR: DESCRIPTION" (one per line) #------------------------------------------------------------------------------ get_storage_controllers() { # Cache lspci output on first call if [[ -z "$LSPCI_CACHE" ]]; then LSPCI_CACHE="$(lspci 2>/dev/null | grep -iE "SAS|SATA|RAID|Mass storage|NVMe")" fi # Format and return cached output echo "$LSPCI_CACHE" | while read -r line; do [[ -z "$line" ]] && continue pci_addr="$(echo "$line" | awk '{print $1}')" # Get short description (strip PCI address) desc="$(echo "$line" | sed 's/^[0-9a-f:.]\+ //')" echo " $pci_addr: $desc" done } #------------------------------------------------------------------------------ # build_drive_map # # Builds a global associative array mapping physical bay numbers to device names. # Uses PCI paths from SERVER_MAPPINGS to resolve current device assignments. # # Sets: # DRIVE_MAP (global associative array) # Keys: Bay identifiers (1, 2, ..., m2-1, m2-2, etc.) # Values: Device names (sda, nvme0n1, etc.) # BAY_TO_PCI_PATH (global associative array) # Keys: Bay identifiers # Values: PCI path strings (for --show-pci option) #------------------------------------------------------------------------------ build_drive_map() { local host="$(hostname)" local mapping="${SERVER_MAPPINGS[$host]}" # Declare global arrays directly declare -g -A DRIVE_MAP=() declare -g -A BAY_TO_PCI_PATH=() if [[ -z "$mapping" ]]; then log_warn "No drive mapping found for host '$host'. Run diagnose-drives.sh to create one." return fi local mapped_count=0 local empty_count=0 while read -r path slot; do [[ -z "$path" || -z "$slot" ]] && continue BAY_TO_PCI_PATH[$slot]="$path" if [[ -L "${DISK_BY_PATH}/$path" ]]; then local drive="$(readlink -f "${DISK_BY_PATH}/$path" | sed 's/.*\///')" DRIVE_MAP[$slot]="$drive" ((mapped_count++)) else log_info "Bay $slot: No device at PCI path $path" ((empty_count++)) fi done <<< "$mapping" log_info "Mapped $mapped_count drives, $empty_count empty bays" } #------------------------------------------------------------------------------ # build_ceph_cache # # Queries Ceph once and builds lookup tables for OSD information. # This is much more efficient than querying ceph-volume per device. # # Sets global associative arrays: # CEPH_DEVICE_TO_OSD - Maps device names to OSD IDs (e.g., sda -> osd.5) # CEPH_OSD_STATUS - Maps OSD numbers to up/down status # CEPH_OSD_IN - Maps OSD numbers to in/out status #------------------------------------------------------------------------------ build_ceph_cache() { declare -g -A CEPH_DEVICE_TO_OSD=() declare -g -A CEPH_OSD_STATUS=() declare -g -A CEPH_OSD_IN=() # Skip if ceph-volume is not available if ! command -v ceph-volume &>/dev/null; then log_info "ceph-volume not found, skipping Ceph OSD detection" return fi log_info "Querying Ceph OSD information..." # Parse ceph-volume lvm list output # Format: blocks starting with "====== osd.X =======" followed by device info local current_osd="" local osd_count=0 while IFS= read -r line; do # Match OSD header: "====== osd.5 =======" if [[ "$line" =~ ======[[:space:]]+osd\.([0-9]+)[[:space:]]+======= ]]; then current_osd="osd.${BASH_REMATCH[1]}" # Match "devices" line which has the actual physical device: " devices /dev/sda" # This is more reliable than "block device" which may show LVM paths elif [[ -n "$current_osd" && "$line" =~ devices[[:space:]]+/dev/(sd[a-z]+|nvme[0-9]+n[0-9]+) ]]; then local dev_name="${BASH_REMATCH[1]}" CEPH_DEVICE_TO_OSD["$dev_name"]="$current_osd" ((osd_count++)) log_info "Found $current_osd on $dev_name" current_osd="" # Reset to avoid duplicate matches fi done < <(ceph-volume lvm list 2>/dev/null) log_info "Cached $osd_count Ceph OSDs" # Skip if ceph command is not available if ! command -v ceph &>/dev/null; then log_info "ceph CLI not found, skipping OSD status detection" return fi log_info "Querying Ceph OSD status..." # Parse ceph osd tree for status # Format: ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT while IFS= read -r line; do # Match OSD lines: " 5 hdd 3.63660 osd.5 up 1.00000" if [[ "$line" =~ ^[[:space:]]*([0-9]+)[[:space:]]+.*osd\.([0-9]+)[[:space:]]+(up|down)[[:space:]]+([0-9.]+) ]]; then local osd_num="${BASH_REMATCH[1]}" local status="${BASH_REMATCH[3]}" local reweight="${BASH_REMATCH[4]}" CEPH_OSD_STATUS[$osd_num]="$status" # Determine in/out based on reweight if awk "BEGIN {exit !($reweight > 0)}"; then CEPH_OSD_IN[$osd_num]="in" else CEPH_OSD_IN[$osd_num]="out" fi fi done < <(ceph osd tree 2>/dev/null) } # SMART warning thresholds readonly SMART_TEMP_WARN=50 # Temperature warning threshold (°C) readonly SMART_TEMP_CRIT=60 # Temperature critical threshold (°C) readonly SMART_REALLOCATED_WARN=1 # Reallocated sectors warning threshold readonly SMART_PENDING_WARN=1 # Pending sectors warning threshold readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use #------------------------------------------------------------------------------ # parse_smart_data # # Parses raw SMART data and returns formatted info string. # # Args: # $1 - Device name (e.g., sda, nvme0n1) # $2 - Raw smartctl output string # # Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS #------------------------------------------------------------------------------ parse_smart_data() { local device="$1" local smart_info="$2" local temp="-" local type="HDD" local health="✗" local model="-" local serial="-" local warnings="" if [[ -z "$smart_info" ]]; then echo "HDD|-|✗|-|-|" return fi # Temperature parsing - handles multiple formats: # - SATA: "194 Temperature_Celsius ... 35" (value at end of line) # - SATA: "Temperature: 42 Celsius" # - SATA: "Current Temperature: 35 Celsius" # - NVMe: "Temperature: 42 Celsius" if echo "$smart_info" | grep -q "Temperature_Celsius"; then temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')" elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')" fi # Device type detection - handles SSD, HDD, and NVMe # Priority: 1) NVMe by name, 2) Rotation Rate field, 3) Model name hints, 4) Default HDD if [[ "$device" == nvme* ]]; then type="NVMe" elif echo "$smart_info" | grep -qE "Rotation Rate:"; then # Check the Rotation Rate field value (may have leading whitespace) local rotation_rate rotation_rate="$(echo "$smart_info" | grep -E "Rotation Rate:" | head -1)" if echo "$rotation_rate" | grep -qiE "solid state"; then type="SSD" elif echo "$rotation_rate" | grep -qE "[0-9]+ rpm"; then # Has actual RPM value (e.g., "7200 rpm") - it's an HDD type="HDD" else # Unknown rotation rate, default to HDD type="HDD" fi elif echo "$smart_info" | grep -qE "Device Model:.*SSD|Model Number:.*SSD"; then # Match SSD in the model name field type="SSD" else # Default to HDD for spinning rust type="HDD" fi # Health status (basic SMART check) if echo "$smart_info" | grep -q "SMART overall-health.*PASSED"; then health="✓" elif echo "$smart_info" | grep -q "SMART Health Status.*OK"; then # NVMe format health="✓" fi # Model - try multiple field names model="$(echo "$smart_info" | grep -E "^(Device Model|Model Number|Product):" | head -1 | cut -d: -f2 | xargs)" [[ -z "$model" ]] && model="-" # Serial number - capture everything after the colon to handle spaces serial="$(echo "$smart_info" | grep -E "^Serial [Nn]umber:" | head -1 | cut -d: -f2 | xargs)" [[ -z "$serial" ]] && serial="-" # SMART threshold warnings - check for concerning values local warn_list=() # Temperature thresholds if [[ -n "$temp" && "$temp" =~ ^[0-9]+$ ]]; then if [[ "$temp" -ge "$SMART_TEMP_CRIT" ]]; then warn_list+=("TEMP_CRIT") elif [[ "$temp" -ge "$SMART_TEMP_WARN" ]]; then warn_list+=("TEMP_WARN") fi fi # Reallocated sectors (SMART attribute 5) local reallocated reallocated="$(echo "$smart_info" | grep -E "^\s*5\s+Reallocated_Sector" | awk '{print $NF}')" if [[ -n "$reallocated" && "$reallocated" =~ ^[0-9]+$ && "$reallocated" -ge "$SMART_REALLOCATED_WARN" ]]; then warn_list+=("REALLOC:$reallocated") fi # Current pending sectors (SMART attribute 197) local pending pending="$(echo "$smart_info" | grep -E "^\s*197\s+Current_Pending" | awk '{print $NF}')" if [[ -n "$pending" && "$pending" =~ ^[0-9]+$ && "$pending" -ge "$SMART_PENDING_WARN" ]]; then warn_list+=("PENDING:$pending") fi # UDMA CRC errors (SMART attribute 199) local crc_errors crc_errors="$(echo "$smart_info" | grep -E "^\s*199\s+UDMA_CRC_Error" | awk '{print $NF}')" if [[ -n "$crc_errors" && "$crc_errors" =~ ^[0-9]+$ && "$crc_errors" -ge "$SMART_CRC_ERROR_WARN" ]]; then warn_list+=("CRC:$crc_errors") fi # Power-on hours (SMART attribute 9) local power_hours power_hours="$(echo "$smart_info" | grep -E "^\s*9\s+Power_On_Hours" | awk '{print $NF}')" if [[ -n "$power_hours" && "$power_hours" =~ ^[0-9]+$ && "$power_hours" -ge "$SMART_POWER_ON_HOURS_WARN" ]]; then warn_list+=("HOURS:$power_hours") fi # Join warnings if [[ ${#warn_list[@]} -gt 0 ]]; then warnings="$(IFS=','; echo "${warn_list[*]}")" # Change health indicator to warning if SMART passed but has warnings if [[ "$health" == "✓" ]]; then health="⚠" fi fi # Format temperature with unit if we have a value local temp_display if [[ -n "$temp" && "$temp" != "-" ]]; then temp_display="${temp}°C" else temp_display="-" fi echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}" } #------------------------------------------------------------------------------ # get_drive_smart_info # # Retrieves SMART data for a given device (fetches and parses). # # Args: # $1 - Device name (e.g., sda, nvme0n1) # # Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS #------------------------------------------------------------------------------ get_drive_smart_info() { local device="$1" local smart_info smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)" parse_smart_data "$device" "$smart_info" } #------------------------------------------------------------------------------ # Main Display Logic #------------------------------------------------------------------------------ HOSTNAME=$(hostname) CHASSIS_TYPE=${CHASSIS_TYPES[$HOSTNAME]:-"unknown"} # Display chassis layout case "$CHASSIS_TYPE" in "10bay") generate_10bay_layout "$HOSTNAME" ;; "large1") generate_large1_layout "$HOSTNAME" ;; "micro") generate_micro_layout "$HOSTNAME" ;; *) echo "┌─────────────────────────────────────────────────────────┐" echo "│ Unknown server: $HOSTNAME" echo "│ No chassis mapping defined yet" echo "│ Run diagnose-drives.sh to gather PCI path information" echo "└─────────────────────────────────────────────────────────┘" ;; esac #------------------------------------------------------------------------------ # Drive Details Section #------------------------------------------------------------------------------ # Build Ceph OSD cache (single query instead of per-device) if [[ "$SKIP_CEPH" != true ]]; then build_ceph_cache fi printf "\n" colorize_header '=== Drive Details with SMART Status (by Bay Position) ===' if [[ "$SHOW_PCI" == true ]]; then printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" "PCI PATH" echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" else printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------" fi # Build reverse map: device -> bay declare -A DEVICE_TO_BAY for bay in "${!DRIVE_MAP[@]}"; do device="${DRIVE_MAP[$bay]}" if [[ -n "$device" && "$device" != "EMPTY" ]]; then DEVICE_TO_BAY["$device"]="$bay" fi done # Sort drives by bay position (numeric bays first, then m2 slots) # Combine numeric bays (sorted numerically) with m2 slots (sorted alphanumerically) all_bays="$(printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^[0-9]+$' | sort -n; printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^m2-' | sort)" # Cache lsblk data to reduce redundant calls # Get device sizes (whole disk only) declare -A LSBLK_SIZE=() declare -A LSBLK_MOUNTS=() log_info "Caching block device information..." # Get sizes for whole disks only while read -r name size; do [[ -z "$name" ]] && continue LSBLK_SIZE["$name"]="$size" done < <(lsblk -dn -o NAME,SIZE 2>/dev/null) # Get mount points (including partitions) and map back to parent device while read -r name mounts; do [[ -z "$name" || -z "$mounts" ]] && continue # Strip partition suffix (sda1 -> sda, nvme0n1p1 -> nvme0n1) if [[ "$name" =~ ^(nvme[0-9]+n[0-9]+)p[0-9]+$ ]]; then parent="${BASH_REMATCH[1]}" elif [[ "$name" =~ ^([a-z]+)[0-9]+$ ]]; then parent="${BASH_REMATCH[1]}" else parent="$name" fi if [[ -n "${LSBLK_MOUNTS[$parent]:-}" ]]; then LSBLK_MOUNTS["$parent"]+=",${mounts}" else LSBLK_MOUNTS["$parent"]="$mounts" fi done < <(lsblk -rn -o NAME,MOUNTPOINT 2>/dev/null | grep -v '^ ') # Parallel SMART data collection for faster execution # Collect raw smartctl output in background jobs, parse later if [[ "$SKIP_SMART" != true ]]; then SMART_CACHE_DIR="$(mktemp -d)" log_info "Collecting SMART data in parallel..." for bay in $all_bays; do device="${DRIVE_MAP[$bay]}" if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then # Launch background job to collect raw smartctl data (sudo smartctl -A -i -H "/dev/$device" > "$SMART_CACHE_DIR/${device}.raw" 2>/dev/null) & fi done # Wait for all background SMART queries to complete wait log_info "SMART data collection complete" fi for bay in $all_bays; do device="${DRIVE_MAP[$bay]}" if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then # Use cached lsblk data size="${LSBLK_SIZE[$device]:-}" # Get SMART info from cache (or defaults if skipped) if [[ "$SKIP_SMART" == true ]]; then type="-" temp="-" health="-" model="-" serial="-" warnings="" else # Read from cached raw SMART data and parse it raw_smart="" if [[ -f "$SMART_CACHE_DIR/${device}.raw" ]]; then raw_smart="$(cat "$SMART_CACHE_DIR/${device}.raw")" fi # Parse the raw data using get_drive_smart_info logic inline if [[ -n "$raw_smart" ]]; then smart_info="$(parse_smart_data "$device" "$raw_smart")" IFS='|' read -r type temp health model serial warnings <<< "$smart_info" else type="-" temp="-" health="-" model="-" serial="-" warnings="" fi fi # Check for Ceph OSD using cached data osd_id="-" ceph_status="-" if [[ "$SKIP_CEPH" != true ]]; then osd_id="${CEPH_DEVICE_TO_OSD[$device]:-}" if [[ -n "$osd_id" ]]; then # Get status from cached OSD tree data osd_num="${osd_id#osd.}" up_status="${CEPH_OSD_STATUS[$osd_num]:-unknown}" in_status="${CEPH_OSD_IN[$osd_num]:-out}" ceph_status="${up_status}/${in_status}" else osd_id="-" fi fi # Check mount points using cached lsblk data # This includes both whole-device mounts and partition mounts usage="-" mount_points="${LSBLK_MOUNTS[$device]:-}" # Limit to first 3 mount points for display mount_points="$(echo "$mount_points" | tr ',' '\n' | head -3 | tr '\n' ',' | sed 's/,$//')" if [[ -n "$mount_points" ]]; then if [[ "$mount_points" == *"/"* && ! "$mount_points" == *"/boot"* && ! "$mount_points" == *"/home"* ]]; then # Root filesystem mounted (but not just /boot or /home) if echo "$mount_points" | grep -qE '^/,|^/$|,/$'; then usage="BOOT" else usage="$mount_points" fi else usage="$mount_points" fi fi # Apply colors if enabled colored_temp="$(colorize_temp "$temp")" colored_health="$(colorize_health "$health")" # Colorize warnings if present colored_warnings="${warnings:--}" if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}" fi if [[ "$SHOW_PCI" == true ]]; then pci_path="${BAY_TO_PCI_PATH[$bay]:-}" printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" "$pci_path" else printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" fi fi done # Clean up SMART cache directory if [[ -n "${SMART_CACHE_DIR:-}" && -d "$SMART_CACHE_DIR" ]]; then rm -rf "$SMART_CACHE_DIR" fi # NVMe drives (only show unmapped ones - mapped NVMe drives appear in main table) nvme_devices=$(lsblk -d -n -o NAME,SIZE | grep "^nvme" 2>/dev/null) if [[ -n "$nvme_devices" ]]; then # Filter out already-mapped NVMe devices unmapped_nvme="" while read -r name size; do if [[ -z "${DEVICE_TO_BAY[$name]:-}" ]]; then unmapped_nvme+="$name $size"$'\n' fi done <<< "$nvme_devices" if [[ -n "$unmapped_nvme" ]]; then printf "\n" colorize_header '=== Unmapped NVMe Drives ===' printf "%-15s %-10s %-10s %-40s %-25s\n" "DEVICE" "SIZE" "TYPE" "MODEL" "SERIAL" echo "------------------------------------------------------------------------------------------------------" echo "$unmapped_nvme" | while read -r name size; do [[ -z "$name" ]] && continue device="/dev/$name" # Get model and serial from smartctl for accuracy smart_info="$(sudo smartctl -i "$device" 2>/dev/null)" model="$(echo "$smart_info" | grep "Model Number" | cut -d: -f2 | xargs)" serial="$(echo "$smart_info" | grep "Serial Number" | cut -d: -f2 | xargs)" [[ -z "$model" ]] && model="-" [[ -z "$serial" ]] && serial="-" printf "%-15s %-10s %-10s %-40s %-25s\n" "$device" "$size" "NVMe" "$model" "$serial" done fi fi #------------------------------------------------------------------------------ # Optional sections #------------------------------------------------------------------------------ # Ceph RBD Devices rbd_devices=$(lsblk -d -n -o NAME,SIZE,TYPE 2>/dev/null | grep "rbd" | sort -V) if [ -n "$rbd_devices" ]; then printf "\n" colorize_header '=== Ceph RBD Devices ===' printf "%-15s %-10s %-10s %-30s\n" "DEVICE" "SIZE" "TYPE" "MOUNTPOINT" echo "------------------------------------------------------------" echo "$rbd_devices" | while read -r name size type; do # Get mountpoint if any mountpoint=$(lsblk -n -o MOUNTPOINT "/dev/$name" 2>/dev/null | head -1) [[ -z "$mountpoint" ]] && mountpoint="-" printf "%-15s %-10s %-10s %-30s\n" "/dev/$name" "$size" "$type" "$mountpoint" done fi # Show mapping diagnostic info if DEBUG is set if [[ -n "$DEBUG" ]]; then printf "\n" colorize_header '=== DEBUG: Drive Mappings ===' for key in "${!DRIVE_MAP[@]}"; do echo "Bay $key: ${DRIVE_MAP[$key]}" done | sort -n fi