Files
driveAtlas/driveAtlas.sh
Jared Vititoe 4a98a6f6f8 Add storage-01 HBA bay 5 mapping (phy9)
Verified via ls -la /dev/disk/by-path/ and physical inspection
that HBA SAS3416 phy9 maps to bay 5 (C0 SATA breakout).
Remaining C0 bays 6-8 and C1 bays 9-10 still need drives to verify.

Ref #25

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 18:20:44 -05:00

1175 lines
47 KiB
Bash

#!/bin/bash
#==============================================================================
# Drive Atlas - Server Drive Mapping Tool
# Maps physical drive bays to logical device names using PCI paths
#==============================================================================
# Shell safety options:
# -o pipefail: Exit status of pipe is rightmost non-zero exit code
# Note: Not using -e (errexit) to allow graceful degradation when tools fail
# Note: Not using -u (nounset) as script uses ${var:-default} patterns
set -o pipefail
# Require bash 4.2+ for declare -g -A (global associative arrays)
if ((BASH_VERSINFO[0] < 4 || (BASH_VERSINFO[0] == 4 && BASH_VERSINFO[1] < 2))); then
echo "ERROR: This script requires Bash 4.2 or higher (current: $BASH_VERSION)" >&2
exit 1
fi
VERSION="1.1.0"
#------------------------------------------------------------------------------
# Cleanup Trap
# Ensures temporary directories are removed on exit or interruption
#------------------------------------------------------------------------------
cleanup() {
if [[ -n "${SMART_CACHE_DIR:-}" && -d "$SMART_CACHE_DIR" ]]; then
rm -rf "$SMART_CACHE_DIR"
fi
}
trap cleanup EXIT INT TERM
#------------------------------------------------------------------------------
# Path Constants
# Centralized path definitions to avoid hardcoding throughout the script
#------------------------------------------------------------------------------
readonly DISK_BY_PATH="/dev/disk/by-path"
#------------------------------------------------------------------------------
# show_usage
#
# Displays help message with usage information and available options.
#------------------------------------------------------------------------------
show_usage() {
cat << EOF
Drive Atlas v${VERSION} - Server Drive Mapping Tool
Maps physical drive bays to logical device names using PCI paths.
Displays visual chassis layouts and comprehensive drive information.
USAGE:
$(basename "$0") [OPTIONS]
OPTIONS:
-h, --help Show this help message and exit
-v, --version Show version information
-d, --debug Enable debug output (show drive mappings)
-s, --skip-smart Skip SMART data collection (faster)
-c, --color Enable colored output
--verbose Show detailed error messages and warnings
--no-ceph Skip Ceph OSD information
--show-pci Show PCI paths in output
EXAMPLES:
$(basename "$0") # Normal run with all features
$(basename "$0") --skip-smart # Fast run without SMART data
$(basename "$0") --color # Run with colored output
$(basename "$0") --verbose # Show all errors and warnings
$(basename "$0") --debug # Show mapping debug info
ENVIRONMENT VARIABLES:
DEBUG=1 Same as --debug flag
For more information, see: https://code.lotusguild.org/LotusGuild/driveAtlas
EOF
}
#------------------------------------------------------------------------------
# Command Line Argument Parsing
#------------------------------------------------------------------------------
SKIP_SMART=false
SKIP_CEPH=false
SHOW_PCI=false
USE_COLOR=false
VERBOSE=false
while [[ $# -gt 0 ]]; do
case "$1" in
-h|--help)
show_usage
exit 0
;;
-v|--version)
echo "Drive Atlas v${VERSION}"
exit 0
;;
-d|--debug)
DEBUG=1
shift
;;
-s|--skip-smart)
SKIP_SMART=true
shift
;;
--no-ceph)
SKIP_CEPH=true
shift
;;
--show-pci)
SHOW_PCI=true
shift
;;
-c|--color)
USE_COLOR=true
shift
;;
--verbose)
VERBOSE=true
shift
;;
*)
echo "Unknown option: $1" >&2
echo "Use --help for usage information." >&2
exit 1
;;
esac
done
#------------------------------------------------------------------------------
# Color Definitions
# ANSI escape codes for terminal colors
#------------------------------------------------------------------------------
if [[ "$USE_COLOR" == true ]]; then
COLOR_RESET='\033[0m'
COLOR_RED='\033[0;31m'
COLOR_GREEN='\033[0;32m'
COLOR_YELLOW='\033[0;33m'
COLOR_BLUE='\033[0;34m'
COLOR_CYAN='\033[0;36m'
COLOR_BOLD='\033[1m'
else
COLOR_RESET=''
COLOR_RED=''
COLOR_GREEN=''
COLOR_YELLOW=''
COLOR_BLUE=''
COLOR_CYAN=''
COLOR_BOLD=''
fi
#------------------------------------------------------------------------------
# colorize_health
#
# Returns health indicator with appropriate color
# Args: $1 - health status (✓ or ✗)
#------------------------------------------------------------------------------
colorize_health() {
local health="$1"
if [[ "$USE_COLOR" == true ]]; then
if [[ "$health" == "✓" ]]; then
printf '%b%s%b' "$COLOR_GREEN" "$health" "$COLOR_RESET"
else
printf '%b%s%b' "$COLOR_RED" "$health" "$COLOR_RESET"
fi
else
printf '%s' "$health"
fi
}
#------------------------------------------------------------------------------
# colorize_temp
#
# Returns temperature with color based on value
# Args: $1 - temperature string (e.g., "45°C")
#------------------------------------------------------------------------------
colorize_temp() {
local temp_str="$1"
local temp_val
if [[ "$USE_COLOR" != true || "$temp_str" == "-" ]]; then
echo "$temp_str"
return
fi
# Extract numeric value
temp_val="${temp_str%°C}"
if [[ "$temp_val" =~ ^[0-9]+$ ]]; then
if [[ "$temp_val" -ge 60 ]]; then
printf '%b%s%b' "$COLOR_RED" "$temp_str" "$COLOR_RESET"
elif [[ "$temp_val" -ge 50 ]]; then
printf '%b%s%b' "$COLOR_YELLOW" "$temp_str" "$COLOR_RESET"
else
printf '%b%s%b' "$COLOR_GREEN" "$temp_str" "$COLOR_RESET"
fi
else
printf '%s' "$temp_str"
fi
}
#------------------------------------------------------------------------------
# colorize_header
#
# Returns header text in blue/bold
# Args: $1 - header text
#------------------------------------------------------------------------------
colorize_header() {
if [[ "$USE_COLOR" == true ]]; then
printf '%b%b%s%b\n' "$COLOR_BLUE" "$COLOR_BOLD" "$1" "$COLOR_RESET"
else
printf '%s\n' "$1"
fi
}
#------------------------------------------------------------------------------
# log_error
#
# Logs an error message to stderr. Always shown regardless of verbose mode.
# Args: $1 - error message
#------------------------------------------------------------------------------
log_error() {
if [[ "$USE_COLOR" == true ]]; then
printf '%bERROR:%b %s\n' "$COLOR_RED" "$COLOR_RESET" "$1" >&2
else
printf 'ERROR: %s\n' "$1" >&2
fi
}
#------------------------------------------------------------------------------
# log_warn
#
# Logs a warning message to stderr. Only shown in verbose mode.
# Args: $1 - warning message
#------------------------------------------------------------------------------
log_warn() {
if [[ "$VERBOSE" == true ]]; then
if [[ "$USE_COLOR" == true ]]; then
printf '%bWARN:%b %s\n' "$COLOR_YELLOW" "$COLOR_RESET" "$1" >&2
else
printf 'WARN: %s\n' "$1" >&2
fi
fi
}
#------------------------------------------------------------------------------
# log_info
#
# Logs an informational message to stderr. Only shown in verbose mode.
# Args: $1 - info message
#------------------------------------------------------------------------------
log_info() {
if [[ "$VERBOSE" == true ]]; then
if [[ "$USE_COLOR" == true ]]; then
printf '%bINFO:%b %s\n' "$COLOR_CYAN" "$COLOR_RESET" "$1" >&2
else
printf 'INFO: %s\n' "$1" >&2
fi
fi
}
#------------------------------------------------------------------------------
# Dependency Checks
# Verifies required commands are available before running
#------------------------------------------------------------------------------
# Required dependencies (script will not function without these)
REQUIRED_DEPS=(lsblk lspci readlink hostname)
# Optional dependencies (enhanced functionality)
OPTIONAL_DEPS=(smartctl ceph ceph-volume bc nvme)
FRESH_START_URL="http://10.10.10.63:3000/LotusGuild/freshStartScript/raw/branch/main/freshStart.sh"
#------------------------------------------------------------------------------
# check_dependencies
#
# Verifies required and optional commands are available.
# Exits with error if required dependencies are missing.
# Warns about missing optional dependencies.
#------------------------------------------------------------------------------
check_dependencies() {
local missing_required=()
local missing_optional=()
# Check required dependencies
for cmd in "${REQUIRED_DEPS[@]}"; do
if ! command -v "$cmd" &>/dev/null; then
missing_required+=("$cmd")
fi
done
# Check optional dependencies
for cmd in "${OPTIONAL_DEPS[@]}"; do
if ! command -v "$cmd" &>/dev/null; then
missing_optional+=("$cmd")
fi
done
# Report missing required dependencies and exit
if [[ ${#missing_required[@]} -gt 0 ]]; then
echo "ERROR: Missing required dependencies: ${missing_required[*]}" >&2
echo "" >&2
echo "Please install the missing packages or run the fresh start script:" >&2
echo " curl -s $FRESH_START_URL | bash" >&2
echo "" >&2
exit 1
fi
# Warn about missing optional dependencies
if [[ ${#missing_optional[@]} -gt 0 ]]; then
echo "Note: Some optional features unavailable. Missing: ${missing_optional[*]}" >&2
echo " Install them or run: curl -s $FRESH_START_URL | bash" >&2
echo "" >&2
fi
# Check for sudo access (needed for smartctl)
if command -v smartctl &>/dev/null && ! sudo -n true 2>/dev/null; then
echo "Note: SMART data requires sudo access. Run with sudo for full functionality." >&2
fi
}
# Run dependency check at script start
check_dependencies
#------------------------------------------------------------------------------
# Chassis Layout Generator Functions
# These define the physical layout and display formatting for each chassis type
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# generate_10bay_layout
#
# Generates ASCII art representation of a 10-bay hot-swap chassis (Sliger CX4712).
# Shows storage controllers, M.2 NVMe slot, and 10 front hot-swap bays.
#
# Args:
# $1 - Hostname to display in the layout header
#
# Side effects: Calls build_drive_map() to populate DRIVE_MAP
#------------------------------------------------------------------------------
generate_10bay_layout() {
local hostname="$1"
build_drive_map
# Fixed width for consistent box drawing (fits device names like "nvme0n1")
local drive_width=10
# Main chassis section
printf "┌────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐\n"
printf "│ %-126s │\n" "$hostname - Sliger CX4712 (10x 3.5\" Hot-swap)"
printf "│ │\n"
# Show storage controllers
printf "│ Storage Controllers: │\n"
while IFS= read -r ctrl; do
[[ -n "$ctrl" ]] && printf "│ %-126s│\n" "$ctrl"
done < <(get_storage_controllers)
printf "│ │\n"
# M.2 NVMe slot if present
if [[ -n "${DRIVE_MAP[m2-1]}" ]]; then
printf "│ M.2 NVMe: %-10s │\n" "${DRIVE_MAP[m2-1]}"
printf "│ │\n"
fi
printf "│ Front Hot-swap Bays: │\n"
printf "│ │\n"
# Bay top borders
printf "│ "
for bay in {1..10}; do
printf "┌──────────┐ "
done
printf " │\n"
# Bay contents
printf "│ "
for bay in {1..10}; do
printf "│%-2d:%-7s│ " "$bay" "${DRIVE_MAP[$bay]:-EMPTY}"
done
printf " │\n"
# Bay bottom borders
printf "│ "
for bay in {1..10}; do
printf "└──────────┘ "
done
printf " │\n"
printf "└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n"
}
#------------------------------------------------------------------------------
# generate_micro_layout
#
# Generates ASCII art representation of a micro SBC (e.g., ZimaBoard).
# Shows storage controllers, onboard eMMC (if present), and 2 SATA ports.
#
# Args:
# $1 - Hostname to display in the layout header
#
# Side effects: Calls build_drive_map() to populate DRIVE_MAP
#------------------------------------------------------------------------------
generate_micro_layout() {
local hostname="$1"
build_drive_map
# Check for eMMC storage
local emmc_device=""
if [[ -b /dev/mmcblk0 ]]; then
emmc_device="mmcblk0"
fi
printf "┌─────────────────────────────────────────────────────────────┐\n"
printf "│ %-57s │\n" "$hostname - Micro SBC"
printf "│ │\n"
printf "│ Storage Controllers: │\n"
while IFS= read -r ctrl; do
[[ -n "$ctrl" ]] && printf "│ %-57s│\n" "$ctrl"
done < <(get_storage_controllers)
printf "│ │\n"
# Show eMMC if present
if [[ -n "$emmc_device" ]]; then
local emmc_size=$(lsblk -d -n -o SIZE "/dev/$emmc_device" 2>/dev/null | xargs)
printf "│ ┌─────────────────────────────────────────────────────┐ │\n"
printf "│ │ Onboard eMMC: %-10s (%s) │ │\n" "$emmc_device" "$emmc_size"
printf "│ └─────────────────────────────────────────────────────┘ │\n"
printf "│ │\n"
fi
printf "│ SATA Ports (rear): │\n"
printf "│ ┌──────────────┐ ┌──────────────┐ │\n"
printf "│ │ 1: %-9s │ │ 2: %-9s │ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}"
printf "│ └──────────────┘ └──────────────┘ │\n"
printf "└─────────────────────────────────────────────────────────────┘\n"
}
#------------------------------------------------------------------------------
# generate_large1_layout
#
# Generates ASCII art representation of a large1 chassis (Rosewill RSV-L4500U).
# Shows storage controllers, 2 M.2 NVMe slots, and 15 front bays in 3x5 grid.
#
# Args:
# $1 - Hostname to display in the layout header
#
# Side effects: Calls build_drive_map() to populate DRIVE_MAP
#------------------------------------------------------------------------------
generate_large1_layout() {
local hostname="$1"
build_drive_map
# large1 has 3 stacks of 5 bays at front (15 total) + 2 M.2 slots
# Physical bay mapping TBD - current mapping is by controller order
printf "┌─────────────────────────────────────────────────────────────────────────┐\n"
printf "│ %-69s │\n" "$hostname - Rosewill RSV-L4500U (15x 3.5\" Bays)"
printf "│ │\n"
printf "│ Storage Controllers: │\n"
while IFS= read -r ctrl; do
[[ -n "$ctrl" ]] && printf "│ %-69s│\n" "$ctrl"
done < <(get_storage_controllers)
printf "│ │\n"
printf "│ M.2 NVMe: M1: %-10s M2: %-10s │\n" "${DRIVE_MAP[m2-1]:-EMPTY}" "${DRIVE_MAP[m2-2]:-EMPTY}"
printf "│ │\n"
printf "│ Front Bays (3 stacks x 5 rows): [Bay mapping TBD] │\n"
printf "│ Stack A Stack B Stack C │\n"
printf "│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │\n"
printf "│ │1:%-8s│ │2:%-8s│ │3:%-8s│ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}" "${DRIVE_MAP[3]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │4:%-8s│ │5:%-8s│ │6:%-8s│ │\n" "${DRIVE_MAP[4]:-EMPTY}" "${DRIVE_MAP[5]:-EMPTY}" "${DRIVE_MAP[6]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │7:%-8s│ │8:%-8s│ │9:%-8s│ │\n" "${DRIVE_MAP[7]:-EMPTY}" "${DRIVE_MAP[8]:-EMPTY}" "${DRIVE_MAP[9]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │10:%-7s│ │11:%-7s│ │12:%-7s│ │\n" "${DRIVE_MAP[10]:-EMPTY}" "${DRIVE_MAP[11]:-EMPTY}" "${DRIVE_MAP[12]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │13:%-7s│ │14:%-7s│ │15:%-7s│ │\n" "${DRIVE_MAP[13]:-EMPTY}" "${DRIVE_MAP[14]:-EMPTY}" "${DRIVE_MAP[15]:-EMPTY}"
printf "│ └──────────┘ └──────────┘ └──────────┘ │\n"
printf "└─────────────────────────────────────────────────────────────────────────┘\n"
}
#------------------------------------------------------------------------------
# Server-Specific Drive Mappings
# Maps PCI paths to physical bay numbers for each server
# Format: "pci-path bay-number"
#------------------------------------------------------------------------------
declare -A SERVER_MAPPINGS=(
# compute-storage-01 (formerly medium2)
# Motherboard: B650D4U3-2Q/BCM with AMD SATA controller
# HBA: LSI SAS3008 at 01:00.0 (mini-SAS HD ports)
# Cable mapping from user notes:
# - Mobo SATA: top-right=bay1, bottom-right=bay2, bottom-left=bay3, top-left=bay4
# - HBA bottom mini-SAS: bays 5,6,7,8
# - HBA top mini-SAS: bays 9,10
["compute-storage-01"]="
pci-0000:0d:00.0-ata-2 1
pci-0000:0d:00.0-ata-1 2
pci-0000:0d:00.0-ata-3 3
pci-0000:0d:00.0-ata-4 4
pci-0000:01:00.0-sas-phy6-lun-0 5
pci-0000:01:00.0-sas-phy7-lun-0 6
pci-0000:01:00.0-sas-phy5-lun-0 7
pci-0000:01:00.0-sas-phy2-lun-0 8
pci-0000:01:00.0-sas-phy4-lun-0 9
pci-0000:01:00.0-sas-phy3-lun-0 10
pci-0000:0e:00.0-nvme-1 m2-1
"
# compute-storage-gpu-01
# Motherboard: ASUS PRIME B550-PLUS with AMD SATA controller at 02:00.1
# 5 SATA ports + 1 M.2 NVMe slot
# sdf is USB/card reader - not mapped
["compute-storage-gpu-01"]="
pci-0000:02:00.1-ata-1 1
pci-0000:02:00.1-ata-2 2
pci-0000:02:00.1-ata-3 3
pci-0000:02:00.1-ata-4 4
pci-0000:02:00.1-ata-5 5
pci-0000:0c:00.0-nvme-1 m2-1
"
# storage-01
# Motherboard: ASRock A320M-HDV R4.0
# AMD SATA controller at 02:00.1 (bays 1-4)
# Mobo SATA physical layout:
# top-left=bay 1, bottom-left=bay 2, top-right=bay 3, bottom-right=bay 4
# HBA: LSI SAS3416 at 01:00.0 (4x Mini-SAS HD ports, top=C0 to bottom=C3)
# C0 (top): 4x SATA breakout → bays 5-8
# C1: 4x SATA breakout → bays 9-10 (2 of 4 ports used)
# C2: U.2 NVMe (serial ends in 0d66) → u2-1
# C3: U.2 NVMe (serial ends in 0d4f) → u2-2
# C0 verified: phy9=bay5 (remaining phy8/10/11 → bays 6-8 TBD)
# C1: PHY-to-bay mapping TBD (bays 9-10)
# C2: U.2 NVMe (serial ends in 0d66) → u2-1 (needs FW update)
# C3: U.2 NVMe (serial ends in 0d4f) → u2-2 (needs FW update)
# Also present: 09:00.0 AMD FCH SATA Controller [AHCI mode]
["storage-01"]="
pci-0000:02:00.1-ata-1 1
pci-0000:02:00.1-ata-2 2
pci-0000:02:00.1-ata-5 3
pci-0000:02:00.1-ata-6 4
pci-0000:01:00.0-sas-phy9-lun-0 5
"
# large1
# Custom tower with multiple controllers:
# - HBA: LSI SAS2008 at 10:00.0 (7 drives)
# - AMD SATA at 16:00.1 (3 drives)
# - ASMedia SATA at 25:00.0 (2 drives)
# - 2x NVMe slots
["large1"]="
pci-0000:10:00.0-sas-phy0-lun-0 1
pci-0000:10:00.0-sas-phy1-lun-0 2
pci-0000:10:00.0-sas-phy3-lun-0 3
pci-0000:10:00.0-sas-phy4-lun-0 4
pci-0000:10:00.0-sas-phy5-lun-0 5
pci-0000:10:00.0-sas-phy6-lun-0 6
pci-0000:10:00.0-sas-phy7-lun-0 7
pci-0000:16:00.1-ata-3 8
pci-0000:16:00.1-ata-7 9
pci-0000:16:00.1-ata-8 10
pci-0000:25:00.0-ata-1 11
pci-0000:25:00.0-ata-2 12
pci-0000:2a:00.0-nvme-1 m2-1
pci-0000:26:00.0-nvme-1 m2-2
"
# micro1
# ZimaBoard 832 - Single board computer
# 2 SATA ports on rear (currently unused)
# Boot from onboard eMMC (mmcblk0)
# SATA controller at 00:12.0
["micro1"]="
"
# monitor-02
# ZimaBoard 832 - Single board computer
# 2 SATA ports on rear (currently unused)
# Boot from onboard eMMC (mmcblk0)
# SATA controller would be at a specific PCI address when drives connected
["monitor-02"]="
"
)
declare -A CHASSIS_TYPES=(
["compute-storage-01"]="10bay"
["compute-storage-gpu-01"]="10bay"
["storage-01"]="10bay"
["large1"]="large1"
["micro1"]="micro" # ZimaBoard 832
["monitor-02"]="micro" # ZimaBoard 832
)
#------------------------------------------------------------------------------
# Core Functions
#------------------------------------------------------------------------------
# Cache for lspci output (populated on first call)
LSPCI_CACHE=""
#------------------------------------------------------------------------------
# get_storage_controllers
#
# Returns a formatted list of storage controllers found via lspci.
# Uses cached output if available to avoid redundant lspci calls.
#
# Output Format: " PCI_ADDR: DESCRIPTION" (one per line)
#------------------------------------------------------------------------------
get_storage_controllers() {
# Cache lspci output on first call
if [[ -z "$LSPCI_CACHE" ]]; then
LSPCI_CACHE="$(lspci 2>/dev/null | grep -iE "SAS|SATA|RAID|Mass storage|NVMe")"
fi
# Format and return cached output
echo "$LSPCI_CACHE" | while read -r line; do
[[ -z "$line" ]] && continue
pci_addr="$(echo "$line" | awk '{print $1}')"
# Get short description (strip PCI address)
desc="$(echo "$line" | sed 's/^[0-9a-f:.]\+ //')"
echo " $pci_addr: $desc"
done
}
#------------------------------------------------------------------------------
# build_drive_map
#
# Builds a global associative array mapping physical bay numbers to device names.
# Uses PCI paths from SERVER_MAPPINGS to resolve current device assignments.
#
# Sets:
# DRIVE_MAP (global associative array)
# Keys: Bay identifiers (1, 2, ..., m2-1, m2-2, etc.)
# Values: Device names (sda, nvme0n1, etc.)
# BAY_TO_PCI_PATH (global associative array)
# Keys: Bay identifiers
# Values: PCI path strings (for --show-pci option)
#------------------------------------------------------------------------------
build_drive_map() {
local host="$(hostname | tr -cd '[:alnum:]-_.')"
local mapping="${SERVER_MAPPINGS[$host]}"
# Declare global arrays directly
declare -g -A DRIVE_MAP=()
declare -g -A BAY_TO_PCI_PATH=()
if [[ -z "$mapping" ]]; then
log_warn "No drive mapping found for host '$host'. Run diagnose-drives.sh to create one."
return
fi
local mapped_count=0
local empty_count=0
while read -r path slot; do
[[ -z "$path" || -z "$slot" ]] && continue
BAY_TO_PCI_PATH[$slot]="$path"
if [[ -L "${DISK_BY_PATH}/$path" ]]; then
local drive="$(readlink -f "${DISK_BY_PATH}/$path" | sed 's/.*\///')"
DRIVE_MAP[$slot]="$drive"
((mapped_count++))
else
log_info "Bay $slot: No device at PCI path $path"
((empty_count++))
fi
done <<< "$mapping"
log_info "Mapped $mapped_count drives, $empty_count empty bays"
}
#------------------------------------------------------------------------------
# build_ceph_cache
#
# Queries Ceph once and builds lookup tables for OSD information.
# This is much more efficient than querying ceph-volume per device.
#
# Sets global associative arrays:
# CEPH_DEVICE_TO_OSD - Maps device names to OSD IDs (e.g., sda -> osd.5)
# CEPH_OSD_STATUS - Maps OSD numbers to up/down status
# CEPH_OSD_IN - Maps OSD numbers to in/out status
#------------------------------------------------------------------------------
build_ceph_cache() {
declare -g -A CEPH_DEVICE_TO_OSD=()
declare -g -A CEPH_OSD_STATUS=()
declare -g -A CEPH_OSD_IN=()
# Skip if ceph-volume is not available
if ! command -v ceph-volume &>/dev/null; then
log_info "ceph-volume not found, skipping Ceph OSD detection"
return
fi
log_info "Querying Ceph OSD information..."
# Parse ceph-volume lvm list output
# Format: blocks starting with "====== osd.X =======" followed by device info
local current_osd=""
local osd_count=0
while IFS= read -r line; do
# Match OSD header: "====== osd.5 =======" or "====== osd.19 ======"
# Number of trailing equals varies based on OSD number length
if [[ "$line" =~ ======[[:space:]]+osd\.([0-9]+)[[:space:]]+====== ]]; then
current_osd="osd.${BASH_REMATCH[1]}"
# Match "devices" line which has the actual physical device: " devices /dev/sda"
# This is more reliable than "block device" which may show LVM paths
elif [[ -n "$current_osd" && "$line" =~ devices[[:space:]]+/dev/(sd[a-z]+|nvme[0-9]+n[0-9]+) ]]; then
local dev_name="${BASH_REMATCH[1]}"
CEPH_DEVICE_TO_OSD["$dev_name"]="$current_osd"
((osd_count++))
log_info "Found $current_osd on $dev_name"
current_osd="" # Reset to avoid duplicate matches
fi
done < <(ceph-volume lvm list 2>/dev/null)
log_info "Cached $osd_count Ceph OSDs"
# Skip if ceph command is not available
if ! command -v ceph &>/dev/null; then
log_info "ceph CLI not found, skipping OSD status detection"
return
fi
log_info "Querying Ceph OSD status..."
# Parse ceph osd tree for status
# Format: ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT
while IFS= read -r line; do
# Match OSD lines: " 5 hdd 3.63660 osd.5 up 1.00000"
if [[ "$line" =~ ^[[:space:]]*([0-9]+)[[:space:]]+.*osd\.([0-9]+)[[:space:]]+(up|down)[[:space:]]+([0-9.]+) ]]; then
local osd_num="${BASH_REMATCH[1]}"
local status="${BASH_REMATCH[3]}"
local reweight="${BASH_REMATCH[4]}"
CEPH_OSD_STATUS[$osd_num]="$status"
# Determine in/out based on reweight
if awk "BEGIN {exit !($reweight > 0)}"; then
CEPH_OSD_IN[$osd_num]="in"
else
CEPH_OSD_IN[$osd_num]="out"
fi
fi
done < <(ceph osd tree 2>/dev/null)
}
# SMART warning thresholds
readonly SMART_TEMP_WARN=50 # Temperature warning threshold (°C)
readonly SMART_TEMP_CRIT=60 # Temperature critical threshold (°C)
readonly SMART_REALLOCATED_WARN=1 # Reallocated sectors warning threshold
readonly SMART_PENDING_WARN=1 # Pending sectors warning threshold
readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold
readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use
#------------------------------------------------------------------------------
# parse_smart_data
#
# Parses raw SMART data and returns formatted info string.
#
# Args:
# $1 - Device name (e.g., sda, nvme0n1)
# $2 - Raw smartctl output string
#
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
#------------------------------------------------------------------------------
parse_smart_data() {
local device="$1"
local smart_info="$2"
local temp="-"
local type="HDD"
local health="✗"
local model="-"
local serial="-"
local warnings=""
if [[ -z "$smart_info" ]]; then
echo "HDD|-|✗|-|-|"
return
fi
# Temperature parsing - handles multiple formats:
# - SATA: "194 Temperature_Celsius ... 26 (0 14 0 0 0)" (value before parenthetical)
# - SATA: "Temperature: 42 Celsius"
# - SATA: "Current Temperature: 35 Celsius"
# - SAS: "Current Drive Temperature: 35 C"
# - NVMe: "Temperature: 42 Celsius"
if echo "$smart_info" | grep -q "Temperature_Celsius"; then
# Strip parenthetical data like "(0 14 0 0 0)" before finding last number
temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | sed 's/([^)]*)//g' | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
elif echo "$smart_info" | grep -qE "Current Drive Temperature:"; then
# SAS drives: "Current Drive Temperature: 35 C"
temp="$(echo "$smart_info" | grep -E "Current Drive Temperature:" | head -1 | awk '{for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
elif echo "$smart_info" | grep -qE "(Current )?Temperature:"; then
# SATA/NVMe: "Temperature: 42 Celsius" (may have leading whitespace)
temp="$(echo "$smart_info" | grep -E "(Current )?Temperature:" | head -1 | awk '{for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
fi
# Device type detection - handles SSD, HDD, and NVMe
# Priority: 1) NVMe by name, 2) Rotation Rate field, 3) Model name hints, 4) Default HDD
if [[ "$device" == nvme* ]]; then
type="NVMe"
elif echo "$smart_info" | grep -qE "Rotation Rate:"; then
# Check the Rotation Rate field value (may have leading whitespace)
local rotation_rate
rotation_rate="$(echo "$smart_info" | grep -E "Rotation Rate:" | head -1)"
if echo "$rotation_rate" | grep -qiE "solid state"; then
type="SSD"
elif echo "$rotation_rate" | grep -qE "[0-9]+ rpm"; then
# Has actual RPM value (e.g., "7200 rpm") - it's an HDD
type="HDD"
else
# Unknown rotation rate, default to HDD
type="HDD"
fi
elif echo "$smart_info" | grep -qE "Device Model:.*SSD|Model Number:.*SSD"; then
# Match SSD in the model name field
type="SSD"
else
# Default to HDD for spinning rust
type="HDD"
fi
# Health status (basic SMART check)
if echo "$smart_info" | grep -q "SMART overall-health.*PASSED"; then
health="✓"
elif echo "$smart_info" | grep -q "SMART Health Status.*OK"; then
# NVMe format
health="✓"
fi
# Model - try multiple field names
model="$(echo "$smart_info" | grep -E "^(Device Model|Model Number|Product):" | head -1 | cut -d: -f2 | xargs)"
[[ -z "$model" ]] && model="-"
# Serial number - capture everything after the colon to handle spaces
serial="$(echo "$smart_info" | grep -E "^Serial [Nn]umber:" | head -1 | cut -d: -f2 | xargs)"
[[ -z "$serial" ]] && serial="-"
# SMART threshold warnings - check for concerning values
local warn_list=()
# Temperature thresholds
if [[ -n "$temp" && "$temp" =~ ^[0-9]+$ ]]; then
if [[ "$temp" -ge "$SMART_TEMP_CRIT" ]]; then
warn_list+=("TEMP_CRIT")
elif [[ "$temp" -ge "$SMART_TEMP_WARN" ]]; then
warn_list+=("TEMP_WARN")
fi
fi
# Reallocated sectors (SMART attribute 5)
local reallocated
reallocated="$(echo "$smart_info" | grep -E "^\s*5\s+Reallocated_Sector" | awk '{print $NF}')"
if [[ -n "$reallocated" && "$reallocated" =~ ^[0-9]+$ && "$reallocated" -ge "$SMART_REALLOCATED_WARN" ]]; then
warn_list+=("REALLOC:$reallocated")
fi
# Current pending sectors (SMART attribute 197)
local pending
pending="$(echo "$smart_info" | grep -E "^\s*197\s+Current_Pending" | awk '{print $NF}')"
if [[ -n "$pending" && "$pending" =~ ^[0-9]+$ && "$pending" -ge "$SMART_PENDING_WARN" ]]; then
warn_list+=("PENDING:$pending")
fi
# UDMA CRC errors (SMART attribute 199)
local crc_errors
crc_errors="$(echo "$smart_info" | grep -E "^\s*199\s+UDMA_CRC_Error" | awk '{print $NF}')"
if [[ -n "$crc_errors" && "$crc_errors" =~ ^[0-9]+$ && "$crc_errors" -ge "$SMART_CRC_ERROR_WARN" ]]; then
warn_list+=("CRC:$crc_errors")
fi
# Power-on hours (SMART attribute 9)
local power_hours
power_hours="$(echo "$smart_info" | grep -E "^\s*9\s+Power_On_Hours" | awk '{print $NF}')"
if [[ -n "$power_hours" && "$power_hours" =~ ^[0-9]+$ && "$power_hours" -ge "$SMART_POWER_ON_HOURS_WARN" ]]; then
warn_list+=("HOURS:$power_hours")
fi
# Join warnings
if [[ ${#warn_list[@]} -gt 0 ]]; then
warnings="$(IFS=','; echo "${warn_list[*]}")"
# Change health indicator to warning if SMART passed but has warnings
if [[ "$health" == "✓" ]]; then
health="⚠"
fi
fi
# Format temperature with unit if we have a value
local temp_display
if [[ -n "$temp" && "$temp" != "-" ]]; then
temp_display="${temp}°C"
else
temp_display="-"
fi
echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}"
}
#------------------------------------------------------------------------------
# get_drive_smart_info
#
# Retrieves SMART data for a given device (fetches and parses).
#
# Args:
# $1 - Device name (e.g., sda, nvme0n1)
#
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
#------------------------------------------------------------------------------
get_drive_smart_info() {
local device="$1"
local smart_info
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)"
parse_smart_data "$device" "$smart_info"
}
#------------------------------------------------------------------------------
# Main Display Logic
#------------------------------------------------------------------------------
HOSTNAME=$(hostname | tr -cd '[:alnum:]-_.')
CHASSIS_TYPE=${CHASSIS_TYPES[$HOSTNAME]:-"unknown"}
# Display chassis layout
case "$CHASSIS_TYPE" in
"10bay")
generate_10bay_layout "$HOSTNAME"
;;
"large1")
generate_large1_layout "$HOSTNAME"
;;
"micro")
generate_micro_layout "$HOSTNAME"
;;
*)
echo "┌─────────────────────────────────────────────────────────┐"
echo "│ Unknown server: $HOSTNAME"
echo "│ No chassis mapping defined yet"
echo "│ Run diagnose-drives.sh to gather PCI path information"
echo "└─────────────────────────────────────────────────────────┘"
;;
esac
#------------------------------------------------------------------------------
# Drive Details Section
#------------------------------------------------------------------------------
# Build Ceph OSD cache (single query instead of per-device)
if [[ "$SKIP_CEPH" != true ]]; then
build_ceph_cache
fi
printf "\n"
colorize_header '=== Drive Details with SMART Status (by Bay Position) ==='
if [[ "$SHOW_PCI" == true ]]; then
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" "PCI PATH"
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
else
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS"
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------"
fi
# Build reverse map: device -> bay
declare -A DEVICE_TO_BAY
for bay in "${!DRIVE_MAP[@]}"; do
device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" ]]; then
DEVICE_TO_BAY["$device"]="$bay"
fi
done
# Sort drives by bay position (numeric bays first, then m2 slots)
# Combine numeric bays (sorted numerically) with m2 slots (sorted alphanumerically)
all_bays="$(printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^[0-9]+$' | sort -n; printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^m2-' | sort)"
# Cache lsblk data to reduce redundant calls
# Get device sizes (whole disk only)
declare -A LSBLK_SIZE=()
declare -A LSBLK_MOUNTS=()
log_info "Caching block device information..."
# Get sizes for whole disks only
while read -r name size; do
[[ -z "$name" ]] && continue
LSBLK_SIZE["$name"]="$size"
done < <(lsblk -dn -o NAME,SIZE 2>/dev/null)
# Get mount points (including partitions) and map back to parent device
while read -r name mounts; do
[[ -z "$name" || -z "$mounts" ]] && continue
# Strip partition suffix (sda1 -> sda, nvme0n1p1 -> nvme0n1)
if [[ "$name" =~ ^(nvme[0-9]+n[0-9]+)p[0-9]+$ ]]; then
parent="${BASH_REMATCH[1]}"
elif [[ "$name" =~ ^([a-z]+)[0-9]+$ ]]; then
parent="${BASH_REMATCH[1]}"
else
parent="$name"
fi
if [[ -n "${LSBLK_MOUNTS[$parent]:-}" ]]; then
LSBLK_MOUNTS["$parent"]+=",${mounts}"
else
LSBLK_MOUNTS["$parent"]="$mounts"
fi
done < <(lsblk -rn -o NAME,MOUNTPOINT 2>/dev/null | grep -v '^ ')
# Parallel SMART data collection for faster execution
# Collect raw smartctl output in background jobs, parse later
if [[ "$SKIP_SMART" != true ]]; then
SMART_CACHE_DIR="$(mktemp -d)"
log_info "Collecting SMART data in parallel..."
max_parallel_jobs=10
job_count=0
for bay in $all_bays; do
device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
# Launch background job to collect raw smartctl data
(sudo smartctl -A -i -H "/dev/$device" > "$SMART_CACHE_DIR/${device}.raw" 2>/dev/null) &
((job_count++))
if ((job_count >= max_parallel_jobs)); then
wait -n 2>/dev/null || wait # wait -n requires bash 4.3+, fall back to wait
((job_count--))
fi
fi
done
# Wait for all remaining background SMART queries to complete
wait
log_info "SMART data collection complete"
fi
for bay in $all_bays; do
device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
# Use cached lsblk data
size="${LSBLK_SIZE[$device]:-}"
# Get SMART info from cache (or defaults if skipped)
if [[ "$SKIP_SMART" == true ]]; then
type="-"
temp="-"
health="-"
model="-"
serial="-"
warnings=""
else
# Read from cached raw SMART data and parse it
raw_smart=""
if [[ -f "$SMART_CACHE_DIR/${device}.raw" ]]; then
raw_smart="$(cat "$SMART_CACHE_DIR/${device}.raw")"
fi
# Parse the raw data using get_drive_smart_info logic inline
if [[ -n "$raw_smart" ]]; then
smart_info="$(parse_smart_data "$device" "$raw_smart")"
IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
else
type="-"
temp="-"
health="-"
model="-"
serial="-"
warnings=""
fi
fi
# Check for Ceph OSD using cached data
osd_id="-"
ceph_status="-"
if [[ "$SKIP_CEPH" != true ]]; then
osd_id="${CEPH_DEVICE_TO_OSD[$device]:-}"
if [[ -n "$osd_id" ]]; then
# Get status from cached OSD tree data
osd_num="${osd_id#osd.}"
up_status="${CEPH_OSD_STATUS[$osd_num]:-unknown}"
in_status="${CEPH_OSD_IN[$osd_num]:-out}"
ceph_status="${up_status}/${in_status}"
else
osd_id="-"
fi
fi
# Check mount points using cached lsblk data
# This includes both whole-device mounts and partition mounts
usage="-"
mount_points="${LSBLK_MOUNTS[$device]:-}"
# Limit to first 3 mount points for display
mount_points="$(echo "$mount_points" | tr ',' '\n' | head -3 | tr '\n' ',' | sed 's/,$//')"
if [[ -n "$mount_points" ]]; then
if [[ "$mount_points" == *"/"* && ! "$mount_points" == *"/boot"* && ! "$mount_points" == *"/home"* ]]; then
# Root filesystem mounted (but not just /boot or /home)
if echo "$mount_points" | grep -qE '^/,|^/$|,/$'; then
usage="BOOT"
else
usage="$mount_points"
fi
else
usage="$mount_points"
fi
fi
# Apply colors if enabled
colored_temp="$(colorize_temp "$temp")"
colored_health="$(colorize_health "$health")"
# Colorize warnings if present
colored_warnings="${warnings:--}"
if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then
colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}"
fi
if [[ "$SHOW_PCI" == true ]]; then
pci_path="${BAY_TO_PCI_PATH[$bay]:-}"
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" "$pci_path"
else
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings"
fi
fi
done
# NVMe drives (only show unmapped ones - mapped NVMe drives appear in main table)
nvme_devices=$(lsblk -d -n -o NAME,SIZE | grep "^nvme" 2>/dev/null)
if [[ -n "$nvme_devices" ]]; then
# Filter out already-mapped NVMe devices
unmapped_nvme=""
while read -r name size; do
if [[ -z "${DEVICE_TO_BAY[$name]:-}" ]]; then
unmapped_nvme+="$name $size"$'\n'
fi
done <<< "$nvme_devices"
if [[ -n "$unmapped_nvme" ]]; then
printf "\n"
colorize_header '=== Unmapped NVMe Drives ==='
printf "%-15s %-10s %-10s %-40s %-25s\n" "DEVICE" "SIZE" "TYPE" "MODEL" "SERIAL"
echo "------------------------------------------------------------------------------------------------------"
echo "$unmapped_nvme" | while read -r name size; do
[[ -z "$name" ]] && continue
device="/dev/$name"
# Get model and serial from smartctl for accuracy
smart_info="$(sudo smartctl -i "$device" 2>/dev/null)"
model="$(echo "$smart_info" | grep "Model Number" | cut -d: -f2 | xargs)"
serial="$(echo "$smart_info" | grep "Serial Number" | cut -d: -f2 | xargs)"
[[ -z "$model" ]] && model="-"
[[ -z "$serial" ]] && serial="-"
printf "%-15s %-10s %-10s %-40s %-25s\n" "$device" "$size" "NVMe" "$model" "$serial"
done
fi
fi
#------------------------------------------------------------------------------
# Optional sections
#------------------------------------------------------------------------------
# Ceph RBD Devices
rbd_devices=$(lsblk -d -n -o NAME,SIZE,TYPE 2>/dev/null | grep "rbd" | sort -V)
if [ -n "$rbd_devices" ]; then
printf "\n"
colorize_header '=== Ceph RBD Devices ==='
printf "%-15s %-10s %-10s %-30s\n" "DEVICE" "SIZE" "TYPE" "MOUNTPOINT"
echo "------------------------------------------------------------"
echo "$rbd_devices" | while read -r name size type; do
# Get mountpoint if any
mountpoint=$(lsblk -n -o MOUNTPOINT "/dev/$name" 2>/dev/null | head -1)
[[ -z "$mountpoint" ]] && mountpoint="-"
printf "%-15s %-10s %-10s %-30s\n" "/dev/$name" "$size" "$type" "$mountpoint"
done
fi
# Show mapping diagnostic info if DEBUG is set
if [[ -n "$DEBUG" ]]; then
printf "\n"
colorize_header '=== DEBUG: Drive Mappings ==='
for key in "${!DRIVE_MAP[@]}"; do
echo "Bay $key: ${DRIVE_MAP[$key]}"
done | sort -n
fi