Files
driveAtlas/driveAtlas.sh
Jared Vititoe 05d7fa7e37 Implement parallel SMART data collection for faster execution
SMART queries are now run in parallel using background jobs:
1. First pass launches background jobs for all devices
2. Each job writes to a temp file in SMART_CACHE_DIR
3. Wait for all jobs to complete
4. Second pass reads cached data for display

This significantly reduces script runtime when multiple drives
are present, as SMART queries can take 1-2 seconds each.

Cache directory is automatically cleaned up after use.

Fixes: #15

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 11:40:20 -05:00

1016 lines
41 KiB
Bash

#!/bin/bash
#==============================================================================
# Drive Atlas - Server Drive Mapping Tool
# Maps physical drive bays to logical device names using PCI paths
#==============================================================================
VERSION="1.1.0"
#------------------------------------------------------------------------------
# show_usage
#
# Displays help message with usage information and available options.
#------------------------------------------------------------------------------
show_usage() {
cat << EOF
Drive Atlas v${VERSION} - Server Drive Mapping Tool
Maps physical drive bays to logical device names using PCI paths.
Displays visual chassis layouts and comprehensive drive information.
USAGE:
$(basename "$0") [OPTIONS]
OPTIONS:
-h, --help Show this help message and exit
-v, --version Show version information
-d, --debug Enable debug output (show drive mappings)
-s, --skip-smart Skip SMART data collection (faster)
-c, --color Enable colored output
--verbose Show detailed error messages and warnings
--no-ceph Skip Ceph OSD information
--show-pci Show PCI paths in output
EXAMPLES:
$(basename "$0") # Normal run with all features
$(basename "$0") --skip-smart # Fast run without SMART data
$(basename "$0") --color # Run with colored output
$(basename "$0") --verbose # Show all errors and warnings
$(basename "$0") --debug # Show mapping debug info
ENVIRONMENT VARIABLES:
DEBUG=1 Same as --debug flag
For more information, see: https://code.lotusguild.org/LotusGuild/driveAtlas
EOF
}
#------------------------------------------------------------------------------
# Command Line Argument Parsing
#------------------------------------------------------------------------------
SKIP_SMART=false
SKIP_CEPH=false
SHOW_PCI=false
USE_COLOR=false
VERBOSE=false
while [[ $# -gt 0 ]]; do
case "$1" in
-h|--help)
show_usage
exit 0
;;
-v|--version)
echo "Drive Atlas v${VERSION}"
exit 0
;;
-d|--debug)
DEBUG=1
shift
;;
-s|--skip-smart)
SKIP_SMART=true
shift
;;
--no-ceph)
SKIP_CEPH=true
shift
;;
--show-pci)
SHOW_PCI=true
shift
;;
-c|--color)
USE_COLOR=true
shift
;;
--verbose)
VERBOSE=true
shift
;;
*)
echo "Unknown option: $1" >&2
echo "Use --help for usage information." >&2
exit 1
;;
esac
done
#------------------------------------------------------------------------------
# Color Definitions
# ANSI escape codes for terminal colors
#------------------------------------------------------------------------------
if [[ "$USE_COLOR" == true ]]; then
COLOR_RESET='\033[0m'
COLOR_RED='\033[0;31m'
COLOR_GREEN='\033[0;32m'
COLOR_YELLOW='\033[0;33m'
COLOR_BLUE='\033[0;34m'
COLOR_CYAN='\033[0;36m'
COLOR_BOLD='\033[1m'
else
COLOR_RESET=''
COLOR_RED=''
COLOR_GREEN=''
COLOR_YELLOW=''
COLOR_BLUE=''
COLOR_CYAN=''
COLOR_BOLD=''
fi
#------------------------------------------------------------------------------
# colorize_health
#
# Returns health indicator with appropriate color
# Args: $1 - health status (✓ or ✗)
#------------------------------------------------------------------------------
colorize_health() {
local health="$1"
if [[ "$USE_COLOR" == true ]]; then
if [[ "$health" == "✓" ]]; then
echo -e "${COLOR_GREEN}${health}${COLOR_RESET}"
else
echo -e "${COLOR_RED}${health}${COLOR_RESET}"
fi
else
echo "$health"
fi
}
#------------------------------------------------------------------------------
# colorize_temp
#
# Returns temperature with color based on value
# Args: $1 - temperature string (e.g., "45°C")
#------------------------------------------------------------------------------
colorize_temp() {
local temp_str="$1"
local temp_val
if [[ "$USE_COLOR" != true || "$temp_str" == "-" ]]; then
echo "$temp_str"
return
fi
# Extract numeric value
temp_val="${temp_str%°C}"
if [[ "$temp_val" =~ ^[0-9]+$ ]]; then
if [[ "$temp_val" -ge 60 ]]; then
echo -e "${COLOR_RED}${temp_str}${COLOR_RESET}"
elif [[ "$temp_val" -ge 50 ]]; then
echo -e "${COLOR_YELLOW}${temp_str}${COLOR_RESET}"
else
echo -e "${COLOR_GREEN}${temp_str}${COLOR_RESET}"
fi
else
echo "$temp_str"
fi
}
#------------------------------------------------------------------------------
# colorize_header
#
# Returns header text in blue/bold
# Args: $1 - header text
#------------------------------------------------------------------------------
colorize_header() {
if [[ "$USE_COLOR" == true ]]; then
echo -e "${COLOR_BLUE}${COLOR_BOLD}$1${COLOR_RESET}"
else
echo "$1"
fi
}
#------------------------------------------------------------------------------
# log_error
#
# Logs an error message to stderr. Always shown regardless of verbose mode.
# Args: $1 - error message
#------------------------------------------------------------------------------
log_error() {
if [[ "$USE_COLOR" == true ]]; then
echo -e "${COLOR_RED}ERROR:${COLOR_RESET} $1" >&2
else
echo "ERROR: $1" >&2
fi
}
#------------------------------------------------------------------------------
# log_warn
#
# Logs a warning message to stderr. Only shown in verbose mode.
# Args: $1 - warning message
#------------------------------------------------------------------------------
log_warn() {
if [[ "$VERBOSE" == true ]]; then
if [[ "$USE_COLOR" == true ]]; then
echo -e "${COLOR_YELLOW}WARN:${COLOR_RESET} $1" >&2
else
echo "WARN: $1" >&2
fi
fi
}
#------------------------------------------------------------------------------
# log_info
#
# Logs an informational message to stderr. Only shown in verbose mode.
# Args: $1 - info message
#------------------------------------------------------------------------------
log_info() {
if [[ "$VERBOSE" == true ]]; then
if [[ "$USE_COLOR" == true ]]; then
echo -e "${COLOR_CYAN}INFO:${COLOR_RESET} $1" >&2
else
echo "INFO: $1" >&2
fi
fi
}
#------------------------------------------------------------------------------
# Dependency Checks
# Verifies required commands are available before running
#------------------------------------------------------------------------------
# Required dependencies (script will not function without these)
REQUIRED_DEPS=(lsblk lspci readlink hostname)
# Optional dependencies (enhanced functionality)
OPTIONAL_DEPS=(smartctl ceph ceph-volume bc nvme)
FRESH_START_URL="http://10.10.10.63:3000/LotusGuild/freshStartScript/raw/branch/main/freshStart.sh"
#------------------------------------------------------------------------------
# check_dependencies
#
# Verifies required and optional commands are available.
# Exits with error if required dependencies are missing.
# Warns about missing optional dependencies.
#------------------------------------------------------------------------------
check_dependencies() {
local missing_required=()
local missing_optional=()
# Check required dependencies
for cmd in "${REQUIRED_DEPS[@]}"; do
if ! command -v "$cmd" &>/dev/null; then
missing_required+=("$cmd")
fi
done
# Check optional dependencies
for cmd in "${OPTIONAL_DEPS[@]}"; do
if ! command -v "$cmd" &>/dev/null; then
missing_optional+=("$cmd")
fi
done
# Report missing required dependencies and exit
if [[ ${#missing_required[@]} -gt 0 ]]; then
echo "ERROR: Missing required dependencies: ${missing_required[*]}" >&2
echo "" >&2
echo "Please install the missing packages or run the fresh start script:" >&2
echo " curl -s $FRESH_START_URL | bash" >&2
echo "" >&2
exit 1
fi
# Warn about missing optional dependencies
if [[ ${#missing_optional[@]} -gt 0 ]]; then
echo "Note: Some optional features unavailable. Missing: ${missing_optional[*]}" >&2
echo " Install them or run: curl -s $FRESH_START_URL | bash" >&2
echo "" >&2
fi
# Check for sudo access (needed for smartctl)
if command -v smartctl &>/dev/null && ! sudo -n true 2>/dev/null; then
echo "Note: SMART data requires sudo access. Run with sudo for full functionality." >&2
fi
}
# Run dependency check at script start
check_dependencies
#------------------------------------------------------------------------------
# Chassis Type Definitions
# These define the physical layout and display formatting for each chassis type
#------------------------------------------------------------------------------
generate_10bay_layout() {
local hostname=$1
build_drive_map
# Fixed width for consistent box drawing (fits device names like "nvme0n1")
local drive_width=10
# Main chassis section
printf "┌────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐\n"
printf "│ %-126s │\n" "$hostname - Sliger CX4712 (10x 3.5\" Hot-swap)"
printf "│ │\n"
# Show storage controllers
printf "│ Storage Controllers: │\n"
while IFS= read -r ctrl; do
[[ -n "$ctrl" ]] && printf "│ %-126s│\n" "$ctrl"
done < <(get_storage_controllers)
printf "│ │\n"
# M.2 NVMe slot if present
if [[ -n "${DRIVE_MAP[m2-1]}" ]]; then
printf "│ M.2 NVMe: %-10s │\n" "${DRIVE_MAP[m2-1]}"
printf "│ │\n"
fi
printf "│ Front Hot-swap Bays: │\n"
printf "│ │\n"
# Bay top borders
printf "│ "
for bay in {1..10}; do
printf "┌──────────┐ "
done
printf " │\n"
# Bay contents
printf "│ "
for bay in {1..10}; do
printf "│%-2d:%-7s│ " "$bay" "${DRIVE_MAP[$bay]:-EMPTY}"
done
printf " │\n"
# Bay bottom borders
printf "│ "
for bay in {1..10}; do
printf "└──────────┘ "
done
printf " │\n"
printf "└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n"
}
generate_micro_layout() {
local hostname=$1
build_drive_map
# Check for eMMC storage
local emmc_device=""
if [[ -b /dev/mmcblk0 ]]; then
emmc_device="mmcblk0"
fi
printf "┌─────────────────────────────────────────────────────────────┐\n"
printf "│ %-57s │\n" "$hostname - Micro SBC"
printf "│ │\n"
printf "│ Storage Controllers: │\n"
while IFS= read -r ctrl; do
[[ -n "$ctrl" ]] && printf "│ %-57s│\n" "$ctrl"
done < <(get_storage_controllers)
printf "│ │\n"
# Show eMMC if present
if [[ -n "$emmc_device" ]]; then
local emmc_size=$(lsblk -d -n -o SIZE "/dev/$emmc_device" 2>/dev/null | xargs)
printf "│ ┌─────────────────────────────────────────────────────┐ │\n"
printf "│ │ Onboard eMMC: %-10s (%s) │ │\n" "$emmc_device" "$emmc_size"
printf "│ └─────────────────────────────────────────────────────┘ │\n"
printf "│ │\n"
fi
printf "│ SATA Ports (rear): │\n"
printf "│ ┌──────────────┐ ┌──────────────┐ │\n"
printf "│ │ 1: %-9s │ │ 2: %-9s │ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}"
printf "│ └──────────────┘ └──────────────┘ │\n"
printf "└─────────────────────────────────────────────────────────────┘\n"
}
generate_large1_layout() {
local hostname=$1
build_drive_map
# large1 has 3 stacks of 5 bays at front (15 total) + 2 M.2 slots
# Physical bay mapping TBD - current mapping is by controller order
printf "┌─────────────────────────────────────────────────────────────────────────┐\n"
printf "│ %-69s │\n" "$hostname - Rosewill RSV-L4500U (15x 3.5\" Bays)"
printf "│ │\n"
printf "│ Storage Controllers: │\n"
while IFS= read -r ctrl; do
[[ -n "$ctrl" ]] && printf "│ %-69s│\n" "$ctrl"
done < <(get_storage_controllers)
printf "│ │\n"
printf "│ M.2 NVMe: M1: %-10s M2: %-10s │\n" "${DRIVE_MAP[m2-1]:-EMPTY}" "${DRIVE_MAP[m2-2]:-EMPTY}"
printf "│ │\n"
printf "│ Front Bays (3 stacks x 5 rows): [Bay mapping TBD] │\n"
printf "│ Stack A Stack B Stack C │\n"
printf "│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │\n"
printf "│ │1:%-8s│ │2:%-8s│ │3:%-8s│ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}" "${DRIVE_MAP[3]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │4:%-8s│ │5:%-8s│ │6:%-8s│ │\n" "${DRIVE_MAP[4]:-EMPTY}" "${DRIVE_MAP[5]:-EMPTY}" "${DRIVE_MAP[6]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │7:%-8s│ │8:%-8s│ │9:%-8s│ │\n" "${DRIVE_MAP[7]:-EMPTY}" "${DRIVE_MAP[8]:-EMPTY}" "${DRIVE_MAP[9]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │10:%-7s│ │11:%-7s│ │12:%-7s│ │\n" "${DRIVE_MAP[10]:-EMPTY}" "${DRIVE_MAP[11]:-EMPTY}" "${DRIVE_MAP[12]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │13:%-7s│ │14:%-7s│ │15:%-7s│ │\n" "${DRIVE_MAP[13]:-EMPTY}" "${DRIVE_MAP[14]:-EMPTY}" "${DRIVE_MAP[15]:-EMPTY}"
printf "│ └──────────┘ └──────────┘ └──────────┘ │\n"
printf "└─────────────────────────────────────────────────────────────────────────┘\n"
}
#------------------------------------------------------------------------------
# Server-Specific Drive Mappings
# Maps PCI paths to physical bay numbers for each server
# Format: "pci-path bay-number"
#------------------------------------------------------------------------------
declare -A SERVER_MAPPINGS=(
# compute-storage-01 (formerly medium2)
# Motherboard: B650D4U3-2Q/BCM with AMD SATA controller
# HBA: LSI SAS3008 at 01:00.0 (mini-SAS HD ports)
# Cable mapping from user notes:
# - Mobo SATA: top-right=bay1, bottom-right=bay2, bottom-left=bay3, top-left=bay4
# - HBA bottom mini-SAS: bays 5,6,7,8
# - HBA top mini-SAS: bays 9,10
["compute-storage-01"]="
pci-0000:0d:00.0-ata-2 1
pci-0000:0d:00.0-ata-1 2
pci-0000:0d:00.0-ata-3 3
pci-0000:0d:00.0-ata-4 4
pci-0000:01:00.0-sas-phy6-lun-0 5
pci-0000:01:00.0-sas-phy7-lun-0 6
pci-0000:01:00.0-sas-phy5-lun-0 7
pci-0000:01:00.0-sas-phy2-lun-0 8
pci-0000:01:00.0-sas-phy4-lun-0 9
pci-0000:01:00.0-sas-phy3-lun-0 10
pci-0000:0e:00.0-nvme-1 m2-1
"
# compute-storage-gpu-01
# Motherboard: ASUS PRIME B550-PLUS with AMD SATA controller at 02:00.1
# 5 SATA ports + 1 M.2 NVMe slot
# sdf is USB/card reader - not mapped
["compute-storage-gpu-01"]="
pci-0000:02:00.1-ata-1 1
pci-0000:02:00.1-ata-2 2
pci-0000:02:00.1-ata-3 3
pci-0000:02:00.1-ata-4 4
pci-0000:02:00.1-ata-5 5
pci-0000:0c:00.0-nvme-1 m2-1
"
# storage-01
# Motherboard: ASRock A320M-HDV R4.0 with AMD SATA controller at 02:00.1
# 4 SATA ports used (ata-1, ata-2, ata-5, ata-6) - ata-3/4 empty
["storage-01"]="
pci-0000:02:00.1-ata-1 1
pci-0000:02:00.1-ata-2 2
pci-0000:02:00.1-ata-5 3
pci-0000:02:00.1-ata-6 4
"
# large1
# Custom tower with multiple controllers:
# - HBA: LSI SAS2008 at 10:00.0 (7 drives)
# - AMD SATA at 16:00.1 (3 drives)
# - ASMedia SATA at 25:00.0 (2 drives)
# - 2x NVMe slots
["large1"]="
pci-0000:10:00.0-sas-phy0-lun-0 1
pci-0000:10:00.0-sas-phy1-lun-0 2
pci-0000:10:00.0-sas-phy3-lun-0 3
pci-0000:10:00.0-sas-phy4-lun-0 4
pci-0000:10:00.0-sas-phy5-lun-0 5
pci-0000:10:00.0-sas-phy6-lun-0 6
pci-0000:10:00.0-sas-phy7-lun-0 7
pci-0000:16:00.1-ata-3 8
pci-0000:16:00.1-ata-7 9
pci-0000:16:00.1-ata-8 10
pci-0000:25:00.0-ata-1 11
pci-0000:25:00.0-ata-2 12
pci-0000:2a:00.0-nvme-1 m2-1
pci-0000:26:00.0-nvme-1 m2-2
"
# micro1
# ZimaBoard 832 - Single board computer
# 2 SATA ports on rear (currently unused)
# Boot from onboard eMMC (mmcblk0)
# SATA controller at 00:12.0
["micro1"]="
"
# monitor-02
# ZimaBoard 832 - Single board computer
# 2 SATA ports on rear (currently unused)
# Boot from onboard eMMC (mmcblk0)
# SATA controller would be at a specific PCI address when drives connected
["monitor-02"]="
"
)
declare -A CHASSIS_TYPES=(
["compute-storage-01"]="10bay"
["compute-storage-gpu-01"]="10bay"
["storage-01"]="10bay"
["large1"]="large1"
["micro1"]="micro" # ZimaBoard 832
["monitor-02"]="micro" # ZimaBoard 832
)
#------------------------------------------------------------------------------
# Core Functions
#------------------------------------------------------------------------------
get_storage_controllers() {
# Returns a formatted list of storage controllers (HBAs, SATA, NVMe)
lspci 2>/dev/null | grep -iE "SAS|SATA|RAID|Mass storage|NVMe" | while read -r line; do
pci_addr=$(echo "$line" | awk '{print $1}')
# Get short description (strip PCI address)
desc=$(echo "$line" | sed 's/^[0-9a-f:.]\+ //')
echo " $pci_addr: $desc"
done
}
#------------------------------------------------------------------------------
# build_drive_map
#
# Builds a global associative array mapping physical bay numbers to device names.
# Uses PCI paths from SERVER_MAPPINGS to resolve current device assignments.
#
# Sets:
# DRIVE_MAP (global associative array)
# Keys: Bay identifiers (1, 2, ..., m2-1, m2-2, etc.)
# Values: Device names (sda, nvme0n1, etc.)
# BAY_TO_PCI_PATH (global associative array)
# Keys: Bay identifiers
# Values: PCI path strings (for --show-pci option)
#------------------------------------------------------------------------------
build_drive_map() {
local host="$(hostname)"
local mapping="${SERVER_MAPPINGS[$host]}"
# Declare global arrays directly
declare -g -A DRIVE_MAP=()
declare -g -A BAY_TO_PCI_PATH=()
if [[ -z "$mapping" ]]; then
log_warn "No drive mapping found for host '$host'. Run diagnose-drives.sh to create one."
return
fi
local mapped_count=0
local empty_count=0
while read -r path slot; do
[[ -z "$path" || -z "$slot" ]] && continue
BAY_TO_PCI_PATH[$slot]="$path"
if [[ -L "/dev/disk/by-path/$path" ]]; then
local drive="$(readlink -f "/dev/disk/by-path/$path" | sed 's/.*\///')"
DRIVE_MAP[$slot]="$drive"
((mapped_count++))
else
log_info "Bay $slot: No device at PCI path $path"
((empty_count++))
fi
done <<< "$mapping"
log_info "Mapped $mapped_count drives, $empty_count empty bays"
}
#------------------------------------------------------------------------------
# build_ceph_cache
#
# Queries Ceph once and builds lookup tables for OSD information.
# This is much more efficient than querying ceph-volume per device.
#
# Sets global associative arrays:
# CEPH_DEVICE_TO_OSD - Maps device names to OSD IDs (e.g., sda -> osd.5)
# CEPH_OSD_STATUS - Maps OSD numbers to up/down status
# CEPH_OSD_IN - Maps OSD numbers to in/out status
#------------------------------------------------------------------------------
build_ceph_cache() {
declare -g -A CEPH_DEVICE_TO_OSD=()
declare -g -A CEPH_OSD_STATUS=()
declare -g -A CEPH_OSD_IN=()
# Skip if ceph-volume is not available
if ! command -v ceph-volume &>/dev/null; then
log_info "ceph-volume not found, skipping Ceph OSD detection"
return
fi
log_info "Querying Ceph OSD information..."
# Parse ceph-volume lvm list output
# Format: blocks starting with "====== osd.X =======" followed by device info
local current_osd=""
while IFS= read -r line; do
# Match OSD header: "====== osd.5 ======="
if [[ "$line" =~ ======[[:space:]]+osd\.([0-9]+)[[:space:]]+======= ]]; then
current_osd="osd.${BASH_REMATCH[1]}"
# Match block device line: " block device /dev/sda"
elif [[ -n "$current_osd" && "$line" =~ block[[:space:]]device[[:space:]]+/dev/([^[:space:]]+) ]]; then
local dev_name="${BASH_REMATCH[1]}"
CEPH_DEVICE_TO_OSD[$dev_name]="$current_osd"
fi
done < <(ceph-volume lvm list 2>/dev/null)
# Skip if ceph command is not available
if ! command -v ceph &>/dev/null; then
log_info "ceph CLI not found, skipping OSD status detection"
return
fi
log_info "Querying Ceph OSD status..."
# Parse ceph osd tree for status
# Format: ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT
while IFS= read -r line; do
# Match OSD lines: " 5 hdd 3.63660 osd.5 up 1.00000"
if [[ "$line" =~ ^[[:space:]]*([0-9]+)[[:space:]]+.*osd\.([0-9]+)[[:space:]]+(up|down)[[:space:]]+([0-9.]+) ]]; then
local osd_num="${BASH_REMATCH[1]}"
local status="${BASH_REMATCH[3]}"
local reweight="${BASH_REMATCH[4]}"
CEPH_OSD_STATUS[$osd_num]="$status"
# Determine in/out based on reweight
if awk "BEGIN {exit !($reweight > 0)}"; then
CEPH_OSD_IN[$osd_num]="in"
else
CEPH_OSD_IN[$osd_num]="out"
fi
fi
done < <(ceph osd tree 2>/dev/null)
}
# SMART warning thresholds
readonly SMART_TEMP_WARN=50 # Temperature warning threshold (°C)
readonly SMART_TEMP_CRIT=60 # Temperature critical threshold (°C)
readonly SMART_REALLOCATED_WARN=1 # Reallocated sectors warning threshold
readonly SMART_PENDING_WARN=1 # Pending sectors warning threshold
readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold
readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use
#------------------------------------------------------------------------------
# get_drive_smart_info
#
# Retrieves SMART data for a given device.
#
# Args:
# $1 - Device name (e.g., sda, nvme0n1)
#
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
# TYPE: SSD, HDD, or NVMe
# TEMP: Temperature in Celsius (or "-" if unavailable)
# HEALTH: ✓ for passed, ✗ for failed, ⚠ for passed with warnings
# MODEL: Drive model string
# SERIAL: Drive serial number
# WARNINGS: Comma-separated warning codes (or empty)
#------------------------------------------------------------------------------
get_drive_smart_info() {
local device="$1"
local smart_info
local temp="-"
local type="HDD"
local health="✗"
local model="-"
local serial="-"
local warnings=""
# Capture both stdout and stderr for better error reporting
local smart_stderr
smart_stderr="$(mktemp)"
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>"$smart_stderr")"
local smart_exit=$?
if [[ $smart_exit -ne 0 && -s "$smart_stderr" ]]; then
log_warn "SMART query failed for $device: $(head -1 "$smart_stderr")"
fi
rm -f "$smart_stderr"
if [[ -z "$smart_info" ]]; then
log_info "No SMART data available for $device"
echo "HDD|-|✗|-|-|"
return
fi
# Temperature parsing - handles multiple formats:
# - SATA: "194 Temperature_Celsius ... 35" (value at end of line)
# - SATA: "Temperature: 42 Celsius"
# - SATA: "Current Temperature: 35 Celsius"
# - NVMe: "Temperature: 42 Celsius"
if echo "$smart_info" | grep -q "Temperature_Celsius"; then
# SMART attribute format - temperature is typically the 10th field (raw value)
# But we use the last numeric field before any parentheses for reliability
temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then
# Simple "Temperature: XX Celsius" format
temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')"
fi
# Device type detection - handles SSD, HDD, and NVMe
if [[ "$device" == nvme* ]]; then
type="NVMe"
elif echo "$smart_info" | grep -q "Rotation Rate"; then
if echo "$smart_info" | grep "Rotation Rate" | grep -qiE "solid state|0 rpm"; then
type="SSD"
else
type="HDD"
fi
elif echo "$smart_info" | grep -qiE "SSD|Solid State"; then
type="SSD"
fi
# Health status (basic SMART check)
if echo "$smart_info" | grep -q "SMART overall-health.*PASSED"; then
health="✓"
elif echo "$smart_info" | grep -q "SMART Health Status.*OK"; then
# NVMe format
health="✓"
fi
# Model - try multiple field names
model="$(echo "$smart_info" | grep -E "^(Device Model|Model Number|Product):" | head -1 | cut -d: -f2 | xargs)"
[[ -z "$model" ]] && model="-"
# Serial number - capture everything after the colon to handle spaces
serial="$(echo "$smart_info" | grep -E "^Serial [Nn]umber:" | head -1 | cut -d: -f2 | xargs)"
[[ -z "$serial" ]] && serial="-"
# SMART threshold warnings - check for concerning values
local warn_list=()
# Temperature thresholds
if [[ -n "$temp" && "$temp" =~ ^[0-9]+$ ]]; then
if [[ "$temp" -ge "$SMART_TEMP_CRIT" ]]; then
warn_list+=("TEMP_CRIT")
elif [[ "$temp" -ge "$SMART_TEMP_WARN" ]]; then
warn_list+=("TEMP_WARN")
fi
fi
# Reallocated sectors (SMART attribute 5)
local reallocated
reallocated="$(echo "$smart_info" | grep -E "^\s*5\s+Reallocated_Sector" | awk '{print $NF}')"
if [[ -n "$reallocated" && "$reallocated" =~ ^[0-9]+$ && "$reallocated" -ge "$SMART_REALLOCATED_WARN" ]]; then
warn_list+=("REALLOC:$reallocated")
fi
# Current pending sectors (SMART attribute 197)
local pending
pending="$(echo "$smart_info" | grep -E "^\s*197\s+Current_Pending" | awk '{print $NF}')"
if [[ -n "$pending" && "$pending" =~ ^[0-9]+$ && "$pending" -ge "$SMART_PENDING_WARN" ]]; then
warn_list+=("PENDING:$pending")
fi
# UDMA CRC errors (SMART attribute 199)
local crc_errors
crc_errors="$(echo "$smart_info" | grep -E "^\s*199\s+UDMA_CRC_Error" | awk '{print $NF}')"
if [[ -n "$crc_errors" && "$crc_errors" =~ ^[0-9]+$ && "$crc_errors" -ge "$SMART_CRC_ERROR_WARN" ]]; then
warn_list+=("CRC:$crc_errors")
fi
# Power-on hours (SMART attribute 9)
local power_hours
power_hours="$(echo "$smart_info" | grep -E "^\s*9\s+Power_On_Hours" | awk '{print $NF}')"
if [[ -n "$power_hours" && "$power_hours" =~ ^[0-9]+$ && "$power_hours" -ge "$SMART_POWER_ON_HOURS_WARN" ]]; then
warn_list+=("HOURS:$power_hours")
fi
# Join warnings
if [[ ${#warn_list[@]} -gt 0 ]]; then
warnings="$(IFS=','; echo "${warn_list[*]}")"
# Change health indicator to warning if SMART passed but has warnings
if [[ "$health" == "✓" ]]; then
health="⚠"
fi
fi
# Format temperature with unit if we have a value
local temp_display
if [[ -n "$temp" && "$temp" != "-" ]]; then
temp_display="${temp}°C"
else
temp_display="-"
fi
echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}"
}
#------------------------------------------------------------------------------
# Main Display Logic
#------------------------------------------------------------------------------
HOSTNAME=$(hostname)
CHASSIS_TYPE=${CHASSIS_TYPES[$HOSTNAME]:-"unknown"}
# Display chassis layout
case "$CHASSIS_TYPE" in
"10bay")
generate_10bay_layout "$HOSTNAME"
;;
"large1")
generate_large1_layout "$HOSTNAME"
;;
"micro")
generate_micro_layout "$HOSTNAME"
;;
*)
echo "┌─────────────────────────────────────────────────────────┐"
echo "│ Unknown server: $HOSTNAME"
echo "│ No chassis mapping defined yet"
echo "│ Run diagnose-drives.sh to gather PCI path information"
echo "└─────────────────────────────────────────────────────────┘"
;;
esac
#------------------------------------------------------------------------------
# Drive Details Section
#------------------------------------------------------------------------------
# Build Ceph OSD cache (single query instead of per-device)
if [[ "$SKIP_CEPH" != true ]]; then
build_ceph_cache
fi
printf "\n"
echo -e "$(colorize_header '=== Drive Details with SMART Status (by Bay Position) ===')"
if [[ "$SHOW_PCI" == true ]]; then
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" "PCI PATH"
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
else
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS"
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------"
fi
# Build reverse map: device -> bay
declare -A DEVICE_TO_BAY
for bay in "${!DRIVE_MAP[@]}"; do
device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" ]]; then
DEVICE_TO_BAY[$device]=$bay
fi
done
# Sort drives by bay position (numeric bays first, then m2 slots)
# Combine numeric bays (sorted numerically) with m2 slots (sorted alphanumerically)
all_bays=$(printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^[0-9]+$' | sort -n; printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^m2-' | sort)
# Parallel SMART data collection for faster execution
# Collect SMART data in background jobs, store in temp files
if [[ "$SKIP_SMART" != true ]]; then
SMART_CACHE_DIR="$(mktemp -d)"
log_info "Collecting SMART data in parallel..."
for bay in $all_bays; do
device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
# Launch background job for each device
(get_drive_smart_info "$device" > "$SMART_CACHE_DIR/$device") &
fi
done
# Wait for all background SMART queries to complete
wait
log_info "SMART data collection complete"
fi
for bay in $all_bays; do
device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
size="$(lsblk -d -n -o SIZE "/dev/$device" 2>/dev/null)"
# Get SMART info from cache (or defaults if skipped)
if [[ "$SKIP_SMART" == true ]]; then
type="-"
temp="-"
health="-"
model="-"
serial="-"
warnings=""
else
# Read from cached SMART data
if [[ -f "$SMART_CACHE_DIR/$device" ]]; then
smart_info="$(cat "$SMART_CACHE_DIR/$device")"
else
smart_info=""
fi
IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
fi
# Check for Ceph OSD using cached data
osd_id="-"
ceph_status="-"
if [[ "$SKIP_CEPH" != true ]]; then
osd_id="${CEPH_DEVICE_TO_OSD[$device]:-}"
if [[ -n "$osd_id" ]]; then
# Get status from cached OSD tree data
osd_num="${osd_id#osd.}"
up_status="${CEPH_OSD_STATUS[$osd_num]:-unknown}"
in_status="${CEPH_OSD_IN[$osd_num]:-out}"
ceph_status="${up_status}/${in_status}"
else
osd_id="-"
fi
fi
# Check mount points using lsblk (includes partitions)
# This catches both whole-device mounts and partition mounts (e.g., /dev/sda1)
usage="-"
mount_points="$(lsblk -n -o MOUNTPOINT "/dev/$device" 2>/dev/null | grep -v '^$' | head -3 | tr '\n' ',')"
mount_points="${mount_points%,}" # Remove trailing comma
if [[ -n "$mount_points" ]]; then
if [[ "$mount_points" == *"/"* && ! "$mount_points" == *"/boot"* && ! "$mount_points" == *"/home"* ]]; then
# Root filesystem mounted (but not just /boot or /home)
if echo "$mount_points" | grep -qE '^/,|^/$|,/$'; then
usage="BOOT"
else
usage="$mount_points"
fi
else
usage="$mount_points"
fi
fi
# Apply colors if enabled
colored_temp="$(colorize_temp "$temp")"
colored_health="$(colorize_health "$health")"
# Colorize warnings if present
local colored_warnings="${warnings:--}"
if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then
colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}"
fi
if [[ "$SHOW_PCI" == true ]]; then
pci_path="${BAY_TO_PCI_PATH[$bay]:-}"
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" "$pci_path"
else
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings"
fi
fi
done
# Clean up SMART cache directory
if [[ -n "${SMART_CACHE_DIR:-}" && -d "$SMART_CACHE_DIR" ]]; then
rm -rf "$SMART_CACHE_DIR"
fi
# NVMe drives (only show unmapped ones - mapped NVMe drives appear in main table)
nvme_devices=$(lsblk -d -n -o NAME,SIZE | grep "^nvme" 2>/dev/null)
if [[ -n "$nvme_devices" ]]; then
# Filter out already-mapped NVMe devices
unmapped_nvme=""
while read -r name size; do
if [[ -z "${DEVICE_TO_BAY[$name]:-}" ]]; then
unmapped_nvme+="$name $size"$'\n'
fi
done <<< "$nvme_devices"
if [[ -n "$unmapped_nvme" ]]; then
printf "\n"
echo -e "$(colorize_header '=== Unmapped NVMe Drives ===')"
printf "%-15s %-10s %-10s %-40s %-25s\n" "DEVICE" "SIZE" "TYPE" "MODEL" "SERIAL"
echo "------------------------------------------------------------------------------------------------------"
echo "$unmapped_nvme" | while read -r name size; do
[[ -z "$name" ]] && continue
device="/dev/$name"
# Get model and serial from smartctl for accuracy
smart_info="$(sudo smartctl -i "$device" 2>/dev/null)"
model="$(echo "$smart_info" | grep "Model Number" | cut -d: -f2 | xargs)"
serial="$(echo "$smart_info" | grep "Serial Number" | cut -d: -f2 | xargs)"
[[ -z "$model" ]] && model="-"
[[ -z "$serial" ]] && serial="-"
printf "%-15s %-10s %-10s %-40s %-25s\n" "$device" "$size" "NVMe" "$model" "$serial"
done
fi
fi
#------------------------------------------------------------------------------
# Optional sections
#------------------------------------------------------------------------------
# Ceph RBD Devices
rbd_devices=$(lsblk -d -n -o NAME,SIZE,TYPE 2>/dev/null | grep "rbd" | sort -V)
if [ -n "$rbd_devices" ]; then
printf "\n"
echo -e "$(colorize_header '=== Ceph RBD Devices ===')"
printf "%-15s %-10s %-10s %-30s\n" "DEVICE" "SIZE" "TYPE" "MOUNTPOINT"
echo "------------------------------------------------------------"
echo "$rbd_devices" | while read -r name size type; do
# Get mountpoint if any
mountpoint=$(lsblk -n -o MOUNTPOINT "/dev/$name" 2>/dev/null | head -1)
[[ -z "$mountpoint" ]] && mountpoint="-"
printf "%-15s %-10s %-10s %-30s\n" "/dev/$name" "$size" "$type" "$mountpoint"
done
fi
# Show mapping diagnostic info if DEBUG is set
if [[ -n "$DEBUG" ]]; then
printf "\n"
echo -e "$(colorize_header '=== DEBUG: Drive Mappings ===')"
for key in "${!DRIVE_MAP[@]}"; do
echo "Bay $key: ${DRIVE_MAP[$key]}"
done | sort -n
fi