Files
driveAtlas/driveAtlas.sh
Jared Vititoe 4a86cdd167 Refactor SMART parsing for parallel collection compatibility
Split SMART data handling into two functions:
- parse_smart_data(): Parses raw smartctl output (no I/O)
- get_drive_smart_info(): Fetches and parses (wrapper)

Changed parallel collection to save raw smartctl output to cache
files, then parse during the display loop. This avoids issues
with function availability in background subshells when running
from process substitution (bash <(curl ...)).

Also fixed:
- Removed orphan code that was outside function scope
- Fixed lsblk caching to use separate calls for SIZE and MOUNTPOINT

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 19:57:37 -05:00

1118 lines
44 KiB
Bash

#!/bin/bash
#==============================================================================
# Drive Atlas - Server Drive Mapping Tool
# Maps physical drive bays to logical device names using PCI paths
#==============================================================================
# Shell safety options:
# -o pipefail: Exit status of pipe is rightmost non-zero exit code
# Note: Not using -e (errexit) to allow graceful degradation when tools fail
# Note: Not using -u (nounset) as script uses ${var:-default} patterns
set -o pipefail
VERSION="1.1.0"
#------------------------------------------------------------------------------
# Path Constants
# Centralized path definitions to avoid hardcoding throughout the script
#------------------------------------------------------------------------------
readonly DISK_BY_PATH="/dev/disk/by-path"
#------------------------------------------------------------------------------
# show_usage
#
# Displays help message with usage information and available options.
#------------------------------------------------------------------------------
show_usage() {
cat << EOF
Drive Atlas v${VERSION} - Server Drive Mapping Tool
Maps physical drive bays to logical device names using PCI paths.
Displays visual chassis layouts and comprehensive drive information.
USAGE:
$(basename "$0") [OPTIONS]
OPTIONS:
-h, --help Show this help message and exit
-v, --version Show version information
-d, --debug Enable debug output (show drive mappings)
-s, --skip-smart Skip SMART data collection (faster)
-c, --color Enable colored output
--verbose Show detailed error messages and warnings
--no-ceph Skip Ceph OSD information
--show-pci Show PCI paths in output
EXAMPLES:
$(basename "$0") # Normal run with all features
$(basename "$0") --skip-smart # Fast run without SMART data
$(basename "$0") --color # Run with colored output
$(basename "$0") --verbose # Show all errors and warnings
$(basename "$0") --debug # Show mapping debug info
ENVIRONMENT VARIABLES:
DEBUG=1 Same as --debug flag
For more information, see: https://code.lotusguild.org/LotusGuild/driveAtlas
EOF
}
#------------------------------------------------------------------------------
# Command Line Argument Parsing
#------------------------------------------------------------------------------
SKIP_SMART=false
SKIP_CEPH=false
SHOW_PCI=false
USE_COLOR=false
VERBOSE=false
while [[ $# -gt 0 ]]; do
case "$1" in
-h|--help)
show_usage
exit 0
;;
-v|--version)
echo "Drive Atlas v${VERSION}"
exit 0
;;
-d|--debug)
DEBUG=1
shift
;;
-s|--skip-smart)
SKIP_SMART=true
shift
;;
--no-ceph)
SKIP_CEPH=true
shift
;;
--show-pci)
SHOW_PCI=true
shift
;;
-c|--color)
USE_COLOR=true
shift
;;
--verbose)
VERBOSE=true
shift
;;
*)
echo "Unknown option: $1" >&2
echo "Use --help for usage information." >&2
exit 1
;;
esac
done
#------------------------------------------------------------------------------
# Color Definitions
# ANSI escape codes for terminal colors
#------------------------------------------------------------------------------
if [[ "$USE_COLOR" == true ]]; then
COLOR_RESET='\033[0m'
COLOR_RED='\033[0;31m'
COLOR_GREEN='\033[0;32m'
COLOR_YELLOW='\033[0;33m'
COLOR_BLUE='\033[0;34m'
COLOR_CYAN='\033[0;36m'
COLOR_BOLD='\033[1m'
else
COLOR_RESET=''
COLOR_RED=''
COLOR_GREEN=''
COLOR_YELLOW=''
COLOR_BLUE=''
COLOR_CYAN=''
COLOR_BOLD=''
fi
#------------------------------------------------------------------------------
# colorize_health
#
# Returns health indicator with appropriate color
# Args: $1 - health status (✓ or ✗)
#------------------------------------------------------------------------------
colorize_health() {
local health="$1"
if [[ "$USE_COLOR" == true ]]; then
if [[ "$health" == "✓" ]]; then
printf '%b%s%b' "$COLOR_GREEN" "$health" "$COLOR_RESET"
else
printf '%b%s%b' "$COLOR_RED" "$health" "$COLOR_RESET"
fi
else
printf '%s' "$health"
fi
}
#------------------------------------------------------------------------------
# colorize_temp
#
# Returns temperature with color based on value
# Args: $1 - temperature string (e.g., "45°C")
#------------------------------------------------------------------------------
colorize_temp() {
local temp_str="$1"
local temp_val
if [[ "$USE_COLOR" != true || "$temp_str" == "-" ]]; then
echo "$temp_str"
return
fi
# Extract numeric value
temp_val="${temp_str%°C}"
if [[ "$temp_val" =~ ^[0-9]+$ ]]; then
if [[ "$temp_val" -ge 60 ]]; then
printf '%b%s%b' "$COLOR_RED" "$temp_str" "$COLOR_RESET"
elif [[ "$temp_val" -ge 50 ]]; then
printf '%b%s%b' "$COLOR_YELLOW" "$temp_str" "$COLOR_RESET"
else
printf '%b%s%b' "$COLOR_GREEN" "$temp_str" "$COLOR_RESET"
fi
else
printf '%s' "$temp_str"
fi
}
#------------------------------------------------------------------------------
# colorize_header
#
# Returns header text in blue/bold
# Args: $1 - header text
#------------------------------------------------------------------------------
colorize_header() {
if [[ "$USE_COLOR" == true ]]; then
printf '%b%b%s%b\n' "$COLOR_BLUE" "$COLOR_BOLD" "$1" "$COLOR_RESET"
else
printf '%s\n' "$1"
fi
}
#------------------------------------------------------------------------------
# log_error
#
# Logs an error message to stderr. Always shown regardless of verbose mode.
# Args: $1 - error message
#------------------------------------------------------------------------------
log_error() {
if [[ "$USE_COLOR" == true ]]; then
printf '%bERROR:%b %s\n' "$COLOR_RED" "$COLOR_RESET" "$1" >&2
else
printf 'ERROR: %s\n' "$1" >&2
fi
}
#------------------------------------------------------------------------------
# log_warn
#
# Logs a warning message to stderr. Only shown in verbose mode.
# Args: $1 - warning message
#------------------------------------------------------------------------------
log_warn() {
if [[ "$VERBOSE" == true ]]; then
if [[ "$USE_COLOR" == true ]]; then
printf '%bWARN:%b %s\n' "$COLOR_YELLOW" "$COLOR_RESET" "$1" >&2
else
printf 'WARN: %s\n' "$1" >&2
fi
fi
}
#------------------------------------------------------------------------------
# log_info
#
# Logs an informational message to stderr. Only shown in verbose mode.
# Args: $1 - info message
#------------------------------------------------------------------------------
log_info() {
if [[ "$VERBOSE" == true ]]; then
if [[ "$USE_COLOR" == true ]]; then
printf '%bINFO:%b %s\n' "$COLOR_CYAN" "$COLOR_RESET" "$1" >&2
else
printf 'INFO: %s\n' "$1" >&2
fi
fi
}
#------------------------------------------------------------------------------
# Dependency Checks
# Verifies required commands are available before running
#------------------------------------------------------------------------------
# Required dependencies (script will not function without these)
REQUIRED_DEPS=(lsblk lspci readlink hostname)
# Optional dependencies (enhanced functionality)
OPTIONAL_DEPS=(smartctl ceph ceph-volume bc nvme)
FRESH_START_URL="http://10.10.10.63:3000/LotusGuild/freshStartScript/raw/branch/main/freshStart.sh"
#------------------------------------------------------------------------------
# check_dependencies
#
# Verifies required and optional commands are available.
# Exits with error if required dependencies are missing.
# Warns about missing optional dependencies.
#------------------------------------------------------------------------------
check_dependencies() {
local missing_required=()
local missing_optional=()
# Check required dependencies
for cmd in "${REQUIRED_DEPS[@]}"; do
if ! command -v "$cmd" &>/dev/null; then
missing_required+=("$cmd")
fi
done
# Check optional dependencies
for cmd in "${OPTIONAL_DEPS[@]}"; do
if ! command -v "$cmd" &>/dev/null; then
missing_optional+=("$cmd")
fi
done
# Report missing required dependencies and exit
if [[ ${#missing_required[@]} -gt 0 ]]; then
echo "ERROR: Missing required dependencies: ${missing_required[*]}" >&2
echo "" >&2
echo "Please install the missing packages or run the fresh start script:" >&2
echo " curl -s $FRESH_START_URL | bash" >&2
echo "" >&2
exit 1
fi
# Warn about missing optional dependencies
if [[ ${#missing_optional[@]} -gt 0 ]]; then
echo "Note: Some optional features unavailable. Missing: ${missing_optional[*]}" >&2
echo " Install them or run: curl -s $FRESH_START_URL | bash" >&2
echo "" >&2
fi
# Check for sudo access (needed for smartctl)
if command -v smartctl &>/dev/null && ! sudo -n true 2>/dev/null; then
echo "Note: SMART data requires sudo access. Run with sudo for full functionality." >&2
fi
}
# Run dependency check at script start
check_dependencies
#------------------------------------------------------------------------------
# Chassis Layout Generator Functions
# These define the physical layout and display formatting for each chassis type
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# generate_10bay_layout
#
# Generates ASCII art representation of a 10-bay hot-swap chassis (Sliger CX4712).
# Shows storage controllers, M.2 NVMe slot, and 10 front hot-swap bays.
#
# Args:
# $1 - Hostname to display in the layout header
#
# Side effects: Calls build_drive_map() to populate DRIVE_MAP
#------------------------------------------------------------------------------
generate_10bay_layout() {
local hostname="$1"
build_drive_map
# Fixed width for consistent box drawing (fits device names like "nvme0n1")
local drive_width=10
# Main chassis section
printf "┌────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐\n"
printf "│ %-126s │\n" "$hostname - Sliger CX4712 (10x 3.5\" Hot-swap)"
printf "│ │\n"
# Show storage controllers
printf "│ Storage Controllers: │\n"
while IFS= read -r ctrl; do
[[ -n "$ctrl" ]] && printf "│ %-126s│\n" "$ctrl"
done < <(get_storage_controllers)
printf "│ │\n"
# M.2 NVMe slot if present
if [[ -n "${DRIVE_MAP[m2-1]}" ]]; then
printf "│ M.2 NVMe: %-10s │\n" "${DRIVE_MAP[m2-1]}"
printf "│ │\n"
fi
printf "│ Front Hot-swap Bays: │\n"
printf "│ │\n"
# Bay top borders
printf "│ "
for bay in {1..10}; do
printf "┌──────────┐ "
done
printf " │\n"
# Bay contents
printf "│ "
for bay in {1..10}; do
printf "│%-2d:%-7s│ " "$bay" "${DRIVE_MAP[$bay]:-EMPTY}"
done
printf " │\n"
# Bay bottom borders
printf "│ "
for bay in {1..10}; do
printf "└──────────┘ "
done
printf " │\n"
printf "└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n"
}
#------------------------------------------------------------------------------
# generate_micro_layout
#
# Generates ASCII art representation of a micro SBC (e.g., ZimaBoard).
# Shows storage controllers, onboard eMMC (if present), and 2 SATA ports.
#
# Args:
# $1 - Hostname to display in the layout header
#
# Side effects: Calls build_drive_map() to populate DRIVE_MAP
#------------------------------------------------------------------------------
generate_micro_layout() {
local hostname="$1"
build_drive_map
# Check for eMMC storage
local emmc_device=""
if [[ -b /dev/mmcblk0 ]]; then
emmc_device="mmcblk0"
fi
printf "┌─────────────────────────────────────────────────────────────┐\n"
printf "│ %-57s │\n" "$hostname - Micro SBC"
printf "│ │\n"
printf "│ Storage Controllers: │\n"
while IFS= read -r ctrl; do
[[ -n "$ctrl" ]] && printf "│ %-57s│\n" "$ctrl"
done < <(get_storage_controllers)
printf "│ │\n"
# Show eMMC if present
if [[ -n "$emmc_device" ]]; then
local emmc_size=$(lsblk -d -n -o SIZE "/dev/$emmc_device" 2>/dev/null | xargs)
printf "│ ┌─────────────────────────────────────────────────────┐ │\n"
printf "│ │ Onboard eMMC: %-10s (%s) │ │\n" "$emmc_device" "$emmc_size"
printf "│ └─────────────────────────────────────────────────────┘ │\n"
printf "│ │\n"
fi
printf "│ SATA Ports (rear): │\n"
printf "│ ┌──────────────┐ ┌──────────────┐ │\n"
printf "│ │ 1: %-9s │ │ 2: %-9s │ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}"
printf "│ └──────────────┘ └──────────────┘ │\n"
printf "└─────────────────────────────────────────────────────────────┘\n"
}
#------------------------------------------------------------------------------
# generate_large1_layout
#
# Generates ASCII art representation of a large1 chassis (Rosewill RSV-L4500U).
# Shows storage controllers, 2 M.2 NVMe slots, and 15 front bays in 3x5 grid.
#
# Args:
# $1 - Hostname to display in the layout header
#
# Side effects: Calls build_drive_map() to populate DRIVE_MAP
#------------------------------------------------------------------------------
generate_large1_layout() {
local hostname="$1"
build_drive_map
# large1 has 3 stacks of 5 bays at front (15 total) + 2 M.2 slots
# Physical bay mapping TBD - current mapping is by controller order
printf "┌─────────────────────────────────────────────────────────────────────────┐\n"
printf "│ %-69s │\n" "$hostname - Rosewill RSV-L4500U (15x 3.5\" Bays)"
printf "│ │\n"
printf "│ Storage Controllers: │\n"
while IFS= read -r ctrl; do
[[ -n "$ctrl" ]] && printf "│ %-69s│\n" "$ctrl"
done < <(get_storage_controllers)
printf "│ │\n"
printf "│ M.2 NVMe: M1: %-10s M2: %-10s │\n" "${DRIVE_MAP[m2-1]:-EMPTY}" "${DRIVE_MAP[m2-2]:-EMPTY}"
printf "│ │\n"
printf "│ Front Bays (3 stacks x 5 rows): [Bay mapping TBD] │\n"
printf "│ Stack A Stack B Stack C │\n"
printf "│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │\n"
printf "│ │1:%-8s│ │2:%-8s│ │3:%-8s│ │\n" "${DRIVE_MAP[1]:-EMPTY}" "${DRIVE_MAP[2]:-EMPTY}" "${DRIVE_MAP[3]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │4:%-8s│ │5:%-8s│ │6:%-8s│ │\n" "${DRIVE_MAP[4]:-EMPTY}" "${DRIVE_MAP[5]:-EMPTY}" "${DRIVE_MAP[6]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │7:%-8s│ │8:%-8s│ │9:%-8s│ │\n" "${DRIVE_MAP[7]:-EMPTY}" "${DRIVE_MAP[8]:-EMPTY}" "${DRIVE_MAP[9]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │10:%-7s│ │11:%-7s│ │12:%-7s│ │\n" "${DRIVE_MAP[10]:-EMPTY}" "${DRIVE_MAP[11]:-EMPTY}" "${DRIVE_MAP[12]:-EMPTY}"
printf "│ ├──────────┤ ├──────────┤ ├──────────┤ │\n"
printf "│ │13:%-7s│ │14:%-7s│ │15:%-7s│ │\n" "${DRIVE_MAP[13]:-EMPTY}" "${DRIVE_MAP[14]:-EMPTY}" "${DRIVE_MAP[15]:-EMPTY}"
printf "│ └──────────┘ └──────────┘ └──────────┘ │\n"
printf "└─────────────────────────────────────────────────────────────────────────┘\n"
}
#------------------------------------------------------------------------------
# Server-Specific Drive Mappings
# Maps PCI paths to physical bay numbers for each server
# Format: "pci-path bay-number"
#------------------------------------------------------------------------------
declare -A SERVER_MAPPINGS=(
# compute-storage-01 (formerly medium2)
# Motherboard: B650D4U3-2Q/BCM with AMD SATA controller
# HBA: LSI SAS3008 at 01:00.0 (mini-SAS HD ports)
# Cable mapping from user notes:
# - Mobo SATA: top-right=bay1, bottom-right=bay2, bottom-left=bay3, top-left=bay4
# - HBA bottom mini-SAS: bays 5,6,7,8
# - HBA top mini-SAS: bays 9,10
["compute-storage-01"]="
pci-0000:0d:00.0-ata-2 1
pci-0000:0d:00.0-ata-1 2
pci-0000:0d:00.0-ata-3 3
pci-0000:0d:00.0-ata-4 4
pci-0000:01:00.0-sas-phy6-lun-0 5
pci-0000:01:00.0-sas-phy7-lun-0 6
pci-0000:01:00.0-sas-phy5-lun-0 7
pci-0000:01:00.0-sas-phy2-lun-0 8
pci-0000:01:00.0-sas-phy4-lun-0 9
pci-0000:01:00.0-sas-phy3-lun-0 10
pci-0000:0e:00.0-nvme-1 m2-1
"
# compute-storage-gpu-01
# Motherboard: ASUS PRIME B550-PLUS with AMD SATA controller at 02:00.1
# 5 SATA ports + 1 M.2 NVMe slot
# sdf is USB/card reader - not mapped
["compute-storage-gpu-01"]="
pci-0000:02:00.1-ata-1 1
pci-0000:02:00.1-ata-2 2
pci-0000:02:00.1-ata-3 3
pci-0000:02:00.1-ata-4 4
pci-0000:02:00.1-ata-5 5
pci-0000:0c:00.0-nvme-1 m2-1
"
# storage-01
# Motherboard: ASRock A320M-HDV R4.0 with AMD SATA controller at 02:00.1
# 4 SATA ports used (ata-1, ata-2, ata-5, ata-6) - ata-3/4 empty
["storage-01"]="
pci-0000:02:00.1-ata-1 1
pci-0000:02:00.1-ata-2 2
pci-0000:02:00.1-ata-5 3
pci-0000:02:00.1-ata-6 4
"
# large1
# Custom tower with multiple controllers:
# - HBA: LSI SAS2008 at 10:00.0 (7 drives)
# - AMD SATA at 16:00.1 (3 drives)
# - ASMedia SATA at 25:00.0 (2 drives)
# - 2x NVMe slots
["large1"]="
pci-0000:10:00.0-sas-phy0-lun-0 1
pci-0000:10:00.0-sas-phy1-lun-0 2
pci-0000:10:00.0-sas-phy3-lun-0 3
pci-0000:10:00.0-sas-phy4-lun-0 4
pci-0000:10:00.0-sas-phy5-lun-0 5
pci-0000:10:00.0-sas-phy6-lun-0 6
pci-0000:10:00.0-sas-phy7-lun-0 7
pci-0000:16:00.1-ata-3 8
pci-0000:16:00.1-ata-7 9
pci-0000:16:00.1-ata-8 10
pci-0000:25:00.0-ata-1 11
pci-0000:25:00.0-ata-2 12
pci-0000:2a:00.0-nvme-1 m2-1
pci-0000:26:00.0-nvme-1 m2-2
"
# micro1
# ZimaBoard 832 - Single board computer
# 2 SATA ports on rear (currently unused)
# Boot from onboard eMMC (mmcblk0)
# SATA controller at 00:12.0
["micro1"]="
"
# monitor-02
# ZimaBoard 832 - Single board computer
# 2 SATA ports on rear (currently unused)
# Boot from onboard eMMC (mmcblk0)
# SATA controller would be at a specific PCI address when drives connected
["monitor-02"]="
"
)
declare -A CHASSIS_TYPES=(
["compute-storage-01"]="10bay"
["compute-storage-gpu-01"]="10bay"
["storage-01"]="10bay"
["large1"]="large1"
["micro1"]="micro" # ZimaBoard 832
["monitor-02"]="micro" # ZimaBoard 832
)
#------------------------------------------------------------------------------
# Core Functions
#------------------------------------------------------------------------------
# Cache for lspci output (populated on first call)
LSPCI_CACHE=""
#------------------------------------------------------------------------------
# get_storage_controllers
#
# Returns a formatted list of storage controllers found via lspci.
# Uses cached output if available to avoid redundant lspci calls.
#
# Output Format: " PCI_ADDR: DESCRIPTION" (one per line)
#------------------------------------------------------------------------------
get_storage_controllers() {
# Cache lspci output on first call
if [[ -z "$LSPCI_CACHE" ]]; then
LSPCI_CACHE="$(lspci 2>/dev/null | grep -iE "SAS|SATA|RAID|Mass storage|NVMe")"
fi
# Format and return cached output
echo "$LSPCI_CACHE" | while read -r line; do
[[ -z "$line" ]] && continue
pci_addr="$(echo "$line" | awk '{print $1}')"
# Get short description (strip PCI address)
desc="$(echo "$line" | sed 's/^[0-9a-f:.]\+ //')"
echo " $pci_addr: $desc"
done
}
#------------------------------------------------------------------------------
# build_drive_map
#
# Builds a global associative array mapping physical bay numbers to device names.
# Uses PCI paths from SERVER_MAPPINGS to resolve current device assignments.
#
# Sets:
# DRIVE_MAP (global associative array)
# Keys: Bay identifiers (1, 2, ..., m2-1, m2-2, etc.)
# Values: Device names (sda, nvme0n1, etc.)
# BAY_TO_PCI_PATH (global associative array)
# Keys: Bay identifiers
# Values: PCI path strings (for --show-pci option)
#------------------------------------------------------------------------------
build_drive_map() {
local host="$(hostname)"
local mapping="${SERVER_MAPPINGS[$host]}"
# Declare global arrays directly
declare -g -A DRIVE_MAP=()
declare -g -A BAY_TO_PCI_PATH=()
if [[ -z "$mapping" ]]; then
log_warn "No drive mapping found for host '$host'. Run diagnose-drives.sh to create one."
return
fi
local mapped_count=0
local empty_count=0
while read -r path slot; do
[[ -z "$path" || -z "$slot" ]] && continue
BAY_TO_PCI_PATH[$slot]="$path"
if [[ -L "${DISK_BY_PATH}/$path" ]]; then
local drive="$(readlink -f "${DISK_BY_PATH}/$path" | sed 's/.*\///')"
DRIVE_MAP[$slot]="$drive"
((mapped_count++))
else
log_info "Bay $slot: No device at PCI path $path"
((empty_count++))
fi
done <<< "$mapping"
log_info "Mapped $mapped_count drives, $empty_count empty bays"
}
#------------------------------------------------------------------------------
# build_ceph_cache
#
# Queries Ceph once and builds lookup tables for OSD information.
# This is much more efficient than querying ceph-volume per device.
#
# Sets global associative arrays:
# CEPH_DEVICE_TO_OSD - Maps device names to OSD IDs (e.g., sda -> osd.5)
# CEPH_OSD_STATUS - Maps OSD numbers to up/down status
# CEPH_OSD_IN - Maps OSD numbers to in/out status
#------------------------------------------------------------------------------
build_ceph_cache() {
declare -g -A CEPH_DEVICE_TO_OSD=()
declare -g -A CEPH_OSD_STATUS=()
declare -g -A CEPH_OSD_IN=()
# Skip if ceph-volume is not available
if ! command -v ceph-volume &>/dev/null; then
log_info "ceph-volume not found, skipping Ceph OSD detection"
return
fi
log_info "Querying Ceph OSD information..."
# Parse ceph-volume lvm list output
# Format: blocks starting with "====== osd.X =======" followed by device info
local current_osd=""
while IFS= read -r line; do
# Match OSD header: "====== osd.5 ======="
if [[ "$line" =~ ======[[:space:]]+osd\.([0-9]+)[[:space:]]+======= ]]; then
current_osd="osd.${BASH_REMATCH[1]}"
# Match block device line: " block device /dev/sda"
elif [[ -n "$current_osd" && "$line" =~ block[[:space:]]device[[:space:]]+/dev/([^[:space:]]+) ]]; then
local dev_name="${BASH_REMATCH[1]}"
CEPH_DEVICE_TO_OSD[$dev_name]="$current_osd"
fi
done < <(ceph-volume lvm list 2>/dev/null)
# Skip if ceph command is not available
if ! command -v ceph &>/dev/null; then
log_info "ceph CLI not found, skipping OSD status detection"
return
fi
log_info "Querying Ceph OSD status..."
# Parse ceph osd tree for status
# Format: ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT
while IFS= read -r line; do
# Match OSD lines: " 5 hdd 3.63660 osd.5 up 1.00000"
if [[ "$line" =~ ^[[:space:]]*([0-9]+)[[:space:]]+.*osd\.([0-9]+)[[:space:]]+(up|down)[[:space:]]+([0-9.]+) ]]; then
local osd_num="${BASH_REMATCH[1]}"
local status="${BASH_REMATCH[3]}"
local reweight="${BASH_REMATCH[4]}"
CEPH_OSD_STATUS[$osd_num]="$status"
# Determine in/out based on reweight
if awk "BEGIN {exit !($reweight > 0)}"; then
CEPH_OSD_IN[$osd_num]="in"
else
CEPH_OSD_IN[$osd_num]="out"
fi
fi
done < <(ceph osd tree 2>/dev/null)
}
# SMART warning thresholds
readonly SMART_TEMP_WARN=50 # Temperature warning threshold (°C)
readonly SMART_TEMP_CRIT=60 # Temperature critical threshold (°C)
readonly SMART_REALLOCATED_WARN=1 # Reallocated sectors warning threshold
readonly SMART_PENDING_WARN=1 # Pending sectors warning threshold
readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold
readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use
#------------------------------------------------------------------------------
# parse_smart_data
#
# Parses raw SMART data and returns formatted info string.
#
# Args:
# $1 - Device name (e.g., sda, nvme0n1)
# $2 - Raw smartctl output string
#
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
#------------------------------------------------------------------------------
parse_smart_data() {
local device="$1"
local smart_info="$2"
local temp="-"
local type="HDD"
local health="✗"
local model="-"
local serial="-"
local warnings=""
if [[ -z "$smart_info" ]]; then
echo "HDD|-|✗|-|-|"
return
fi
# Temperature parsing - handles multiple formats:
# - SATA: "194 Temperature_Celsius ... 35" (value at end of line)
# - SATA: "Temperature: 42 Celsius"
# - SATA: "Current Temperature: 35 Celsius"
# - NVMe: "Temperature: 42 Celsius"
if echo "$smart_info" | grep -q "Temperature_Celsius"; then
temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then
temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')"
fi
# Device type detection - handles SSD, HDD, and NVMe
if [[ "$device" == nvme* ]]; then
type="NVMe"
elif echo "$smart_info" | grep -q "Rotation Rate"; then
if echo "$smart_info" | grep "Rotation Rate" | grep -qiE "solid state|0 rpm"; then
type="SSD"
else
type="HDD"
fi
elif echo "$smart_info" | grep -qiE "SSD|Solid State"; then
type="SSD"
fi
# Health status (basic SMART check)
if echo "$smart_info" | grep -q "SMART overall-health.*PASSED"; then
health="✓"
elif echo "$smart_info" | grep -q "SMART Health Status.*OK"; then
# NVMe format
health="✓"
fi
# Model - try multiple field names
model="$(echo "$smart_info" | grep -E "^(Device Model|Model Number|Product):" | head -1 | cut -d: -f2 | xargs)"
[[ -z "$model" ]] && model="-"
# Serial number - capture everything after the colon to handle spaces
serial="$(echo "$smart_info" | grep -E "^Serial [Nn]umber:" | head -1 | cut -d: -f2 | xargs)"
[[ -z "$serial" ]] && serial="-"
# SMART threshold warnings - check for concerning values
local warn_list=()
# Temperature thresholds
if [[ -n "$temp" && "$temp" =~ ^[0-9]+$ ]]; then
if [[ "$temp" -ge "$SMART_TEMP_CRIT" ]]; then
warn_list+=("TEMP_CRIT")
elif [[ "$temp" -ge "$SMART_TEMP_WARN" ]]; then
warn_list+=("TEMP_WARN")
fi
fi
# Reallocated sectors (SMART attribute 5)
local reallocated
reallocated="$(echo "$smart_info" | grep -E "^\s*5\s+Reallocated_Sector" | awk '{print $NF}')"
if [[ -n "$reallocated" && "$reallocated" =~ ^[0-9]+$ && "$reallocated" -ge "$SMART_REALLOCATED_WARN" ]]; then
warn_list+=("REALLOC:$reallocated")
fi
# Current pending sectors (SMART attribute 197)
local pending
pending="$(echo "$smart_info" | grep -E "^\s*197\s+Current_Pending" | awk '{print $NF}')"
if [[ -n "$pending" && "$pending" =~ ^[0-9]+$ && "$pending" -ge "$SMART_PENDING_WARN" ]]; then
warn_list+=("PENDING:$pending")
fi
# UDMA CRC errors (SMART attribute 199)
local crc_errors
crc_errors="$(echo "$smart_info" | grep -E "^\s*199\s+UDMA_CRC_Error" | awk '{print $NF}')"
if [[ -n "$crc_errors" && "$crc_errors" =~ ^[0-9]+$ && "$crc_errors" -ge "$SMART_CRC_ERROR_WARN" ]]; then
warn_list+=("CRC:$crc_errors")
fi
# Power-on hours (SMART attribute 9)
local power_hours
power_hours="$(echo "$smart_info" | grep -E "^\s*9\s+Power_On_Hours" | awk '{print $NF}')"
if [[ -n "$power_hours" && "$power_hours" =~ ^[0-9]+$ && "$power_hours" -ge "$SMART_POWER_ON_HOURS_WARN" ]]; then
warn_list+=("HOURS:$power_hours")
fi
# Join warnings
if [[ ${#warn_list[@]} -gt 0 ]]; then
warnings="$(IFS=','; echo "${warn_list[*]}")"
# Change health indicator to warning if SMART passed but has warnings
if [[ "$health" == "✓" ]]; then
health="⚠"
fi
fi
# Format temperature with unit if we have a value
local temp_display
if [[ -n "$temp" && "$temp" != "-" ]]; then
temp_display="${temp}°C"
else
temp_display="-"
fi
echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}"
}
#------------------------------------------------------------------------------
# get_drive_smart_info
#
# Retrieves SMART data for a given device (fetches and parses).
#
# Args:
# $1 - Device name (e.g., sda, nvme0n1)
#
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
#------------------------------------------------------------------------------
get_drive_smart_info() {
local device="$1"
local smart_info
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)"
parse_smart_data "$device" "$smart_info"
}
#------------------------------------------------------------------------------
# Main Display Logic
#------------------------------------------------------------------------------
HOSTNAME=$(hostname)
CHASSIS_TYPE=${CHASSIS_TYPES[$HOSTNAME]:-"unknown"}
# Display chassis layout
case "$CHASSIS_TYPE" in
"10bay")
generate_10bay_layout "$HOSTNAME"
;;
"large1")
generate_large1_layout "$HOSTNAME"
;;
"micro")
generate_micro_layout "$HOSTNAME"
;;
*)
echo "┌─────────────────────────────────────────────────────────┐"
echo "│ Unknown server: $HOSTNAME"
echo "│ No chassis mapping defined yet"
echo "│ Run diagnose-drives.sh to gather PCI path information"
echo "└─────────────────────────────────────────────────────────┘"
;;
esac
#------------------------------------------------------------------------------
# Drive Details Section
#------------------------------------------------------------------------------
# Build Ceph OSD cache (single query instead of per-device)
if [[ "$SKIP_CEPH" != true ]]; then
build_ceph_cache
fi
printf "\n"
colorize_header '=== Drive Details with SMART Status (by Bay Position) ==='
if [[ "$SHOW_PCI" == true ]]; then
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" "PCI PATH"
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
else
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS"
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------"
fi
# Build reverse map: device -> bay
declare -A DEVICE_TO_BAY
for bay in "${!DRIVE_MAP[@]}"; do
device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" ]]; then
DEVICE_TO_BAY["$device"]="$bay"
fi
done
# Sort drives by bay position (numeric bays first, then m2 slots)
# Combine numeric bays (sorted numerically) with m2 slots (sorted alphanumerically)
all_bays="$(printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^[0-9]+$' | sort -n; printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^m2-' | sort)"
# Cache lsblk data to reduce redundant calls
# Get device sizes (whole disk only)
declare -A LSBLK_SIZE=()
declare -A LSBLK_MOUNTS=()
log_info "Caching block device information..."
# Get sizes for whole disks only
while read -r name size; do
[[ -z "$name" ]] && continue
LSBLK_SIZE["$name"]="$size"
done < <(lsblk -dn -o NAME,SIZE 2>/dev/null)
# Get mount points (including partitions) and map back to parent device
while read -r name mounts; do
[[ -z "$name" || -z "$mounts" ]] && continue
# Strip partition suffix (sda1 -> sda, nvme0n1p1 -> nvme0n1)
if [[ "$name" =~ ^(nvme[0-9]+n[0-9]+)p[0-9]+$ ]]; then
parent="${BASH_REMATCH[1]}"
elif [[ "$name" =~ ^([a-z]+)[0-9]+$ ]]; then
parent="${BASH_REMATCH[1]}"
else
parent="$name"
fi
if [[ -n "${LSBLK_MOUNTS[$parent]:-}" ]]; then
LSBLK_MOUNTS["$parent"]+=",${mounts}"
else
LSBLK_MOUNTS["$parent"]="$mounts"
fi
done < <(lsblk -rn -o NAME,MOUNTPOINT 2>/dev/null | grep -v '^ ')
# Parallel SMART data collection for faster execution
# Collect raw smartctl output in background jobs, parse later
if [[ "$SKIP_SMART" != true ]]; then
SMART_CACHE_DIR="$(mktemp -d)"
log_info "Collecting SMART data in parallel..."
for bay in $all_bays; do
device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
# Launch background job to collect raw smartctl data
(sudo smartctl -A -i -H "/dev/$device" > "$SMART_CACHE_DIR/${device}.raw" 2>/dev/null) &
fi
done
# Wait for all background SMART queries to complete
wait
log_info "SMART data collection complete"
fi
for bay in $all_bays; do
device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
# Use cached lsblk data
size="${LSBLK_SIZE[$device]:-}"
# Get SMART info from cache (or defaults if skipped)
if [[ "$SKIP_SMART" == true ]]; then
type="-"
temp="-"
health="-"
model="-"
serial="-"
warnings=""
else
# Read from cached raw SMART data and parse it
raw_smart=""
if [[ -f "$SMART_CACHE_DIR/${device}.raw" ]]; then
raw_smart="$(cat "$SMART_CACHE_DIR/${device}.raw")"
fi
# Parse the raw data using get_drive_smart_info logic inline
if [[ -n "$raw_smart" ]]; then
smart_info="$(parse_smart_data "$device" "$raw_smart")"
IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
else
type="-"
temp="-"
health="-"
model="-"
serial="-"
warnings=""
fi
fi
# Check for Ceph OSD using cached data
osd_id="-"
ceph_status="-"
if [[ "$SKIP_CEPH" != true ]]; then
osd_id="${CEPH_DEVICE_TO_OSD[$device]:-}"
if [[ -n "$osd_id" ]]; then
# Get status from cached OSD tree data
osd_num="${osd_id#osd.}"
up_status="${CEPH_OSD_STATUS[$osd_num]:-unknown}"
in_status="${CEPH_OSD_IN[$osd_num]:-out}"
ceph_status="${up_status}/${in_status}"
else
osd_id="-"
fi
fi
# Check mount points using cached lsblk data
# This includes both whole-device mounts and partition mounts
usage="-"
mount_points="${LSBLK_MOUNTS[$device]:-}"
# Limit to first 3 mount points for display
mount_points="$(echo "$mount_points" | tr ',' '\n' | head -3 | tr '\n' ',' | sed 's/,$//')"
if [[ -n "$mount_points" ]]; then
if [[ "$mount_points" == *"/"* && ! "$mount_points" == *"/boot"* && ! "$mount_points" == *"/home"* ]]; then
# Root filesystem mounted (but not just /boot or /home)
if echo "$mount_points" | grep -qE '^/,|^/$|,/$'; then
usage="BOOT"
else
usage="$mount_points"
fi
else
usage="$mount_points"
fi
fi
# Apply colors if enabled
colored_temp="$(colorize_temp "$temp")"
colored_health="$(colorize_health "$health")"
# Colorize warnings if present
colored_warnings="${warnings:--}"
if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then
colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}"
fi
if [[ "$SHOW_PCI" == true ]]; then
pci_path="${BAY_TO_PCI_PATH[$bay]:-}"
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" "$pci_path"
else
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings"
fi
fi
done
# Clean up SMART cache directory
if [[ -n "${SMART_CACHE_DIR:-}" && -d "$SMART_CACHE_DIR" ]]; then
rm -rf "$SMART_CACHE_DIR"
fi
# NVMe drives (only show unmapped ones - mapped NVMe drives appear in main table)
nvme_devices=$(lsblk -d -n -o NAME,SIZE | grep "^nvme" 2>/dev/null)
if [[ -n "$nvme_devices" ]]; then
# Filter out already-mapped NVMe devices
unmapped_nvme=""
while read -r name size; do
if [[ -z "${DEVICE_TO_BAY[$name]:-}" ]]; then
unmapped_nvme+="$name $size"$'\n'
fi
done <<< "$nvme_devices"
if [[ -n "$unmapped_nvme" ]]; then
printf "\n"
colorize_header '=== Unmapped NVMe Drives ==='
printf "%-15s %-10s %-10s %-40s %-25s\n" "DEVICE" "SIZE" "TYPE" "MODEL" "SERIAL"
echo "------------------------------------------------------------------------------------------------------"
echo "$unmapped_nvme" | while read -r name size; do
[[ -z "$name" ]] && continue
device="/dev/$name"
# Get model and serial from smartctl for accuracy
smart_info="$(sudo smartctl -i "$device" 2>/dev/null)"
model="$(echo "$smart_info" | grep "Model Number" | cut -d: -f2 | xargs)"
serial="$(echo "$smart_info" | grep "Serial Number" | cut -d: -f2 | xargs)"
[[ -z "$model" ]] && model="-"
[[ -z "$serial" ]] && serial="-"
printf "%-15s %-10s %-10s %-40s %-25s\n" "$device" "$size" "NVMe" "$model" "$serial"
done
fi
fi
#------------------------------------------------------------------------------
# Optional sections
#------------------------------------------------------------------------------
# Ceph RBD Devices
rbd_devices=$(lsblk -d -n -o NAME,SIZE,TYPE 2>/dev/null | grep "rbd" | sort -V)
if [ -n "$rbd_devices" ]; then
printf "\n"
colorize_header '=== Ceph RBD Devices ==='
printf "%-15s %-10s %-10s %-30s\n" "DEVICE" "SIZE" "TYPE" "MOUNTPOINT"
echo "------------------------------------------------------------"
echo "$rbd_devices" | while read -r name size type; do
# Get mountpoint if any
mountpoint=$(lsblk -n -o MOUNTPOINT "/dev/$name" 2>/dev/null | head -1)
[[ -z "$mountpoint" ]] && mountpoint="-"
printf "%-15s %-10s %-10s %-30s\n" "/dev/$name" "$size" "$type" "$mountpoint"
done
fi
# Show mapping diagnostic info if DEBUG is set
if [[ -n "$DEBUG" ]]; then
printf "\n"
colorize_header '=== DEBUG: Drive Mappings ==='
for key in "${!DRIVE_MAP[@]}"; do
echo "Bay $key: ${DRIVE_MAP[$key]}"
done | sort -n
fi