Compare commits
24 Commits
f5638cad84
...
7db30a7bbf
| Author | SHA1 | Date | |
|---|---|---|---|
| 7db30a7bbf | |||
| 6dc0b00efd | |||
| 09cba482d4 | |||
| 11fc60b38b | |||
| 3edaafa007 | |||
| 7450d79f01 | |||
| 6436e9fbb4 | |||
| 59ecb3998b | |||
| b61a9305ab | |||
| 05d7fa7e37 | |||
| fa7fa296db | |||
| 0eb3e30dba | |||
| 2a23a17072 | |||
| 2befe710d5 | |||
| 71a4e3b1fb | |||
| 9d39332df3 | |||
| 6b4a985b95 | |||
| 3e74a5a047 | |||
| b3cf164cc7 | |||
| 16d9280a38 | |||
| ff1486dfe2 | |||
| d104616861 | |||
| 90055bec81 | |||
| 94c5c7c3b3 |
830
driveAtlas.sh
830
driveAtlas.sh
@@ -5,13 +5,323 @@
|
||||
# Maps physical drive bays to logical device names using PCI paths
|
||||
#==============================================================================
|
||||
|
||||
# Shell safety options:
|
||||
# -o pipefail: Exit status of pipe is rightmost non-zero exit code
|
||||
# Note: Not using -e (errexit) to allow graceful degradation when tools fail
|
||||
# Note: Not using -u (nounset) as script uses ${var:-default} patterns
|
||||
set -o pipefail
|
||||
|
||||
VERSION="1.1.0"
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Chassis Type Definitions
|
||||
# Path Constants
|
||||
# Centralized path definitions to avoid hardcoding throughout the script
|
||||
#------------------------------------------------------------------------------
|
||||
readonly DISK_BY_PATH="/dev/disk/by-path"
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# show_usage
|
||||
#
|
||||
# Displays help message with usage information and available options.
|
||||
#------------------------------------------------------------------------------
|
||||
show_usage() {
|
||||
cat << EOF
|
||||
Drive Atlas v${VERSION} - Server Drive Mapping Tool
|
||||
|
||||
Maps physical drive bays to logical device names using PCI paths.
|
||||
Displays visual chassis layouts and comprehensive drive information.
|
||||
|
||||
USAGE:
|
||||
$(basename "$0") [OPTIONS]
|
||||
|
||||
OPTIONS:
|
||||
-h, --help Show this help message and exit
|
||||
-v, --version Show version information
|
||||
-d, --debug Enable debug output (show drive mappings)
|
||||
-s, --skip-smart Skip SMART data collection (faster)
|
||||
-c, --color Enable colored output
|
||||
--verbose Show detailed error messages and warnings
|
||||
--no-ceph Skip Ceph OSD information
|
||||
--show-pci Show PCI paths in output
|
||||
|
||||
EXAMPLES:
|
||||
$(basename "$0") # Normal run with all features
|
||||
$(basename "$0") --skip-smart # Fast run without SMART data
|
||||
$(basename "$0") --color # Run with colored output
|
||||
$(basename "$0") --verbose # Show all errors and warnings
|
||||
$(basename "$0") --debug # Show mapping debug info
|
||||
|
||||
ENVIRONMENT VARIABLES:
|
||||
DEBUG=1 Same as --debug flag
|
||||
|
||||
For more information, see: https://code.lotusguild.org/LotusGuild/driveAtlas
|
||||
EOF
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Command Line Argument Parsing
|
||||
#------------------------------------------------------------------------------
|
||||
SKIP_SMART=false
|
||||
SKIP_CEPH=false
|
||||
SHOW_PCI=false
|
||||
USE_COLOR=false
|
||||
VERBOSE=false
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
-h|--help)
|
||||
show_usage
|
||||
exit 0
|
||||
;;
|
||||
-v|--version)
|
||||
echo "Drive Atlas v${VERSION}"
|
||||
exit 0
|
||||
;;
|
||||
-d|--debug)
|
||||
DEBUG=1
|
||||
shift
|
||||
;;
|
||||
-s|--skip-smart)
|
||||
SKIP_SMART=true
|
||||
shift
|
||||
;;
|
||||
--no-ceph)
|
||||
SKIP_CEPH=true
|
||||
shift
|
||||
;;
|
||||
--show-pci)
|
||||
SHOW_PCI=true
|
||||
shift
|
||||
;;
|
||||
-c|--color)
|
||||
USE_COLOR=true
|
||||
shift
|
||||
;;
|
||||
--verbose)
|
||||
VERBOSE=true
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
echo "Use --help for usage information." >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Color Definitions
|
||||
# ANSI escape codes for terminal colors
|
||||
#------------------------------------------------------------------------------
|
||||
if [[ "$USE_COLOR" == true ]]; then
|
||||
COLOR_RESET='\033[0m'
|
||||
COLOR_RED='\033[0;31m'
|
||||
COLOR_GREEN='\033[0;32m'
|
||||
COLOR_YELLOW='\033[0;33m'
|
||||
COLOR_BLUE='\033[0;34m'
|
||||
COLOR_CYAN='\033[0;36m'
|
||||
COLOR_BOLD='\033[1m'
|
||||
else
|
||||
COLOR_RESET=''
|
||||
COLOR_RED=''
|
||||
COLOR_GREEN=''
|
||||
COLOR_YELLOW=''
|
||||
COLOR_BLUE=''
|
||||
COLOR_CYAN=''
|
||||
COLOR_BOLD=''
|
||||
fi
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# colorize_health
|
||||
#
|
||||
# Returns health indicator with appropriate color
|
||||
# Args: $1 - health status (✓ or ✗)
|
||||
#------------------------------------------------------------------------------
|
||||
colorize_health() {
|
||||
local health="$1"
|
||||
if [[ "$USE_COLOR" == true ]]; then
|
||||
if [[ "$health" == "✓" ]]; then
|
||||
printf '%b%s%b' "$COLOR_GREEN" "$health" "$COLOR_RESET"
|
||||
else
|
||||
printf '%b%s%b' "$COLOR_RED" "$health" "$COLOR_RESET"
|
||||
fi
|
||||
else
|
||||
printf '%s' "$health"
|
||||
fi
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# colorize_temp
|
||||
#
|
||||
# Returns temperature with color based on value
|
||||
# Args: $1 - temperature string (e.g., "45°C")
|
||||
#------------------------------------------------------------------------------
|
||||
colorize_temp() {
|
||||
local temp_str="$1"
|
||||
local temp_val
|
||||
|
||||
if [[ "$USE_COLOR" != true || "$temp_str" == "-" ]]; then
|
||||
echo "$temp_str"
|
||||
return
|
||||
fi
|
||||
|
||||
# Extract numeric value
|
||||
temp_val="${temp_str%°C}"
|
||||
if [[ "$temp_val" =~ ^[0-9]+$ ]]; then
|
||||
if [[ "$temp_val" -ge 60 ]]; then
|
||||
printf '%b%s%b' "$COLOR_RED" "$temp_str" "$COLOR_RESET"
|
||||
elif [[ "$temp_val" -ge 50 ]]; then
|
||||
printf '%b%s%b' "$COLOR_YELLOW" "$temp_str" "$COLOR_RESET"
|
||||
else
|
||||
printf '%b%s%b' "$COLOR_GREEN" "$temp_str" "$COLOR_RESET"
|
||||
fi
|
||||
else
|
||||
printf '%s' "$temp_str"
|
||||
fi
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# colorize_header
|
||||
#
|
||||
# Returns header text in blue/bold
|
||||
# Args: $1 - header text
|
||||
#------------------------------------------------------------------------------
|
||||
colorize_header() {
|
||||
if [[ "$USE_COLOR" == true ]]; then
|
||||
printf '%b%b%s%b\n' "$COLOR_BLUE" "$COLOR_BOLD" "$1" "$COLOR_RESET"
|
||||
else
|
||||
printf '%s\n' "$1"
|
||||
fi
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# log_error
|
||||
#
|
||||
# Logs an error message to stderr. Always shown regardless of verbose mode.
|
||||
# Args: $1 - error message
|
||||
#------------------------------------------------------------------------------
|
||||
log_error() {
|
||||
if [[ "$USE_COLOR" == true ]]; then
|
||||
printf '%bERROR:%b %s\n' "$COLOR_RED" "$COLOR_RESET" "$1" >&2
|
||||
else
|
||||
printf 'ERROR: %s\n' "$1" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# log_warn
|
||||
#
|
||||
# Logs a warning message to stderr. Only shown in verbose mode.
|
||||
# Args: $1 - warning message
|
||||
#------------------------------------------------------------------------------
|
||||
log_warn() {
|
||||
if [[ "$VERBOSE" == true ]]; then
|
||||
if [[ "$USE_COLOR" == true ]]; then
|
||||
printf '%bWARN:%b %s\n' "$COLOR_YELLOW" "$COLOR_RESET" "$1" >&2
|
||||
else
|
||||
printf 'WARN: %s\n' "$1" >&2
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# log_info
|
||||
#
|
||||
# Logs an informational message to stderr. Only shown in verbose mode.
|
||||
# Args: $1 - info message
|
||||
#------------------------------------------------------------------------------
|
||||
log_info() {
|
||||
if [[ "$VERBOSE" == true ]]; then
|
||||
if [[ "$USE_COLOR" == true ]]; then
|
||||
printf '%bINFO:%b %s\n' "$COLOR_CYAN" "$COLOR_RESET" "$1" >&2
|
||||
else
|
||||
printf 'INFO: %s\n' "$1" >&2
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Dependency Checks
|
||||
# Verifies required commands are available before running
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
# Required dependencies (script will not function without these)
|
||||
REQUIRED_DEPS=(lsblk lspci readlink hostname)
|
||||
|
||||
# Optional dependencies (enhanced functionality)
|
||||
OPTIONAL_DEPS=(smartctl ceph ceph-volume bc nvme)
|
||||
|
||||
FRESH_START_URL="http://10.10.10.63:3000/LotusGuild/freshStartScript/raw/branch/main/freshStart.sh"
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# check_dependencies
|
||||
#
|
||||
# Verifies required and optional commands are available.
|
||||
# Exits with error if required dependencies are missing.
|
||||
# Warns about missing optional dependencies.
|
||||
#------------------------------------------------------------------------------
|
||||
check_dependencies() {
|
||||
local missing_required=()
|
||||
local missing_optional=()
|
||||
|
||||
# Check required dependencies
|
||||
for cmd in "${REQUIRED_DEPS[@]}"; do
|
||||
if ! command -v "$cmd" &>/dev/null; then
|
||||
missing_required+=("$cmd")
|
||||
fi
|
||||
done
|
||||
|
||||
# Check optional dependencies
|
||||
for cmd in "${OPTIONAL_DEPS[@]}"; do
|
||||
if ! command -v "$cmd" &>/dev/null; then
|
||||
missing_optional+=("$cmd")
|
||||
fi
|
||||
done
|
||||
|
||||
# Report missing required dependencies and exit
|
||||
if [[ ${#missing_required[@]} -gt 0 ]]; then
|
||||
echo "ERROR: Missing required dependencies: ${missing_required[*]}" >&2
|
||||
echo "" >&2
|
||||
echo "Please install the missing packages or run the fresh start script:" >&2
|
||||
echo " curl -s $FRESH_START_URL | bash" >&2
|
||||
echo "" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Warn about missing optional dependencies
|
||||
if [[ ${#missing_optional[@]} -gt 0 ]]; then
|
||||
echo "Note: Some optional features unavailable. Missing: ${missing_optional[*]}" >&2
|
||||
echo " Install them or run: curl -s $FRESH_START_URL | bash" >&2
|
||||
echo "" >&2
|
||||
fi
|
||||
|
||||
# Check for sudo access (needed for smartctl)
|
||||
if command -v smartctl &>/dev/null && ! sudo -n true 2>/dev/null; then
|
||||
echo "Note: SMART data requires sudo access. Run with sudo for full functionality." >&2
|
||||
fi
|
||||
}
|
||||
|
||||
# Run dependency check at script start
|
||||
check_dependencies
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Chassis Layout Generator Functions
|
||||
# These define the physical layout and display formatting for each chassis type
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# generate_10bay_layout
|
||||
#
|
||||
# Generates ASCII art representation of a 10-bay hot-swap chassis (Sliger CX4712).
|
||||
# Shows storage controllers, M.2 NVMe slot, and 10 front hot-swap bays.
|
||||
#
|
||||
# Args:
|
||||
# $1 - Hostname to display in the layout header
|
||||
#
|
||||
# Side effects: Calls build_drive_map() to populate DRIVE_MAP
|
||||
#------------------------------------------------------------------------------
|
||||
generate_10bay_layout() {
|
||||
local hostname=$1
|
||||
local hostname="$1"
|
||||
build_drive_map
|
||||
|
||||
# Fixed width for consistent box drawing (fits device names like "nvme0n1")
|
||||
@@ -62,8 +372,19 @@ generate_10bay_layout() {
|
||||
printf "└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘\n"
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# generate_micro_layout
|
||||
#
|
||||
# Generates ASCII art representation of a micro SBC (e.g., ZimaBoard).
|
||||
# Shows storage controllers, onboard eMMC (if present), and 2 SATA ports.
|
||||
#
|
||||
# Args:
|
||||
# $1 - Hostname to display in the layout header
|
||||
#
|
||||
# Side effects: Calls build_drive_map() to populate DRIVE_MAP
|
||||
#------------------------------------------------------------------------------
|
||||
generate_micro_layout() {
|
||||
local hostname=$1
|
||||
local hostname="$1"
|
||||
build_drive_map
|
||||
|
||||
# Check for eMMC storage
|
||||
@@ -97,8 +418,19 @@ generate_micro_layout() {
|
||||
printf "└─────────────────────────────────────────────────────────────┘\n"
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# generate_large1_layout
|
||||
#
|
||||
# Generates ASCII art representation of a large1 chassis (Rosewill RSV-L4500U).
|
||||
# Shows storage controllers, 2 M.2 NVMe slots, and 15 front bays in 3x5 grid.
|
||||
#
|
||||
# Args:
|
||||
# $1 - Hostname to display in the layout header
|
||||
#
|
||||
# Side effects: Calls build_drive_map() to populate DRIVE_MAP
|
||||
#------------------------------------------------------------------------------
|
||||
generate_large1_layout() {
|
||||
local hostname=$1
|
||||
local hostname="$1"
|
||||
build_drive_map
|
||||
|
||||
# large1 has 3 stacks of 5 bays at front (15 total) + 2 M.2 slots
|
||||
@@ -233,50 +565,298 @@ declare -A CHASSIS_TYPES=(
|
||||
# Core Functions
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
# Cache for lspci output (populated on first call)
|
||||
LSPCI_CACHE=""
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# get_storage_controllers
|
||||
#
|
||||
# Returns a formatted list of storage controllers found via lspci.
|
||||
# Uses cached output if available to avoid redundant lspci calls.
|
||||
#
|
||||
# Output Format: " PCI_ADDR: DESCRIPTION" (one per line)
|
||||
#------------------------------------------------------------------------------
|
||||
get_storage_controllers() {
|
||||
# Returns a formatted list of storage controllers (HBAs, SATA, NVMe)
|
||||
lspci 2>/dev/null | grep -iE "SAS|SATA|RAID|Mass storage|NVMe" | while read -r line; do
|
||||
pci_addr=$(echo "$line" | awk '{print $1}')
|
||||
# Cache lspci output on first call
|
||||
if [[ -z "$LSPCI_CACHE" ]]; then
|
||||
LSPCI_CACHE="$(lspci 2>/dev/null | grep -iE "SAS|SATA|RAID|Mass storage|NVMe")"
|
||||
fi
|
||||
|
||||
# Format and return cached output
|
||||
echo "$LSPCI_CACHE" | while read -r line; do
|
||||
[[ -z "$line" ]] && continue
|
||||
pci_addr="$(echo "$line" | awk '{print $1}')"
|
||||
# Get short description (strip PCI address)
|
||||
desc=$(echo "$line" | sed 's/^[0-9a-f:.]\+ //')
|
||||
desc="$(echo "$line" | sed 's/^[0-9a-f:.]\+ //')"
|
||||
echo " $pci_addr: $desc"
|
||||
done
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# build_drive_map
|
||||
#
|
||||
# Builds a global associative array mapping physical bay numbers to device names.
|
||||
# Uses PCI paths from SERVER_MAPPINGS to resolve current device assignments.
|
||||
#
|
||||
# Sets:
|
||||
# DRIVE_MAP (global associative array)
|
||||
# Keys: Bay identifiers (1, 2, ..., m2-1, m2-2, etc.)
|
||||
# Values: Device names (sda, nvme0n1, etc.)
|
||||
# BAY_TO_PCI_PATH (global associative array)
|
||||
# Keys: Bay identifiers
|
||||
# Values: PCI path strings (for --show-pci option)
|
||||
#------------------------------------------------------------------------------
|
||||
build_drive_map() {
|
||||
local host=$(hostname)
|
||||
declare -A drive_map
|
||||
local host="$(hostname)"
|
||||
local mapping="${SERVER_MAPPINGS[$host]}"
|
||||
|
||||
local mapping=${SERVER_MAPPINGS[$host]}
|
||||
# Declare global arrays directly
|
||||
declare -g -A DRIVE_MAP=()
|
||||
declare -g -A BAY_TO_PCI_PATH=()
|
||||
|
||||
if [[ -n "$mapping" ]]; then
|
||||
if [[ -z "$mapping" ]]; then
|
||||
log_warn "No drive mapping found for host '$host'. Run diagnose-drives.sh to create one."
|
||||
return
|
||||
fi
|
||||
|
||||
local mapped_count=0
|
||||
local empty_count=0
|
||||
while read -r path slot; do
|
||||
[[ -z "$path" || -z "$slot" ]] && continue
|
||||
|
||||
if [[ -L "/dev/disk/by-path/$path" ]]; then
|
||||
local drive=$(readlink -f "/dev/disk/by-path/$path" | sed 's/.*\///')
|
||||
drive_map[$slot]=$drive
|
||||
BAY_TO_PCI_PATH[$slot]="$path"
|
||||
if [[ -L "${DISK_BY_PATH}/$path" ]]; then
|
||||
local drive="$(readlink -f "${DISK_BY_PATH}/$path" | sed 's/.*\///')"
|
||||
DRIVE_MAP[$slot]="$drive"
|
||||
((mapped_count++))
|
||||
else
|
||||
log_info "Bay $slot: No device at PCI path $path"
|
||||
((empty_count++))
|
||||
fi
|
||||
done <<< "$mapping"
|
||||
fi
|
||||
|
||||
# Make drive_map available globally
|
||||
declare -g -A DRIVE_MAP=()
|
||||
for key in "${!drive_map[@]}"; do
|
||||
DRIVE_MAP[$key]=${drive_map[$key]}
|
||||
done
|
||||
log_info "Mapped $mapped_count drives, $empty_count empty bays"
|
||||
}
|
||||
|
||||
get_drive_smart_info() {
|
||||
local device=$1
|
||||
local smart_info=$(sudo smartctl -A -i -H /dev/$device 2>/dev/null)
|
||||
local temp=$(echo "$smart_info" | grep "Temperature" | awk '{print $10}' | head -1)
|
||||
local type=$(echo "$smart_info" | grep "Rotation Rate" | grep -q "Solid State" && echo "SSD" || echo "HDD")
|
||||
local health=$(echo "$smart_info" | grep "SMART overall-health" | grep -q "PASSED" && echo "✓" || echo "✗")
|
||||
local model=$(echo "$smart_info" | grep "Device Model\|Model Number" | cut -d: -f2 | xargs)
|
||||
local serial=$(echo "$smart_info" | grep "Serial Number" | awk '{print $3}')
|
||||
#------------------------------------------------------------------------------
|
||||
# build_ceph_cache
|
||||
#
|
||||
# Queries Ceph once and builds lookup tables for OSD information.
|
||||
# This is much more efficient than querying ceph-volume per device.
|
||||
#
|
||||
# Sets global associative arrays:
|
||||
# CEPH_DEVICE_TO_OSD - Maps device names to OSD IDs (e.g., sda -> osd.5)
|
||||
# CEPH_OSD_STATUS - Maps OSD numbers to up/down status
|
||||
# CEPH_OSD_IN - Maps OSD numbers to in/out status
|
||||
#------------------------------------------------------------------------------
|
||||
build_ceph_cache() {
|
||||
declare -g -A CEPH_DEVICE_TO_OSD=()
|
||||
declare -g -A CEPH_OSD_STATUS=()
|
||||
declare -g -A CEPH_OSD_IN=()
|
||||
|
||||
echo "$type|$temp°C|$health|$model|$serial"
|
||||
# Skip if ceph-volume is not available
|
||||
if ! command -v ceph-volume &>/dev/null; then
|
||||
log_info "ceph-volume not found, skipping Ceph OSD detection"
|
||||
return
|
||||
fi
|
||||
|
||||
log_info "Querying Ceph OSD information..."
|
||||
|
||||
# Parse ceph-volume lvm list output
|
||||
# Format: blocks starting with "====== osd.X =======" followed by device info
|
||||
local current_osd=""
|
||||
while IFS= read -r line; do
|
||||
# Match OSD header: "====== osd.5 ======="
|
||||
if [[ "$line" =~ ======[[:space:]]+osd\.([0-9]+)[[:space:]]+======= ]]; then
|
||||
current_osd="osd.${BASH_REMATCH[1]}"
|
||||
# Match block device line: " block device /dev/sda"
|
||||
elif [[ -n "$current_osd" && "$line" =~ block[[:space:]]device[[:space:]]+/dev/([^[:space:]]+) ]]; then
|
||||
local dev_name="${BASH_REMATCH[1]}"
|
||||
CEPH_DEVICE_TO_OSD[$dev_name]="$current_osd"
|
||||
fi
|
||||
done < <(ceph-volume lvm list 2>/dev/null)
|
||||
|
||||
# Skip if ceph command is not available
|
||||
if ! command -v ceph &>/dev/null; then
|
||||
log_info "ceph CLI not found, skipping OSD status detection"
|
||||
return
|
||||
fi
|
||||
|
||||
log_info "Querying Ceph OSD status..."
|
||||
|
||||
# Parse ceph osd tree for status
|
||||
# Format: ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT
|
||||
while IFS= read -r line; do
|
||||
# Match OSD lines: " 5 hdd 3.63660 osd.5 up 1.00000"
|
||||
if [[ "$line" =~ ^[[:space:]]*([0-9]+)[[:space:]]+.*osd\.([0-9]+)[[:space:]]+(up|down)[[:space:]]+([0-9.]+) ]]; then
|
||||
local osd_num="${BASH_REMATCH[1]}"
|
||||
local status="${BASH_REMATCH[3]}"
|
||||
local reweight="${BASH_REMATCH[4]}"
|
||||
|
||||
CEPH_OSD_STATUS[$osd_num]="$status"
|
||||
|
||||
# Determine in/out based on reweight
|
||||
if awk "BEGIN {exit !($reweight > 0)}"; then
|
||||
CEPH_OSD_IN[$osd_num]="in"
|
||||
else
|
||||
CEPH_OSD_IN[$osd_num]="out"
|
||||
fi
|
||||
fi
|
||||
done < <(ceph osd tree 2>/dev/null)
|
||||
}
|
||||
|
||||
# SMART warning thresholds
|
||||
readonly SMART_TEMP_WARN=50 # Temperature warning threshold (°C)
|
||||
readonly SMART_TEMP_CRIT=60 # Temperature critical threshold (°C)
|
||||
readonly SMART_REALLOCATED_WARN=1 # Reallocated sectors warning threshold
|
||||
readonly SMART_PENDING_WARN=1 # Pending sectors warning threshold
|
||||
readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold
|
||||
readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# get_drive_smart_info
|
||||
#
|
||||
# Retrieves SMART data for a given device.
|
||||
#
|
||||
# Args:
|
||||
# $1 - Device name (e.g., sda, nvme0n1)
|
||||
#
|
||||
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
|
||||
# TYPE: SSD, HDD, or NVMe
|
||||
# TEMP: Temperature in Celsius (or "-" if unavailable)
|
||||
# HEALTH: ✓ for passed, ✗ for failed, ⚠ for passed with warnings
|
||||
# MODEL: Drive model string
|
||||
# SERIAL: Drive serial number
|
||||
# WARNINGS: Comma-separated warning codes (or empty)
|
||||
#------------------------------------------------------------------------------
|
||||
get_drive_smart_info() {
|
||||
local device="$1"
|
||||
local smart_info
|
||||
local temp="-"
|
||||
local type="HDD"
|
||||
local health="✗"
|
||||
local model="-"
|
||||
local serial="-"
|
||||
local warnings=""
|
||||
|
||||
# Capture both stdout and stderr for better error reporting
|
||||
local smart_stderr
|
||||
smart_stderr="$(mktemp)"
|
||||
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>"$smart_stderr")"
|
||||
local smart_exit=$?
|
||||
|
||||
if [[ $smart_exit -ne 0 && -s "$smart_stderr" ]]; then
|
||||
log_warn "SMART query failed for $device: $(head -1 "$smart_stderr")"
|
||||
fi
|
||||
rm -f "$smart_stderr"
|
||||
|
||||
if [[ -z "$smart_info" ]]; then
|
||||
log_info "No SMART data available for $device"
|
||||
echo "HDD|-|✗|-|-|"
|
||||
return
|
||||
fi
|
||||
|
||||
# Temperature parsing - handles multiple formats:
|
||||
# - SATA: "194 Temperature_Celsius ... 35" (value at end of line)
|
||||
# - SATA: "Temperature: 42 Celsius"
|
||||
# - SATA: "Current Temperature: 35 Celsius"
|
||||
# - NVMe: "Temperature: 42 Celsius"
|
||||
if echo "$smart_info" | grep -q "Temperature_Celsius"; then
|
||||
# SMART attribute format - temperature is typically the 10th field (raw value)
|
||||
# But we use the last numeric field before any parentheses for reliability
|
||||
temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
|
||||
elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then
|
||||
# Simple "Temperature: XX Celsius" format
|
||||
temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')"
|
||||
fi
|
||||
|
||||
# Device type detection - handles SSD, HDD, and NVMe
|
||||
if [[ "$device" == nvme* ]]; then
|
||||
type="NVMe"
|
||||
elif echo "$smart_info" | grep -q "Rotation Rate"; then
|
||||
if echo "$smart_info" | grep "Rotation Rate" | grep -qiE "solid state|0 rpm"; then
|
||||
type="SSD"
|
||||
else
|
||||
type="HDD"
|
||||
fi
|
||||
elif echo "$smart_info" | grep -qiE "SSD|Solid State"; then
|
||||
type="SSD"
|
||||
fi
|
||||
|
||||
# Health status (basic SMART check)
|
||||
if echo "$smart_info" | grep -q "SMART overall-health.*PASSED"; then
|
||||
health="✓"
|
||||
elif echo "$smart_info" | grep -q "SMART Health Status.*OK"; then
|
||||
# NVMe format
|
||||
health="✓"
|
||||
fi
|
||||
|
||||
# Model - try multiple field names
|
||||
model="$(echo "$smart_info" | grep -E "^(Device Model|Model Number|Product):" | head -1 | cut -d: -f2 | xargs)"
|
||||
[[ -z "$model" ]] && model="-"
|
||||
|
||||
# Serial number - capture everything after the colon to handle spaces
|
||||
serial="$(echo "$smart_info" | grep -E "^Serial [Nn]umber:" | head -1 | cut -d: -f2 | xargs)"
|
||||
[[ -z "$serial" ]] && serial="-"
|
||||
|
||||
# SMART threshold warnings - check for concerning values
|
||||
local warn_list=()
|
||||
|
||||
# Temperature thresholds
|
||||
if [[ -n "$temp" && "$temp" =~ ^[0-9]+$ ]]; then
|
||||
if [[ "$temp" -ge "$SMART_TEMP_CRIT" ]]; then
|
||||
warn_list+=("TEMP_CRIT")
|
||||
elif [[ "$temp" -ge "$SMART_TEMP_WARN" ]]; then
|
||||
warn_list+=("TEMP_WARN")
|
||||
fi
|
||||
fi
|
||||
|
||||
# Reallocated sectors (SMART attribute 5)
|
||||
local reallocated
|
||||
reallocated="$(echo "$smart_info" | grep -E "^\s*5\s+Reallocated_Sector" | awk '{print $NF}')"
|
||||
if [[ -n "$reallocated" && "$reallocated" =~ ^[0-9]+$ && "$reallocated" -ge "$SMART_REALLOCATED_WARN" ]]; then
|
||||
warn_list+=("REALLOC:$reallocated")
|
||||
fi
|
||||
|
||||
# Current pending sectors (SMART attribute 197)
|
||||
local pending
|
||||
pending="$(echo "$smart_info" | grep -E "^\s*197\s+Current_Pending" | awk '{print $NF}')"
|
||||
if [[ -n "$pending" && "$pending" =~ ^[0-9]+$ && "$pending" -ge "$SMART_PENDING_WARN" ]]; then
|
||||
warn_list+=("PENDING:$pending")
|
||||
fi
|
||||
|
||||
# UDMA CRC errors (SMART attribute 199)
|
||||
local crc_errors
|
||||
crc_errors="$(echo "$smart_info" | grep -E "^\s*199\s+UDMA_CRC_Error" | awk '{print $NF}')"
|
||||
if [[ -n "$crc_errors" && "$crc_errors" =~ ^[0-9]+$ && "$crc_errors" -ge "$SMART_CRC_ERROR_WARN" ]]; then
|
||||
warn_list+=("CRC:$crc_errors")
|
||||
fi
|
||||
|
||||
# Power-on hours (SMART attribute 9)
|
||||
local power_hours
|
||||
power_hours="$(echo "$smart_info" | grep -E "^\s*9\s+Power_On_Hours" | awk '{print $NF}')"
|
||||
if [[ -n "$power_hours" && "$power_hours" =~ ^[0-9]+$ && "$power_hours" -ge "$SMART_POWER_ON_HOURS_WARN" ]]; then
|
||||
warn_list+=("HOURS:$power_hours")
|
||||
fi
|
||||
|
||||
# Join warnings
|
||||
if [[ ${#warn_list[@]} -gt 0 ]]; then
|
||||
warnings="$(IFS=','; echo "${warn_list[*]}")"
|
||||
# Change health indicator to warning if SMART passed but has warnings
|
||||
if [[ "$health" == "✓" ]]; then
|
||||
health="⚠"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Format temperature with unit if we have a value
|
||||
local temp_display
|
||||
if [[ -n "$temp" && "$temp" != "-" ]]; then
|
||||
temp_display="${temp}°C"
|
||||
else
|
||||
temp_display="-"
|
||||
fi
|
||||
|
||||
echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}"
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
@@ -310,88 +890,184 @@ esac
|
||||
# Drive Details Section
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
echo -e "\n=== Drive Details with SMART Status (by Bay Position) ==="
|
||||
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE"
|
||||
echo "----------------------------------------------------------------------------------------------------------------------------------------------------"
|
||||
# Build Ceph OSD cache (single query instead of per-device)
|
||||
if [[ "$SKIP_CEPH" != true ]]; then
|
||||
build_ceph_cache
|
||||
fi
|
||||
|
||||
printf "\n"
|
||||
colorize_header '=== Drive Details with SMART Status (by Bay Position) ==='
|
||||
if [[ "$SHOW_PCI" == true ]]; then
|
||||
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" "PCI PATH"
|
||||
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
|
||||
else
|
||||
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS"
|
||||
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------"
|
||||
fi
|
||||
|
||||
# Build reverse map: device -> bay
|
||||
declare -A DEVICE_TO_BAY
|
||||
for bay in "${!DRIVE_MAP[@]}"; do
|
||||
device="${DRIVE_MAP[$bay]}"
|
||||
if [[ -n "$device" && "$device" != "EMPTY" ]]; then
|
||||
DEVICE_TO_BAY[$device]=$bay
|
||||
DEVICE_TO_BAY["$device"]="$bay"
|
||||
fi
|
||||
done
|
||||
|
||||
# Sort drives by bay position
|
||||
for bay in $(printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^[0-9]+$' | sort -n); do
|
||||
# Sort drives by bay position (numeric bays first, then m2 slots)
|
||||
# Combine numeric bays (sorted numerically) with m2 slots (sorted alphanumerically)
|
||||
all_bays="$(printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^[0-9]+$' | sort -n; printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^m2-' | sort)"
|
||||
|
||||
# Cache lsblk data to reduce redundant calls
|
||||
# Single call gets all info we need: size and mount points
|
||||
declare -A LSBLK_SIZE=()
|
||||
declare -A LSBLK_MOUNTS=()
|
||||
log_info "Caching block device information..."
|
||||
while IFS='|' read -r name size mounts; do
|
||||
[[ -z "$name" ]] && continue
|
||||
LSBLK_SIZE[$name]="$size"
|
||||
# Accumulate mount points for parent device
|
||||
parent="${name%%[0-9]}" # Strip partition number
|
||||
if [[ -n "$mounts" ]]; then
|
||||
if [[ -n "${LSBLK_MOUNTS[$parent]}" ]]; then
|
||||
LSBLK_MOUNTS[$parent]+=",${mounts}"
|
||||
else
|
||||
LSBLK_MOUNTS[$parent]="$mounts"
|
||||
fi
|
||||
fi
|
||||
done < <(lsblk -rn -o NAME,SIZE,MOUNTPOINT 2>/dev/null)
|
||||
|
||||
# Parallel SMART data collection for faster execution
|
||||
# Collect SMART data in background jobs, store in temp files
|
||||
if [[ "$SKIP_SMART" != true ]]; then
|
||||
SMART_CACHE_DIR="$(mktemp -d)"
|
||||
log_info "Collecting SMART data in parallel..."
|
||||
|
||||
for bay in $all_bays; do
|
||||
device="${DRIVE_MAP[$bay]}"
|
||||
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
|
||||
size=$(lsblk -d -n -o SIZE "/dev/$device" 2>/dev/null)
|
||||
smart_info=$(get_drive_smart_info "$device")
|
||||
IFS='|' read -r type temp health model serial <<< "$smart_info"
|
||||
# Launch background job for each device
|
||||
(get_drive_smart_info "$device" > "$SMART_CACHE_DIR/$device") &
|
||||
fi
|
||||
done
|
||||
|
||||
# Check for Ceph OSD
|
||||
osd_id=$(ceph-volume lvm list 2>/dev/null | grep -B 20 "/dev/$device" | grep "osd id" | awk '{print "osd."$3}' | head -1)
|
||||
|
||||
# Get Ceph status if OSD exists
|
||||
ceph_status="-"
|
||||
if [[ -n "$osd_id" ]]; then
|
||||
# Get in/out and up/down status from ceph osd tree
|
||||
osd_num=$(echo "$osd_id" | sed 's/osd\.//')
|
||||
# Parse ceph osd tree output - column 5 is STATUS (up/down), column 6 is REWEIGHT (1.0 = in, 0 = out)
|
||||
tree_line=$(ceph osd tree 2>/dev/null | grep -E "^\s*${osd_num}\s+" | grep "osd.${osd_num}")
|
||||
up_status=$(echo "$tree_line" | awk '{print $5}')
|
||||
reweight=$(echo "$tree_line" | awk '{print $6}')
|
||||
|
||||
# Default to unknown if we can't parse
|
||||
[[ -z "$up_status" ]] && up_status="unknown"
|
||||
[[ -z "$reweight" ]] && reweight="0"
|
||||
|
||||
# Determine in/out based on reweight (1.0 = in, 0 = out)
|
||||
if (( $(echo "$reweight > 0" | bc -l 2>/dev/null || echo 0) )); then
|
||||
in_status="in"
|
||||
else
|
||||
in_status="out"
|
||||
# Wait for all background SMART queries to complete
|
||||
wait
|
||||
log_info "SMART data collection complete"
|
||||
fi
|
||||
|
||||
for bay in $all_bays; do
|
||||
device="${DRIVE_MAP[$bay]}"
|
||||
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
|
||||
# Use cached lsblk data
|
||||
size="${LSBLK_SIZE[$device]:-}"
|
||||
|
||||
# Get SMART info from cache (or defaults if skipped)
|
||||
if [[ "$SKIP_SMART" == true ]]; then
|
||||
type="-"
|
||||
temp="-"
|
||||
health="-"
|
||||
model="-"
|
||||
serial="-"
|
||||
warnings=""
|
||||
else
|
||||
# Read from cached SMART data
|
||||
if [[ -f "$SMART_CACHE_DIR/$device" ]]; then
|
||||
smart_info="$(cat "$SMART_CACHE_DIR/$device")"
|
||||
else
|
||||
smart_info=""
|
||||
fi
|
||||
IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
|
||||
fi
|
||||
|
||||
# Check for Ceph OSD using cached data
|
||||
osd_id="-"
|
||||
ceph_status="-"
|
||||
if [[ "$SKIP_CEPH" != true ]]; then
|
||||
osd_id="${CEPH_DEVICE_TO_OSD[$device]:-}"
|
||||
if [[ -n "$osd_id" ]]; then
|
||||
# Get status from cached OSD tree data
|
||||
osd_num="${osd_id#osd.}"
|
||||
up_status="${CEPH_OSD_STATUS[$osd_num]:-unknown}"
|
||||
in_status="${CEPH_OSD_IN[$osd_num]:-out}"
|
||||
ceph_status="${up_status}/${in_status}"
|
||||
else
|
||||
osd_id="-"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check if boot drive
|
||||
# Check mount points using cached lsblk data
|
||||
# This includes both whole-device mounts and partition mounts
|
||||
usage="-"
|
||||
if mount | grep -q "^/dev/${device}"; then
|
||||
mount_point=$(mount | grep "^/dev/${device}" | awk '{print $3}' | head -1)
|
||||
if [[ "$mount_point" == "/" ]]; then
|
||||
mount_points="${LSBLK_MOUNTS[$device]:-}"
|
||||
# Limit to first 3 mount points for display
|
||||
mount_points="$(echo "$mount_points" | tr ',' '\n' | head -3 | tr '\n' ',' | sed 's/,$//')"
|
||||
if [[ -n "$mount_points" ]]; then
|
||||
if [[ "$mount_points" == *"/"* && ! "$mount_points" == *"/boot"* && ! "$mount_points" == *"/home"* ]]; then
|
||||
# Root filesystem mounted (but not just /boot or /home)
|
||||
if echo "$mount_points" | grep -qE '^/,|^/$|,/$'; then
|
||||
usage="BOOT"
|
||||
else
|
||||
usage="$mount_point"
|
||||
usage="$mount_points"
|
||||
fi
|
||||
else
|
||||
usage="$mount_points"
|
||||
fi
|
||||
fi
|
||||
|
||||
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s\n" "$bay" "/dev/$device" "$size" "$type" "$temp" "$health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage"
|
||||
# Apply colors if enabled
|
||||
colored_temp="$(colorize_temp "$temp")"
|
||||
colored_health="$(colorize_health "$health")"
|
||||
|
||||
# Colorize warnings if present
|
||||
local colored_warnings="${warnings:--}"
|
||||
if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then
|
||||
colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}"
|
||||
fi
|
||||
|
||||
if [[ "$SHOW_PCI" == true ]]; then
|
||||
pci_path="${BAY_TO_PCI_PATH[$bay]:-}"
|
||||
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" "$pci_path"
|
||||
else
|
||||
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
# NVMe drives
|
||||
# Clean up SMART cache directory
|
||||
if [[ -n "${SMART_CACHE_DIR:-}" && -d "$SMART_CACHE_DIR" ]]; then
|
||||
rm -rf "$SMART_CACHE_DIR"
|
||||
fi
|
||||
|
||||
# NVMe drives (only show unmapped ones - mapped NVMe drives appear in main table)
|
||||
nvme_devices=$(lsblk -d -n -o NAME,SIZE | grep "^nvme" 2>/dev/null)
|
||||
if [ -n "$nvme_devices" ]; then
|
||||
echo -e "\n=== NVMe Drives ==="
|
||||
if [[ -n "$nvme_devices" ]]; then
|
||||
# Filter out already-mapped NVMe devices
|
||||
unmapped_nvme=""
|
||||
while read -r name size; do
|
||||
if [[ -z "${DEVICE_TO_BAY[$name]:-}" ]]; then
|
||||
unmapped_nvme+="$name $size"$'\n'
|
||||
fi
|
||||
done <<< "$nvme_devices"
|
||||
|
||||
if [[ -n "$unmapped_nvme" ]]; then
|
||||
printf "\n"
|
||||
colorize_header '=== Unmapped NVMe Drives ==='
|
||||
printf "%-15s %-10s %-10s %-40s %-25s\n" "DEVICE" "SIZE" "TYPE" "MODEL" "SERIAL"
|
||||
echo "------------------------------------------------------------------------------------------------------"
|
||||
echo "$nvme_devices" | while read -r name size; do
|
||||
echo "$unmapped_nvme" | while read -r name size; do
|
||||
[[ -z "$name" ]] && continue
|
||||
device="/dev/$name"
|
||||
# Get model and serial from smartctl for accuracy
|
||||
smart_info=$(sudo smartctl -i "$device" 2>/dev/null)
|
||||
model=$(echo "$smart_info" | grep "Model Number" | cut -d: -f2 | xargs)
|
||||
serial=$(echo "$smart_info" | grep "Serial Number" | cut -d: -f2 | xargs)
|
||||
smart_info="$(sudo smartctl -i "$device" 2>/dev/null)"
|
||||
model="$(echo "$smart_info" | grep "Model Number" | cut -d: -f2 | xargs)"
|
||||
serial="$(echo "$smart_info" | grep "Serial Number" | cut -d: -f2 | xargs)"
|
||||
[[ -z "$model" ]] && model="-"
|
||||
[[ -z "$serial" ]] && serial="-"
|
||||
printf "%-15s %-10s %-10s %-40s %-25s\n" "$device" "$size" "NVMe" "$model" "$serial"
|
||||
done
|
||||
fi
|
||||
fi
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Optional sections
|
||||
@@ -400,7 +1076,8 @@ fi
|
||||
# Ceph RBD Devices
|
||||
rbd_devices=$(lsblk -d -n -o NAME,SIZE,TYPE 2>/dev/null | grep "rbd" | sort -V)
|
||||
if [ -n "$rbd_devices" ]; then
|
||||
echo -e "\n=== Ceph RBD Devices ==="
|
||||
printf "\n"
|
||||
colorize_header '=== Ceph RBD Devices ==='
|
||||
printf "%-15s %-10s %-10s %-30s\n" "DEVICE" "SIZE" "TYPE" "MOUNTPOINT"
|
||||
echo "------------------------------------------------------------"
|
||||
echo "$rbd_devices" | while read -r name size type; do
|
||||
@@ -413,7 +1090,8 @@ fi
|
||||
|
||||
# Show mapping diagnostic info if DEBUG is set
|
||||
if [[ -n "$DEBUG" ]]; then
|
||||
echo -e "\n=== DEBUG: Drive Mappings ==="
|
||||
printf "\n"
|
||||
colorize_header '=== DEBUG: Drive Mappings ==='
|
||||
for key in "${!DRIVE_MAP[@]}"; do
|
||||
echo "Bay $key: ${DRIVE_MAP[$key]}"
|
||||
done | sort -n
|
||||
|
||||
Reference in New Issue
Block a user