Compare commits

...

3 Commits

Author SHA1 Message Date
4a86cdd167 Refactor SMART parsing for parallel collection compatibility
Split SMART data handling into two functions:
- parse_smart_data(): Parses raw smartctl output (no I/O)
- get_drive_smart_info(): Fetches and parses (wrapper)

Changed parallel collection to save raw smartctl output to cache
files, then parse during the display loop. This avoids issues
with function availability in background subshells when running
from process substitution (bash <(curl ...)).

Also fixed:
- Removed orphan code that was outside function scope
- Fixed lsblk caching to use separate calls for SIZE and MOUNTPOINT

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 19:57:37 -05:00
58897b1f3a Fix lsblk caching to properly parse SIZE and MOUNTPOINT
Split lsblk queries into two separate calls:
1. lsblk -dn for disk sizes (whole disk only, simpler parsing)
2. lsblk -rn for mount points (handles partition-to-parent mapping)

This fixes issues where:
- SIZE was empty due to delimiter confusion
- Mount points with spaces caused parsing errors

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 19:54:41 -05:00
fbd9965fb1 Fix 'local' used outside function context
Removed 'local' keyword from colored_warnings variable assignment
in the main script body. The 'local' keyword can only be used
inside functions in bash.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 19:53:22 -05:00

View File

@@ -714,24 +714,19 @@ readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold
readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# get_drive_smart_info # parse_smart_data
# #
# Retrieves SMART data for a given device. # Parses raw SMART data and returns formatted info string.
# #
# Args: # Args:
# $1 - Device name (e.g., sda, nvme0n1) # $1 - Device name (e.g., sda, nvme0n1)
# $2 - Raw smartctl output string
# #
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS # Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
# TYPE: SSD, HDD, or NVMe
# TEMP: Temperature in Celsius (or "-" if unavailable)
# HEALTH: ✓ for passed, ✗ for failed, ⚠ for passed with warnings
# MODEL: Drive model string
# SERIAL: Drive serial number
# WARNINGS: Comma-separated warning codes (or empty)
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
get_drive_smart_info() { parse_smart_data() {
local device="$1" local device="$1"
local smart_info local smart_info="$2"
local temp="-" local temp="-"
local type="HDD" local type="HDD"
local health="✗" local health="✗"
@@ -739,19 +734,7 @@ get_drive_smart_info() {
local serial="-" local serial="-"
local warnings="" local warnings=""
# Capture both stdout and stderr for better error reporting
local smart_stderr
smart_stderr="$(mktemp)"
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>"$smart_stderr")"
local smart_exit=$?
if [[ $smart_exit -ne 0 && -s "$smart_stderr" ]]; then
log_warn "SMART query failed for $device: $(head -1 "$smart_stderr")"
fi
rm -f "$smart_stderr"
if [[ -z "$smart_info" ]]; then if [[ -z "$smart_info" ]]; then
log_info "No SMART data available for $device"
echo "HDD|-|✗|-|-|" echo "HDD|-|✗|-|-|"
return return
fi fi
@@ -762,11 +745,8 @@ get_drive_smart_info() {
# - SATA: "Current Temperature: 35 Celsius" # - SATA: "Current Temperature: 35 Celsius"
# - NVMe: "Temperature: 42 Celsius" # - NVMe: "Temperature: 42 Celsius"
if echo "$smart_info" | grep -q "Temperature_Celsius"; then if echo "$smart_info" | grep -q "Temperature_Celsius"; then
# SMART attribute format - temperature is typically the 10th field (raw value)
# But we use the last numeric field before any parentheses for reliability
temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')" temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then
# Simple "Temperature: XX Celsius" format
temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')" temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')"
fi fi
@@ -859,6 +839,24 @@ get_drive_smart_info() {
echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}" echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}"
} }
#------------------------------------------------------------------------------
# get_drive_smart_info
#
# Retrieves SMART data for a given device (fetches and parses).
#
# Args:
# $1 - Device name (e.g., sda, nvme0n1)
#
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
#------------------------------------------------------------------------------
get_drive_smart_info() {
local device="$1"
local smart_info
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)"
parse_smart_data "$device" "$smart_info"
}
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# Main Display Logic # Main Display Logic
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
@@ -919,26 +917,37 @@ done
all_bays="$(printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^[0-9]+$' | sort -n; printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^m2-' | sort)" all_bays="$(printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^[0-9]+$' | sort -n; printf '%s\n' "${!DRIVE_MAP[@]}" | grep -E '^m2-' | sort)"
# Cache lsblk data to reduce redundant calls # Cache lsblk data to reduce redundant calls
# Single call gets all info we need: size and mount points # Get device sizes (whole disk only)
declare -A LSBLK_SIZE=() declare -A LSBLK_SIZE=()
declare -A LSBLK_MOUNTS=() declare -A LSBLK_MOUNTS=()
log_info "Caching block device information..." log_info "Caching block device information..."
while IFS='|' read -r name size mounts; do
# Get sizes for whole disks only
while read -r name size; do
[[ -z "$name" ]] && continue [[ -z "$name" ]] && continue
LSBLK_SIZE[$name]="$size" LSBLK_SIZE["$name"]="$size"
# Accumulate mount points for parent device done < <(lsblk -dn -o NAME,SIZE 2>/dev/null)
parent="${name%%[0-9]}" # Strip partition number
if [[ -n "$mounts" ]]; then # Get mount points (including partitions) and map back to parent device
if [[ -n "${LSBLK_MOUNTS[$parent]}" ]]; then while read -r name mounts; do
LSBLK_MOUNTS[$parent]+=",${mounts}" [[ -z "$name" || -z "$mounts" ]] && continue
# Strip partition suffix (sda1 -> sda, nvme0n1p1 -> nvme0n1)
if [[ "$name" =~ ^(nvme[0-9]+n[0-9]+)p[0-9]+$ ]]; then
parent="${BASH_REMATCH[1]}"
elif [[ "$name" =~ ^([a-z]+)[0-9]+$ ]]; then
parent="${BASH_REMATCH[1]}"
else else
LSBLK_MOUNTS[$parent]="$mounts" parent="$name"
fi fi
if [[ -n "${LSBLK_MOUNTS[$parent]:-}" ]]; then
LSBLK_MOUNTS["$parent"]+=",${mounts}"
else
LSBLK_MOUNTS["$parent"]="$mounts"
fi fi
done < <(lsblk -rn -o NAME,SIZE,MOUNTPOINT 2>/dev/null) done < <(lsblk -rn -o NAME,MOUNTPOINT 2>/dev/null | grep -v '^ ')
# Parallel SMART data collection for faster execution # Parallel SMART data collection for faster execution
# Collect SMART data in background jobs, store in temp files # Collect raw smartctl output in background jobs, parse later
if [[ "$SKIP_SMART" != true ]]; then if [[ "$SKIP_SMART" != true ]]; then
SMART_CACHE_DIR="$(mktemp -d)" SMART_CACHE_DIR="$(mktemp -d)"
log_info "Collecting SMART data in parallel..." log_info "Collecting SMART data in parallel..."
@@ -946,8 +955,8 @@ if [[ "$SKIP_SMART" != true ]]; then
for bay in $all_bays; do for bay in $all_bays; do
device="${DRIVE_MAP[$bay]}" device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
# Launch background job for each device # Launch background job to collect raw smartctl data
(get_drive_smart_info "$device" > "$SMART_CACHE_DIR/$device") & (sudo smartctl -A -i -H "/dev/$device" > "$SMART_CACHE_DIR/${device}.raw" 2>/dev/null) &
fi fi
done done
@@ -971,13 +980,23 @@ for bay in $all_bays; do
serial="-" serial="-"
warnings="" warnings=""
else else
# Read from cached SMART data # Read from cached raw SMART data and parse it
if [[ -f "$SMART_CACHE_DIR/$device" ]]; then raw_smart=""
smart_info="$(cat "$SMART_CACHE_DIR/$device")" if [[ -f "$SMART_CACHE_DIR/${device}.raw" ]]; then
else raw_smart="$(cat "$SMART_CACHE_DIR/${device}.raw")"
smart_info=""
fi fi
# Parse the raw data using get_drive_smart_info logic inline
if [[ -n "$raw_smart" ]]; then
smart_info="$(parse_smart_data "$device" "$raw_smart")"
IFS='|' read -r type temp health model serial warnings <<< "$smart_info" IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
else
type="-"
temp="-"
health="-"
model="-"
serial="-"
warnings=""
fi
fi fi
# Check for Ceph OSD using cached data # Check for Ceph OSD using cached data
@@ -1020,7 +1039,7 @@ for bay in $all_bays; do
colored_health="$(colorize_health "$health")" colored_health="$(colorize_health "$health")"
# Colorize warnings if present # Colorize warnings if present
local colored_warnings="${warnings:--}" colored_warnings="${warnings:--}"
if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then
colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}" colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}"
fi fi