Refactor SMART parsing for parallel collection compatibility

Split SMART data handling into two functions:
- parse_smart_data(): Parses raw smartctl output (no I/O)
- get_drive_smart_info(): Fetches and parses (wrapper)

Changed parallel collection to save raw smartctl output to cache
files, then parse during the display loop. This avoids issues
with function availability in background subshells when running
from process substitution (bash <(curl ...)).

Also fixed:
- Removed orphan code that was outside function scope
- Fixed lsblk caching to use separate calls for SIZE and MOUNTPOINT

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-05 19:57:37 -05:00
parent 58897b1f3a
commit 4a86cdd167

View File

@@ -714,24 +714,19 @@ readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold
readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# get_drive_smart_info # parse_smart_data
# #
# Retrieves SMART data for a given device. # Parses raw SMART data and returns formatted info string.
# #
# Args: # Args:
# $1 - Device name (e.g., sda, nvme0n1) # $1 - Device name (e.g., sda, nvme0n1)
# $2 - Raw smartctl output string
# #
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS # Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
# TYPE: SSD, HDD, or NVMe
# TEMP: Temperature in Celsius (or "-" if unavailable)
# HEALTH: ✓ for passed, ✗ for failed, ⚠ for passed with warnings
# MODEL: Drive model string
# SERIAL: Drive serial number
# WARNINGS: Comma-separated warning codes (or empty)
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
get_drive_smart_info() { parse_smart_data() {
local device="$1" local device="$1"
local smart_info local smart_info="$2"
local temp="-" local temp="-"
local type="HDD" local type="HDD"
local health="✗" local health="✗"
@@ -739,19 +734,7 @@ get_drive_smart_info() {
local serial="-" local serial="-"
local warnings="" local warnings=""
# Capture both stdout and stderr for better error reporting
local smart_stderr
smart_stderr="$(mktemp)"
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>"$smart_stderr")"
local smart_exit=$?
if [[ $smart_exit -ne 0 && -s "$smart_stderr" ]]; then
log_warn "SMART query failed for $device: $(head -1 "$smart_stderr")"
fi
rm -f "$smart_stderr"
if [[ -z "$smart_info" ]]; then if [[ -z "$smart_info" ]]; then
log_info "No SMART data available for $device"
echo "HDD|-|✗|-|-|" echo "HDD|-|✗|-|-|"
return return
fi fi
@@ -762,11 +745,8 @@ get_drive_smart_info() {
# - SATA: "Current Temperature: 35 Celsius" # - SATA: "Current Temperature: 35 Celsius"
# - NVMe: "Temperature: 42 Celsius" # - NVMe: "Temperature: 42 Celsius"
if echo "$smart_info" | grep -q "Temperature_Celsius"; then if echo "$smart_info" | grep -q "Temperature_Celsius"; then
# SMART attribute format - temperature is typically the 10th field (raw value)
# But we use the last numeric field before any parentheses for reliability
temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')" temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then
# Simple "Temperature: XX Celsius" format
temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')" temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')"
fi fi
@@ -859,6 +839,24 @@ get_drive_smart_info() {
echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}" echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}"
} }
#------------------------------------------------------------------------------
# get_drive_smart_info
#
# Retrieves SMART data for a given device (fetches and parses).
#
# Args:
# $1 - Device name (e.g., sda, nvme0n1)
#
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
#------------------------------------------------------------------------------
get_drive_smart_info() {
local device="$1"
local smart_info
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)"
parse_smart_data "$device" "$smart_info"
}
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# Main Display Logic # Main Display Logic
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
@@ -949,7 +947,7 @@ while read -r name mounts; do
done < <(lsblk -rn -o NAME,MOUNTPOINT 2>/dev/null | grep -v '^ ') done < <(lsblk -rn -o NAME,MOUNTPOINT 2>/dev/null | grep -v '^ ')
# Parallel SMART data collection for faster execution # Parallel SMART data collection for faster execution
# Collect SMART data in background jobs, store in temp files # Collect raw smartctl output in background jobs, parse later
if [[ "$SKIP_SMART" != true ]]; then if [[ "$SKIP_SMART" != true ]]; then
SMART_CACHE_DIR="$(mktemp -d)" SMART_CACHE_DIR="$(mktemp -d)"
log_info "Collecting SMART data in parallel..." log_info "Collecting SMART data in parallel..."
@@ -957,8 +955,8 @@ if [[ "$SKIP_SMART" != true ]]; then
for bay in $all_bays; do for bay in $all_bays; do
device="${DRIVE_MAP[$bay]}" device="${DRIVE_MAP[$bay]}"
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
# Launch background job for each device # Launch background job to collect raw smartctl data
(get_drive_smart_info "$device" > "$SMART_CACHE_DIR/$device") & (sudo smartctl -A -i -H "/dev/$device" > "$SMART_CACHE_DIR/${device}.raw" 2>/dev/null) &
fi fi
done done
@@ -982,13 +980,23 @@ for bay in $all_bays; do
serial="-" serial="-"
warnings="" warnings=""
else else
# Read from cached SMART data # Read from cached raw SMART data and parse it
if [[ -f "$SMART_CACHE_DIR/$device" ]]; then raw_smart=""
smart_info="$(cat "$SMART_CACHE_DIR/$device")" if [[ -f "$SMART_CACHE_DIR/${device}.raw" ]]; then
else raw_smart="$(cat "$SMART_CACHE_DIR/${device}.raw")"
smart_info="" fi
# Parse the raw data using get_drive_smart_info logic inline
if [[ -n "$raw_smart" ]]; then
smart_info="$(parse_smart_data "$device" "$raw_smart")"
IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
else
type="-"
temp="-"
health="-"
model="-"
serial="-"
warnings=""
fi fi
IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
fi fi
# Check for Ceph OSD using cached data # Check for Ceph OSD using cached data