Refactor SMART parsing for parallel collection compatibility
Split SMART data handling into two functions: - parse_smart_data(): Parses raw smartctl output (no I/O) - get_drive_smart_info(): Fetches and parses (wrapper) Changed parallel collection to save raw smartctl output to cache files, then parse during the display loop. This avoids issues with function availability in background subshells when running from process substitution (bash <(curl ...)). Also fixed: - Removed orphan code that was outside function scope - Fixed lsblk caching to use separate calls for SIZE and MOUNTPOINT Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -714,24 +714,19 @@ readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold
|
||||
readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# get_drive_smart_info
|
||||
# parse_smart_data
|
||||
#
|
||||
# Retrieves SMART data for a given device.
|
||||
# Parses raw SMART data and returns formatted info string.
|
||||
#
|
||||
# Args:
|
||||
# $1 - Device name (e.g., sda, nvme0n1)
|
||||
# $2 - Raw smartctl output string
|
||||
#
|
||||
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
|
||||
# TYPE: SSD, HDD, or NVMe
|
||||
# TEMP: Temperature in Celsius (or "-" if unavailable)
|
||||
# HEALTH: ✓ for passed, ✗ for failed, ⚠ for passed with warnings
|
||||
# MODEL: Drive model string
|
||||
# SERIAL: Drive serial number
|
||||
# WARNINGS: Comma-separated warning codes (or empty)
|
||||
#------------------------------------------------------------------------------
|
||||
get_drive_smart_info() {
|
||||
parse_smart_data() {
|
||||
local device="$1"
|
||||
local smart_info
|
||||
local smart_info="$2"
|
||||
local temp="-"
|
||||
local type="HDD"
|
||||
local health="✗"
|
||||
@@ -739,19 +734,7 @@ get_drive_smart_info() {
|
||||
local serial="-"
|
||||
local warnings=""
|
||||
|
||||
# Capture both stdout and stderr for better error reporting
|
||||
local smart_stderr
|
||||
smart_stderr="$(mktemp)"
|
||||
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>"$smart_stderr")"
|
||||
local smart_exit=$?
|
||||
|
||||
if [[ $smart_exit -ne 0 && -s "$smart_stderr" ]]; then
|
||||
log_warn "SMART query failed for $device: $(head -1 "$smart_stderr")"
|
||||
fi
|
||||
rm -f "$smart_stderr"
|
||||
|
||||
if [[ -z "$smart_info" ]]; then
|
||||
log_info "No SMART data available for $device"
|
||||
echo "HDD|-|✗|-|-|"
|
||||
return
|
||||
fi
|
||||
@@ -762,11 +745,8 @@ get_drive_smart_info() {
|
||||
# - SATA: "Current Temperature: 35 Celsius"
|
||||
# - NVMe: "Temperature: 42 Celsius"
|
||||
if echo "$smart_info" | grep -q "Temperature_Celsius"; then
|
||||
# SMART attribute format - temperature is typically the 10th field (raw value)
|
||||
# But we use the last numeric field before any parentheses for reliability
|
||||
temp="$(echo "$smart_info" | grep "Temperature_Celsius" | head -1 | awk '{for(i=NF;i>0;i--) if($i ~ /^[0-9]+$/) {print $i; exit}}')"
|
||||
elif echo "$smart_info" | grep -qE "^(Current )?Temperature:"; then
|
||||
# Simple "Temperature: XX Celsius" format
|
||||
temp="$(echo "$smart_info" | grep -E "^(Current )?Temperature:" | head -1 | awk '{print $2}')"
|
||||
fi
|
||||
|
||||
@@ -859,6 +839,24 @@ get_drive_smart_info() {
|
||||
echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}"
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# get_drive_smart_info
|
||||
#
|
||||
# Retrieves SMART data for a given device (fetches and parses).
|
||||
#
|
||||
# Args:
|
||||
# $1 - Device name (e.g., sda, nvme0n1)
|
||||
#
|
||||
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
|
||||
#------------------------------------------------------------------------------
|
||||
get_drive_smart_info() {
|
||||
local device="$1"
|
||||
local smart_info
|
||||
|
||||
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)"
|
||||
parse_smart_data "$device" "$smart_info"
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Main Display Logic
|
||||
#------------------------------------------------------------------------------
|
||||
@@ -949,7 +947,7 @@ while read -r name mounts; do
|
||||
done < <(lsblk -rn -o NAME,MOUNTPOINT 2>/dev/null | grep -v '^ ')
|
||||
|
||||
# Parallel SMART data collection for faster execution
|
||||
# Collect SMART data in background jobs, store in temp files
|
||||
# Collect raw smartctl output in background jobs, parse later
|
||||
if [[ "$SKIP_SMART" != true ]]; then
|
||||
SMART_CACHE_DIR="$(mktemp -d)"
|
||||
log_info "Collecting SMART data in parallel..."
|
||||
@@ -957,8 +955,8 @@ if [[ "$SKIP_SMART" != true ]]; then
|
||||
for bay in $all_bays; do
|
||||
device="${DRIVE_MAP[$bay]}"
|
||||
if [[ -n "$device" && "$device" != "EMPTY" && -b "/dev/$device" ]]; then
|
||||
# Launch background job for each device
|
||||
(get_drive_smart_info "$device" > "$SMART_CACHE_DIR/$device") &
|
||||
# Launch background job to collect raw smartctl data
|
||||
(sudo smartctl -A -i -H "/dev/$device" > "$SMART_CACHE_DIR/${device}.raw" 2>/dev/null) &
|
||||
fi
|
||||
done
|
||||
|
||||
@@ -982,13 +980,23 @@ for bay in $all_bays; do
|
||||
serial="-"
|
||||
warnings=""
|
||||
else
|
||||
# Read from cached SMART data
|
||||
if [[ -f "$SMART_CACHE_DIR/$device" ]]; then
|
||||
smart_info="$(cat "$SMART_CACHE_DIR/$device")"
|
||||
else
|
||||
smart_info=""
|
||||
# Read from cached raw SMART data and parse it
|
||||
raw_smart=""
|
||||
if [[ -f "$SMART_CACHE_DIR/${device}.raw" ]]; then
|
||||
raw_smart="$(cat "$SMART_CACHE_DIR/${device}.raw")"
|
||||
fi
|
||||
# Parse the raw data using get_drive_smart_info logic inline
|
||||
if [[ -n "$raw_smart" ]]; then
|
||||
smart_info="$(parse_smart_data "$device" "$raw_smart")"
|
||||
IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
|
||||
else
|
||||
type="-"
|
||||
temp="-"
|
||||
health="-"
|
||||
model="-"
|
||||
serial="-"
|
||||
warnings=""
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check for Ceph OSD using cached data
|
||||
|
||||
Reference in New Issue
Block a user