Add SMART threshold warnings for drive health monitoring

New warning detection for concerning SMART values:
- Temperature: Warning at 50°C, Critical at 60°C
- Reallocated sectors: Warning at >= 1
- Pending sectors: Warning at >= 1
- UDMA CRC errors: Warning at >= 100
- Power-on hours: Warning at >= 43800 (5 years)

Health indicator now shows ⚠ when SMART passed but has warnings.
Added WARNINGS column to output showing codes like:
TEMP_WARN, TEMP_CRIT, REALLOC:5, PENDING:2, CRC:150, HOURS:50000

Thresholds are configurable via constants at top of script.

Fixes: #12

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-05 11:36:06 -05:00
parent 2befe710d5
commit 2a23a17072

View File

@@ -573,6 +573,14 @@ build_ceph_cache() {
done < <(ceph osd tree 2>/dev/null)
}
# SMART warning thresholds
readonly SMART_TEMP_WARN=50 # Temperature warning threshold (°C)
readonly SMART_TEMP_CRIT=60 # Temperature critical threshold (°C)
readonly SMART_REALLOCATED_WARN=1 # Reallocated sectors warning threshold
readonly SMART_PENDING_WARN=1 # Pending sectors warning threshold
readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold
readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use
#------------------------------------------------------------------------------
# get_drive_smart_info
#
@@ -581,12 +589,13 @@ build_ceph_cache() {
# Args:
# $1 - Device name (e.g., sda, nvme0n1)
#
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
# TYPE: SSD, HDD, or NVMe
# TEMP: Temperature in Celsius (or "-" if unavailable)
# HEALTH: ✓ for passed, ✗ for failed
# HEALTH: ✓ for passed, ✗ for failed, ⚠ for passed with warnings
# MODEL: Drive model string
# SERIAL: Drive serial number
# WARNINGS: Comma-separated warning codes (or empty)
#------------------------------------------------------------------------------
get_drive_smart_info() {
local device="$1"
@@ -596,6 +605,7 @@ get_drive_smart_info() {
local health="✗"
local model="-"
local serial="-"
local warnings=""
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)"
@@ -626,7 +636,7 @@ get_drive_smart_info() {
type="SSD"
fi
# Health status
# Health status (basic SMART check)
if echo "$smart_info" | grep -q "SMART overall-health.*PASSED"; then
health="✓"
elif echo "$smart_info" | grep -q "SMART Health Status.*OK"; then
@@ -642,6 +652,55 @@ get_drive_smart_info() {
serial="$(echo "$smart_info" | grep -E "^Serial [Nn]umber:" | head -1 | cut -d: -f2 | xargs)"
[[ -z "$serial" ]] && serial="-"
# SMART threshold warnings - check for concerning values
local warn_list=()
# Temperature thresholds
if [[ -n "$temp" && "$temp" =~ ^[0-9]+$ ]]; then
if [[ "$temp" -ge "$SMART_TEMP_CRIT" ]]; then
warn_list+=("TEMP_CRIT")
elif [[ "$temp" -ge "$SMART_TEMP_WARN" ]]; then
warn_list+=("TEMP_WARN")
fi
fi
# Reallocated sectors (SMART attribute 5)
local reallocated
reallocated="$(echo "$smart_info" | grep -E "^\s*5\s+Reallocated_Sector" | awk '{print $NF}')"
if [[ -n "$reallocated" && "$reallocated" =~ ^[0-9]+$ && "$reallocated" -ge "$SMART_REALLOCATED_WARN" ]]; then
warn_list+=("REALLOC:$reallocated")
fi
# Current pending sectors (SMART attribute 197)
local pending
pending="$(echo "$smart_info" | grep -E "^\s*197\s+Current_Pending" | awk '{print $NF}')"
if [[ -n "$pending" && "$pending" =~ ^[0-9]+$ && "$pending" -ge "$SMART_PENDING_WARN" ]]; then
warn_list+=("PENDING:$pending")
fi
# UDMA CRC errors (SMART attribute 199)
local crc_errors
crc_errors="$(echo "$smart_info" | grep -E "^\s*199\s+UDMA_CRC_Error" | awk '{print $NF}')"
if [[ -n "$crc_errors" && "$crc_errors" =~ ^[0-9]+$ && "$crc_errors" -ge "$SMART_CRC_ERROR_WARN" ]]; then
warn_list+=("CRC:$crc_errors")
fi
# Power-on hours (SMART attribute 9)
local power_hours
power_hours="$(echo "$smart_info" | grep -E "^\s*9\s+Power_On_Hours" | awk '{print $NF}')"
if [[ -n "$power_hours" && "$power_hours" =~ ^[0-9]+$ && "$power_hours" -ge "$SMART_POWER_ON_HOURS_WARN" ]]; then
warn_list+=("HOURS:$power_hours")
fi
# Join warnings
if [[ ${#warn_list[@]} -gt 0 ]]; then
warnings="$(IFS=','; echo "${warn_list[*]}")"
# Change health indicator to warning if SMART passed but has warnings
if [[ "$health" == "✓" ]]; then
health="⚠"
fi
fi
# Format temperature with unit if we have a value
local temp_display
if [[ -n "$temp" && "$temp" != "-" ]]; then
@@ -650,7 +709,7 @@ get_drive_smart_info() {
temp_display="-"
fi
echo "${type}|${temp_display}|${health}|${model}|${serial}"
echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}"
}
#------------------------------------------------------------------------------
@@ -692,11 +751,11 @@ fi
printf "\n"
echo -e "$(colorize_header '=== Drive Details with SMART Status (by Bay Position) ===')"
if [[ "$SHOW_PCI" == true ]]; then
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "PCI PATH"
echo "------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" "PCI PATH"
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
else
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE"
echo "----------------------------------------------------------------------------------------------------------------------------------------------------"
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS"
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------"
fi
# Build reverse map: device -> bay
@@ -723,9 +782,10 @@ for bay in $all_bays; do
health="-"
model="-"
serial="-"
warnings=""
else
smart_info="$(get_drive_smart_info "$device")"
IFS='|' read -r type temp health model serial <<< "$smart_info"
IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
fi
# Check for Ceph OSD using cached data
@@ -766,11 +826,17 @@ for bay in $all_bays; do
colored_temp="$(colorize_temp "$temp")"
colored_health="$(colorize_health "$health")"
# Colorize warnings if present
local colored_warnings="${warnings:--}"
if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then
colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}"
fi
if [[ "$SHOW_PCI" == true ]]; then
pci_path="${BAY_TO_PCI_PATH[$bay]:-}"
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$pci_path"
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" "$pci_path"
else
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage"
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings"
fi
fi
done