Add SMART threshold warnings for drive health monitoring
New warning detection for concerning SMART values: - Temperature: Warning at 50°C, Critical at 60°C - Reallocated sectors: Warning at >= 1 - Pending sectors: Warning at >= 1 - UDMA CRC errors: Warning at >= 100 - Power-on hours: Warning at >= 43800 (5 years) Health indicator now shows ⚠ when SMART passed but has warnings. Added WARNINGS column to output showing codes like: TEMP_WARN, TEMP_CRIT, REALLOC:5, PENDING:2, CRC:150, HOURS:50000 Thresholds are configurable via constants at top of script. Fixes: #12 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -573,6 +573,14 @@ build_ceph_cache() {
|
||||
done < <(ceph osd tree 2>/dev/null)
|
||||
}
|
||||
|
||||
# SMART warning thresholds
|
||||
readonly SMART_TEMP_WARN=50 # Temperature warning threshold (°C)
|
||||
readonly SMART_TEMP_CRIT=60 # Temperature critical threshold (°C)
|
||||
readonly SMART_REALLOCATED_WARN=1 # Reallocated sectors warning threshold
|
||||
readonly SMART_PENDING_WARN=1 # Pending sectors warning threshold
|
||||
readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold
|
||||
readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# get_drive_smart_info
|
||||
#
|
||||
@@ -581,12 +589,13 @@ build_ceph_cache() {
|
||||
# Args:
|
||||
# $1 - Device name (e.g., sda, nvme0n1)
|
||||
#
|
||||
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL
|
||||
# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS
|
||||
# TYPE: SSD, HDD, or NVMe
|
||||
# TEMP: Temperature in Celsius (or "-" if unavailable)
|
||||
# HEALTH: ✓ for passed, ✗ for failed
|
||||
# HEALTH: ✓ for passed, ✗ for failed, ⚠ for passed with warnings
|
||||
# MODEL: Drive model string
|
||||
# SERIAL: Drive serial number
|
||||
# WARNINGS: Comma-separated warning codes (or empty)
|
||||
#------------------------------------------------------------------------------
|
||||
get_drive_smart_info() {
|
||||
local device="$1"
|
||||
@@ -596,6 +605,7 @@ get_drive_smart_info() {
|
||||
local health="✗"
|
||||
local model="-"
|
||||
local serial="-"
|
||||
local warnings=""
|
||||
|
||||
smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)"
|
||||
|
||||
@@ -626,7 +636,7 @@ get_drive_smart_info() {
|
||||
type="SSD"
|
||||
fi
|
||||
|
||||
# Health status
|
||||
# Health status (basic SMART check)
|
||||
if echo "$smart_info" | grep -q "SMART overall-health.*PASSED"; then
|
||||
health="✓"
|
||||
elif echo "$smart_info" | grep -q "SMART Health Status.*OK"; then
|
||||
@@ -642,6 +652,55 @@ get_drive_smart_info() {
|
||||
serial="$(echo "$smart_info" | grep -E "^Serial [Nn]umber:" | head -1 | cut -d: -f2 | xargs)"
|
||||
[[ -z "$serial" ]] && serial="-"
|
||||
|
||||
# SMART threshold warnings - check for concerning values
|
||||
local warn_list=()
|
||||
|
||||
# Temperature thresholds
|
||||
if [[ -n "$temp" && "$temp" =~ ^[0-9]+$ ]]; then
|
||||
if [[ "$temp" -ge "$SMART_TEMP_CRIT" ]]; then
|
||||
warn_list+=("TEMP_CRIT")
|
||||
elif [[ "$temp" -ge "$SMART_TEMP_WARN" ]]; then
|
||||
warn_list+=("TEMP_WARN")
|
||||
fi
|
||||
fi
|
||||
|
||||
# Reallocated sectors (SMART attribute 5)
|
||||
local reallocated
|
||||
reallocated="$(echo "$smart_info" | grep -E "^\s*5\s+Reallocated_Sector" | awk '{print $NF}')"
|
||||
if [[ -n "$reallocated" && "$reallocated" =~ ^[0-9]+$ && "$reallocated" -ge "$SMART_REALLOCATED_WARN" ]]; then
|
||||
warn_list+=("REALLOC:$reallocated")
|
||||
fi
|
||||
|
||||
# Current pending sectors (SMART attribute 197)
|
||||
local pending
|
||||
pending="$(echo "$smart_info" | grep -E "^\s*197\s+Current_Pending" | awk '{print $NF}')"
|
||||
if [[ -n "$pending" && "$pending" =~ ^[0-9]+$ && "$pending" -ge "$SMART_PENDING_WARN" ]]; then
|
||||
warn_list+=("PENDING:$pending")
|
||||
fi
|
||||
|
||||
# UDMA CRC errors (SMART attribute 199)
|
||||
local crc_errors
|
||||
crc_errors="$(echo "$smart_info" | grep -E "^\s*199\s+UDMA_CRC_Error" | awk '{print $NF}')"
|
||||
if [[ -n "$crc_errors" && "$crc_errors" =~ ^[0-9]+$ && "$crc_errors" -ge "$SMART_CRC_ERROR_WARN" ]]; then
|
||||
warn_list+=("CRC:$crc_errors")
|
||||
fi
|
||||
|
||||
# Power-on hours (SMART attribute 9)
|
||||
local power_hours
|
||||
power_hours="$(echo "$smart_info" | grep -E "^\s*9\s+Power_On_Hours" | awk '{print $NF}')"
|
||||
if [[ -n "$power_hours" && "$power_hours" =~ ^[0-9]+$ && "$power_hours" -ge "$SMART_POWER_ON_HOURS_WARN" ]]; then
|
||||
warn_list+=("HOURS:$power_hours")
|
||||
fi
|
||||
|
||||
# Join warnings
|
||||
if [[ ${#warn_list[@]} -gt 0 ]]; then
|
||||
warnings="$(IFS=','; echo "${warn_list[*]}")"
|
||||
# Change health indicator to warning if SMART passed but has warnings
|
||||
if [[ "$health" == "✓" ]]; then
|
||||
health="⚠"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Format temperature with unit if we have a value
|
||||
local temp_display
|
||||
if [[ -n "$temp" && "$temp" != "-" ]]; then
|
||||
@@ -650,7 +709,7 @@ get_drive_smart_info() {
|
||||
temp_display="-"
|
||||
fi
|
||||
|
||||
echo "${type}|${temp_display}|${health}|${model}|${serial}"
|
||||
echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}"
|
||||
}
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
@@ -692,11 +751,11 @@ fi
|
||||
printf "\n"
|
||||
echo -e "$(colorize_header '=== Drive Details with SMART Status (by Bay Position) ===')"
|
||||
if [[ "$SHOW_PCI" == true ]]; then
|
||||
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "PCI PATH"
|
||||
echo "------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
|
||||
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" "PCI PATH"
|
||||
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------"
|
||||
else
|
||||
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE"
|
||||
echo "----------------------------------------------------------------------------------------------------------------------------------------------------"
|
||||
printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS"
|
||||
echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------"
|
||||
fi
|
||||
|
||||
# Build reverse map: device -> bay
|
||||
@@ -723,9 +782,10 @@ for bay in $all_bays; do
|
||||
health="-"
|
||||
model="-"
|
||||
serial="-"
|
||||
warnings=""
|
||||
else
|
||||
smart_info="$(get_drive_smart_info "$device")"
|
||||
IFS='|' read -r type temp health model serial <<< "$smart_info"
|
||||
IFS='|' read -r type temp health model serial warnings <<< "$smart_info"
|
||||
fi
|
||||
|
||||
# Check for Ceph OSD using cached data
|
||||
@@ -766,11 +826,17 @@ for bay in $all_bays; do
|
||||
colored_temp="$(colorize_temp "$temp")"
|
||||
colored_health="$(colorize_health "$health")"
|
||||
|
||||
# Colorize warnings if present
|
||||
local colored_warnings="${warnings:--}"
|
||||
if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then
|
||||
colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}"
|
||||
fi
|
||||
|
||||
if [[ "$SHOW_PCI" == true ]]; then
|
||||
pci_path="${BAY_TO_PCI_PATH[$bay]:-}"
|
||||
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$pci_path"
|
||||
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" "$pci_path"
|
||||
else
|
||||
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage"
|
||||
printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
Reference in New Issue
Block a user