diff --git a/driveAtlas.sh b/driveAtlas.sh index 9afa374..50ab63c 100644 --- a/driveAtlas.sh +++ b/driveAtlas.sh @@ -573,6 +573,14 @@ build_ceph_cache() { done < <(ceph osd tree 2>/dev/null) } +# SMART warning thresholds +readonly SMART_TEMP_WARN=50 # Temperature warning threshold (°C) +readonly SMART_TEMP_CRIT=60 # Temperature critical threshold (°C) +readonly SMART_REALLOCATED_WARN=1 # Reallocated sectors warning threshold +readonly SMART_PENDING_WARN=1 # Pending sectors warning threshold +readonly SMART_CRC_ERROR_WARN=100 # UDMA CRC error warning threshold +readonly SMART_POWER_ON_HOURS_WARN=43800 # ~5 years of continuous use + #------------------------------------------------------------------------------ # get_drive_smart_info # @@ -581,12 +589,13 @@ build_ceph_cache() { # Args: # $1 - Device name (e.g., sda, nvme0n1) # -# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL +# Returns: Pipe-delimited string: TYPE|TEMP|HEALTH|MODEL|SERIAL|WARNINGS # TYPE: SSD, HDD, or NVMe # TEMP: Temperature in Celsius (or "-" if unavailable) -# HEALTH: ✓ for passed, ✗ for failed +# HEALTH: ✓ for passed, ✗ for failed, ⚠ for passed with warnings # MODEL: Drive model string # SERIAL: Drive serial number +# WARNINGS: Comma-separated warning codes (or empty) #------------------------------------------------------------------------------ get_drive_smart_info() { local device="$1" @@ -596,6 +605,7 @@ get_drive_smart_info() { local health="✗" local model="-" local serial="-" + local warnings="" smart_info="$(sudo smartctl -A -i -H "/dev/$device" 2>/dev/null)" @@ -626,7 +636,7 @@ get_drive_smart_info() { type="SSD" fi - # Health status + # Health status (basic SMART check) if echo "$smart_info" | grep -q "SMART overall-health.*PASSED"; then health="✓" elif echo "$smart_info" | grep -q "SMART Health Status.*OK"; then @@ -642,6 +652,55 @@ get_drive_smart_info() { serial="$(echo "$smart_info" | grep -E "^Serial [Nn]umber:" | head -1 | cut -d: -f2 | xargs)" [[ -z "$serial" ]] && serial="-" + # SMART threshold warnings - check for concerning values + local warn_list=() + + # Temperature thresholds + if [[ -n "$temp" && "$temp" =~ ^[0-9]+$ ]]; then + if [[ "$temp" -ge "$SMART_TEMP_CRIT" ]]; then + warn_list+=("TEMP_CRIT") + elif [[ "$temp" -ge "$SMART_TEMP_WARN" ]]; then + warn_list+=("TEMP_WARN") + fi + fi + + # Reallocated sectors (SMART attribute 5) + local reallocated + reallocated="$(echo "$smart_info" | grep -E "^\s*5\s+Reallocated_Sector" | awk '{print $NF}')" + if [[ -n "$reallocated" && "$reallocated" =~ ^[0-9]+$ && "$reallocated" -ge "$SMART_REALLOCATED_WARN" ]]; then + warn_list+=("REALLOC:$reallocated") + fi + + # Current pending sectors (SMART attribute 197) + local pending + pending="$(echo "$smart_info" | grep -E "^\s*197\s+Current_Pending" | awk '{print $NF}')" + if [[ -n "$pending" && "$pending" =~ ^[0-9]+$ && "$pending" -ge "$SMART_PENDING_WARN" ]]; then + warn_list+=("PENDING:$pending") + fi + + # UDMA CRC errors (SMART attribute 199) + local crc_errors + crc_errors="$(echo "$smart_info" | grep -E "^\s*199\s+UDMA_CRC_Error" | awk '{print $NF}')" + if [[ -n "$crc_errors" && "$crc_errors" =~ ^[0-9]+$ && "$crc_errors" -ge "$SMART_CRC_ERROR_WARN" ]]; then + warn_list+=("CRC:$crc_errors") + fi + + # Power-on hours (SMART attribute 9) + local power_hours + power_hours="$(echo "$smart_info" | grep -E "^\s*9\s+Power_On_Hours" | awk '{print $NF}')" + if [[ -n "$power_hours" && "$power_hours" =~ ^[0-9]+$ && "$power_hours" -ge "$SMART_POWER_ON_HOURS_WARN" ]]; then + warn_list+=("HOURS:$power_hours") + fi + + # Join warnings + if [[ ${#warn_list[@]} -gt 0 ]]; then + warnings="$(IFS=','; echo "${warn_list[*]}")" + # Change health indicator to warning if SMART passed but has warnings + if [[ "$health" == "✓" ]]; then + health="⚠" + fi + fi + # Format temperature with unit if we have a value local temp_display if [[ -n "$temp" && "$temp" != "-" ]]; then @@ -650,7 +709,7 @@ get_drive_smart_info() { temp_display="-" fi - echo "${type}|${temp_display}|${health}|${model}|${serial}" + echo "${type}|${temp_display}|${health}|${model}|${serial}|${warnings}" } #------------------------------------------------------------------------------ @@ -692,11 +751,11 @@ fi printf "\n" echo -e "$(colorize_header '=== Drive Details with SMART Status (by Bay Position) ===')" if [[ "$SHOW_PCI" == true ]]; then - printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "PCI PATH" - echo "------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" + printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s %-40s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" "PCI PATH" + echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" else - printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" - echo "----------------------------------------------------------------------------------------------------------------------------------------------------" + printf "%-5s %-15s %-10s %-8s %-8s %-8s %-30s %-20s %-12s %-10s %-10s %-30s\n" "BAY" "DEVICE" "SIZE" "TYPE" "TEMP" "HEALTH" "MODEL" "SERIAL" "CEPH OSD" "STATUS" "USAGE" "WARNINGS" + echo "----------------------------------------------------------------------------------------------------------------------------------------------------------------------" fi # Build reverse map: device -> bay @@ -723,9 +782,10 @@ for bay in $all_bays; do health="-" model="-" serial="-" + warnings="" else smart_info="$(get_drive_smart_info "$device")" - IFS='|' read -r type temp health model serial <<< "$smart_info" + IFS='|' read -r type temp health model serial warnings <<< "$smart_info" fi # Check for Ceph OSD using cached data @@ -766,11 +826,17 @@ for bay in $all_bays; do colored_temp="$(colorize_temp "$temp")" colored_health="$(colorize_health "$health")" + # Colorize warnings if present + local colored_warnings="${warnings:--}" + if [[ "$USE_COLOR" == true && -n "$warnings" ]]; then + colored_warnings="${COLOR_YELLOW}${warnings}${COLOR_RESET}" + fi + if [[ "$SHOW_PCI" == true ]]; then pci_path="${BAY_TO_PCI_PATH[$bay]:-}" - printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$pci_path" + printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b %-40s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" "$pci_path" else - printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" + printf "%-5s %-15s %-10s %-8s %-8b %-8b %-30s %-20s %-12s %-10s %-10s %-30b\n" "$bay" "/dev/$device" "$size" "$type" "$colored_temp" "$colored_health" "$model" "$serial" "$osd_id" "$ceph_status" "$usage" "$colored_warnings" fi fi done