Add timeout protection to external commands

Add a configurable CMD_TIMEOUT constant and apply timeouts to smartctl and ceph commands that may hang on unresponsive disks or network issues. This prevents the script from blocking indefinitely. #14 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-05 10:46:26 -05:00
parent 07989c8788
commit eff8eb3a3c
1 changed files with 15 additions and 6 deletions
@@ -2,6 +2,11 @@
 VERSION="1.1.0"
 ###################
 # Timeout Configuration
 ###################
 readonly CMD_TIMEOUT=30  # Default timeout in seconds for external commands
 ###################
 # Color Definitions
 ###################
@@ -105,7 +110,9 @@ get_disk_health() {
        while IFS= read -r disk; do
            [[ -z "$disk" ]] && continue
            echo -e "\nChecking /dev/$disk:"
-            smartctl -H "/dev/$disk"
+            if ! timeout $CMD_TIMEOUT smartctl -H "/dev/$disk"; then
                log_message warn "smartctl timed out or failed for /dev/$disk"
            fi
        done < <(lsblk -d -o name | grep -E '^sd|^nvme')
    else
        log_message warn "smartctl not found. Install smartmontools for disk health monitoring"
@@ -419,16 +426,16 @@ get_ceph_health() {
    echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}"
    if command -v ceph >/dev/null 2>&1; then
        echo -e "${GREEN}Health Status:${NC}"
-        ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster"
+        timeout $CMD_TIMEOUT ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster or timed out"
        echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}"
-        ceph osd tree 2>/dev/null || true
+        timeout $CMD_TIMEOUT ceph osd tree 2>/dev/null || log_message warn "Ceph OSD tree timed out"
        echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}"
-        ceph df 2>/dev/null || true
+        timeout $CMD_TIMEOUT ceph df 2>/dev/null || log_message warn "Ceph df timed out"
        echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}"
-        ceph osd df 2>/dev/null || true
+        timeout $CMD_TIMEOUT ceph osd df 2>/dev/null || log_message warn "Ceph OSD df timed out"
    else
        log_message info "Ceph tools not installed on this node"
    fi
@@ -492,9 +499,11 @@ quick_health_check() {
    if command -v smartctl >/dev/null 2>&1; then
        while IFS= read -r disk; do
            [[ -z "$disk" ]] && continue
-            health=$(smartctl -H "/dev/$disk" 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs)
+            health=$(timeout $CMD_TIMEOUT smartctl -H "/dev/$disk" 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs)
            if [[ -n "$health" ]]; then
                echo -e "/dev/$disk: $health"
            else
                echo -e "/dev/$disk: ${YELLOW}check timed out or unavailable${NC}"
            fi
        done < <(lsblk -d -o name | grep -E '^sd|^nvme')
    fi