Add timeout protection to external commands

Add a configurable CMD_TIMEOUT constant and apply timeouts to
smartctl and ceph commands that may hang on unresponsive disks
or network issues. This prevents the script from blocking
indefinitely.

#14

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-05 10:46:26 -05:00
parent 07989c8788
commit eff8eb3a3c

View File

@@ -2,6 +2,11 @@
VERSION="1.1.0" VERSION="1.1.0"
###################
# Timeout Configuration
###################
readonly CMD_TIMEOUT=30 # Default timeout in seconds for external commands
################### ###################
# Color Definitions # Color Definitions
################### ###################
@@ -105,7 +110,9 @@ get_disk_health() {
while IFS= read -r disk; do while IFS= read -r disk; do
[[ -z "$disk" ]] && continue [[ -z "$disk" ]] && continue
echo -e "\nChecking /dev/$disk:" echo -e "\nChecking /dev/$disk:"
smartctl -H "/dev/$disk" if ! timeout $CMD_TIMEOUT smartctl -H "/dev/$disk"; then
log_message warn "smartctl timed out or failed for /dev/$disk"
fi
done < <(lsblk -d -o name | grep -E '^sd|^nvme') done < <(lsblk -d -o name | grep -E '^sd|^nvme')
else else
log_message warn "smartctl not found. Install smartmontools for disk health monitoring" log_message warn "smartctl not found. Install smartmontools for disk health monitoring"
@@ -419,16 +426,16 @@ get_ceph_health() {
echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}" echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}"
if command -v ceph >/dev/null 2>&1; then if command -v ceph >/dev/null 2>&1; then
echo -e "${GREEN}Health Status:${NC}" echo -e "${GREEN}Health Status:${NC}"
ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster" timeout $CMD_TIMEOUT ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster or timed out"
echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}" echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}"
ceph osd tree 2>/dev/null || true timeout $CMD_TIMEOUT ceph osd tree 2>/dev/null || log_message warn "Ceph OSD tree timed out"
echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}" echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}"
ceph df 2>/dev/null || true timeout $CMD_TIMEOUT ceph df 2>/dev/null || log_message warn "Ceph df timed out"
echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}" echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}"
ceph osd df 2>/dev/null || true timeout $CMD_TIMEOUT ceph osd df 2>/dev/null || log_message warn "Ceph OSD df timed out"
else else
log_message info "Ceph tools not installed on this node" log_message info "Ceph tools not installed on this node"
fi fi
@@ -492,9 +499,11 @@ quick_health_check() {
if command -v smartctl >/dev/null 2>&1; then if command -v smartctl >/dev/null 2>&1; then
while IFS= read -r disk; do while IFS= read -r disk; do
[[ -z "$disk" ]] && continue [[ -z "$disk" ]] && continue
health=$(smartctl -H "/dev/$disk" 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs) health=$(timeout $CMD_TIMEOUT smartctl -H "/dev/$disk" 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs)
if [[ -n "$health" ]]; then if [[ -n "$health" ]]; then
echo -e "/dev/$disk: $health" echo -e "/dev/$disk: $health"
else
echo -e "/dev/$disk: ${YELLOW}check timed out or unavailable${NC}"
fi fi
done < <(lsblk -d -o name | grep -E '^sd|^nvme') done < <(lsblk -d -o name | grep -E '^sd|^nvme')
fi fi