Cache systemctl list-unit-files to avoid repeated calls

Add get_unit_files() and unit_file_exists() helper functions that cache the output of systemctl list-unit-files. This avoids running the same command multiple times when checking for node_exporter and hwmon.timer unit files. #6 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Add efficient process wait utility function
2026-02-05 10:52:10 -05:00 · 2026-02-05 10:51:36 -05:00 · 2026-02-05 10:51:13 -05:00 · 2026-02-05 10:50:45 -05:00 · 2026-02-05 10:50:14 -05:00 · 2026-02-05 10:49:23 -05:00
1 changed files with 286 additions and 48 deletions
@@ -2,6 +2,37 @@
 VERSION="1.1.0"
 ###################
 # Timeout Configuration
 ###################
 readonly CMD_TIMEOUT=30  # Default timeout in seconds for external commands
 ###################
 # Logging Configuration
 ###################
 # Optional log file - set via environment variable PROXDOC_LOGFILE
 LOGFILE="${PROXDOC_LOGFILE:-}"
 ###################
 # Cached Data
 ###################
 # Disk list cache - populated on first use
 DISK_LIST=""
 # Unit files cache - populated on first use
 UNIT_FILES=""
 ###################
 # Pattern Constants
 ###################
 # Virtual/firewall interface patterns to skip
 readonly VIRTUAL_IFACE_PATTERN="^(veth|fwbr|fwln|fwpr|tap)"
 # Storage controller patterns for HBA detection
 readonly STORAGE_CONTROLLER_PATTERN="RAID|SAS|SATA|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe"
 # Disk device patterns
 readonly DISK_DEVICE_PATTERN="^sd|^nvme"
 # PCI devices to exclude from hardware info
 readonly EXCLUDED_PCI_PATTERN="Host bridge|PCI bridge|ISA bridge|SMBus|IOMMU|Dummy|USB controller|Audio device|Encryption controller|Multimedia controller"
 ###################
 # Color Definitions
 ###################
@@ -31,18 +62,42 @@ print_header() {
 }
 # Error handling flags
 ERRORS_OCCURRED=0
 WARNINGS_OCCURRED=0
 cleanup() {
    # Cleanup function called on exit
    local exit_code=$?
    if [[ $exit_code -ne 0 ]]; then
        echo -e "\n${RED}Script terminated with exit code: $exit_code${NC}"
    fi
    # Add any cleanup tasks here (temp files, etc.)
 }
 handle_error() {
-    echo -e "${RED}Error: $1${NC}"
+    local message="$1"
-    exit 1
+    local fatal="${2:-true}"  # Default to fatal error
    echo -e "${RED}Error: $message${NC}" >&2
    ERRORS_OCCURRED=$((ERRORS_OCCURRED + 1))
    if [[ "$fatal" == "true" ]]; then
        exit 1
    fi
 }
 log_message() {
-    local level=$1
+    local level="$1"
-    local message=$2
+    local message="$2"
-    case $level in
+    case "$level" in
        info)    echo -e "${GREEN}[INFO]${NC} $message" ;;
-        warn)    echo -e "${YELLOW}[WARN]${NC} $message" ;;
+        warn)
-        error)   echo -e "${RED}[ERROR]${NC} $message" ;;
+            echo -e "${YELLOW}[WARN]${NC} $message"
            WARNINGS_OCCURRED=$((WARNINGS_OCCURRED + 1))
            ;;
        error)
            echo -e "${RED}[ERROR]${NC} $message" >&2
            ERRORS_OCCURRED=$((ERRORS_OCCURRED + 1))
            ;;
    esac
 }
@@ -67,6 +122,46 @@ checkIfOnHypervisor() {
    command -v pveversion >/dev/null 2>&1
 }
 # Get disk list with caching to avoid multiple lsblk calls
 get_disk_list() {
    if [[ -z "$DISK_LIST" ]]; then
        DISK_LIST=$(lsblk -d -o name 2>/dev/null | grep -E "$DISK_DEVICE_PATTERN")
    fi
    echo "$DISK_LIST"
 }
 # Get systemctl unit files with caching
 get_unit_files() {
    if [[ -z "$UNIT_FILES" ]]; then
        UNIT_FILES=$(systemctl list-unit-files 2>/dev/null)
    fi
    echo "$UNIT_FILES"
 }
 # Check if a unit file exists (uses cached data)
 unit_file_exists() {
    local unit_name="$1"
    get_unit_files | grep -q "$unit_name"
 }
 # Efficient process wait with optional spinner
 # Usage: wait_for_process $pid [delay]
 # Uses kill -0 instead of ps -p for efficiency
 wait_for_process() {
    local pid="$1"
    local delay="${2:-0.1}"
    local spinner='|/-\'
    local i=0
    while kill -0 "$pid" 2>/dev/null; do
        printf "\r%c " "${spinner:i++%${#spinner}:1}"
        sleep "$delay"
    done
    printf "\r  \r"  # Clear spinner
    wait "$pid"
    return $?
 }
 ###################
 # System Information Functions
 ###################
@@ -102,35 +197,40 @@ get_temp_info() {
 get_disk_health() {
    echo -e "\n${GREEN}=== Disk Health Status ===${NC}"
    if command -v smartctl >/dev/null 2>&1; then
-        for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do
+        while IFS= read -r disk; do
            [[ -z "$disk" ]] && continue
            echo -e "\nChecking /dev/$disk:"
-            smartctl -H /dev/$disk
+            if ! timeout $CMD_TIMEOUT smartctl -H "/dev/$disk"; then
-        done
+                log_message warn "smartctl timed out or failed for /dev/$disk"
            fi
        done <<< "$(get_disk_list)"
    else
        log_message warn "smartctl not found. Install smartmontools for disk health monitoring"
    fi
 }
 get_cpu_info() {
-    cpu_info=$(grep -m 1 -w 'model name' /proc/cpuinfo | awk -F: '{print $2}' | xargs) || {
+    local cpu_info cpu_cores cpu_mhz
        echo -e "${RED}Failed to retrieve CPU model information.${NC}"
    }
    cpu_cores=$(lscpu | grep '^CPU(s):' | awk '{print $2}')
    cpu_mhz=$(lscpu | grep 'MHz' | awk '{print $4}')
-    echo -e "${GREEN}CPU Model:${NC} $cpu_info"
+    cpu_info=$(grep -m 1 -w 'model name' /proc/cpuinfo 2>/dev/null | awk -F: '{print $2}' | xargs)
-    echo -e "${GREEN}CPU Cores:${NC} $cpu_cores"
+    cpu_cores=$(lscpu 2>/dev/null | grep '^CPU(s):' | awk '{print $2}')
-    echo -e "${GREEN}CPU MHz:${NC} $cpu_mhz"
+    cpu_mhz=$(lscpu 2>/dev/null | grep 'MHz' | awk '{print $4}')
    echo -e "${GREEN}CPU Model:${NC} ${cpu_info:-Unknown}"
    echo -e "${GREEN}CPU Cores:${NC} ${cpu_cores:-Unknown}"
    echo -e "${GREEN}CPU MHz:${NC} ${cpu_mhz:-Unknown}"
 }
 get_ram_info() {
-    ram_total=$(free -h | grep 'Mem:' | awk '{print $2}')
+    local ram_total ram_used ram_free
    ram_used=$(free -h | grep 'Mem:' | awk '{print $3}')
    ram_free=$(free -h | grep 'Mem:' | awk '{print $4}')
-    echo -e "${GREEN}Total RAM:${NC} $ram_total"
+    ram_total=$(free -h 2>/dev/null | grep 'Mem:' | awk '{print $2}')
-    echo -e "${GREEN}Used RAM:${NC} $ram_used"
+    ram_used=$(free -h 2>/dev/null | grep 'Mem:' | awk '{print $3}')
-    echo -e "${GREEN}Free RAM:${NC} $ram_free"
+    ram_free=$(free -h 2>/dev/null | grep 'Mem:' | awk '{print $4}')
    echo -e "${GREEN}Total RAM:${NC} ${ram_total:-Unknown}"
    echo -e "${GREEN}Used RAM:${NC} ${ram_used:-Unknown}"
    echo -e "${GREEN}Free RAM:${NC} ${ram_free:-Unknown}"
 }
 get_storage_info() {
@@ -144,10 +244,13 @@ get_storage_info() {
 }
 get_network_info() {
-    default_gateway=$(ip route | grep default | awk '{print $3}')
+    local default_gateway ip_addresses
-    ip_addresses=$(hostname -I | xargs)
+
-    echo -e "${GREEN}Default Gateway:${NC} $default_gateway"
+    default_gateway=$(ip route 2>/dev/null | grep default | awk '{print $3}')
-    echo -e "${GREEN}IP Addresses:${NC} $ip_addresses"
+    ip_addresses=$(hostname -I 2>/dev/null | xargs)
    echo -e "${GREEN}Default Gateway:${NC} ${default_gateway:-Not configured}"
    echo -e "${GREEN}IP Addresses:${NC} ${ip_addresses:-None detected}"
 }
 get_detailed_network() {
@@ -173,7 +276,7 @@ get_hardware_info() {
    echo -e "${GREEN}BIOS Version:${NC} $(dmidecode -s bios-version)"
    echo -e "\n${GREEN}=== PCI Devices ===${NC}"
    # Show interesting devices, exclude bridges, infrastructure, and integrated motherboard devices
-    lspci | grep -v -E "Host bridge|PCI bridge|ISA bridge|SMBus|IOMMU|Dummy|USB controller|Audio device|Encryption controller|Multimedia controller"
+    lspci | grep -v -E "$EXCLUDED_PCI_PATTERN"
 }
 get_motherboard_info() {
@@ -350,7 +453,7 @@ get_physical_interfaces() {
        [[ "$iface" == "lo" ]] && continue
        # Skip virtual/firewall interfaces
-        [[ "$iface" =~ ^(veth|fwbr|fwln|fwpr|tap) ]] && continue
+        [[ "$iface" =~ $VIRTUAL_IFACE_PATTERN ]] && continue
        # This is a physical interface
        echo "$iface"
@@ -361,9 +464,9 @@ get_hba_info() {
    echo -e "\n${GREEN}=== HBA/Storage Controller Information ===${NC}"
    # Find RAID, SAS, SATA, SCSI, and storage controllers
-    lspci -vmm 2>/dev/null | awk '
+    lspci -vmm 2>/dev/null | awk -v pattern="$STORAGE_CONTROLLER_PATTERN" '
        BEGIN { RS=""; FS="\n" }
-        /RAID|SAS|SATA|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe/ {
+        $0 ~ pattern {
            for (i=1; i<=NF; i++) {
                if ($i ~ /^Slot:/) slot = substr($i, 7)
                if ($i ~ /^Class:/) class = substr($i, 8)
@@ -382,7 +485,7 @@ get_hba_info() {
    # Show detailed info for storage controllers
    echo -e "\n${GREEN}=== Storage Controller Details ===${NC}"
-    for ctrl in $(lspci | grep -iE "RAID|SAS|SATA|SCSI|Mass storage|NVMe" | awk '{print $1}'); do
+    for ctrl in $(lspci | grep -iE "$STORAGE_CONTROLLER_PATTERN" | awk '{print $1}'); do
        echo -e "\n${GREEN}Controller $ctrl:${NC}"
        lspci -vvs "$ctrl" 2>/dev/null | grep -E "^\s+(Subsystem|LnkSta|Kernel driver)" | head -5
    done
@@ -418,16 +521,16 @@ get_ceph_health() {
    echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}"
    if command -v ceph >/dev/null 2>&1; then
        echo -e "${GREEN}Health Status:${NC}"
-        ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster"
+        timeout $CMD_TIMEOUT ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster or timed out"
        echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}"
-        ceph osd tree 2>/dev/null || true
+        timeout $CMD_TIMEOUT ceph osd tree 2>/dev/null || log_message warn "Ceph OSD tree timed out"
        echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}"
-        ceph df 2>/dev/null || true
+        timeout $CMD_TIMEOUT ceph df 2>/dev/null || log_message warn "Ceph df timed out"
        echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}"
-        ceph osd df 2>/dev/null || true
+        timeout $CMD_TIMEOUT ceph osd df 2>/dev/null || log_message warn "Ceph OSD df timed out"
    else
        log_message info "Ceph tools not installed on this node"
    fi
@@ -444,7 +547,7 @@ get_node_exporter_status() {
        else
            log_message warn "Port 9100 not listening"
        fi
-    elif systemctl list-unit-files 2>/dev/null | grep -q node_exporter; then
+    elif unit_file_exists node_exporter; then
        log_message warn "Node Exporter is installed but not running"
        echo -e "Start with: systemctl start node_exporter"
    else
@@ -459,7 +562,7 @@ get_hwmon_status() {
        systemctl list-timers hwmon.timer --no-pager 2>/dev/null
        echo -e "\n${GREEN}Last Run:${NC}"
        journalctl -u hwmon.service -n 3 --no-pager 2>/dev/null || true
-    elif systemctl list-unit-files 2>/dev/null | grep -q hwmon.timer; then
+    elif unit_file_exists hwmon.timer; then
        log_message warn "hwmon timer is installed but not active"
        echo -e "Enable with: systemctl enable --now hwmon.timer"
    else
@@ -467,6 +570,51 @@ get_hwmon_status() {
    fi
 }
 # Valid check names for selective mode
 readonly VALID_CHECKS="cpu ram memory storage disk network hardware temps services ceph vms containers"
 run_selective_checks() {
    local checks="$1"
    if [[ -z "$checks" ]]; then
        log_message error "No checks specified. Use --checks=cpu,ram,disk"
        echo "Valid checks: $VALID_CHECKS"
        exit 1
    fi
    # Validate check names
    IFS=',' read -ra check_array <<< "$checks"
    for check in "${check_array[@]}"; do
        if [[ ! " $VALID_CHECKS " =~ " $check " ]]; then
            log_message error "Unknown check: $check"
            echo "Valid checks: $VALID_CHECKS"
            exit 1
        fi
    done
    log_message info "Running selective checks: $checks"
    echo ""
    for check in "${check_array[@]}"; do
        case "$check" in
            cpu)        log_message info "Checking CPU..."; get_cpu_info ;;
            ram)        log_message info "Checking RAM..."; get_ram_info ;;
            memory)     log_message info "Checking memory details..."; get_memory_details ;;
            storage)    log_message info "Checking storage..."; get_storage_info ;;
            disk)       log_message info "Checking disk health..."; get_disk_health ;;
            network)    log_message info "Checking network..."; get_network_info; get_detailed_network; get_nic_details ;;
            hardware)   log_message info "Checking hardware..."; get_hardware_info; get_motherboard_info; get_hba_info ;;
            temps)      log_message info "Checking temperatures..."; get_temp_info ;;
            services)   log_message info "Checking services..."; check_services ;;
            ceph)       log_message info "Checking Ceph..."; get_ceph_health ;;
            vms)        log_message info "Checking VMs..."; list_vms ;;
            containers) log_message info "Checking containers..."; list_containers ;;
        esac
    done
    echo ""
    log_message info "Selective checks complete"
 }
 quick_health_check() {
    echo -e "\n${GREEN}=== Quick Health Check ===${NC}"
    echo -e "Running quick health assessment...\n"
@@ -480,12 +628,15 @@ quick_health_check() {
    # Disk health (quick)
    echo -e "\n${GREEN}=== Disk Health Summary ===${NC}"
    if command -v smartctl >/dev/null 2>&1; then
-        for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do
+        while IFS= read -r disk; do
-            health=$(smartctl -H /dev/$disk 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs)
+            [[ -z "$disk" ]] && continue
            health=$(timeout $CMD_TIMEOUT smartctl -H "/dev/$disk" 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs)
            if [[ -n "$health" ]]; then
                echo -e "/dev/$disk: $health"
            else
                echo -e "/dev/$disk: ${YELLOW}check timed out or unavailable${NC}"
            fi
-        done
+        done <<< "$(get_disk_list)"
    fi
    # Node Exporter
@@ -574,6 +725,31 @@ help() {
    echo "  --vm-list        Check VM vitals"
    echo "  --ct-list        Check container vitals"
    echo "  --backup         Review backup health"
    echo "  --checks=LIST    Run only specific checks (comma-separated)"
    echo ""
    echo "Valid checks for --checks option:"
    echo "  cpu, ram, memory, storage, disk, network, hardware, temps,"
    echo "  services, ceph, vms, containers"
    echo ""
    echo "Examples:"
    echo "  Run full diagnostics:"
    echo "    curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --diags"
    echo ""
    echo "  Quick health check:"
    echo "    curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --quick"
    echo ""
    echo "  Check only services and VMs:"
    echo "    curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --services"
    echo "    curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --vm-list"
    echo ""
    echo "  View drive bay mapping:"
    echo "    curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --drives"
    echo ""
    echo "  Check Ceph cluster health:"
    echo "    curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --ceph"
    echo ""
    echo "  Run only CPU and RAM checks:"
    echo "    curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --checks=cpu,ram"
    exit 0
 }
@@ -583,7 +759,28 @@ help() {
 ###################
 # Main Functions
 ###################
 # Setup logging if LOGFILE is specified
 setup_logging() {
    if [[ -n "$LOGFILE" ]]; then
        # Create log directory if needed
        local log_dir
        log_dir=$(dirname "$LOGFILE")
        if [[ ! -d "$log_dir" ]]; then
            mkdir -p "$log_dir" 2>/dev/null || {
                log_message warn "Cannot create log directory: $log_dir"
                LOGFILE=""
                return
            }
        fi
        log_message info "Logging output to: $LOGFILE"
        # Redirect stdout and stderr to tee (no subshell overhead)
        exec > >(tee -a "$LOGFILE") 2>&1
    fi
 }
 runDiags() {
    setup_logging
    log_message info "Beginning system examination..."
    # Check if running on Proxmox
@@ -649,10 +846,49 @@ runDiags() {
    echo ""
    log_message info "Examination complete"
    # Print summary if there were issues
    if [[ $WARNINGS_OCCURRED -gt 0 || $ERRORS_OCCURRED -gt 0 ]]; then
        echo -e "\n${YELLOW}=== Summary ===${NC}"
        [[ $WARNINGS_OCCURRED -gt 0 ]] && echo -e "Warnings: $WARNINGS_OCCURRED"
        [[ $ERRORS_OCCURRED -gt 0 ]] && echo -e "Errors: $ERRORS_OCCURRED"
    fi
 }
 # Whitelist of valid command options
 readonly VALID_OPTIONS="--help --diags --quick --drives --ceph --node-exporter --hwmon --services --vm-list --ct-list --backup --checks"
 validate_input() {
    local input="$1"
    # Check if input matches valid option pattern (starts with -- and contains only alphanumeric, hyphens, equals, commas)
    if [[ ! "$input" =~ ^--[a-z][-a-z=,]*$ ]]; then
        return 1
    fi
    # Extract the option name (before any = sign)
    local opt_name="${input%%=*}"
    # Check against whitelist
    if [[ ! " $VALID_OPTIONS " =~ " $opt_name " ]]; then
        return 1
    fi
    return 0
 }
 checkForInput() {
-    case $1 in
+    local input="$1"
    # Validate input against whitelist
    if ! validate_input "$input"; then
        echo -e "${RED}Invalid option: $input${NC}"
        echo -e "Use --help to see available options."
        exit 1
    fi
    # Extract option name and value for --checks=X pattern
    local opt_name="${input%%=*}"
    local opt_value="${input#*=}"
    [[ "$opt_name" == "$opt_value" ]] && opt_value=""
    case "$opt_name" in
        --help)          help ;;
        --diags)         check_requirements; runDiags ;;
        --quick)         quick_health_check ;;
@@ -664,7 +900,7 @@ checkForInput() {
        --vm-list)       list_vms ;;
        --ct-list)       list_containers ;;
        --backup)        echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;;
-        *)               echo -e "${RED}Invalid option: $1${NC}"; help ;;
+        --checks)        run_selective_checks "$opt_value" ;;
    esac
 }
@@ -682,8 +918,10 @@ if [[ $EUID -ne 0 ]]; then
    handle_error "This script must be run as root"
 fi
-# Set trap for interrupts
+# Set trap for cleanup and interrupts
-trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM
+trap cleanup EXIT
 trap 'echo -e "\n${RED}Script interrupted by user.${NC}"; exit 130' INT
 trap 'echo -e "\n${RED}Script terminated.${NC}"; exit 143' TERM
 if [[ -n $argOne ]]; then
    checkForInput "$argOne"
Author	SHA1	Message	Date
jared	6125fb9d6b	Cache systemctl list-unit-files to avoid repeated calls Add get_unit_files() and unit_file_exists() helper functions that cache the output of systemctl list-unit-files. This avoids running the same command multiple times when checking for node_exporter and hwmon.timer unit files. #6 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:52:10 -05:00
jared	86be5fd1c1	Add efficient process wait utility function Add wait_for_process() that uses kill -0 instead of ps -p for checking if a process is running. This is more efficient as kill -0 only checks process existence without spawning a new process like ps would. Includes optional spinner for visual feedback during waits. #7 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:51:36 -05:00
jared	a491ae4592	Cache disk list to avoid multiple lsblk calls Add get_disk_list() function that caches the output of lsblk on first call. Subsequent calls return the cached value, reducing overhead when multiple functions need to iterate over disk devices. #8 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:51:13 -05:00
jared	7514e2ba7c	Add logging infrastructure without subshell overhead Add optional logging to file via PROXDOC_LOGFILE environment variable. Uses exec redirection with tee instead of subshells, which is more efficient for long-running diagnostics. Usage: PROXDOC_LOGFILE=/tmp/proxdoc.log ./proxDoc.sh --diags #9 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:50:45 -05:00
jared	f7ed682bdb	Standardize error handling with cleanup trap - Add cleanup function called on EXIT trap - Add ERRORS_OCCURRED and WARNINGS_OCCURRED counters - Make handle_error support non-fatal errors with optional parameter - Add proper exit codes for INT (130) and TERM (143) signals - Add summary of errors/warnings at the end of diagnostics - Redirect error messages to stderr #10 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:50:14 -05:00
jared	148a7ac644	Add validation for potentially empty variables Add fallback values for variables that might be empty when system information is unavailable. Use parameter expansion with default values (${var:-Default}) to ensure meaningful output even when commands fail or return empty results. Affected functions: get_cpu_info, get_ram_info, get_network_info #11 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:49:23 -05:00
jared	67d4b76324	Extract magic strings into named constants Define pattern constants at the top of the script for: - VIRTUAL_IFACE_PATTERN: virtual/firewall interface patterns - STORAGE_CONTROLLER_PATTERN: HBA/storage controller detection - DISK_DEVICE_PATTERN: disk device name patterns - EXCLUDED_PCI_PATTERN: PCI devices to exclude from listing This improves maintainability and makes patterns easier to modify. #12 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:48:51 -05:00
jared	6633a0a9a1	Implement selective checks with --checks option Add the ability to run only specific diagnostic checks using --checks=cpu,ram,disk syntax. This allows users to perform targeted diagnostics without running the full suite. Supported checks: cpu, ram, memory, storage, disk, network, hardware, temps, services, ceph, vms, containers #13 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:46:59 -05:00
jared	eff8eb3a3c	Add timeout protection to external commands Add a configurable CMD_TIMEOUT constant and apply timeouts to smartctl and ceph commands that may hang on unresponsive disks or network issues. This prevents the script from blocking indefinitely. #14 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:46:26 -05:00
jared	07989c8788	Add examples section to help documentation Expand the help output to include practical usage examples for common operations like full diagnostics, quick health checks, service monitoring, and Ceph health checks. #15 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:45:46 -05:00
jared	c8fadf924b	Add input validation with whitelist of valid options Implement strict input validation using a whitelist approach. Only accept options that match the expected pattern and are in the approved list. This prevents injection attacks and invalid inputs from being processed. #16 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:45:19 -05:00
jared	c25e3ccc76	Fix variable quoting in disk iteration loops Replace unsafe for loops with properly quoted while loops when iterating over disk devices. This prevents word splitting issues with device names containing special characters. #17 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>	2026-02-05 10:44:43 -05:00