From 575c60b1face81efc4b81dbe2cb94761e7a43f38 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Sat, 24 Jan 2026 17:18:53 -0500 Subject: [PATCH] Update to v1.1.0: Add interactive menu, DriveAtlas, and monitoring integrations - Add interactive numbered menu when run without arguments - Add DriveAtlas integration (--drives) for physical drive bay mapping - Add Ceph cluster health monitoring (--ceph) - Add Node Exporter status check (--node-exporter) - Add hwmon daemon status check (--hwmon) - Add quick health check mode (--quick) - Add container list option (--ct-list) - Full diagnostics now includes all monitoring checks - Update README with new features and changelog Co-Authored-By: Claude Opus 4.5 --- README.md | 63 ++++++++-- proxDoc.sh | 354 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 369 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 47aac58..4e7ce32 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,21 @@ ProxDoc is a comprehensive diagnostic tool for Proxmox server health monitoring ## Features - Complete system diagnostics and health checks +- **Interactive menu system** when run without arguments +- **DriveAtlas integration** for physical drive bay mapping +- **Ceph cluster health** monitoring (OSD tree, pool usage, disk usage) +- **Node Exporter status** check for Prometheus monitoring +- **hwmon daemon status** for hardware monitoring +- **Quick health check** mode for fast assessments - Temperature monitoring -- Disk health status +- Disk health status via SMART - CPU and RAM information +- Detailed memory DIMM information (slots, speed, manufacturer) +- Motherboard and system information - Storage information including ZFS pools -- Network diagnostics +- Network diagnostics with ethtool integration +- Detailed NIC information (driver, firmware, link speed) +- HBA/Storage controller detection and details - Hardware information - Service status monitoring - VM and Container status @@ -23,7 +33,11 @@ The script requires the following tools to be installed: - ip - smartctl - sensors -- netstat +- lspci + +Optional tools for enhanced diagnostics: +- ethtool (for detailed NIC information including link speed and firmware) +- netstat (for network statistics) ## Usage @@ -38,22 +52,39 @@ curl -sL "http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh" - `--help`: Show the help guide - `--diags`: Perform full system examination -- `--connect`: Connect to a remote Proxmox host -- `--services`: Check vital services +- `--quick`: Quick health check (services, temps, disks) +- `--drives`: Show physical drive bay mapping (DriveAtlas) +- `--ceph`: Check Ceph cluster health +- `--node-exporter`: Check Node Exporter status +- `--hwmon`: Check hwmon daemon status +- `--services`: Check vital Proxmox services - `--vm-list`: Check VM vitals +- `--ct-list`: Check container vitals - `--backup`: Review backup health +- `--connect`: Connect to a remote Proxmox host - `--save`: Save examination results to a log file +### Interactive Mode + +Run without arguments to display an interactive menu: + +```bash +./proxDoc.sh +``` + ## Output Information The script provides detailed information about: - System information and Proxmox version - CPU model, cores, and frequency -- RAM usage +- RAM usage and detailed DIMM information +- Motherboard manufacturer, model, and serial number - Storage status - Disk health - Network configuration -- Hardware details +- Detailed NIC information (driver, MAC, speed, firmware) +- HBA/Storage controller details +- Hardware details including PCI devices - System load - Service status - Recent system errors @@ -75,4 +106,20 @@ chmod +x proxdoc.sh ## Version -Current Version: 1.0.0 \ No newline at end of file +Current Version: 1.1.0 + +### Changelog + +#### v1.1.0 +- Added interactive menu system when run without arguments +- Added DriveAtlas integration (`--drives`) for physical drive bay mapping +- Added Ceph cluster health monitoring (`--ceph`) +- Added Node Exporter status check (`--node-exporter`) +- Added hwmon daemon status check (`--hwmon`) +- Added quick health check mode (`--quick`) +- Added container list option (`--ct-list`) +- Full diagnostics now include all monitoring checks +- Improved error handling with graceful fallbacks + +#### v1.0.0 +- Initial release \ No newline at end of file diff --git a/proxDoc.sh b/proxDoc.sh index 23bf193..b9a7ff1 100755 --- a/proxDoc.sh +++ b/proxDoc.sh @@ -1,6 +1,6 @@ #!/bin/bash -VERSION="1.0.0" +VERSION="1.1.0" SPINNER="/-\|" ################### @@ -63,7 +63,7 @@ show_progress() { check_requirements() { log_message info "Checking medical equipment..." - local tools=("dmidecode" "lscpu" "ip" "smartctl" "sensors" "netstat") + local tools=("dmidecode" "lscpu" "ip" "smartctl" "sensors" "lspci") for tool in "${tools[@]}"; do if ! command -v "$tool" >/dev/null 2>&1; then handle_error "Required instrument '$tool' is missing" @@ -151,10 +151,16 @@ get_network_info() { } get_detailed_network() { - echo -e "\n${GREEN}=== Network Interface Details ===${NC}" + echo -e "\n${GREEN}=== Network Interface Statistics ===${NC}" ip -s link show echo -e "\n${GREEN}=== Network Statistics ===${NC}" - netstat -i + if command -v netstat >/dev/null 2>&1; then + netstat -i + elif command -v ss >/dev/null 2>&1; then + ss -s + else + log_message warn "netstat/ss not found for network statistics" + fi } get_hardware_info() { @@ -163,6 +169,122 @@ get_hardware_info() { lspci | grep -i -E "vga|ethernet|raid" } +get_motherboard_info() { + echo -e "\n${GREEN}=== Motherboard Information ===${NC}" + echo -e "${GREEN}Manufacturer:${NC} $(dmidecode -s baseboard-manufacturer)" + echo -e "${GREEN}Product Name:${NC} $(dmidecode -s baseboard-product-name)" + echo -e "${GREEN}Version:${NC} $(dmidecode -s baseboard-version)" + echo -e "${GREEN}Serial Number:${NC} $(dmidecode -s baseboard-serial-number)" + echo -e "${GREEN}System Manufacturer:${NC} $(dmidecode -s system-manufacturer)" + echo -e "${GREEN}System Product:${NC} $(dmidecode -s system-product-name)" + echo -e "${GREEN}System Serial:${NC} $(dmidecode -s system-serial-number)" +} + +get_memory_details() { + echo -e "\n${GREEN}=== Memory DIMM Information ===${NC}" + dmidecode -t memory | awk ' + /Memory Device/,/^$/ { + if (/Size:/ && !/No Module Installed/) { + size=$2" "$3 + } + if (/Type:/ && !/Unknown/ && !/Error/) { + type=$2 + } + if (/Speed:/ && !/Unknown/ && $2 != "Unknown") { + speed=$2" "$3 + } + if (/Manufacturer:/ && !/Unknown/ && $2 != "Unknown") { + mfr=$2 + } + if (/Part Number:/ && !/Unknown/) { + part=$3 + } + if (/Locator:/ && !/Bank/) { + loc=$2 + if (size && size !~ /No/) { + printf "%-12s %-10s %-8s %-12s %-20s\n", loc, size, type, speed, mfr + size=""; type=""; speed=""; mfr=""; part="" + } + } + } + ' + echo -e "\n${GREEN}Memory Summary:${NC}" + echo -e " Total Slots: $(dmidecode -t memory | grep -c "Memory Device")" + echo -e " Populated: $(dmidecode -t memory | grep "Size:" | grep -cv "No Module")" + echo -e " Max Capacity: $(dmidecode -t memory | grep "Maximum Capacity" | head -1 | awk '{print $3" "$4}')" +} + +get_nic_details() { + echo -e "\n${GREEN}=== Network Interface Details ===${NC}" + for iface in $(ls /sys/class/net | grep -v lo); do + echo -e "\n${GREEN}Interface: $iface${NC}" + + # Get driver info + if [ -L "/sys/class/net/$iface/device/driver" ]; then + driver=$(basename $(readlink /sys/class/net/$iface/device/driver)) + echo -e " Driver: $driver" + fi + + # Get MAC address + if [ -f "/sys/class/net/$iface/address" ]; then + echo -e " MAC: $(cat /sys/class/net/$iface/address)" + fi + + # Get link state + if [ -f "/sys/class/net/$iface/operstate" ]; then + echo -e " State: $(cat /sys/class/net/$iface/operstate)" + fi + + # Use ethtool if available + if command -v ethtool >/dev/null 2>&1; then + # Get speed and duplex + link_info=$(ethtool $iface 2>/dev/null | grep -E "Speed:|Duplex:|Link detected:") + if [ -n "$link_info" ]; then + echo "$link_info" | while read line; do + echo -e " $line" + done + fi + + # Get firmware version + fw_ver=$(ethtool -i $iface 2>/dev/null | grep "firmware-version" | awk '{print $2}') + if [ -n "$fw_ver" ] && [ "$fw_ver" != "" ]; then + echo -e " Firmware: $fw_ver" + fi + fi + done +} + +get_hba_info() { + echo -e "\n${GREEN}=== HBA/Storage Controller Information ===${NC}" + + # Find RAID, SAS, SCSI, and storage controllers + lspci -vmm 2>/dev/null | awk ' + BEGIN { RS=""; FS="\n" } + /RAID|SAS|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe/ { + for (i=1; i<=NF; i++) { + if ($i ~ /^Slot:/) slot = substr($i, 7) + if ($i ~ /^Class:/) class = substr($i, 8) + if ($i ~ /^Vendor:/) vendor = substr($i, 9) + if ($i ~ /^Device:/) device = substr($i, 9) + if ($i ~ /^Rev:/) rev = substr($i, 6) + } + printf "\n%s\n", slot + printf " Class: %s\n", class + printf " Vendor: %s\n", vendor + printf " Device: %s\n", device + if (rev) printf " Rev: %s\n", rev + slot=""; class=""; vendor=""; device=""; rev="" + } + ' + + # Show detailed info for storage controllers + echo -e "\n${GREEN}=== Storage Controller Details ===${NC}" + for ctrl in $(lspci | grep -iE "RAID|SAS|SCSI|Mass storage|NVMe" | awk '{print $1}'); do + echo -e "\n${GREEN}Controller $ctrl:${NC}" + lspci -vvs "$ctrl" 2>/dev/null | grep -E "^\s+(Subsystem|LnkSta|Kernel driver)" | head -5 + done +} + get_system_status() { echo -e "\n${GREEN}=== System Load ===${NC}" uptime @@ -174,6 +296,105 @@ get_system_status() { journalctl -p err -n 5 --no-pager } +################### +# DriveAtlas & Monitoring Functions +################### + +get_drive_atlas() { + echo -e "\n${GREEN}=== Drive Atlas - Physical Bay Mapping ===${NC}" + if command -v curl >/dev/null 2>&1; then + if ! bash <(curl -sL "http://10.10.10.63:3000/LotusGuild/driveAtlas/raw/branch/main/driveAtlas.sh") 2>/dev/null; then + log_message warn "DriveAtlas failed to execute or server unavailable" + fi + else + log_message warn "curl not installed - cannot fetch DriveAtlas" + fi +} + +get_ceph_health() { + echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}" + if command -v ceph >/dev/null 2>&1; then + echo -e "${GREEN}Health Status:${NC}" + ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster" + + echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}" + ceph osd tree 2>/dev/null || true + + echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}" + ceph df 2>/dev/null || true + + echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}" + ceph osd df 2>/dev/null || true + else + log_message info "Ceph tools not installed on this node" + fi +} + +get_node_exporter_status() { + echo -e "\n${GREEN}=== Node Exporter Status ===${NC}" + if systemctl is-active --quiet node_exporter 2>/dev/null; then + echo -e "${GREEN}Service:${NC} Running" + local ip=$(hostname -I | awk '{print $1}') + echo -e "${GREEN}Metrics URL:${NC} http://${ip}:9100/metrics" + if ss -tlnp 2>/dev/null | grep -q ':9100'; then + echo -e "${GREEN}Port 9100:${NC} Listening" + else + log_message warn "Port 9100 not listening" + fi + elif systemctl list-unit-files 2>/dev/null | grep -q node_exporter; then + log_message warn "Node Exporter is installed but not running" + echo -e "Start with: systemctl start node_exporter" + else + log_message info "Node Exporter not installed" + fi +} + +get_hwmon_status() { + echo -e "\n${GREEN}=== hwmon Daemon Status ===${NC}" + if systemctl is-active --quiet hwmon.timer 2>/dev/null; then + echo -e "${GREEN}Timer:${NC} Active" + systemctl list-timers hwmon.timer --no-pager 2>/dev/null + echo -e "\n${GREEN}Last Run:${NC}" + journalctl -u hwmon.service -n 3 --no-pager 2>/dev/null || true + elif systemctl list-unit-files 2>/dev/null | grep -q hwmon.timer; then + log_message warn "hwmon timer is installed but not active" + echo -e "Enable with: systemctl enable --now hwmon.timer" + else + log_message info "hwmon daemon not installed" + fi +} + +quick_health_check() { + echo -e "\n${GREEN}=== Quick Health Check ===${NC}" + echo -e "Running quick health assessment...\n" + + # Services + check_services + + # Temperatures + get_temp_info + + # Disk health (quick) + echo -e "\n${GREEN}=== Disk Health Summary ===${NC}" + if command -v smartctl >/dev/null 2>&1; then + for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do + health=$(smartctl -H /dev/$disk 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs) + if [[ -n "$health" ]]; then + echo -e "/dev/$disk: $health" + fi + done + fi + + # Node Exporter + get_node_exporter_status + + # Ceph quick status + if command -v ceph >/dev/null 2>&1; then + echo -e "\n${GREEN}=== Ceph Quick Status ===${NC}" + ceph health 2>/dev/null || true + fi +} + ################### # Proxmox Specific Functions ################### @@ -221,13 +442,21 @@ help() { echo "A comprehensive diagnostic tool for Proxmox server health checks." echo "" echo "Treatment Options:" - echo " --help Show this prescription guide" - echo " --diags Perform full system examination" - echo " --connect Make a house call to a remote Proxmox host" - echo " --services Check vital services" - echo " --vm-list Check VM vitals" - echo " --backup Review backup health" - echo " --save Save examination results to medical record" + echo " --help Show this prescription guide" + echo " --diags Perform full system examination" + echo " --quick Quick health check (services, temps, disks)" + echo " --drives Show physical drive bay mapping (DriveAtlas)" + echo " --ceph Check Ceph cluster health" + echo " --node-exporter Check Node Exporter status" + echo " --hwmon Check hwmon daemon status" + echo " --services Check vital Proxmox services" + echo " --vm-list Check VM vitals" + echo " --ct-list Check container vitals" + echo " --backup Review backup health" + echo " --connect Make a house call to a remote Proxmox host" + echo " --save Save examination results to medical record" + echo "" + echo "Interactive mode: Run without arguments for menu" exit 0 } @@ -242,6 +471,44 @@ connectToHost() { fi } +show_menu() { + echo "" + echo -e "${GREEN}Select a diagnostic option:${NC}" + echo "" + echo " 1) Full System Diagnostics" + echo " 2) Quick Health Check" + echo " 3) Drive Atlas (Physical Bay Mapping)" + echo " 4) Check Proxmox Services" + echo " 5) VM Status" + echo " 6) Container Status" + echo " 7) Ceph Cluster Health" + echo " 8) Node Exporter Status" + echo " 9) hwmon Daemon Status" + echo " 10) Backup Status" + echo " 11) Connect to Remote Host" + echo " 12) Save Full Report to File" + echo " 0) Exit" + echo "" + read -rp "Enter choice [0-12]: " choice + + case $choice in + 1) check_requirements; runDiags ;; + 2) quick_health_check ;; + 3) get_drive_atlas ;; + 4) check_services ;; + 5) list_vms ;; + 6) list_containers ;; + 7) get_ceph_health ;; + 8) get_node_exporter_status ;; + 9) get_hwmon_status ;; + 10) echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;; + 11) connectToHost ;; + 12) exec 1> >(tee "proxmox_diag_$(date '+%Y%m%d_%H%M%S').log"); runDiags ;; + 0) echo "Goodbye!"; exit 0 ;; + *) echo -e "${RED}Invalid option${NC}"; show_menu ;; + esac +} + ################### # Main Functions ################### @@ -251,13 +518,22 @@ runDiags() { get_system_info get_cpu_info get_ram_info + get_memory_details get_storage_info get_disk_health + get_drive_atlas get_network_info get_detailed_network + get_nic_details get_hardware_info + get_motherboard_info + get_hba_info get_temp_info get_system_status + get_node_exporter_status + get_hwmon_status + get_ceph_health + list_vms list_containers ) & show_progress $! log_message info "Examination complete" @@ -266,44 +542,42 @@ runDiags() { checkForInput() { case $1 in - --help) help ;; - --diags) check_requirements; runDiags ;; - --connect) connectToHost ;; - --services) check_services ;; - --vm-list) list_vms ;; - --backup) echo -e "${GREEN}Backup Status:${NC}"; pvesm status ;; - --save) exec 1> >(tee "proxmox_diag_$(date '+%Y%m%d_%H%M%S').log"); runDiags ;; - *) echo -e "${RED}Invalid option: $1${NC}"; help ;; + --help) help ;; + --diags) check_requirements; runDiags ;; + --quick) quick_health_check ;; + --drives) get_drive_atlas ;; + --ceph) get_ceph_health ;; + --node-exporter) get_node_exporter_status ;; + --hwmon) get_hwmon_status ;; + --connect) connectToHost ;; + --services) check_services ;; + --vm-list) list_vms ;; + --ct-list) list_containers ;; + --backup) echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;; + --save) exec 1> >(tee "proxmox_diag_$(date '+%Y%m%d_%H%M%S').log"); check_requirements; runDiags ;; + *) echo -e "${RED}Invalid option: $1${NC}"; help ;; esac } -main() { - print_header - - trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM - - if [[ $EUID -ne 0 ]]; then - handle_error "This script must be run as root" - fi - - if checkIfOnHypervisor; then - runDiags - else - connectToHost - fi -} ################### # Script Execution ################### argOne=$1 +# Show header +print_header + +# Check root +if [[ $EUID -ne 0 ]]; then + handle_error "This script must be run as root" +fi + +# Set trap for interrupts +trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM + if [[ -n $argOne ]]; then checkForInput "$argOne" else - echo "Please enter an option:" - read -r argOne - checkForInput "$argOne" -fi - -main \ No newline at end of file + show_menu +fi \ No newline at end of file