Update to v1.1.0: Add interactive menu, DriveAtlas, and monitoring integrations

- Add interactive numbered menu when run without arguments
- Add DriveAtlas integration (--drives) for physical drive bay mapping
- Add Ceph cluster health monitoring (--ceph)
- Add Node Exporter status check (--node-exporter)
- Add hwmon daemon status check (--hwmon)
- Add quick health check mode (--quick)
- Add container list option (--ct-list)
- Full diagnostics now includes all monitoring checks
- Update README with new features and changelog

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-24 17:18:53 -05:00
parent 90cb1eee97
commit 575c60b1fa
2 changed files with 369 additions and 48 deletions

View File

@@ -5,11 +5,21 @@ ProxDoc is a comprehensive diagnostic tool for Proxmox server health monitoring
## Features ## Features
- Complete system diagnostics and health checks - Complete system diagnostics and health checks
- **Interactive menu system** when run without arguments
- **DriveAtlas integration** for physical drive bay mapping
- **Ceph cluster health** monitoring (OSD tree, pool usage, disk usage)
- **Node Exporter status** check for Prometheus monitoring
- **hwmon daemon status** for hardware monitoring
- **Quick health check** mode for fast assessments
- Temperature monitoring - Temperature monitoring
- Disk health status - Disk health status via SMART
- CPU and RAM information - CPU and RAM information
- Detailed memory DIMM information (slots, speed, manufacturer)
- Motherboard and system information
- Storage information including ZFS pools - Storage information including ZFS pools
- Network diagnostics - Network diagnostics with ethtool integration
- Detailed NIC information (driver, firmware, link speed)
- HBA/Storage controller detection and details
- Hardware information - Hardware information
- Service status monitoring - Service status monitoring
- VM and Container status - VM and Container status
@@ -23,7 +33,11 @@ The script requires the following tools to be installed:
- ip - ip
- smartctl - smartctl
- sensors - sensors
- netstat - lspci
Optional tools for enhanced diagnostics:
- ethtool (for detailed NIC information including link speed and firmware)
- netstat (for network statistics)
## Usage ## Usage
@@ -38,22 +52,39 @@ curl -sL "http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh"
- `--help`: Show the help guide - `--help`: Show the help guide
- `--diags`: Perform full system examination - `--diags`: Perform full system examination
- `--connect`: Connect to a remote Proxmox host - `--quick`: Quick health check (services, temps, disks)
- `--services`: Check vital services - `--drives`: Show physical drive bay mapping (DriveAtlas)
- `--ceph`: Check Ceph cluster health
- `--node-exporter`: Check Node Exporter status
- `--hwmon`: Check hwmon daemon status
- `--services`: Check vital Proxmox services
- `--vm-list`: Check VM vitals - `--vm-list`: Check VM vitals
- `--ct-list`: Check container vitals
- `--backup`: Review backup health - `--backup`: Review backup health
- `--connect`: Connect to a remote Proxmox host
- `--save`: Save examination results to a log file - `--save`: Save examination results to a log file
### Interactive Mode
Run without arguments to display an interactive menu:
```bash
./proxDoc.sh
```
## Output Information ## Output Information
The script provides detailed information about: The script provides detailed information about:
- System information and Proxmox version - System information and Proxmox version
- CPU model, cores, and frequency - CPU model, cores, and frequency
- RAM usage - RAM usage and detailed DIMM information
- Motherboard manufacturer, model, and serial number
- Storage status - Storage status
- Disk health - Disk health
- Network configuration - Network configuration
- Hardware details - Detailed NIC information (driver, MAC, speed, firmware)
- HBA/Storage controller details
- Hardware details including PCI devices
- System load - System load
- Service status - Service status
- Recent system errors - Recent system errors
@@ -75,4 +106,20 @@ chmod +x proxdoc.sh
## Version ## Version
Current Version: 1.0.0 Current Version: 1.1.0
### Changelog
#### v1.1.0
- Added interactive menu system when run without arguments
- Added DriveAtlas integration (`--drives`) for physical drive bay mapping
- Added Ceph cluster health monitoring (`--ceph`)
- Added Node Exporter status check (`--node-exporter`)
- Added hwmon daemon status check (`--hwmon`)
- Added quick health check mode (`--quick`)
- Added container list option (`--ct-list`)
- Full diagnostics now include all monitoring checks
- Improved error handling with graceful fallbacks
#### v1.0.0
- Initial release

View File

@@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash
VERSION="1.0.0" VERSION="1.1.0"
SPINNER="/-\|" SPINNER="/-\|"
################### ###################
@@ -63,7 +63,7 @@ show_progress() {
check_requirements() { check_requirements() {
log_message info "Checking medical equipment..." log_message info "Checking medical equipment..."
local tools=("dmidecode" "lscpu" "ip" "smartctl" "sensors" "netstat") local tools=("dmidecode" "lscpu" "ip" "smartctl" "sensors" "lspci")
for tool in "${tools[@]}"; do for tool in "${tools[@]}"; do
if ! command -v "$tool" >/dev/null 2>&1; then if ! command -v "$tool" >/dev/null 2>&1; then
handle_error "Required instrument '$tool' is missing" handle_error "Required instrument '$tool' is missing"
@@ -151,10 +151,16 @@ get_network_info() {
} }
get_detailed_network() { get_detailed_network() {
echo -e "\n${GREEN}=== Network Interface Details ===${NC}" echo -e "\n${GREEN}=== Network Interface Statistics ===${NC}"
ip -s link show ip -s link show
echo -e "\n${GREEN}=== Network Statistics ===${NC}" echo -e "\n${GREEN}=== Network Statistics ===${NC}"
netstat -i if command -v netstat >/dev/null 2>&1; then
netstat -i
elif command -v ss >/dev/null 2>&1; then
ss -s
else
log_message warn "netstat/ss not found for network statistics"
fi
} }
get_hardware_info() { get_hardware_info() {
@@ -163,6 +169,122 @@ get_hardware_info() {
lspci | grep -i -E "vga|ethernet|raid" lspci | grep -i -E "vga|ethernet|raid"
} }
get_motherboard_info() {
echo -e "\n${GREEN}=== Motherboard Information ===${NC}"
echo -e "${GREEN}Manufacturer:${NC} $(dmidecode -s baseboard-manufacturer)"
echo -e "${GREEN}Product Name:${NC} $(dmidecode -s baseboard-product-name)"
echo -e "${GREEN}Version:${NC} $(dmidecode -s baseboard-version)"
echo -e "${GREEN}Serial Number:${NC} $(dmidecode -s baseboard-serial-number)"
echo -e "${GREEN}System Manufacturer:${NC} $(dmidecode -s system-manufacturer)"
echo -e "${GREEN}System Product:${NC} $(dmidecode -s system-product-name)"
echo -e "${GREEN}System Serial:${NC} $(dmidecode -s system-serial-number)"
}
get_memory_details() {
echo -e "\n${GREEN}=== Memory DIMM Information ===${NC}"
dmidecode -t memory | awk '
/Memory Device/,/^$/ {
if (/Size:/ && !/No Module Installed/) {
size=$2" "$3
}
if (/Type:/ && !/Unknown/ && !/Error/) {
type=$2
}
if (/Speed:/ && !/Unknown/ && $2 != "Unknown") {
speed=$2" "$3
}
if (/Manufacturer:/ && !/Unknown/ && $2 != "Unknown") {
mfr=$2
}
if (/Part Number:/ && !/Unknown/) {
part=$3
}
if (/Locator:/ && !/Bank/) {
loc=$2
if (size && size !~ /No/) {
printf "%-12s %-10s %-8s %-12s %-20s\n", loc, size, type, speed, mfr
size=""; type=""; speed=""; mfr=""; part=""
}
}
}
'
echo -e "\n${GREEN}Memory Summary:${NC}"
echo -e " Total Slots: $(dmidecode -t memory | grep -c "Memory Device")"
echo -e " Populated: $(dmidecode -t memory | grep "Size:" | grep -cv "No Module")"
echo -e " Max Capacity: $(dmidecode -t memory | grep "Maximum Capacity" | head -1 | awk '{print $3" "$4}')"
}
get_nic_details() {
echo -e "\n${GREEN}=== Network Interface Details ===${NC}"
for iface in $(ls /sys/class/net | grep -v lo); do
echo -e "\n${GREEN}Interface: $iface${NC}"
# Get driver info
if [ -L "/sys/class/net/$iface/device/driver" ]; then
driver=$(basename $(readlink /sys/class/net/$iface/device/driver))
echo -e " Driver: $driver"
fi
# Get MAC address
if [ -f "/sys/class/net/$iface/address" ]; then
echo -e " MAC: $(cat /sys/class/net/$iface/address)"
fi
# Get link state
if [ -f "/sys/class/net/$iface/operstate" ]; then
echo -e " State: $(cat /sys/class/net/$iface/operstate)"
fi
# Use ethtool if available
if command -v ethtool >/dev/null 2>&1; then
# Get speed and duplex
link_info=$(ethtool $iface 2>/dev/null | grep -E "Speed:|Duplex:|Link detected:")
if [ -n "$link_info" ]; then
echo "$link_info" | while read line; do
echo -e " $line"
done
fi
# Get firmware version
fw_ver=$(ethtool -i $iface 2>/dev/null | grep "firmware-version" | awk '{print $2}')
if [ -n "$fw_ver" ] && [ "$fw_ver" != "" ]; then
echo -e " Firmware: $fw_ver"
fi
fi
done
}
get_hba_info() {
echo -e "\n${GREEN}=== HBA/Storage Controller Information ===${NC}"
# Find RAID, SAS, SCSI, and storage controllers
lspci -vmm 2>/dev/null | awk '
BEGIN { RS=""; FS="\n" }
/RAID|SAS|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe/ {
for (i=1; i<=NF; i++) {
if ($i ~ /^Slot:/) slot = substr($i, 7)
if ($i ~ /^Class:/) class = substr($i, 8)
if ($i ~ /^Vendor:/) vendor = substr($i, 9)
if ($i ~ /^Device:/) device = substr($i, 9)
if ($i ~ /^Rev:/) rev = substr($i, 6)
}
printf "\n%s\n", slot
printf " Class: %s\n", class
printf " Vendor: %s\n", vendor
printf " Device: %s\n", device
if (rev) printf " Rev: %s\n", rev
slot=""; class=""; vendor=""; device=""; rev=""
}
'
# Show detailed info for storage controllers
echo -e "\n${GREEN}=== Storage Controller Details ===${NC}"
for ctrl in $(lspci | grep -iE "RAID|SAS|SCSI|Mass storage|NVMe" | awk '{print $1}'); do
echo -e "\n${GREEN}Controller $ctrl:${NC}"
lspci -vvs "$ctrl" 2>/dev/null | grep -E "^\s+(Subsystem|LnkSta|Kernel driver)" | head -5
done
}
get_system_status() { get_system_status() {
echo -e "\n${GREEN}=== System Load ===${NC}" echo -e "\n${GREEN}=== System Load ===${NC}"
uptime uptime
@@ -174,6 +296,105 @@ get_system_status() {
journalctl -p err -n 5 --no-pager journalctl -p err -n 5 --no-pager
} }
###################
# DriveAtlas & Monitoring Functions
###################
get_drive_atlas() {
echo -e "\n${GREEN}=== Drive Atlas - Physical Bay Mapping ===${NC}"
if command -v curl >/dev/null 2>&1; then
if ! bash <(curl -sL "http://10.10.10.63:3000/LotusGuild/driveAtlas/raw/branch/main/driveAtlas.sh") 2>/dev/null; then
log_message warn "DriveAtlas failed to execute or server unavailable"
fi
else
log_message warn "curl not installed - cannot fetch DriveAtlas"
fi
}
get_ceph_health() {
echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}"
if command -v ceph >/dev/null 2>&1; then
echo -e "${GREEN}Health Status:${NC}"
ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster"
echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}"
ceph osd tree 2>/dev/null || true
echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}"
ceph df 2>/dev/null || true
echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}"
ceph osd df 2>/dev/null || true
else
log_message info "Ceph tools not installed on this node"
fi
}
get_node_exporter_status() {
echo -e "\n${GREEN}=== Node Exporter Status ===${NC}"
if systemctl is-active --quiet node_exporter 2>/dev/null; then
echo -e "${GREEN}Service:${NC} Running"
local ip=$(hostname -I | awk '{print $1}')
echo -e "${GREEN}Metrics URL:${NC} http://${ip}:9100/metrics"
if ss -tlnp 2>/dev/null | grep -q ':9100'; then
echo -e "${GREEN}Port 9100:${NC} Listening"
else
log_message warn "Port 9100 not listening"
fi
elif systemctl list-unit-files 2>/dev/null | grep -q node_exporter; then
log_message warn "Node Exporter is installed but not running"
echo -e "Start with: systemctl start node_exporter"
else
log_message info "Node Exporter not installed"
fi
}
get_hwmon_status() {
echo -e "\n${GREEN}=== hwmon Daemon Status ===${NC}"
if systemctl is-active --quiet hwmon.timer 2>/dev/null; then
echo -e "${GREEN}Timer:${NC} Active"
systemctl list-timers hwmon.timer --no-pager 2>/dev/null
echo -e "\n${GREEN}Last Run:${NC}"
journalctl -u hwmon.service -n 3 --no-pager 2>/dev/null || true
elif systemctl list-unit-files 2>/dev/null | grep -q hwmon.timer; then
log_message warn "hwmon timer is installed but not active"
echo -e "Enable with: systemctl enable --now hwmon.timer"
else
log_message info "hwmon daemon not installed"
fi
}
quick_health_check() {
echo -e "\n${GREEN}=== Quick Health Check ===${NC}"
echo -e "Running quick health assessment...\n"
# Services
check_services
# Temperatures
get_temp_info
# Disk health (quick)
echo -e "\n${GREEN}=== Disk Health Summary ===${NC}"
if command -v smartctl >/dev/null 2>&1; then
for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do
health=$(smartctl -H /dev/$disk 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs)
if [[ -n "$health" ]]; then
echo -e "/dev/$disk: $health"
fi
done
fi
# Node Exporter
get_node_exporter_status
# Ceph quick status
if command -v ceph >/dev/null 2>&1; then
echo -e "\n${GREEN}=== Ceph Quick Status ===${NC}"
ceph health 2>/dev/null || true
fi
}
################### ###################
# Proxmox Specific Functions # Proxmox Specific Functions
################### ###################
@@ -221,13 +442,21 @@ help() {
echo "A comprehensive diagnostic tool for Proxmox server health checks." echo "A comprehensive diagnostic tool for Proxmox server health checks."
echo "" echo ""
echo "Treatment Options:" echo "Treatment Options:"
echo " --help Show this prescription guide" echo " --help Show this prescription guide"
echo " --diags Perform full system examination" echo " --diags Perform full system examination"
echo " --connect Make a house call to a remote Proxmox host" echo " --quick Quick health check (services, temps, disks)"
echo " --services Check vital services" echo " --drives Show physical drive bay mapping (DriveAtlas)"
echo " --vm-list Check VM vitals" echo " --ceph Check Ceph cluster health"
echo " --backup Review backup health" echo " --node-exporter Check Node Exporter status"
echo " --save Save examination results to medical record" echo " --hwmon Check hwmon daemon status"
echo " --services Check vital Proxmox services"
echo " --vm-list Check VM vitals"
echo " --ct-list Check container vitals"
echo " --backup Review backup health"
echo " --connect Make a house call to a remote Proxmox host"
echo " --save Save examination results to medical record"
echo ""
echo "Interactive mode: Run without arguments for menu"
exit 0 exit 0
} }
@@ -242,6 +471,44 @@ connectToHost() {
fi fi
} }
show_menu() {
echo ""
echo -e "${GREEN}Select a diagnostic option:${NC}"
echo ""
echo " 1) Full System Diagnostics"
echo " 2) Quick Health Check"
echo " 3) Drive Atlas (Physical Bay Mapping)"
echo " 4) Check Proxmox Services"
echo " 5) VM Status"
echo " 6) Container Status"
echo " 7) Ceph Cluster Health"
echo " 8) Node Exporter Status"
echo " 9) hwmon Daemon Status"
echo " 10) Backup Status"
echo " 11) Connect to Remote Host"
echo " 12) Save Full Report to File"
echo " 0) Exit"
echo ""
read -rp "Enter choice [0-12]: " choice
case $choice in
1) check_requirements; runDiags ;;
2) quick_health_check ;;
3) get_drive_atlas ;;
4) check_services ;;
5) list_vms ;;
6) list_containers ;;
7) get_ceph_health ;;
8) get_node_exporter_status ;;
9) get_hwmon_status ;;
10) echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;;
11) connectToHost ;;
12) exec 1> >(tee "proxmox_diag_$(date '+%Y%m%d_%H%M%S').log"); runDiags ;;
0) echo "Goodbye!"; exit 0 ;;
*) echo -e "${RED}Invalid option${NC}"; show_menu ;;
esac
}
################### ###################
# Main Functions # Main Functions
################### ###################
@@ -251,13 +518,22 @@ runDiags() {
get_system_info get_system_info
get_cpu_info get_cpu_info
get_ram_info get_ram_info
get_memory_details
get_storage_info get_storage_info
get_disk_health get_disk_health
get_drive_atlas
get_network_info get_network_info
get_detailed_network get_detailed_network
get_nic_details
get_hardware_info get_hardware_info
get_motherboard_info
get_hba_info
get_temp_info get_temp_info
get_system_status get_system_status
get_node_exporter_status
get_hwmon_status
get_ceph_health
list_vms
list_containers list_containers
) & show_progress $! ) & show_progress $!
log_message info "Examination complete" log_message info "Examination complete"
@@ -266,44 +542,42 @@ runDiags() {
checkForInput() { checkForInput() {
case $1 in case $1 in
--help) help ;; --help) help ;;
--diags) check_requirements; runDiags ;; --diags) check_requirements; runDiags ;;
--connect) connectToHost ;; --quick) quick_health_check ;;
--services) check_services ;; --drives) get_drive_atlas ;;
--vm-list) list_vms ;; --ceph) get_ceph_health ;;
--backup) echo -e "${GREEN}Backup Status:${NC}"; pvesm status ;; --node-exporter) get_node_exporter_status ;;
--save) exec 1> >(tee "proxmox_diag_$(date '+%Y%m%d_%H%M%S').log"); runDiags ;; --hwmon) get_hwmon_status ;;
*) echo -e "${RED}Invalid option: $1${NC}"; help ;; --connect) connectToHost ;;
--services) check_services ;;
--vm-list) list_vms ;;
--ct-list) list_containers ;;
--backup) echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;;
--save) exec 1> >(tee "proxmox_diag_$(date '+%Y%m%d_%H%M%S').log"); check_requirements; runDiags ;;
*) echo -e "${RED}Invalid option: $1${NC}"; help ;;
esac esac
} }
main() {
print_header
trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM
if [[ $EUID -ne 0 ]]; then
handle_error "This script must be run as root"
fi
if checkIfOnHypervisor; then
runDiags
else
connectToHost
fi
}
################### ###################
# Script Execution # Script Execution
################### ###################
argOne=$1 argOne=$1
# Show header
print_header
# Check root
if [[ $EUID -ne 0 ]]; then
handle_error "This script must be run as root"
fi
# Set trap for interrupts
trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM
if [[ -n $argOne ]]; then if [[ -n $argOne ]]; then
checkForInput "$argOne" checkForInput "$argOne"
else else
echo "Please enter an option:" show_menu
read -r argOne
checkForInput "$argOne"
fi fi
main