Files
proxDoc/proxDoc.sh
Jared Vititoe d9e546f75d Remove interactive features for remote-only execution
- Remove interactive menu (requires stdin)
- Remove --connect option (requires stdin)
- Remove --save option (not practical for remote execution)
- Show help when run without arguments
- Update help to show curl usage example
- Update README for remote-only usage

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 17:22:37 -05:00

533 lines
17 KiB
Bash
Executable File

#!/bin/bash
VERSION="1.1.0"
SPINNER="/-\|"
###################
# Color Definitions
###################
NC="\033[00m"
GREEN="\033[01;32m"
RED="\033[01;31m"
YELLOW="\033[01;33m"
###################
# Utility Functions
###################
print_header() {
echo "
____ ____
/ __ \_________ _ __/ __ \____ _____
/ /_/ / ___/ __ \| |/_/ / / / __ \/ ___/
/ ____/ / / /_/ /> </ /_/ / /_/ / /__
/_/ /_/ \____/_/|_/_____/\____/\___/
The Proxmox System Doctor
Version ${VERSION}
======================================"
echo "Started at: $(date '+%Y-%m-%d %H:%M:%S')"
echo "Examining patient: $(hostname)"
echo "======================================"
}
handle_error() {
echo -e "${RED}Error: $1${NC}"
exit 1
}
log_message() {
local level=$1
local message=$2
case $level in
info) echo -e "${GREEN}[INFO]${NC} $message" ;;
warn) echo -e "${YELLOW}[WARN]${NC} $message" ;;
error) echo -e "${RED}[ERROR]${NC} $message" ;;
esac
}
show_progress() {
local pid=$1
local delay=0.1
local spinstr='|/-\'
while ps -p $pid > /dev/null; do
local temp=${spinstr#?}
printf " [%c] " "$spinstr"
local spinstr=$temp${spinstr%"$temp"}
sleep $delay
printf "\b\b\b\b\b\b"
done
printf " \b\b\b\b"
}
check_requirements() {
log_message info "Checking medical equipment..."
local tools=("dmidecode" "lscpu" "ip" "smartctl" "sensors" "lspci")
for tool in "${tools[@]}"; do
if ! command -v "$tool" >/dev/null 2>&1; then
handle_error "Required instrument '$tool' is missing"
fi
done
}
checkIfOnHypervisor() {
if ! command -v pveversion >/dev/null 2>&1; then
return 1
fi
return 0
}
###################
# System Information Functions
###################
get_system_info() {
echo -e "\n${GREEN}=== System Information ===${NC}"
echo -e "\n${GREEN}=== Diagnostic Run: $(date '+%Y-%m-%d %H:%M:%S') ===${NC}"
echo -e "${GREEN}Hostname:$(uname -n)${NC}"
echo -e "${GREEN}Kernel:$(uname -r)${NC}"
echo -e "\n${GREEN}=== Proxmox Version ===${NC}"
pveversion || echo "Not available"
}
get_temp_info() {
echo -e "\n${GREEN}=== Temperature Information ===${NC}"
if command -v sensors >/dev/null 2>&1; then
sensors
else
log_message warn "sensors command not found. Install lm-sensors package for temperature monitoring"
fi
}
get_disk_health() {
echo -e "\n${GREEN}=== Disk Health Status ===${NC}"
if command -v smartctl >/dev/null 2>&1; then
for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do
echo -e "\nChecking /dev/$disk:"
smartctl -H /dev/$disk
done
else
log_message warn "smartctl not found. Install smartmontools for disk health monitoring"
fi
}
get_cpu_info() {
cpu_info=$(grep -m 1 -w 'model name' /proc/cpuinfo | awk -F: '{print $2}' | xargs) || {
echo -e "${RED}Failed to retrieve CPU model information.${NC}"
}
cpu_cores=$(lscpu | grep '^CPU(s):' | awk '{print $2}')
cpu_mhz=$(lscpu | grep 'MHz' | awk '{print $4}')
echo -e "${GREEN}CPU Model:${NC} $cpu_info"
echo -e "${GREEN}CPU Cores:${NC} $cpu_cores"
echo -e "${GREEN}CPU MHz:${NC} $cpu_mhz"
}
get_ram_info() {
ram_total=$(free -h | grep 'Mem:' | awk '{print $2}')
ram_used=$(free -h | grep 'Mem:' | awk '{print $3}')
ram_free=$(free -h | grep 'Mem:' | awk '{print $4}')
echo -e "${GREEN}Total RAM:${NC} $ram_total"
echo -e "${GREEN}Used RAM:${NC} $ram_used"
echo -e "${GREEN}Free RAM:${NC} $ram_free"
}
get_storage_info() {
echo -e "${GREEN}Storage Information:${NC}"
df -h --output=source,size,used,avail,pcent | grep '^/dev'
if command -v zpool >/dev/null 2>&1; then
echo -e "\n${GREEN}=== ZFS Pool Status ===${NC}"
zpool status
fi
}
get_network_info() {
default_gateway=$(ip route | grep default | awk '{print $3}')
ip_addresses=$(hostname -I | xargs)
echo -e "${GREEN}Default Gateway:${NC} $default_gateway"
echo -e "${GREEN}IP Addresses:${NC} $ip_addresses"
}
get_detailed_network() {
echo -e "\n${GREEN}=== Network Interface Statistics ===${NC}"
ip -s link show
echo -e "\n${GREEN}=== Network Statistics ===${NC}"
if command -v netstat >/dev/null 2>&1; then
netstat -i
elif command -v ss >/dev/null 2>&1; then
ss -s
else
log_message warn "netstat/ss not found for network statistics"
fi
}
get_hardware_info() {
echo -e "${GREEN}BIOS Version:${NC} $(dmidecode -s bios-version)"
echo -e "\n${GREEN}=== PCI Devices ===${NC}"
lspci | grep -i -E "vga|ethernet|raid"
}
get_motherboard_info() {
echo -e "\n${GREEN}=== Motherboard Information ===${NC}"
echo -e "${GREEN}Manufacturer:${NC} $(dmidecode -s baseboard-manufacturer)"
echo -e "${GREEN}Product Name:${NC} $(dmidecode -s baseboard-product-name)"
echo -e "${GREEN}Version:${NC} $(dmidecode -s baseboard-version)"
echo -e "${GREEN}Serial Number:${NC} $(dmidecode -s baseboard-serial-number)"
echo -e "${GREEN}System Manufacturer:${NC} $(dmidecode -s system-manufacturer)"
echo -e "${GREEN}System Product:${NC} $(dmidecode -s system-product-name)"
echo -e "${GREEN}System Serial:${NC} $(dmidecode -s system-serial-number)"
}
get_memory_details() {
echo -e "\n${GREEN}=== Memory DIMM Information ===${NC}"
dmidecode -t memory | awk '
/Memory Device/,/^$/ {
if (/Size:/ && !/No Module Installed/) {
size=$2" "$3
}
if (/Type:/ && !/Unknown/ && !/Error/) {
type=$2
}
if (/Speed:/ && !/Unknown/ && $2 != "Unknown") {
speed=$2" "$3
}
if (/Manufacturer:/ && !/Unknown/ && $2 != "Unknown") {
mfr=$2
}
if (/Part Number:/ && !/Unknown/) {
part=$3
}
if (/Locator:/ && !/Bank/) {
loc=$2
if (size && size !~ /No/) {
printf "%-12s %-10s %-8s %-12s %-20s\n", loc, size, type, speed, mfr
size=""; type=""; speed=""; mfr=""; part=""
}
}
}
'
echo -e "\n${GREEN}Memory Summary:${NC}"
echo -e " Total Slots: $(dmidecode -t memory | grep -c "Memory Device")"
echo -e " Populated: $(dmidecode -t memory | grep "Size:" | grep -cv "No Module")"
echo -e " Max Capacity: $(dmidecode -t memory | grep "Maximum Capacity" | head -1 | awk '{print $3" "$4}')"
}
get_nic_details() {
echo -e "\n${GREEN}=== Network Interface Details ===${NC}"
for iface in $(ls /sys/class/net | grep -v lo); do
echo -e "\n${GREEN}Interface: $iface${NC}"
# Get driver info
if [ -L "/sys/class/net/$iface/device/driver" ]; then
driver=$(basename $(readlink /sys/class/net/$iface/device/driver))
echo -e " Driver: $driver"
fi
# Get MAC address
if [ -f "/sys/class/net/$iface/address" ]; then
echo -e " MAC: $(cat /sys/class/net/$iface/address)"
fi
# Get link state
if [ -f "/sys/class/net/$iface/operstate" ]; then
echo -e " State: $(cat /sys/class/net/$iface/operstate)"
fi
# Use ethtool if available
if command -v ethtool >/dev/null 2>&1; then
# Get speed and duplex
link_info=$(ethtool $iface 2>/dev/null | grep -E "Speed:|Duplex:|Link detected:")
if [ -n "$link_info" ]; then
echo "$link_info" | while read line; do
echo -e " $line"
done
fi
# Get firmware version
fw_ver=$(ethtool -i $iface 2>/dev/null | grep "firmware-version" | awk '{print $2}')
if [ -n "$fw_ver" ] && [ "$fw_ver" != "" ]; then
echo -e " Firmware: $fw_ver"
fi
fi
done
}
get_hba_info() {
echo -e "\n${GREEN}=== HBA/Storage Controller Information ===${NC}"
# Find RAID, SAS, SCSI, and storage controllers
lspci -vmm 2>/dev/null | awk '
BEGIN { RS=""; FS="\n" }
/RAID|SAS|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe/ {
for (i=1; i<=NF; i++) {
if ($i ~ /^Slot:/) slot = substr($i, 7)
if ($i ~ /^Class:/) class = substr($i, 8)
if ($i ~ /^Vendor:/) vendor = substr($i, 9)
if ($i ~ /^Device:/) device = substr($i, 9)
if ($i ~ /^Rev:/) rev = substr($i, 6)
}
printf "\n%s\n", slot
printf " Class: %s\n", class
printf " Vendor: %s\n", vendor
printf " Device: %s\n", device
if (rev) printf " Rev: %s\n", rev
slot=""; class=""; vendor=""; device=""; rev=""
}
'
# Show detailed info for storage controllers
echo -e "\n${GREEN}=== Storage Controller Details ===${NC}"
for ctrl in $(lspci | grep -iE "RAID|SAS|SCSI|Mass storage|NVMe" | awk '{print $1}'); do
echo -e "\n${GREEN}Controller $ctrl:${NC}"
lspci -vvs "$ctrl" 2>/dev/null | grep -E "^\s+(Subsystem|LnkSta|Kernel driver)" | head -5
done
}
get_system_status() {
echo -e "\n${GREEN}=== System Load ===${NC}"
uptime
echo -e "\n${GREEN}=== Service Status ===${NC}"
systemctl list-units --type=service --state=running | wc -l
echo -e "\n${GREEN}=== Recent System Errors ===${NC}"
journalctl -p err -n 5 --no-pager
}
###################
# DriveAtlas & Monitoring Functions
###################
get_drive_atlas() {
echo -e "\n${GREEN}=== Drive Atlas - Physical Bay Mapping ===${NC}"
if command -v curl >/dev/null 2>&1; then
if ! bash <(curl -sL "http://10.10.10.63:3000/LotusGuild/driveAtlas/raw/branch/main/driveAtlas.sh") 2>/dev/null; then
log_message warn "DriveAtlas failed to execute or server unavailable"
fi
else
log_message warn "curl not installed - cannot fetch DriveAtlas"
fi
}
get_ceph_health() {
echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}"
if command -v ceph >/dev/null 2>&1; then
echo -e "${GREEN}Health Status:${NC}"
ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster"
echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}"
ceph osd tree 2>/dev/null || true
echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}"
ceph df 2>/dev/null || true
echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}"
ceph osd df 2>/dev/null || true
else
log_message info "Ceph tools not installed on this node"
fi
}
get_node_exporter_status() {
echo -e "\n${GREEN}=== Node Exporter Status ===${NC}"
if systemctl is-active --quiet node_exporter 2>/dev/null; then
echo -e "${GREEN}Service:${NC} Running"
local ip=$(hostname -I | awk '{print $1}')
echo -e "${GREEN}Metrics URL:${NC} http://${ip}:9100/metrics"
if ss -tlnp 2>/dev/null | grep -q ':9100'; then
echo -e "${GREEN}Port 9100:${NC} Listening"
else
log_message warn "Port 9100 not listening"
fi
elif systemctl list-unit-files 2>/dev/null | grep -q node_exporter; then
log_message warn "Node Exporter is installed but not running"
echo -e "Start with: systemctl start node_exporter"
else
log_message info "Node Exporter not installed"
fi
}
get_hwmon_status() {
echo -e "\n${GREEN}=== hwmon Daemon Status ===${NC}"
if systemctl is-active --quiet hwmon.timer 2>/dev/null; then
echo -e "${GREEN}Timer:${NC} Active"
systemctl list-timers hwmon.timer --no-pager 2>/dev/null
echo -e "\n${GREEN}Last Run:${NC}"
journalctl -u hwmon.service -n 3 --no-pager 2>/dev/null || true
elif systemctl list-unit-files 2>/dev/null | grep -q hwmon.timer; then
log_message warn "hwmon timer is installed but not active"
echo -e "Enable with: systemctl enable --now hwmon.timer"
else
log_message info "hwmon daemon not installed"
fi
}
quick_health_check() {
echo -e "\n${GREEN}=== Quick Health Check ===${NC}"
echo -e "Running quick health assessment...\n"
# Services
check_services
# Temperatures
get_temp_info
# Disk health (quick)
echo -e "\n${GREEN}=== Disk Health Summary ===${NC}"
if command -v smartctl >/dev/null 2>&1; then
for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do
health=$(smartctl -H /dev/$disk 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs)
if [[ -n "$health" ]]; then
echo -e "/dev/$disk: $health"
fi
done
fi
# Node Exporter
get_node_exporter_status
# Ceph quick status
if command -v ceph >/dev/null 2>&1; then
echo -e "\n${GREEN}=== Ceph Quick Status ===${NC}"
ceph health 2>/dev/null || true
fi
}
###################
# Proxmox Specific Functions
###################
check_services() {
echo -e "${GREEN}Checking critical services:${NC}"
services=("pvedaemon" "pveproxy" "pvecluster" "pve-cluster" "corosync")
for service in "${services[@]}"; do
status=$(systemctl is-active "$service")
echo -e "${GREEN}$service:${NC} $status"
done
}
check_pve_version() {
local min_version="6.0"
local current_version=$(pveversion | grep -oP 'pve-manager/\K[0-9]+\.[0-9]+' || echo "0.0")
if (( $(echo "$current_version < $min_version" | bc -l) )); then
log_message warn "Proxmox VE version $current_version may not support all features"
fi
}
list_vms() {
if command -v qm >/dev/null 2>&1; then
echo -e "${GREEN}Virtual Machine Status:${NC}"
qm list
else
handle_error "qm command not found"
fi
}
list_containers() {
if command -v pct >/dev/null 2>&1; then
echo -e "\n${GREEN}=== LXC Container Status ===${NC}"
pct list
else
log_message warn "pct command not found"
fi
}
###################
# Command Line Interface Functions
###################
help() {
echo "ProxDoc - The Proxmox System Doctor v${VERSION}"
echo ""
echo "Usage: curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- [OPTION]"
echo ""
echo "A comprehensive diagnostic tool for Proxmox server health checks."
echo ""
echo "Treatment Options:"
echo " --help Show this prescription guide"
echo " --diags Perform full system examination"
echo " --quick Quick health check (services, temps, disks)"
echo " --drives Show physical drive bay mapping (DriveAtlas)"
echo " --ceph Check Ceph cluster health"
echo " --node-exporter Check Node Exporter status"
echo " --hwmon Check hwmon daemon status"
echo " --services Check vital Proxmox services"
echo " --vm-list Check VM vitals"
echo " --ct-list Check container vitals"
echo " --backup Review backup health"
exit 0
}
###################
# Main Functions
###################
runDiags() {
log_message info "Beginning system examination..."
(
get_system_info
get_cpu_info
get_ram_info
get_memory_details
get_storage_info
get_disk_health
get_drive_atlas
get_network_info
get_detailed_network
get_nic_details
get_hardware_info
get_motherboard_info
get_hba_info
get_temp_info
get_system_status
get_node_exporter_status
get_hwmon_status
get_ceph_health
list_vms
list_containers
) & show_progress $!
log_message info "Examination complete"
}
checkForInput() {
case $1 in
--help) help ;;
--diags) check_requirements; runDiags ;;
--quick) quick_health_check ;;
--drives) get_drive_atlas ;;
--ceph) get_ceph_health ;;
--node-exporter) get_node_exporter_status ;;
--hwmon) get_hwmon_status ;;
--services) check_services ;;
--vm-list) list_vms ;;
--ct-list) list_containers ;;
--backup) echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;;
*) echo -e "${RED}Invalid option: $1${NC}"; help ;;
esac
}
###################
# Script Execution
###################
argOne=$1
# Show header
print_header
# Check root
if [[ $EUID -ne 0 ]]; then
handle_error "This script must be run as root"
fi
# Set trap for interrupts
trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM
if [[ -n $argOne ]]; then
checkForInput "$argOne"
else
help
fi