#!/bin/bash VERSION="1.1.0" SPINNER="/-\|" ################### # Color Definitions ################### NC="\033[00m" GREEN="\033[01;32m" RED="\033[01;31m" YELLOW="\033[01;33m" ################### # Utility Functions ################### print_header() { echo " ____ ____ / __ \_________ _ __/ __ \____ _____ / /_/ / ___/ __ \| |/_/ / / / __ \/ ___/ / ____/ / / /_/ /> /dev/null; do local temp=${spinstr#?} printf " [%c] " "$spinstr" local spinstr=$temp${spinstr%"$temp"} sleep $delay printf "\b\b\b\b\b\b" done printf " \b\b\b\b" } check_requirements() { log_message info "Checking medical equipment..." local tools=("dmidecode" "lscpu" "ip" "smartctl" "sensors" "lspci" "bc") local missing=() for tool in "${tools[@]}"; do if ! command -v "$tool" >/dev/null 2>&1; then missing+=("$tool") fi done if [[ ${#missing[@]} -gt 0 ]]; then handle_error "Missing tools: ${missing[*]}\n Please install with 'curl -s http://10.10.10.63:3000/LotusGuild/freshStartScript/raw/branch/main/freshStart.sh | bash'" fi } checkIfOnHypervisor() { if ! command -v pveversion >/dev/null 2>&1; then return 1 fi return 0 } ################### # System Information Functions ################### get_system_info() { echo -e "\n${GREEN}=== System Information ===${NC}" echo -e "\n${GREEN}=== Diagnostic Run: $(date '+%Y-%m-%d %H:%M:%S') ===${NC}" echo -e "${GREEN}Hostname:$(uname -n)${NC}" echo -e "${GREEN}Kernel:$(uname -r)${NC}" echo -e "\n${GREEN}=== Proxmox Version ===${NC}" pveversion || echo "Not available" } get_temp_info() { echo -e "\n${GREEN}=== Temperature Information ===${NC}" if command -v sensors >/dev/null 2>&1; then sensors else log_message warn "sensors command not found. Install lm-sensors package for temperature monitoring" fi } get_disk_health() { echo -e "\n${GREEN}=== Disk Health Status ===${NC}" if command -v smartctl >/dev/null 2>&1; then for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do echo -e "\nChecking /dev/$disk:" smartctl -H /dev/$disk done else log_message warn "smartctl not found. Install smartmontools for disk health monitoring" fi } get_cpu_info() { cpu_info=$(grep -m 1 -w 'model name' /proc/cpuinfo | awk -F: '{print $2}' | xargs) || { echo -e "${RED}Failed to retrieve CPU model information.${NC}" } cpu_cores=$(lscpu | grep '^CPU(s):' | awk '{print $2}') cpu_mhz=$(lscpu | grep 'MHz' | awk '{print $4}') echo -e "${GREEN}CPU Model:${NC} $cpu_info" echo -e "${GREEN}CPU Cores:${NC} $cpu_cores" echo -e "${GREEN}CPU MHz:${NC} $cpu_mhz" } get_ram_info() { ram_total=$(free -h | grep 'Mem:' | awk '{print $2}') ram_used=$(free -h | grep 'Mem:' | awk '{print $3}') ram_free=$(free -h | grep 'Mem:' | awk '{print $4}') echo -e "${GREEN}Total RAM:${NC} $ram_total" echo -e "${GREEN}Used RAM:${NC} $ram_used" echo -e "${GREEN}Free RAM:${NC} $ram_free" } get_storage_info() { echo -e "${GREEN}Storage Information:${NC}" df -h --output=source,size,used,avail,pcent | grep '^/dev' if command -v zpool >/dev/null 2>&1; then echo -e "\n${GREEN}=== ZFS Pool Status ===${NC}" zpool status fi } get_network_info() { default_gateway=$(ip route | grep default | awk '{print $3}') ip_addresses=$(hostname -I | xargs) echo -e "${GREEN}Default Gateway:${NC} $default_gateway" echo -e "${GREEN}IP Addresses:${NC} $ip_addresses" } get_detailed_network() { echo -e "\n${GREEN}=== Network Interface Statistics ===${NC}" # Show only physical interfaces and bridges, skip virtual/firewall interfaces for iface in $(ls /sys/class/net | grep -v lo | grep -v -E "^veth|^fwbr|^fwln|^fwpr|^tap"); do ip -s link show "$iface" 2>/dev/null done echo -e "\n${GREEN}=== Network Statistics ===${NC}" if command -v ss >/dev/null 2>&1; then ss -s elif command -v netstat >/dev/null 2>&1; then netstat -i else log_message warn "netstat/ss not found for network statistics" fi } get_hardware_info() { echo -e "${GREEN}BIOS Version:${NC} $(dmidecode -s bios-version)" echo -e "\n${GREEN}=== PCI Devices ===${NC}" # Show interesting devices, exclude bridges, infrastructure, and integrated motherboard devices lspci | grep -v -E "Host bridge|PCI bridge|ISA bridge|SMBus|IOMMU|Dummy|USB controller|Audio device|Encryption controller|Multimedia controller" } get_motherboard_info() { echo -e "\n${GREEN}=== Motherboard Information ===${NC}" echo -e "${GREEN}Manufacturer:${NC} $(dmidecode -s baseboard-manufacturer)" echo -e "${GREEN}Product Name:${NC} $(dmidecode -s baseboard-product-name)" echo -e "${GREEN}Version:${NC} $(dmidecode -s baseboard-version)" echo -e "${GREEN}Serial Number:${NC} $(dmidecode -s baseboard-serial-number)" echo -e "${GREEN}System Manufacturer:${NC} $(dmidecode -s system-manufacturer)" echo -e "${GREEN}System Product:${NC} $(dmidecode -s system-product-name)" echo -e "${GREEN}System Serial:${NC} $(dmidecode -s system-serial-number)" } get_memory_details() { echo -e "\n${GREEN}=== Memory DIMM Information ===${NC}" dmidecode -t memory | awk ' /Memory Device/,/^$/ { if (/Size:/ && !/No Module Installed/) { size=$2" "$3 } if (/Type:/ && !/Unknown/ && !/Error/) { type=$2 } if (/Speed:/ && !/Unknown/ && $2 != "Unknown") { speed=$2" "$3 } if (/Manufacturer:/ && !/Unknown/ && $2 != "Unknown") { mfr=$2 } if (/Part Number:/ && !/Unknown/) { part=$3 } if (/Locator:/ && !/Bank/) { loc=$2 if (size && size !~ /No/) { printf "%-12s %-10s %-8s %-12s %-20s\n", loc, size, type, speed, mfr size=""; type=""; speed=""; mfr=""; part="" } } } ' echo -e "\n${GREEN}Memory Summary:${NC}" # Count actual DIMM slots by looking for Locator entries with slot-like names (DIMM, BANK, ChannelA, etc.) # Filter out Bank Locator lines and count unique slot names local total_slots=$(dmidecode -t memory | grep -E "^\s+Locator:" | grep -v "Bank Locator" | wc -l) # Count populated slots - those with actual size values (not "No Module Installed" or "Not Installed") local populated=$(dmidecode -t memory | grep -E "^\s+Size:" | grep -v -E "No Module|Not Installed" | wc -l) echo -e " Total Slots: $total_slots" echo -e " Populated: $populated" echo -e " Max Capacity: $(dmidecode -t memory | grep "Maximum Capacity" | head -1 | awk '{print $3" "$4}')" } get_nic_details() { echo -e "\n${GREEN}=== Network Interface Details ===${NC}" # Show only physical interfaces and bridges, skip virtual/firewall interfaces for iface in $(ls /sys/class/net | grep -v lo | grep -v -E "^veth|^fwbr|^fwln|^fwpr|^tap"); do echo -e "\n${GREEN}Interface: $iface${NC}" # Get driver info if [ -L "/sys/class/net/$iface/device/driver" ]; then driver=$(basename $(readlink /sys/class/net/$iface/device/driver)) echo -e " Driver: $driver" fi # Get MAC address if [ -f "/sys/class/net/$iface/address" ]; then echo -e " MAC: $(cat /sys/class/net/$iface/address)" fi # Get link state if [ -f "/sys/class/net/$iface/operstate" ]; then echo -e " State: $(cat /sys/class/net/$iface/operstate)" fi # Use ethtool if available if command -v ethtool >/dev/null 2>&1; then # Get speed and duplex link_info=$(ethtool $iface 2>/dev/null | grep -E "Speed:|Duplex:|Link detected:") if [ -n "$link_info" ]; then echo "$link_info" | while read line; do echo -e " $line" done fi # Get firmware version fw_ver=$(ethtool -i $iface 2>/dev/null | grep "firmware-version" | awk '{print $2}') if [ -n "$fw_ver" ] && [ "$fw_ver" != "" ]; then echo -e " Firmware: $fw_ver" fi fi done } get_hba_info() { echo -e "\n${GREEN}=== HBA/Storage Controller Information ===${NC}" # Find RAID, SAS, SATA, SCSI, and storage controllers lspci -vmm 2>/dev/null | awk ' BEGIN { RS=""; FS="\n" } /RAID|SAS|SATA|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe/ { for (i=1; i<=NF; i++) { if ($i ~ /^Slot:/) slot = substr($i, 7) if ($i ~ /^Class:/) class = substr($i, 8) if ($i ~ /^Vendor:/) vendor = substr($i, 9) if ($i ~ /^Device:/) device = substr($i, 9) if ($i ~ /^Rev:/) rev = substr($i, 6) } printf "\n%s\n", slot printf " Class: %s\n", class printf " Vendor: %s\n", vendor printf " Device: %s\n", device if (rev) printf " Rev: %s\n", rev slot=""; class=""; vendor=""; device=""; rev="" } ' # Show detailed info for storage controllers echo -e "\n${GREEN}=== Storage Controller Details ===${NC}" for ctrl in $(lspci | grep -iE "RAID|SAS|SATA|SCSI|Mass storage|NVMe" | awk '{print $1}'); do echo -e "\n${GREEN}Controller $ctrl:${NC}" lspci -vvs "$ctrl" 2>/dev/null | grep -E "^\s+(Subsystem|LnkSta|Kernel driver)" | head -5 done } get_system_status() { echo -e "\n${GREEN}=== System Load ===${NC}" uptime echo -e "\n${GREEN}=== Service Status ===${NC}" systemctl list-units --type=service --state=running | wc -l echo -e "\n${GREEN}=== Recent System Errors ===${NC}" journalctl -p err -n 5 --no-pager } ################### # DriveAtlas & Monitoring Functions ################### get_drive_atlas() { echo -e "\n${GREEN}=== Drive Atlas - Physical Bay Mapping ===${NC}" if command -v curl >/dev/null 2>&1; then if ! bash <(curl -sL "http://10.10.10.63:3000/LotusGuild/driveAtlas/raw/branch/main/driveAtlas.sh") 2>/dev/null; then log_message warn "DriveAtlas failed to execute or server unavailable" fi else log_message warn "curl not installed - cannot fetch DriveAtlas" fi } get_ceph_health() { echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}" if command -v ceph >/dev/null 2>&1; then echo -e "${GREEN}Health Status:${NC}" ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster" echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}" ceph osd tree 2>/dev/null || true echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}" ceph df 2>/dev/null || true echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}" ceph osd df 2>/dev/null || true else log_message info "Ceph tools not installed on this node" fi } get_node_exporter_status() { echo -e "\n${GREEN}=== Node Exporter Status ===${NC}" if systemctl is-active --quiet node_exporter 2>/dev/null; then echo -e "${GREEN}Service:${NC} Running" local ip=$(hostname -I | awk '{print $1}') echo -e "${GREEN}Metrics URL:${NC} http://${ip}:9100/metrics" if ss -tlnp 2>/dev/null | grep -q ':9100'; then echo -e "${GREEN}Port 9100:${NC} Listening" else log_message warn "Port 9100 not listening" fi elif systemctl list-unit-files 2>/dev/null | grep -q node_exporter; then log_message warn "Node Exporter is installed but not running" echo -e "Start with: systemctl start node_exporter" else log_message info "Node Exporter not installed" fi } get_hwmon_status() { echo -e "\n${GREEN}=== hwmon Daemon Status ===${NC}" if systemctl is-active --quiet hwmon.timer 2>/dev/null; then echo -e "${GREEN}Timer:${NC} Active" systemctl list-timers hwmon.timer --no-pager 2>/dev/null echo -e "\n${GREEN}Last Run:${NC}" journalctl -u hwmon.service -n 3 --no-pager 2>/dev/null || true elif systemctl list-unit-files 2>/dev/null | grep -q hwmon.timer; then log_message warn "hwmon timer is installed but not active" echo -e "Enable with: systemctl enable --now hwmon.timer" else log_message info "hwmon daemon not installed" fi } quick_health_check() { echo -e "\n${GREEN}=== Quick Health Check ===${NC}" echo -e "Running quick health assessment...\n" # Services check_services # Temperatures get_temp_info # Disk health (quick) echo -e "\n${GREEN}=== Disk Health Summary ===${NC}" if command -v smartctl >/dev/null 2>&1; then for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do health=$(smartctl -H /dev/$disk 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs) if [[ -n "$health" ]]; then echo -e "/dev/$disk: $health" fi done fi # Node Exporter get_node_exporter_status # Ceph quick status if command -v ceph >/dev/null 2>&1; then echo -e "\n${GREEN}=== Ceph Quick Status ===${NC}" ceph health 2>/dev/null || true fi } ################### # Proxmox Specific Functions ################### check_services() { echo -e "${GREEN}Checking critical services:${NC}" services=("pvedaemon" "pveproxy" "pvecluster" "pve-cluster" "corosync") for service in "${services[@]}"; do status=$(systemctl is-active "$service") echo -e "${GREEN}$service:${NC} $status" done } check_pve_version() { local min_version="6.0" local current_version=$(pveversion | grep -oP 'pve-manager/\K[0-9]+\.[0-9]+' || echo "0.0") if (( $(echo "$current_version < $min_version" | bc -l) )); then log_message warn "Proxmox VE version $current_version may not support all features" fi } list_vms() { if command -v qm >/dev/null 2>&1; then echo -e "${GREEN}Virtual Machine Status:${NC}" qm list else handle_error "qm command not found" fi } list_containers() { if command -v pct >/dev/null 2>&1; then echo -e "\n${GREEN}=== LXC Container Status ===${NC}" pct list else log_message warn "pct command not found" fi } ################### # Command Line Interface Functions ################### help() { echo "ProxDoc - The Proxmox System Doctor v${VERSION}" echo "" echo "Usage: curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- [OPTION]" echo "" echo "A comprehensive diagnostic tool for Proxmox server health checks." echo "" echo "Treatment Options:" echo " --help Show this prescription guide" echo " --diags Perform full system examination" echo " --quick Quick health check (services, temps, disks)" echo " --drives Show physical drive bay mapping (DriveAtlas)" echo " --ceph Check Ceph cluster health" echo " --node-exporter Check Node Exporter status" echo " --hwmon Check hwmon daemon status" echo " --services Check vital Proxmox services" echo " --vm-list Check VM vitals" echo " --ct-list Check container vitals" echo " --backup Review backup health" exit 0 } ################### # Main Functions ################### runDiags() { log_message info "Beginning system examination..." ( get_system_info get_cpu_info get_ram_info get_memory_details get_storage_info get_drive_atlas get_network_info get_detailed_network get_nic_details get_hardware_info get_motherboard_info get_hba_info get_temp_info get_system_status get_node_exporter_status get_hwmon_status get_ceph_health list_vms list_containers ) & show_progress $! log_message info "Examination complete" } checkForInput() { case $1 in --help) help ;; --diags) check_requirements; runDiags ;; --quick) quick_health_check ;; --drives) get_drive_atlas ;; --ceph) get_ceph_health ;; --node-exporter) get_node_exporter_status ;; --hwmon) get_hwmon_status ;; --services) check_services ;; --vm-list) list_vms ;; --ct-list) list_containers ;; --backup) echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;; *) echo -e "${RED}Invalid option: $1${NC}"; help ;; esac } ################### # Script Execution ################### argOne=$1 # Show header print_header # Check root if [[ $EUID -ne 0 ]]; then handle_error "This script must be run as root" fi # Set trap for interrupts trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM if [[ -n $argOne ]]; then checkForInput "$argOne" else help fi