#!/bin/bash VERSION="1.1.0" ################### # Color Definitions ################### NC="\033[00m" GREEN="\033[01;32m" RED="\033[01;31m" YELLOW="\033[01;33m" ################### # Utility Functions ################### print_header() { echo " ____ ____ / __ \_________ _ __/ __ \____ _____ / /_/ / ___/ __ \| |/_/ / / / __ \/ ___/ / ____/ / / /_/ /> /dev/null 2>&1; then missing+=("$tool") fi done if [[ ${#missing[@]} -gt 0 ]]; then handle_error "Missing tools: ${missing[*]}\n Please install with 'curl -s http://10.10.10.63:3000/LotusGuild/freshStartScript/raw/branch/main/freshStart.sh | bash'" fi } checkIfOnHypervisor() { command -v pveversion >/dev/null 2>&1 } ################### # System Information Functions ################### get_system_info() { echo -e "\n${GREEN}=== System Information ===${NC}" echo -e "\n${GREEN}=== Diagnostic Run: $(date '+%Y-%m-%d %H:%M:%S') ===${NC}" echo -e "${GREEN}Hostname:${NC} $(uname -n)" echo -e "${GREEN}Kernel:${NC} $(uname -r)" if checkIfOnHypervisor; then echo -e "\n${GREEN}=== Proxmox Version ===${NC}" pveversion else echo -e "\n${GREEN}=== OS Information ===${NC}" if [[ -f /etc/os-release ]]; then source /etc/os-release echo -e "${GREEN}Distribution:${NC} $PRETTY_NAME" else echo "OS information not available" fi fi } get_temp_info() { echo -e "\n${GREEN}=== Temperature Information ===${NC}" if command -v sensors >/dev/null 2>&1; then sensors else log_message warn "sensors command not found. Install lm-sensors package for temperature monitoring" fi } get_disk_health() { echo -e "\n${GREEN}=== Disk Health Status ===${NC}" if command -v smartctl >/dev/null 2>&1; then for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do echo -e "\nChecking /dev/$disk:" smartctl -H /dev/$disk done else log_message warn "smartctl not found. Install smartmontools for disk health monitoring" fi } get_cpu_info() { cpu_info=$(grep -m 1 -w 'model name' /proc/cpuinfo | awk -F: '{print $2}' | xargs) || { echo -e "${RED}Failed to retrieve CPU model information.${NC}" } cpu_cores=$(lscpu | grep '^CPU(s):' | awk '{print $2}') cpu_mhz=$(lscpu | grep 'MHz' | awk '{print $4}') echo -e "${GREEN}CPU Model:${NC} $cpu_info" echo -e "${GREEN}CPU Cores:${NC} $cpu_cores" echo -e "${GREEN}CPU MHz:${NC} $cpu_mhz" } get_ram_info() { ram_total=$(free -h | grep 'Mem:' | awk '{print $2}') ram_used=$(free -h | grep 'Mem:' | awk '{print $3}') ram_free=$(free -h | grep 'Mem:' | awk '{print $4}') echo -e "${GREEN}Total RAM:${NC} $ram_total" echo -e "${GREEN}Used RAM:${NC} $ram_used" echo -e "${GREEN}Free RAM:${NC} $ram_free" } get_storage_info() { echo -e "${GREEN}Storage Information:${NC}" df -h --output=source,size,used,avail,pcent | grep '^/dev' if command -v zpool >/dev/null 2>&1; then echo -e "\n${GREEN}=== ZFS Pool Status ===${NC}" zpool status fi } get_network_info() { default_gateway=$(ip route | grep default | awk '{print $3}') ip_addresses=$(hostname -I | xargs) echo -e "${GREEN}Default Gateway:${NC} $default_gateway" echo -e "${GREEN}IP Addresses:${NC} $ip_addresses" } get_detailed_network() { echo -e "\n${GREEN}=== Network Interface Statistics ===${NC}" local iface while IFS= read -r iface; do [[ -z "$iface" ]] && continue ip -s link show "$iface" 2>/dev/null done < <(get_physical_interfaces) echo -e "\n${GREEN}=== Network Statistics ===${NC}" if command -v ss >/dev/null 2>&1; then ss -s elif command -v netstat >/dev/null 2>&1; then netstat -i else log_message warn "netstat/ss not found for network statistics" fi } get_hardware_info() { echo -e "${GREEN}BIOS Version:${NC} $(dmidecode -s bios-version)" echo -e "\n${GREEN}=== PCI Devices ===${NC}" # Show interesting devices, exclude bridges, infrastructure, and integrated motherboard devices lspci | grep -v -E "Host bridge|PCI bridge|ISA bridge|SMBus|IOMMU|Dummy|USB controller|Audio device|Encryption controller|Multimedia controller" } get_motherboard_info() { echo -e "\n${GREEN}=== Motherboard Information ===${NC}" echo -e "${GREEN}Manufacturer:${NC} $(dmidecode -s baseboard-manufacturer)" echo -e "${GREEN}Product Name:${NC} $(dmidecode -s baseboard-product-name)" echo -e "${GREEN}Version:${NC} $(dmidecode -s baseboard-version)" echo -e "${GREEN}Serial Number:${NC} $(dmidecode -s baseboard-serial-number)" echo -e "${GREEN}System Manufacturer:${NC} $(dmidecode -s system-manufacturer)" echo -e "${GREEN}System Product:${NC} $(dmidecode -s system-product-name)" echo -e "${GREEN}System Serial:${NC} $(dmidecode -s system-serial-number)" } get_memory_details() { echo -e "\n${GREEN}=== Memory DIMM Information ===${NC}" dmidecode -t memory | awk ' /Memory Device/,/^$/ { if (/Size:/ && !/No Module Installed/) { size=$2" "$3 } if (/Type:/ && !/Unknown/ && !/Error/) { type=$2 } if (/Speed:/ && !/Unknown/ && $2 != "Unknown") { speed=$2" "$3 } if (/Manufacturer:/ && !/Unknown/ && $2 != "Unknown") { mfr=$2 } if (/Part Number:/ && !/Unknown/) { part=$3 } if (/Locator:/ && !/Bank/) { loc=$2 if (size && size !~ /No/) { printf "%-12s %-10s %-8s %-12s %-20s\n", loc, size, type, speed, mfr size=""; type=""; speed=""; mfr=""; part="" } } } ' echo -e "\n${GREEN}Memory Summary:${NC}" # Count actual DIMM slots by looking for Locator entries with slot-like names (DIMM, BANK, ChannelA, etc.) # Filter out Bank Locator lines and count unique slot names local total_slots=$(dmidecode -t memory | grep -E "^\s+Locator:" | grep -v "Bank Locator" | wc -l) # Count populated slots - those with actual size values (not "No Module Installed" or "Not Installed") local populated=$(dmidecode -t memory | grep -E "^\s+Size:" | grep -v -E "No Module|Not Installed" | wc -l) echo -e " Total Slots: $total_slots" echo -e " Populated: $populated" echo -e " Max Capacity: $(dmidecode -t memory | grep "Maximum Capacity" | head -1 | awk '{print $3" "$4}')" } get_nic_details() { echo -e "\n${GREEN}=== Network Interface Details ===${NC}" local iface while IFS= read -r iface; do [[ -z "$iface" ]] && continue echo -e "\n${GREEN}Interface: $iface${NC}" # Get driver info if [[ -L "/sys/class/net/$iface/device/driver" ]]; then local driver driver=$(basename "$(readlink "/sys/class/net/$iface/device/driver")") echo -e " Driver: $driver" fi # Get MAC address if [[ -f "/sys/class/net/$iface/address" ]]; then echo -e " MAC: $(cat "/sys/class/net/$iface/address")" fi # Get link state if [[ -f "/sys/class/net/$iface/operstate" ]]; then echo -e " State: $(cat "/sys/class/net/$iface/operstate")" fi # Use ethtool if available if command -v ethtool >/dev/null 2>&1; then # Get speed and duplex local link_info link_info=$(ethtool "$iface" 2>/dev/null | grep -E "Speed:|Duplex:|Link detected:") if [[ -n "$link_info" ]]; then echo "$link_info" | while IFS= read -r line; do echo -e " $line" done fi # Get firmware version local fw_ver fw_ver=$(ethtool -i "$iface" 2>/dev/null | grep "firmware-version" | awk '{print $2}') if [[ -n "$fw_ver" ]]; then echo -e " Firmware: $fw_ver" fi fi done < <(get_physical_interfaces) } get_physical_interfaces() { local iface for iface in /sys/class/net/*; do # Skip if glob didn't match anything [[ -e "$iface" ]] || continue # Get just the interface name iface=$(basename "$iface") # Skip loopback [[ "$iface" == "lo" ]] && continue # Skip virtual/firewall interfaces [[ "$iface" =~ ^(veth|fwbr|fwln|fwpr|tap) ]] && continue # This is a physical interface echo "$iface" done } get_hba_info() { echo -e "\n${GREEN}=== HBA/Storage Controller Information ===${NC}" # Find RAID, SAS, SATA, SCSI, and storage controllers lspci -vmm 2>/dev/null | awk ' BEGIN { RS=""; FS="\n" } /RAID|SAS|SATA|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe/ { for (i=1; i<=NF; i++) { if ($i ~ /^Slot:/) slot = substr($i, 7) if ($i ~ /^Class:/) class = substr($i, 8) if ($i ~ /^Vendor:/) vendor = substr($i, 9) if ($i ~ /^Device:/) device = substr($i, 9) if ($i ~ /^Rev:/) rev = substr($i, 6) } printf "\n%s\n", slot printf " Class: %s\n", class printf " Vendor: %s\n", vendor printf " Device: %s\n", device if (rev) printf " Rev: %s\n", rev slot=""; class=""; vendor=""; device=""; rev="" } ' # Show detailed info for storage controllers echo -e "\n${GREEN}=== Storage Controller Details ===${NC}" for ctrl in $(lspci | grep -iE "RAID|SAS|SATA|SCSI|Mass storage|NVMe" | awk '{print $1}'); do echo -e "\n${GREEN}Controller $ctrl:${NC}" lspci -vvs "$ctrl" 2>/dev/null | grep -E "^\s+(Subsystem|LnkSta|Kernel driver)" | head -5 done } get_system_status() { echo -e "\n${GREEN}=== System Load ===${NC}" uptime echo -e "\n${GREEN}=== Service Status ===${NC}" systemctl list-units --type=service --state=running | wc -l echo -e "\n${GREEN}=== Recent System Errors ===${NC}" journalctl -p err -n 5 --no-pager } ################### # DriveAtlas & Monitoring Functions ################### get_drive_atlas() { echo -e "\n${GREEN}=== Drive Atlas - Physical Bay Mapping ===${NC}" if command -v curl >/dev/null 2>&1; then if ! bash <(curl -sL "http://10.10.10.63:3000/LotusGuild/driveAtlas/raw/branch/main/driveAtlas.sh") 2>/dev/null; then log_message warn "DriveAtlas failed to execute or server unavailable" fi else log_message warn "curl not installed - cannot fetch DriveAtlas" fi } get_ceph_health() { echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}" if command -v ceph >/dev/null 2>&1; then echo -e "${GREEN}Health Status:${NC}" ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster" echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}" ceph osd tree 2>/dev/null || true echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}" ceph df 2>/dev/null || true echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}" ceph osd df 2>/dev/null || true else log_message info "Ceph tools not installed on this node" fi } get_node_exporter_status() { echo -e "\n${GREEN}=== Node Exporter Status ===${NC}" if systemctl is-active --quiet node_exporter 2>/dev/null; then echo -e "${GREEN}Service:${NC} Running" local ip=$(hostname -I | awk '{print $1}') echo -e "${GREEN}Metrics URL:${NC} http://${ip}:9100/metrics" if ss -tlnp 2>/dev/null | grep -q ':9100'; then echo -e "${GREEN}Port 9100:${NC} Listening" else log_message warn "Port 9100 not listening" fi elif systemctl list-unit-files 2>/dev/null | grep -q node_exporter; then log_message warn "Node Exporter is installed but not running" echo -e "Start with: systemctl start node_exporter" else log_message info "Node Exporter not installed" fi } get_hwmon_status() { echo -e "\n${GREEN}=== hwmon Daemon Status ===${NC}" if systemctl is-active --quiet hwmon.timer 2>/dev/null; then echo -e "${GREEN}Timer:${NC} Active" systemctl list-timers hwmon.timer --no-pager 2>/dev/null echo -e "\n${GREEN}Last Run:${NC}" journalctl -u hwmon.service -n 3 --no-pager 2>/dev/null || true elif systemctl list-unit-files 2>/dev/null | grep -q hwmon.timer; then log_message warn "hwmon timer is installed but not active" echo -e "Enable with: systemctl enable --now hwmon.timer" else log_message info "hwmon daemon not installed" fi } quick_health_check() { echo -e "\n${GREEN}=== Quick Health Check ===${NC}" echo -e "Running quick health assessment...\n" # Services check_services # Temperatures get_temp_info # Disk health (quick) echo -e "\n${GREEN}=== Disk Health Summary ===${NC}" if command -v smartctl >/dev/null 2>&1; then for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do health=$(smartctl -H /dev/$disk 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs) if [[ -n "$health" ]]; then echo -e "/dev/$disk: $health" fi done fi # Node Exporter get_node_exporter_status # Ceph quick status if command -v ceph >/dev/null 2>&1; then echo -e "\n${GREEN}=== Ceph Quick Status ===${NC}" ceph health 2>/dev/null || true fi } ################### # Proxmox Specific Functions ################### check_services() { if ! checkIfOnHypervisor; then log_message warn "Not on Proxmox - skipping Proxmox service checks" return 0 fi echo -e "${GREEN}Checking critical services:${NC}" local services=("pvedaemon" "pveproxy" "pvecluster" "pve-cluster" "corosync") for service in "${services[@]}"; do local status status=$(systemctl is-active "$service" 2>/dev/null || echo "not-found") echo -e "${GREEN}$service:${NC} $status" done } check_pve_version() { local min_version="6.0" local current_version=$(pveversion | grep -oP 'pve-manager/\K[0-9]+\.[0-9]+' || echo "0.0") if (( $(echo "$current_version < $min_version" | bc -l) )); then log_message warn "Proxmox VE version $current_version may not support all features" fi } list_vms() { if ! checkIfOnHypervisor; then log_message info "Not on Proxmox - skipping VM list" return 0 fi if command -v qm >/dev/null 2>&1; then echo -e "${GREEN}Virtual Machine Status:${NC}" qm list else log_message warn "qm command not found" fi } list_containers() { if ! checkIfOnHypervisor; then log_message info "Not on Proxmox - skipping container list" return 0 fi if command -v pct >/dev/null 2>&1; then echo -e "\n${GREEN}=== LXC Container Status ===${NC}" pct list else log_message warn "pct command not found" fi } ################### # Command Line Interface Functions ################### help() { echo "ProxDoc - The Proxmox System Doctor v${VERSION}" echo "" echo "Usage: curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- [OPTION]" echo "" echo "A comprehensive diagnostic tool for Proxmox server health checks." echo "" echo "Treatment Options:" echo " --help Show this prescription guide" echo " --diags Perform full system examination" echo " --quick Quick health check (services, temps, disks)" echo " --drives Show physical drive bay mapping (DriveAtlas)" echo " --ceph Check Ceph cluster health" echo " --node-exporter Check Node Exporter status" echo " --hwmon Check hwmon daemon status" echo " --services Check vital Proxmox services" echo " --vm-list Check VM vitals" echo " --ct-list Check container vitals" echo " --backup Review backup health" exit 0 } ################### # Main Functions ################### runDiags() { log_message info "Beginning system examination..." # Check if running on Proxmox local is_proxmox=false if checkIfOnHypervisor; then is_proxmox=true log_message info "Detected Proxmox VE hypervisor" else log_message warn "Not running on Proxmox VE - some checks will be skipped" fi echo "" log_message info "Checking system information..." get_system_info log_message info "Checking CPU..." get_cpu_info log_message info "Checking RAM..." get_ram_info log_message info "Checking memory details..." get_memory_details log_message info "Checking storage..." get_storage_info log_message info "Checking drive atlas..." get_drive_atlas log_message info "Checking network..." get_network_info get_detailed_network get_nic_details log_message info "Checking hardware..." get_hardware_info get_motherboard_info get_hba_info log_message info "Checking temperatures..." get_temp_info log_message info "Checking system status..." get_system_status log_message info "Checking monitoring services..." get_node_exporter_status get_hwmon_status # Only run Proxmox-specific checks if on Proxmox if [[ "$is_proxmox" == true ]]; then log_message info "Checking Ceph cluster..." get_ceph_health log_message info "Checking VMs..." list_vms log_message info "Checking containers..." list_containers fi echo "" log_message info "Examination complete" } checkForInput() { case $1 in --help) help ;; --diags) check_requirements; runDiags ;; --quick) quick_health_check ;; --drives) get_drive_atlas ;; --ceph) get_ceph_health ;; --node-exporter) get_node_exporter_status ;; --hwmon) get_hwmon_status ;; --services) check_services ;; --vm-list) list_vms ;; --ct-list) list_containers ;; --backup) echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;; *) echo -e "${RED}Invalid option: $1${NC}"; help ;; esac } ################### # Script Execution ################### argOne=$1 # Show header print_header # Check root if [[ $EUID -ne 0 ]]; then handle_error "This script must be run as root" fi # Set trap for interrupts trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM if [[ -n $argOne ]]; then checkForInput "$argOne" else help fi