#!/bin/bash VERSION="1.1.0" ################### # Color Definitions ################### NC="\033[00m" GREEN="\033[01;32m" RED="\033[01;31m" YELLOW="\033[01;33m" ################### # Utility Functions ################### print_header() { echo " ____ ____ / __ \_________ _ __/ __ \____ _____ / /_/ / ___/ __ \| |/_/ / / / __ \/ ___/ / ____/ / / /_/ /> /dev/null 2>&1; then missing+=("$tool") fi done if [[ ${#missing[@]} -gt 0 ]]; then handle_error "Missing tools: ${missing[*]}\n Please install with 'curl -s http://10.10.10.63:3000/LotusGuild/freshStartScript/raw/branch/main/freshStart.sh | bash'" fi } checkIfOnHypervisor() { command -v pveversion >/dev/null 2>&1 } ################### # System Information Functions ################### get_system_info() { echo -e "\n${GREEN}=== System Information ===${NC}" echo -e "\n${GREEN}=== Diagnostic Run: $(date '+%Y-%m-%d %H:%M:%S') ===${NC}" echo -e "${GREEN}Hostname:${NC} $(uname -n)" echo -e "${GREEN}Kernel:${NC} $(uname -r)" if checkIfOnHypervisor; then echo -e "\n${GREEN}=== Proxmox Version ===${NC}" pveversion else echo -e "\n${GREEN}=== OS Information ===${NC}" if [[ -f /etc/os-release ]]; then source /etc/os-release echo -e "${GREEN}Distribution:${NC} $PRETTY_NAME" else echo "OS information not available" fi fi } get_temp_info() { echo -e "\n${GREEN}=== Temperature Information ===${NC}" if command -v sensors >/dev/null 2>&1; then sensors else log_message warn "sensors command not found. Install lm-sensors package for temperature monitoring" fi } get_disk_health() { echo -e "\n${GREEN}=== Disk Health Status ===${NC}" if command -v smartctl >/dev/null 2>&1; then for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do echo -e "\nChecking /dev/$disk:" smartctl -H /dev/$disk done else log_message warn "smartctl not found. Install smartmontools for disk health monitoring" fi } get_cpu_info() { cpu_info=$(grep -m 1 -w 'model name' /proc/cpuinfo | awk -F: '{print $2}' | xargs) || { echo -e "${RED}Failed to retrieve CPU model information.${NC}" } cpu_cores=$(lscpu | grep '^CPU(s):' | awk '{print $2}') cpu_mhz=$(lscpu | grep 'MHz' | awk '{print $4}') echo -e "${GREEN}CPU Model:${NC} $cpu_info" echo -e "${GREEN}CPU Cores:${NC} $cpu_cores" echo -e "${GREEN}CPU MHz:${NC} $cpu_mhz" } get_ram_info() { ram_total=$(free -h | grep 'Mem:' | awk '{print $2}') ram_used=$(free -h | grep 'Mem:' | awk '{print $3}') ram_free=$(free -h | grep 'Mem:' | awk '{print $4}') echo -e "${GREEN}Total RAM:${NC} $ram_total" echo -e "${GREEN}Used RAM:${NC} $ram_used" echo -e "${GREEN}Free RAM:${NC} $ram_free" } get_storage_info() { echo -e "${GREEN}Storage Information:${NC}" df -h --output=source,size,used,avail,pcent | grep '^/dev' if command -v zpool >/dev/null 2>&1; then echo -e "\n${GREEN}=== ZFS Pool Status ===${NC}" zpool status fi } get_network_info() { default_gateway=$(ip route | grep default | awk '{print $3}') ip_addresses=$(hostname -I | xargs) echo -e "${GREEN}Default Gateway:${NC} $default_gateway" echo -e "${GREEN}IP Addresses:${NC} $ip_addresses" } get_detailed_network() { echo -e "\n${GREEN}=== Network Interface Statistics ===${NC}" local iface while IFS= read -r iface; do [[ -z "$iface" ]] && continue ip -s link show "$iface" 2>/dev/null done < <(get_physical_interfaces) echo -e "\n${GREEN}=== Network Statistics ===${NC}" if command -v ss >/dev/null 2>&1; then ss -s elif command -v netstat >/dev/null 2>&1; then netstat -i else log_message warn "netstat/ss not found for network statistics" fi } get_hardware_info() { echo -e "${GREEN}BIOS Version:${NC} $(dmidecode -s bios-version)" echo -e "\n${GREEN}=== PCI Devices ===${NC}" # Show interesting devices, exclude bridges, infrastructure, and integrated motherboard devices lspci | grep -v -E "Host bridge|PCI bridge|ISA bridge|SMBus|IOMMU|Dummy|USB controller|Audio device|Encryption controller|Multimedia controller" } get_motherboard_info() { echo -e "\n${GREEN}=== Motherboard Information ===${NC}" echo -e "${GREEN}Manufacturer:${NC} $(dmidecode -s baseboard-manufacturer)" echo -e "${GREEN}Product Name:${NC} $(dmidecode -s baseboard-product-name)" echo -e "${GREEN}Version:${NC} $(dmidecode -s baseboard-version)" echo -e "${GREEN}Serial Number:${NC} $(dmidecode -s baseboard-serial-number)" echo -e "${GREEN}System Manufacturer:${NC} $(dmidecode -s system-manufacturer)" echo -e "${GREEN}System Product:${NC} $(dmidecode -s system-product-name)" echo -e "${GREEN}System Serial:${NC} $(dmidecode -s system-serial-number)" } get_memory_details() { echo -e "\n${GREEN}=== Memory DIMM Information ===${NC}" # Use a more robust parsing approach local locator size type speed manufacturer local in_device=false # Print header printf "%-12s %-12s %-10s %-12s %-20s\n" "Slot" "Size" "Type" "Speed" "Manufacturer" printf "%-12s %-12s %-10s %-12s %-20s\n" "----" "----" "----" "-----" "------------" while IFS= read -r line; do # Detect start of a memory device section if [[ "$line" =~ ^Memory[[:space:]]Device ]]; then # If we have data from previous device, print it if [[ -n "$locator" && -n "$size" && ! "$size" =~ (No|Not|Installed) ]]; then printf "%-12s %-12s %-10s %-12s %-20s\n" \ "${locator:-N/A}" \ "${size:-N/A}" \ "${type:-N/A}" \ "${speed:-N/A}" \ "${manufacturer:-N/A}" fi # Reset variables for new device locator="" size="" type="" speed="" manufacturer="" in_device=true continue fi # Skip if not in a device section [[ "$in_device" != true ]] && continue # Parse fields (case-insensitive, flexible whitespace) if [[ "$line" =~ ^[[:space:]]*Locator:[[:space:]]*(.+)$ ]] && [[ ! "$line" =~ Bank ]]; then locator="${BASH_REMATCH[1]}" locator="${locator// /_}" # Replace spaces with underscores elif [[ "$line" =~ ^[[:space:]]*Size:[[:space:]]*(.+)$ ]]; then size="${BASH_REMATCH[1]}" elif [[ "$line" =~ ^[[:space:]]*Type:[[:space:]]*(.+)$ ]]; then type="${BASH_REMATCH[1]}" # Skip if it's an error or unknown type [[ "$type" =~ (Unknown|Error|Correction) ]] && type="" elif [[ "$line" =~ ^[[:space:]]*Speed:[[:space:]]*(.+)$ ]]; then speed="${BASH_REMATCH[1]}" [[ "$speed" =~ Unknown ]] && speed="" elif [[ "$line" =~ ^[[:space:]]*Manufacturer:[[:space:]]*(.+)$ ]]; then manufacturer="${BASH_REMATCH[1]}" [[ "$manufacturer" =~ (Unknown|NO DIMM) ]] && manufacturer="" fi # Empty line marks end of device section if [[ -z "$line" ]]; then in_device=false fi done < <(dmidecode -t memory 2>/dev/null) # Print last device if it has data if [[ -n "$locator" && -n "$size" && ! "$size" =~ (No|Not|Installed) ]]; then printf "%-12s %-12s %-10s %-12s %-20s\n" \ "${locator:-N/A}" \ "${size:-N/A}" \ "${type:-N/A}" \ "${speed:-N/A}" \ "${manufacturer:-N/A}" fi # Memory summary echo -e "\n${GREEN}Memory Summary:${NC}" # Count slots more reliably local total_slots=0 local populated=0 while IFS= read -r line; do if [[ "$line" =~ ^[[:space:]]*Locator: ]] && [[ ! "$line" =~ Bank ]]; then ((total_slots++)) fi done < <(dmidecode -t memory 2>/dev/null) while IFS= read -r line; do if [[ "$line" =~ ^[[:space:]]*Size:[[:space:]]*(.+)$ ]]; then local size_val="${BASH_REMATCH[1]}" if [[ ! "$size_val" =~ (No|Not|Installed) ]]; then ((populated++)) fi fi done < <(dmidecode -t memory 2>/dev/null) echo -e " Total Slots: $total_slots" echo -e " Populated: $populated" # Get max capacity local max_capacity max_capacity=$(dmidecode -t memory 2>/dev/null | grep -i "Maximum Capacity" | head -1 | sed 's/.*: //') echo -e " Max Capacity: ${max_capacity:-Unknown}" } get_nic_details() { echo -e "\n${GREEN}=== Network Interface Details ===${NC}" local iface while IFS= read -r iface; do [[ -z "$iface" ]] && continue echo -e "\n${GREEN}Interface: $iface${NC}" # Get driver info if [[ -L "/sys/class/net/$iface/device/driver" ]]; then local driver driver=$(basename "$(readlink "/sys/class/net/$iface/device/driver")") echo -e " Driver: $driver" fi # Get MAC address if [[ -f "/sys/class/net/$iface/address" ]]; then echo -e " MAC: $(cat "/sys/class/net/$iface/address")" fi # Get link state if [[ -f "/sys/class/net/$iface/operstate" ]]; then echo -e " State: $(cat "/sys/class/net/$iface/operstate")" fi # Use ethtool if available if command -v ethtool >/dev/null 2>&1; then # Get speed and duplex local link_info link_info=$(ethtool "$iface" 2>/dev/null | grep -E "Speed:|Duplex:|Link detected:") if [[ -n "$link_info" ]]; then echo "$link_info" | while IFS= read -r line; do echo -e " $line" done fi # Get firmware version local fw_ver fw_ver=$(ethtool -i "$iface" 2>/dev/null | grep "firmware-version" | awk '{print $2}') if [[ -n "$fw_ver" ]]; then echo -e " Firmware: $fw_ver" fi fi done < <(get_physical_interfaces) } get_physical_interfaces() { local iface for iface in /sys/class/net/*; do # Skip if glob didn't match anything [[ -e "$iface" ]] || continue # Get just the interface name iface=$(basename "$iface") # Skip loopback [[ "$iface" == "lo" ]] && continue # Skip virtual/firewall interfaces [[ "$iface" =~ ^(veth|fwbr|fwln|fwpr|tap) ]] && continue # This is a physical interface echo "$iface" done } get_hba_info() { echo -e "\n${GREEN}=== HBA/Storage Controller Information ===${NC}" # Find RAID, SAS, SATA, SCSI, and storage controllers lspci -vmm 2>/dev/null | awk ' BEGIN { RS=""; FS="\n" } /RAID|SAS|SATA|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe/ { for (i=1; i<=NF; i++) { if ($i ~ /^Slot:/) slot = substr($i, 7) if ($i ~ /^Class:/) class = substr($i, 8) if ($i ~ /^Vendor:/) vendor = substr($i, 9) if ($i ~ /^Device:/) device = substr($i, 9) if ($i ~ /^Rev:/) rev = substr($i, 6) } printf "\n%s\n", slot printf " Class: %s\n", class printf " Vendor: %s\n", vendor printf " Device: %s\n", device if (rev) printf " Rev: %s\n", rev slot=""; class=""; vendor=""; device=""; rev="" } ' # Show detailed info for storage controllers echo -e "\n${GREEN}=== Storage Controller Details ===${NC}" for ctrl in $(lspci | grep -iE "RAID|SAS|SATA|SCSI|Mass storage|NVMe" | awk '{print $1}'); do echo -e "\n${GREEN}Controller $ctrl:${NC}" lspci -vvs "$ctrl" 2>/dev/null | grep -E "^\s+(Subsystem|LnkSta|Kernel driver)" | head -5 done } get_system_status() { echo -e "\n${GREEN}=== System Load ===${NC}" uptime echo -e "\n${GREEN}=== Service Status ===${NC}" systemctl list-units --type=service --state=running | wc -l echo -e "\n${GREEN}=== Recent System Errors ===${NC}" journalctl -p err -n 5 --no-pager } ################### # DriveAtlas & Monitoring Functions ################### get_drive_atlas() { echo -e "\n${GREEN}=== Drive Atlas - Physical Bay Mapping ===${NC}" if command -v curl >/dev/null 2>&1; then if ! bash <(curl -sL "http://10.10.10.63:3000/LotusGuild/driveAtlas/raw/branch/main/driveAtlas.sh") 2>/dev/null; then log_message warn "DriveAtlas failed to execute or server unavailable" fi else log_message warn "curl not installed - cannot fetch DriveAtlas" fi } get_ceph_health() { echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}" if command -v ceph >/dev/null 2>&1; then echo -e "${GREEN}Health Status:${NC}" ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster" echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}" ceph osd tree 2>/dev/null || true echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}" ceph df 2>/dev/null || true echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}" ceph osd df 2>/dev/null || true else log_message info "Ceph tools not installed on this node" fi } get_node_exporter_status() { echo -e "\n${GREEN}=== Node Exporter Status ===${NC}" if systemctl is-active --quiet node_exporter 2>/dev/null; then echo -e "${GREEN}Service:${NC} Running" local ip=$(hostname -I | awk '{print $1}') echo -e "${GREEN}Metrics URL:${NC} http://${ip}:9100/metrics" if ss -tlnp 2>/dev/null | grep -q ':9100'; then echo -e "${GREEN}Port 9100:${NC} Listening" else log_message warn "Port 9100 not listening" fi elif systemctl list-unit-files 2>/dev/null | grep -q node_exporter; then log_message warn "Node Exporter is installed but not running" echo -e "Start with: systemctl start node_exporter" else log_message info "Node Exporter not installed" fi } get_hwmon_status() { echo -e "\n${GREEN}=== hwmon Daemon Status ===${NC}" if systemctl is-active --quiet hwmon.timer 2>/dev/null; then echo -e "${GREEN}Timer:${NC} Active" systemctl list-timers hwmon.timer --no-pager 2>/dev/null echo -e "\n${GREEN}Last Run:${NC}" journalctl -u hwmon.service -n 3 --no-pager 2>/dev/null || true elif systemctl list-unit-files 2>/dev/null | grep -q hwmon.timer; then log_message warn "hwmon timer is installed but not active" echo -e "Enable with: systemctl enable --now hwmon.timer" else log_message info "hwmon daemon not installed" fi } quick_health_check() { echo -e "\n${GREEN}=== Quick Health Check ===${NC}" echo -e "Running quick health assessment...\n" # Services check_services # Temperatures get_temp_info # Disk health (quick) echo -e "\n${GREEN}=== Disk Health Summary ===${NC}" if command -v smartctl >/dev/null 2>&1; then for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do health=$(smartctl -H /dev/$disk 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs) if [[ -n "$health" ]]; then echo -e "/dev/$disk: $health" fi done fi # Node Exporter get_node_exporter_status # Ceph quick status if command -v ceph >/dev/null 2>&1; then echo -e "\n${GREEN}=== Ceph Quick Status ===${NC}" ceph health 2>/dev/null || true fi } ################### # Proxmox Specific Functions ################### check_services() { if ! checkIfOnHypervisor; then log_message warn "Not on Proxmox - skipping Proxmox service checks" return 0 fi echo -e "${GREEN}Checking critical services:${NC}" local services=("pvedaemon" "pveproxy" "pvecluster" "pve-cluster" "corosync") for service in "${services[@]}"; do local status status=$(systemctl is-active "$service" 2>/dev/null || echo "not-found") echo -e "${GREEN}$service:${NC} $status" done } check_pve_version() { local min_version="6.0" local current_version=$(pveversion | grep -oP 'pve-manager/\K[0-9]+\.[0-9]+' || echo "0.0") if (( $(echo "$current_version < $min_version" | bc -l) )); then log_message warn "Proxmox VE version $current_version may not support all features" fi } list_vms() { if ! checkIfOnHypervisor; then log_message info "Not on Proxmox - skipping VM list" return 0 fi if command -v qm >/dev/null 2>&1; then echo -e "${GREEN}Virtual Machine Status:${NC}" qm list else log_message warn "qm command not found" fi } list_containers() { if ! checkIfOnHypervisor; then log_message info "Not on Proxmox - skipping container list" return 0 fi if command -v pct >/dev/null 2>&1; then echo -e "\n${GREEN}=== LXC Container Status ===${NC}" pct list else log_message warn "pct command not found" fi } ################### # Command Line Interface Functions ################### help() { echo "ProxDoc - The Proxmox System Doctor v${VERSION}" echo "" echo "Usage: curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- [OPTION]" echo "" echo "A comprehensive diagnostic tool for Proxmox server health checks." echo "" echo "Treatment Options:" echo " --help Show this prescription guide" echo " --diags Perform full system examination" echo " --quick Quick health check (services, temps, disks)" echo " --drives Show physical drive bay mapping (DriveAtlas)" echo " --ceph Check Ceph cluster health" echo " --node-exporter Check Node Exporter status" echo " --hwmon Check hwmon daemon status" echo " --services Check vital Proxmox services" echo " --vm-list Check VM vitals" echo " --ct-list Check container vitals" echo " --backup Review backup health" exit 0 } ################### # Main Functions ################### runDiags() { log_message info "Beginning system examination..." # Check if running on Proxmox local is_proxmox=false if checkIfOnHypervisor; then is_proxmox=true log_message info "Detected Proxmox VE hypervisor" else log_message warn "Not running on Proxmox VE - some checks will be skipped" fi echo "" log_message info "Checking system information..." get_system_info log_message info "Checking CPU..." get_cpu_info log_message info "Checking RAM..." get_ram_info log_message info "Checking memory details..." get_memory_details log_message info "Checking storage..." get_storage_info log_message info "Checking drive atlas..." get_drive_atlas log_message info "Checking network..." get_network_info get_detailed_network get_nic_details log_message info "Checking hardware..." get_hardware_info get_motherboard_info get_hba_info log_message info "Checking temperatures..." get_temp_info log_message info "Checking system status..." get_system_status log_message info "Checking monitoring services..." get_node_exporter_status get_hwmon_status # Only run Proxmox-specific checks if on Proxmox if [[ "$is_proxmox" == true ]]; then log_message info "Checking Ceph cluster..." get_ceph_health log_message info "Checking VMs..." list_vms log_message info "Checking containers..." list_containers fi echo "" log_message info "Examination complete" } checkForInput() { case $1 in --help) help ;; --diags) check_requirements; runDiags ;; --quick) quick_health_check ;; --drives) get_drive_atlas ;; --ceph) get_ceph_health ;; --node-exporter) get_node_exporter_status ;; --hwmon) get_hwmon_status ;; --services) check_services ;; --vm-list) list_vms ;; --ct-list) list_containers ;; --backup) echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;; *) echo -e "${RED}Invalid option: $1${NC}"; help ;; esac } ################### # Script Execution ################### argOne=$1 # Show header print_header # Check root if [[ $EUID -ne 0 ]]; then handle_error "This script must be run as root" fi # Set trap for interrupts trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM if [[ -n $argOne ]]; then checkForInput "$argOne" else help fi