Compare commits
15 Commits
e1dac4c08c
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| daafb6c4fb | |||
| 76f7aaa64c | |||
| ef004c621e | |||
| 6125fb9d6b | |||
| 86be5fd1c1 | |||
| a491ae4592 | |||
| 7514e2ba7c | |||
| f7ed682bdb | |||
| 148a7ac644 | |||
| 67d4b76324 | |||
| 6633a0a9a1 | |||
| eff8eb3a3c | |||
| 07989c8788 | |||
| c8fadf924b | |||
| c25e3ccc76 |
21
README.md
21
README.md
@@ -32,6 +32,7 @@ The script requires the following tools to be installed:
|
||||
- smartctl
|
||||
- sensors
|
||||
- lspci
|
||||
- bc
|
||||
|
||||
Optional tools for enhanced diagnostics:
|
||||
- ethtool (for detailed NIC information including link speed and firmware)
|
||||
@@ -73,6 +74,8 @@ curl -sL "http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh"
|
||||
- `--vm-list`: Check VM vitals
|
||||
- `--ct-list`: Check container vitals
|
||||
- `--backup`: Review backup health
|
||||
- `--checks=LIST`: Run only specific checks (comma-separated)
|
||||
- Valid checks: cpu, ram, memory, storage, disk, network, hardware, temps, services, ceph, vms, containers
|
||||
|
||||
## Output Information
|
||||
|
||||
@@ -101,10 +104,26 @@ The script provides detailed information about:
|
||||
|
||||
## Version
|
||||
|
||||
Current Version: 1.1.0
|
||||
Current Version: 1.2.0
|
||||
|
||||
### Changelog
|
||||
|
||||
#### v1.2.0
|
||||
- Fixed variable quoting in disk iteration loops (security)
|
||||
- Added input validation with whitelist of valid options
|
||||
- Added examples to help documentation
|
||||
- Added timeout protection to smartctl and ceph commands
|
||||
- Added `--checks=` option for selective diagnostics
|
||||
- Extracted magic strings into named constants
|
||||
- Added validation for potentially empty variables
|
||||
- Standardized error handling with cleanup trap
|
||||
- Added optional logging infrastructure (PROXDOC_LOGFILE)
|
||||
- Cached disk list and unit files to reduce command overhead
|
||||
- Added efficient process wait utility function
|
||||
- Fixed CPU MHz parsing showing multiple values
|
||||
- Fixed memory DIMM table not displaying data
|
||||
- Fixed bonding_masters showing as network interface
|
||||
|
||||
#### v1.1.0
|
||||
- Added DriveAtlas integration (`--drives`) for physical drive bay mapping
|
||||
- Added Ceph cluster health monitoring (`--ceph`)
|
||||
|
||||
426
proxDoc.sh
426
proxDoc.sh
@@ -1,6 +1,37 @@
|
||||
#!/bin/bash
|
||||
|
||||
VERSION="1.1.0"
|
||||
VERSION="1.2.0"
|
||||
|
||||
###################
|
||||
# Timeout Configuration
|
||||
###################
|
||||
readonly CMD_TIMEOUT=30 # Default timeout in seconds for external commands
|
||||
|
||||
###################
|
||||
# Logging Configuration
|
||||
###################
|
||||
# Optional log file - set via environment variable PROXDOC_LOGFILE
|
||||
LOGFILE="${PROXDOC_LOGFILE:-}"
|
||||
|
||||
###################
|
||||
# Cached Data
|
||||
###################
|
||||
# Disk list cache - populated on first use
|
||||
DISK_LIST=""
|
||||
# Unit files cache - populated on first use
|
||||
UNIT_FILES=""
|
||||
|
||||
###################
|
||||
# Pattern Constants
|
||||
###################
|
||||
# Virtual/firewall interface patterns to skip
|
||||
readonly VIRTUAL_IFACE_PATTERN="^(veth|fwbr|fwln|fwpr|tap)"
|
||||
# Storage controller patterns for HBA detection
|
||||
readonly STORAGE_CONTROLLER_PATTERN="RAID|SAS|SATA|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe"
|
||||
# Disk device patterns
|
||||
readonly DISK_DEVICE_PATTERN="^sd|^nvme"
|
||||
# PCI devices to exclude from hardware info
|
||||
readonly EXCLUDED_PCI_PATTERN="Host bridge|PCI bridge|ISA bridge|SMBus|IOMMU|Dummy|USB controller|Audio device|Encryption controller|Multimedia controller"
|
||||
|
||||
###################
|
||||
# Color Definitions
|
||||
@@ -31,18 +62,42 @@ print_header() {
|
||||
}
|
||||
|
||||
|
||||
# Error handling flags
|
||||
ERRORS_OCCURRED=0
|
||||
WARNINGS_OCCURRED=0
|
||||
|
||||
cleanup() {
|
||||
# Cleanup function called on exit
|
||||
local exit_code=$?
|
||||
if [[ $exit_code -ne 0 ]]; then
|
||||
echo -e "\n${RED}Script terminated with exit code: $exit_code${NC}"
|
||||
fi
|
||||
# Add any cleanup tasks here (temp files, etc.)
|
||||
}
|
||||
|
||||
handle_error() {
|
||||
echo -e "${RED}Error: $1${NC}"
|
||||
local message="$1"
|
||||
local fatal="${2:-true}" # Default to fatal error
|
||||
echo -e "${RED}Error: $message${NC}" >&2
|
||||
ERRORS_OCCURRED=$((ERRORS_OCCURRED + 1))
|
||||
if [[ "$fatal" == "true" ]]; then
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
log_message() {
|
||||
local level=$1
|
||||
local message=$2
|
||||
case $level in
|
||||
local level="$1"
|
||||
local message="$2"
|
||||
case "$level" in
|
||||
info) echo -e "${GREEN}[INFO]${NC} $message" ;;
|
||||
warn) echo -e "${YELLOW}[WARN]${NC} $message" ;;
|
||||
error) echo -e "${RED}[ERROR]${NC} $message" ;;
|
||||
warn)
|
||||
echo -e "${YELLOW}[WARN]${NC} $message"
|
||||
WARNINGS_OCCURRED=$((WARNINGS_OCCURRED + 1))
|
||||
;;
|
||||
error)
|
||||
echo -e "${RED}[ERROR]${NC} $message" >&2
|
||||
ERRORS_OCCURRED=$((ERRORS_OCCURRED + 1))
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
@@ -67,6 +122,46 @@ checkIfOnHypervisor() {
|
||||
command -v pveversion >/dev/null 2>&1
|
||||
}
|
||||
|
||||
# Get disk list with caching to avoid multiple lsblk calls
|
||||
get_disk_list() {
|
||||
if [[ -z "$DISK_LIST" ]]; then
|
||||
DISK_LIST=$(lsblk -d -o name 2>/dev/null | grep -E "$DISK_DEVICE_PATTERN")
|
||||
fi
|
||||
echo "$DISK_LIST"
|
||||
}
|
||||
|
||||
# Get systemctl unit files with caching
|
||||
get_unit_files() {
|
||||
if [[ -z "$UNIT_FILES" ]]; then
|
||||
UNIT_FILES=$(systemctl list-unit-files 2>/dev/null)
|
||||
fi
|
||||
echo "$UNIT_FILES"
|
||||
}
|
||||
|
||||
# Check if a unit file exists (uses cached data)
|
||||
unit_file_exists() {
|
||||
local unit_name="$1"
|
||||
get_unit_files | grep -q "$unit_name"
|
||||
}
|
||||
|
||||
# Efficient process wait with optional spinner
|
||||
# Usage: wait_for_process $pid [delay]
|
||||
# Uses kill -0 instead of ps -p for efficiency
|
||||
wait_for_process() {
|
||||
local pid="$1"
|
||||
local delay="${2:-0.1}"
|
||||
local spinner='|/-\'
|
||||
local i=0
|
||||
|
||||
while kill -0 "$pid" 2>/dev/null; do
|
||||
printf "\r%c " "${spinner:i++%${#spinner}:1}"
|
||||
sleep "$delay"
|
||||
done
|
||||
printf "\r \r" # Clear spinner
|
||||
wait "$pid"
|
||||
return $?
|
||||
}
|
||||
|
||||
###################
|
||||
# System Information Functions
|
||||
###################
|
||||
@@ -102,35 +197,46 @@ get_temp_info() {
|
||||
get_disk_health() {
|
||||
echo -e "\n${GREEN}=== Disk Health Status ===${NC}"
|
||||
if command -v smartctl >/dev/null 2>&1; then
|
||||
for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do
|
||||
while IFS= read -r disk; do
|
||||
[[ -z "$disk" ]] && continue
|
||||
echo -e "\nChecking /dev/$disk:"
|
||||
smartctl -H /dev/$disk
|
||||
done
|
||||
if ! timeout $CMD_TIMEOUT smartctl -H "/dev/$disk"; then
|
||||
log_message warn "smartctl timed out or failed for /dev/$disk"
|
||||
fi
|
||||
done <<< "$(get_disk_list)"
|
||||
else
|
||||
log_message warn "smartctl not found. Install smartmontools for disk health monitoring"
|
||||
fi
|
||||
}
|
||||
|
||||
get_cpu_info() {
|
||||
cpu_info=$(grep -m 1 -w 'model name' /proc/cpuinfo | awk -F: '{print $2}' | xargs) || {
|
||||
echo -e "${RED}Failed to retrieve CPU model information.${NC}"
|
||||
}
|
||||
cpu_cores=$(lscpu | grep '^CPU(s):' | awk '{print $2}')
|
||||
cpu_mhz=$(lscpu | grep 'MHz' | awk '{print $4}')
|
||||
local cpu_info cpu_cores cpu_mhz cpu_max_mhz
|
||||
|
||||
echo -e "${GREEN}CPU Model:${NC} $cpu_info"
|
||||
echo -e "${GREEN}CPU Cores:${NC} $cpu_cores"
|
||||
echo -e "${GREEN}CPU MHz:${NC} $cpu_mhz"
|
||||
cpu_info=$(grep -m 1 -w 'model name' /proc/cpuinfo 2>/dev/null | awk -F: '{print $2}' | xargs)
|
||||
cpu_cores=$(lscpu 2>/dev/null | grep -E '^CPU\(s\):' | awk '{print $2}')
|
||||
# Get current CPU MHz (first match only) - handle both "CPU MHz:" and "CPU(s) MHz:" formats
|
||||
cpu_mhz=$(lscpu 2>/dev/null | grep -E '^CPU( )?(max )?MHz:' | head -1 | awk -F: '{print $2}' | xargs)
|
||||
cpu_max_mhz=$(lscpu 2>/dev/null | grep -E '^CPU max MHz:' | awk -F: '{print $2}' | xargs)
|
||||
|
||||
echo -e "${GREEN}CPU Model:${NC} ${cpu_info:-Unknown}"
|
||||
echo -e "${GREEN}CPU Cores:${NC} ${cpu_cores:-Unknown}"
|
||||
if [[ -n "$cpu_max_mhz" ]]; then
|
||||
echo -e "${GREEN}CPU MHz:${NC} ${cpu_mhz:-Unknown} (max: ${cpu_max_mhz})"
|
||||
else
|
||||
echo -e "${GREEN}CPU MHz:${NC} ${cpu_mhz:-Unknown}"
|
||||
fi
|
||||
}
|
||||
|
||||
get_ram_info() {
|
||||
ram_total=$(free -h | grep 'Mem:' | awk '{print $2}')
|
||||
ram_used=$(free -h | grep 'Mem:' | awk '{print $3}')
|
||||
ram_free=$(free -h | grep 'Mem:' | awk '{print $4}')
|
||||
local ram_total ram_used ram_free
|
||||
|
||||
echo -e "${GREEN}Total RAM:${NC} $ram_total"
|
||||
echo -e "${GREEN}Used RAM:${NC} $ram_used"
|
||||
echo -e "${GREEN}Free RAM:${NC} $ram_free"
|
||||
ram_total=$(free -h 2>/dev/null | grep 'Mem:' | awk '{print $2}')
|
||||
ram_used=$(free -h 2>/dev/null | grep 'Mem:' | awk '{print $3}')
|
||||
ram_free=$(free -h 2>/dev/null | grep 'Mem:' | awk '{print $4}')
|
||||
|
||||
echo -e "${GREEN}Total RAM:${NC} ${ram_total:-Unknown}"
|
||||
echo -e "${GREEN}Used RAM:${NC} ${ram_used:-Unknown}"
|
||||
echo -e "${GREEN}Free RAM:${NC} ${ram_free:-Unknown}"
|
||||
}
|
||||
|
||||
get_storage_info() {
|
||||
@@ -144,10 +250,13 @@ get_storage_info() {
|
||||
}
|
||||
|
||||
get_network_info() {
|
||||
default_gateway=$(ip route | grep default | awk '{print $3}')
|
||||
ip_addresses=$(hostname -I | xargs)
|
||||
echo -e "${GREEN}Default Gateway:${NC} $default_gateway"
|
||||
echo -e "${GREEN}IP Addresses:${NC} $ip_addresses"
|
||||
local default_gateway ip_addresses
|
||||
|
||||
default_gateway=$(ip route 2>/dev/null | grep default | awk '{print $3}')
|
||||
ip_addresses=$(hostname -I 2>/dev/null | xargs)
|
||||
|
||||
echo -e "${GREEN}Default Gateway:${NC} ${default_gateway:-Not configured}"
|
||||
echo -e "${GREEN}IP Addresses:${NC} ${ip_addresses:-None detected}"
|
||||
}
|
||||
|
||||
get_detailed_network() {
|
||||
@@ -173,7 +282,7 @@ get_hardware_info() {
|
||||
echo -e "${GREEN}BIOS Version:${NC} $(dmidecode -s bios-version)"
|
||||
echo -e "\n${GREEN}=== PCI Devices ===${NC}"
|
||||
# Show interesting devices, exclude bridges, infrastructure, and integrated motherboard devices
|
||||
lspci | grep -v -E "Host bridge|PCI bridge|ISA bridge|SMBus|IOMMU|Dummy|USB controller|Audio device|Encryption controller|Multimedia controller"
|
||||
lspci | grep -v -E "$EXCLUDED_PCI_PATTERN"
|
||||
}
|
||||
|
||||
get_motherboard_info() {
|
||||
@@ -190,33 +299,29 @@ get_motherboard_info() {
|
||||
get_memory_details() {
|
||||
echo -e "\n${GREEN}=== Memory DIMM Information ===${NC}"
|
||||
|
||||
# Use a more robust parsing approach
|
||||
local locator size type speed manufacturer
|
||||
local in_device=false
|
||||
|
||||
# Print header
|
||||
printf "%-12s %-12s %-10s %-12s %-20s\n" "Slot" "Size" "Type" "Speed" "Manufacturer"
|
||||
printf "%-12s %-12s %-10s %-12s %-20s\n" "----" "----" "----" "-----" "------------"
|
||||
|
||||
local locator="" size="" type="" speed="" manufacturer=""
|
||||
local in_device=false
|
||||
local dimm_count=0
|
||||
|
||||
while IFS= read -r line; do
|
||||
# Detect start of a memory device section
|
||||
if [[ "$line" =~ ^Memory[[:space:]]Device ]]; then
|
||||
# If we have data from previous device, print it
|
||||
if [[ -n "$locator" && -n "$size" && ! "$size" =~ (No|Not|Installed) ]]; then
|
||||
# Detect start of a memory device section (may have leading whitespace or not)
|
||||
if [[ "$line" =~ Memory[[:space:]]+Device$ ]] || [[ "$line" == "Memory Device" ]]; then
|
||||
# Print previous device if it had valid data
|
||||
if [[ -n "$locator" && -n "$size" && ! "$size" =~ ^(No|Not)[[:space:]] ]]; then
|
||||
printf "%-12s %-12s %-10s %-12s %-20s\n" \
|
||||
"${locator:-N/A}" \
|
||||
"${size:-N/A}" \
|
||||
"${type:-N/A}" \
|
||||
"${speed:-N/A}" \
|
||||
"${manufacturer:-N/A}"
|
||||
((dimm_count++))
|
||||
fi
|
||||
|
||||
# Reset variables for new device
|
||||
locator=""
|
||||
size=""
|
||||
type=""
|
||||
speed=""
|
||||
manufacturer=""
|
||||
# Reset for new device
|
||||
locator="" size="" type="" speed="" manufacturer=""
|
||||
in_device=true
|
||||
continue
|
||||
fi
|
||||
@@ -224,62 +329,62 @@ get_memory_details() {
|
||||
# Skip if not in a device section
|
||||
[[ "$in_device" != true ]] && continue
|
||||
|
||||
# Parse fields (case-insensitive, flexible whitespace)
|
||||
if [[ "$line" =~ ^[[:space:]]*Locator:[[:space:]]*(.+)$ ]] && [[ ! "$line" =~ Bank ]]; then
|
||||
# Parse fields - be very specific to avoid matching wrong lines
|
||||
# Locator (but not Bank Locator)
|
||||
if [[ "$line" =~ ^[[:space:]]*Locator:[[:space:]]*(.+)$ ]] && [[ ! "$line" == *Bank*Locator* ]]; then
|
||||
locator="${BASH_REMATCH[1]}"
|
||||
locator="${locator// /_}" # Replace spaces with underscores
|
||||
locator="${locator// /_}"
|
||||
# Size
|
||||
elif [[ "$line" =~ ^[[:space:]]*Size:[[:space:]]*(.+)$ ]]; then
|
||||
size="${BASH_REMATCH[1]}"
|
||||
elif [[ "$line" =~ ^[[:space:]]*Type:[[:space:]]*(.+)$ ]]; then
|
||||
# Type (exact match, not Type Detail or other Type fields)
|
||||
elif [[ "$line" =~ ^[[:space:]]*Type:[[:space:]]*([A-Za-z0-9]+)$ ]]; then
|
||||
type="${BASH_REMATCH[1]}"
|
||||
# Skip if it's an error or unknown type
|
||||
[[ "$type" =~ (Unknown|Error|Correction) ]] && type=""
|
||||
# Only clear if it's truly unknown
|
||||
[[ "$type" == "Unknown" ]] && type=""
|
||||
# Speed (exact field)
|
||||
elif [[ "$line" =~ ^[[:space:]]*Speed:[[:space:]]*(.+)$ ]]; then
|
||||
speed="${BASH_REMATCH[1]}"
|
||||
[[ "$speed" =~ Unknown ]] && speed=""
|
||||
[[ "$speed" == "Unknown" ]] && speed=""
|
||||
# Manufacturer
|
||||
elif [[ "$line" =~ ^[[:space:]]*Manufacturer:[[:space:]]*(.+)$ ]]; then
|
||||
manufacturer="${BASH_REMATCH[1]}"
|
||||
[[ "$manufacturer" =~ (Unknown|NO DIMM) ]] && manufacturer=""
|
||||
# Clear common placeholder values
|
||||
[[ "$manufacturer" =~ ^(Unknown|NO[[:space:]]DIMM|Not[[:space:]]Specified)$ ]] && manufacturer=""
|
||||
fi
|
||||
|
||||
# Empty line marks end of device section
|
||||
if [[ -z "$line" ]]; then
|
||||
# Empty line or new section marks end
|
||||
if [[ -z "$line" ]] || [[ "$line" =~ ^Handle ]]; then
|
||||
in_device=false
|
||||
fi
|
||||
|
||||
done < <(dmidecode -t memory 2>/dev/null)
|
||||
|
||||
# Print last device if it has data
|
||||
if [[ -n "$locator" && -n "$size" && ! "$size" =~ (No|Not|Installed) ]]; then
|
||||
# Print last device if valid
|
||||
if [[ -n "$locator" && -n "$size" && ! "$size" =~ ^(No|Not)[[:space:]] ]]; then
|
||||
printf "%-12s %-12s %-10s %-12s %-20s\n" \
|
||||
"${locator:-N/A}" \
|
||||
"${size:-N/A}" \
|
||||
"${type:-N/A}" \
|
||||
"${speed:-N/A}" \
|
||||
"${manufacturer:-N/A}"
|
||||
((dimm_count++))
|
||||
fi
|
||||
|
||||
# If no DIMMs were printed, show a message
|
||||
if [[ $dimm_count -eq 0 ]]; then
|
||||
echo " (Unable to parse DIMM details from dmidecode)"
|
||||
fi
|
||||
|
||||
# Memory summary
|
||||
echo -e "\n${GREEN}Memory Summary:${NC}"
|
||||
|
||||
# Count slots more reliably
|
||||
local total_slots=0
|
||||
local populated=0
|
||||
# Count slots and populated using simpler grep approach
|
||||
# Pattern ^[[:space:]]*Locator: already excludes "Bank Locator:" lines
|
||||
local total_slots populated
|
||||
total_slots=$(dmidecode -t memory 2>/dev/null | grep -c "^[[:space:]]*Locator:")
|
||||
|
||||
while IFS= read -r line; do
|
||||
if [[ "$line" =~ ^[[:space:]]*Locator: ]] && [[ ! "$line" =~ Bank ]]; then
|
||||
((total_slots++))
|
||||
fi
|
||||
done < <(dmidecode -t memory 2>/dev/null)
|
||||
|
||||
while IFS= read -r line; do
|
||||
if [[ "$line" =~ ^[[:space:]]*Size:[[:space:]]*(.+)$ ]]; then
|
||||
local size_val="${BASH_REMATCH[1]}"
|
||||
if [[ ! "$size_val" =~ (No|Not|Installed) ]]; then
|
||||
((populated++))
|
||||
fi
|
||||
fi
|
||||
done < <(dmidecode -t memory 2>/dev/null)
|
||||
populated=$(dmidecode -t memory 2>/dev/null | grep "^[[:space:]]*Size:" | grep -cv "No Module\|Not Installed")
|
||||
|
||||
echo -e " Total Slots: $total_slots"
|
||||
echo -e " Populated: $populated"
|
||||
@@ -349,8 +454,11 @@ get_physical_interfaces() {
|
||||
# Skip loopback
|
||||
[[ "$iface" == "lo" ]] && continue
|
||||
|
||||
# Skip bonding_masters (virtual file, not an interface)
|
||||
[[ "$iface" == "bonding_masters" ]] && continue
|
||||
|
||||
# Skip virtual/firewall interfaces
|
||||
[[ "$iface" =~ ^(veth|fwbr|fwln|fwpr|tap) ]] && continue
|
||||
[[ "$iface" =~ $VIRTUAL_IFACE_PATTERN ]] && continue
|
||||
|
||||
# This is a physical interface
|
||||
echo "$iface"
|
||||
@@ -361,9 +469,9 @@ get_hba_info() {
|
||||
echo -e "\n${GREEN}=== HBA/Storage Controller Information ===${NC}"
|
||||
|
||||
# Find RAID, SAS, SATA, SCSI, and storage controllers
|
||||
lspci -vmm 2>/dev/null | awk '
|
||||
lspci -vmm 2>/dev/null | awk -v pattern="$STORAGE_CONTROLLER_PATTERN" '
|
||||
BEGIN { RS=""; FS="\n" }
|
||||
/RAID|SAS|SATA|SCSI|Mass storage|Serial Attached|Fibre Channel|NVMe/ {
|
||||
$0 ~ pattern {
|
||||
for (i=1; i<=NF; i++) {
|
||||
if ($i ~ /^Slot:/) slot = substr($i, 7)
|
||||
if ($i ~ /^Class:/) class = substr($i, 8)
|
||||
@@ -382,7 +490,7 @@ get_hba_info() {
|
||||
|
||||
# Show detailed info for storage controllers
|
||||
echo -e "\n${GREEN}=== Storage Controller Details ===${NC}"
|
||||
for ctrl in $(lspci | grep -iE "RAID|SAS|SATA|SCSI|Mass storage|NVMe" | awk '{print $1}'); do
|
||||
for ctrl in $(lspci | grep -iE "$STORAGE_CONTROLLER_PATTERN" | awk '{print $1}'); do
|
||||
echo -e "\n${GREEN}Controller $ctrl:${NC}"
|
||||
lspci -vvs "$ctrl" 2>/dev/null | grep -E "^\s+(Subsystem|LnkSta|Kernel driver)" | head -5
|
||||
done
|
||||
@@ -418,16 +526,16 @@ get_ceph_health() {
|
||||
echo -e "\n${GREEN}=== Ceph Cluster Health ===${NC}"
|
||||
if command -v ceph >/dev/null 2>&1; then
|
||||
echo -e "${GREEN}Health Status:${NC}"
|
||||
ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster"
|
||||
timeout $CMD_TIMEOUT ceph health detail 2>/dev/null || log_message warn "Cannot connect to Ceph cluster or timed out"
|
||||
|
||||
echo -e "\n${GREEN}=== Ceph OSD Tree ===${NC}"
|
||||
ceph osd tree 2>/dev/null || true
|
||||
timeout $CMD_TIMEOUT ceph osd tree 2>/dev/null || log_message warn "Ceph OSD tree timed out"
|
||||
|
||||
echo -e "\n${GREEN}=== Ceph Pool Usage ===${NC}"
|
||||
ceph df 2>/dev/null || true
|
||||
timeout $CMD_TIMEOUT ceph df 2>/dev/null || log_message warn "Ceph df timed out"
|
||||
|
||||
echo -e "\n${GREEN}=== Ceph OSD Usage ===${NC}"
|
||||
ceph osd df 2>/dev/null || true
|
||||
timeout $CMD_TIMEOUT ceph osd df 2>/dev/null || log_message warn "Ceph OSD df timed out"
|
||||
else
|
||||
log_message info "Ceph tools not installed on this node"
|
||||
fi
|
||||
@@ -444,7 +552,7 @@ get_node_exporter_status() {
|
||||
else
|
||||
log_message warn "Port 9100 not listening"
|
||||
fi
|
||||
elif systemctl list-unit-files 2>/dev/null | grep -q node_exporter; then
|
||||
elif unit_file_exists node_exporter; then
|
||||
log_message warn "Node Exporter is installed but not running"
|
||||
echo -e "Start with: systemctl start node_exporter"
|
||||
else
|
||||
@@ -459,7 +567,7 @@ get_hwmon_status() {
|
||||
systemctl list-timers hwmon.timer --no-pager 2>/dev/null
|
||||
echo -e "\n${GREEN}Last Run:${NC}"
|
||||
journalctl -u hwmon.service -n 3 --no-pager 2>/dev/null || true
|
||||
elif systemctl list-unit-files 2>/dev/null | grep -q hwmon.timer; then
|
||||
elif unit_file_exists hwmon.timer; then
|
||||
log_message warn "hwmon timer is installed but not active"
|
||||
echo -e "Enable with: systemctl enable --now hwmon.timer"
|
||||
else
|
||||
@@ -467,6 +575,51 @@ get_hwmon_status() {
|
||||
fi
|
||||
}
|
||||
|
||||
# Valid check names for selective mode
|
||||
readonly VALID_CHECKS="cpu ram memory storage disk network hardware temps services ceph vms containers"
|
||||
|
||||
run_selective_checks() {
|
||||
local checks="$1"
|
||||
if [[ -z "$checks" ]]; then
|
||||
log_message error "No checks specified. Use --checks=cpu,ram,disk"
|
||||
echo "Valid checks: $VALID_CHECKS"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Validate check names
|
||||
IFS=',' read -ra check_array <<< "$checks"
|
||||
for check in "${check_array[@]}"; do
|
||||
if [[ ! " $VALID_CHECKS " =~ " $check " ]]; then
|
||||
log_message error "Unknown check: $check"
|
||||
echo "Valid checks: $VALID_CHECKS"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
log_message info "Running selective checks: $checks"
|
||||
echo ""
|
||||
|
||||
for check in "${check_array[@]}"; do
|
||||
case "$check" in
|
||||
cpu) log_message info "Checking CPU..."; get_cpu_info ;;
|
||||
ram) log_message info "Checking RAM..."; get_ram_info ;;
|
||||
memory) log_message info "Checking memory details..."; get_memory_details ;;
|
||||
storage) log_message info "Checking storage..."; get_storage_info ;;
|
||||
disk) log_message info "Checking disk health..."; get_disk_health ;;
|
||||
network) log_message info "Checking network..."; get_network_info; get_detailed_network; get_nic_details ;;
|
||||
hardware) log_message info "Checking hardware..."; get_hardware_info; get_motherboard_info; get_hba_info ;;
|
||||
temps) log_message info "Checking temperatures..."; get_temp_info ;;
|
||||
services) log_message info "Checking services..."; check_services ;;
|
||||
ceph) log_message info "Checking Ceph..."; get_ceph_health ;;
|
||||
vms) log_message info "Checking VMs..."; list_vms ;;
|
||||
containers) log_message info "Checking containers..."; list_containers ;;
|
||||
esac
|
||||
done
|
||||
|
||||
echo ""
|
||||
log_message info "Selective checks complete"
|
||||
}
|
||||
|
||||
quick_health_check() {
|
||||
echo -e "\n${GREEN}=== Quick Health Check ===${NC}"
|
||||
echo -e "Running quick health assessment...\n"
|
||||
@@ -480,12 +633,15 @@ quick_health_check() {
|
||||
# Disk health (quick)
|
||||
echo -e "\n${GREEN}=== Disk Health Summary ===${NC}"
|
||||
if command -v smartctl >/dev/null 2>&1; then
|
||||
for disk in $(lsblk -d -o name | grep -E '^sd|^nvme'); do
|
||||
health=$(smartctl -H /dev/$disk 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs)
|
||||
while IFS= read -r disk; do
|
||||
[[ -z "$disk" ]] && continue
|
||||
health=$(timeout $CMD_TIMEOUT smartctl -H "/dev/$disk" 2>/dev/null | grep -i "health" | awk -F: '{print $2}' | xargs)
|
||||
if [[ -n "$health" ]]; then
|
||||
echo -e "/dev/$disk: $health"
|
||||
else
|
||||
echo -e "/dev/$disk: ${YELLOW}check timed out or unavailable${NC}"
|
||||
fi
|
||||
done
|
||||
done <<< "$(get_disk_list)"
|
||||
fi
|
||||
|
||||
# Node Exporter
|
||||
@@ -511,7 +667,8 @@ check_services() {
|
||||
local services=("pvedaemon" "pveproxy" "pvecluster" "pve-cluster" "corosync")
|
||||
for service in "${services[@]}"; do
|
||||
local status
|
||||
status=$(systemctl is-active "$service" 2>/dev/null || echo "not-found")
|
||||
status=$(systemctl is-active "$service" 2>/dev/null)
|
||||
[[ -z "$status" ]] && status="not-found"
|
||||
echo -e "${GREEN}$service:${NC} $status"
|
||||
done
|
||||
}
|
||||
@@ -574,6 +731,31 @@ help() {
|
||||
echo " --vm-list Check VM vitals"
|
||||
echo " --ct-list Check container vitals"
|
||||
echo " --backup Review backup health"
|
||||
echo " --checks=LIST Run only specific checks (comma-separated)"
|
||||
echo ""
|
||||
echo "Valid checks for --checks option:"
|
||||
echo " cpu, ram, memory, storage, disk, network, hardware, temps,"
|
||||
echo " services, ceph, vms, containers"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " Run full diagnostics:"
|
||||
echo " curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --diags"
|
||||
echo ""
|
||||
echo " Quick health check:"
|
||||
echo " curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --quick"
|
||||
echo ""
|
||||
echo " Check only services and VMs:"
|
||||
echo " curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --services"
|
||||
echo " curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --vm-list"
|
||||
echo ""
|
||||
echo " View drive bay mapping:"
|
||||
echo " curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --drives"
|
||||
echo ""
|
||||
echo " Check Ceph cluster health:"
|
||||
echo " curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --ceph"
|
||||
echo ""
|
||||
echo " Run only CPU and RAM checks:"
|
||||
echo " curl -sL \"http://10.10.10.63:3000/LotusGuild/proxDoc/raw/branch/main/proxDoc.sh\" | bash -s -- --checks=cpu,ram"
|
||||
exit 0
|
||||
}
|
||||
|
||||
@@ -583,7 +765,28 @@ help() {
|
||||
###################
|
||||
# Main Functions
|
||||
###################
|
||||
|
||||
# Setup logging if LOGFILE is specified
|
||||
setup_logging() {
|
||||
if [[ -n "$LOGFILE" ]]; then
|
||||
# Create log directory if needed
|
||||
local log_dir
|
||||
log_dir=$(dirname "$LOGFILE")
|
||||
if [[ ! -d "$log_dir" ]]; then
|
||||
mkdir -p "$log_dir" 2>/dev/null || {
|
||||
log_message warn "Cannot create log directory: $log_dir"
|
||||
LOGFILE=""
|
||||
return
|
||||
}
|
||||
fi
|
||||
log_message info "Logging output to: $LOGFILE"
|
||||
# Redirect stdout and stderr to tee (no subshell overhead)
|
||||
exec > >(tee -a "$LOGFILE") 2>&1
|
||||
fi
|
||||
}
|
||||
|
||||
runDiags() {
|
||||
setup_logging
|
||||
log_message info "Beginning system examination..."
|
||||
|
||||
# Check if running on Proxmox
|
||||
@@ -649,10 +852,49 @@ runDiags() {
|
||||
|
||||
echo ""
|
||||
log_message info "Examination complete"
|
||||
|
||||
# Print summary if there were issues
|
||||
if [[ $WARNINGS_OCCURRED -gt 0 || $ERRORS_OCCURRED -gt 0 ]]; then
|
||||
echo -e "\n${YELLOW}=== Summary ===${NC}"
|
||||
[[ $WARNINGS_OCCURRED -gt 0 ]] && echo -e "Warnings: $WARNINGS_OCCURRED"
|
||||
[[ $ERRORS_OCCURRED -gt 0 ]] && echo -e "Errors: $ERRORS_OCCURRED"
|
||||
fi
|
||||
}
|
||||
|
||||
# Whitelist of valid command options
|
||||
readonly VALID_OPTIONS="--help --diags --quick --drives --ceph --node-exporter --hwmon --services --vm-list --ct-list --backup --checks"
|
||||
|
||||
validate_input() {
|
||||
local input="$1"
|
||||
# Check if input matches valid option pattern (starts with -- and contains only alphanumeric, hyphens, equals, commas)
|
||||
if [[ ! "$input" =~ ^--[a-z][-a-z=,]*$ ]]; then
|
||||
return 1
|
||||
fi
|
||||
# Extract the option name (before any = sign)
|
||||
local opt_name="${input%%=*}"
|
||||
# Check against whitelist
|
||||
if [[ ! " $VALID_OPTIONS " =~ " $opt_name " ]]; then
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
checkForInput() {
|
||||
case $1 in
|
||||
local input="$1"
|
||||
|
||||
# Validate input against whitelist
|
||||
if ! validate_input "$input"; then
|
||||
echo -e "${RED}Invalid option: $input${NC}"
|
||||
echo -e "Use --help to see available options."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Extract option name and value for --checks=X pattern
|
||||
local opt_name="${input%%=*}"
|
||||
local opt_value="${input#*=}"
|
||||
[[ "$opt_name" == "$opt_value" ]] && opt_value=""
|
||||
|
||||
case "$opt_name" in
|
||||
--help) help ;;
|
||||
--diags) check_requirements; runDiags ;;
|
||||
--quick) quick_health_check ;;
|
||||
@@ -664,7 +906,7 @@ checkForInput() {
|
||||
--vm-list) list_vms ;;
|
||||
--ct-list) list_containers ;;
|
||||
--backup) echo -e "${GREEN}Backup Status:${NC}"; pvesm status 2>/dev/null || log_message warn "pvesm not available" ;;
|
||||
*) echo -e "${RED}Invalid option: $1${NC}"; help ;;
|
||||
--checks) run_selective_checks "$opt_value" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
@@ -682,8 +924,10 @@ if [[ $EUID -ne 0 ]]; then
|
||||
handle_error "This script must be run as root"
|
||||
fi
|
||||
|
||||
# Set trap for interrupts
|
||||
trap 'echo -e "${RED}Script interrupted.${NC}"; exit 1' INT TERM
|
||||
# Set trap for cleanup and interrupts
|
||||
trap cleanup EXIT
|
||||
trap 'echo -e "\n${RED}Script interrupted by user.${NC}"; exit 130' INT
|
||||
trap 'echo -e "\n${RED}Script terminated.${NC}"; exit 143' TERM
|
||||
|
||||
if [[ -n $argOne ]]; then
|
||||
checkForInput "$argOne"
|
||||
|
||||
Reference in New Issue
Block a user