#!/bin/bash set -e echo "Starting Proxmox fresh installation script..." # ============================================================================= # Platform Detection # ============================================================================= PLATFORM="unknown" if command -v pveversion &>/dev/null; then PLATFORM="pve" echo "Detected platform: Proxmox VE ($(pveversion 2>/dev/null || echo 'version unknown'))" elif command -v proxmox-backup-manager &>/dev/null; then PLATFORM="pbs" echo "Detected platform: Proxmox Backup Server" else echo "WARNING: Could not detect Proxmox platform (PVE or PBS)" echo "Proceeding with common package installation..." fi # Cleanup function for failed installations cleanup() { echo "Cleaning up on error..." systemctl stop node_exporter 2>/dev/null || true systemctl stop promtail 2>/dev/null || true systemctl stop hwmon.timer 2>/dev/null || true systemctl disable node_exporter 2>/dev/null || true systemctl disable promtail 2>/dev/null || true systemctl disable hwmon.timer 2>/dev/null || true rm -f /etc/systemd/system/node_exporter.service rm -f /etc/systemd/system/promtail.service rm -f /etc/systemd/system/hwmon.service rm -f /etc/systemd/system/hwmon.timer rm -f /usr/local/bin/node_exporter rm -f /usr/local/bin/promtail rm -rf node_exporter-*.linux-amd64.tar.gz node_exporter-*.linux-amd64 rm -rf /etc/promtail /var/lib/promtail userdel node_exporter 2>/dev/null || true systemctl daemon-reload echo "Cleanup completed." } # Set trap for cleanup on error trap cleanup ERR # Install dependencies echo "Installing required packages..." apt-get update # Common packages for all platforms COMMON_PKGS="python3-pip smartmontools python3-psutil python3-requests lm-sensors fastfetch rsync jq sysstat unzip" if [[ "$PLATFORM" == "pve" ]]; then echo "Installing PVE-specific packages..." apt-get install -y $COMMON_PKGS iperf3 fio nvme-cli elif [[ "$PLATFORM" == "pbs" ]]; then echo "Installing PBS-specific packages..." apt-get install -y $COMMON_PKGS zfsutils-linux nvme-cli else echo "Installing common packages..." apt-get install -y $COMMON_PKGS nvme-cli fi # Install Node Exporter echo "Installing Prometheus Node Exporter..." NODE_EXPORTER_VERSION="1.8.2" # Download Node Exporter with error handling echo "Downloading Node Exporter..." if ! wget --timeout=30 --tries=3 "https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz"; then echo "ERROR: Failed to download Node Exporter" exit 1 fi # Extract with error checking if ! tar xvfz node_exporter-*.linux-amd64.tar.gz; then echo "ERROR: Failed to extract Node Exporter archive" exit 1 fi # Check if user already exists if ! id "node_exporter" &>/dev/null; then echo "Creating node_exporter user..." useradd -rs /bin/false node_exporter else echo "node_exporter user already exists, skipping creation..." fi # Move binary to proper location if ! mv node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/; then echo "ERROR: Failed to move Node Exporter binary" exit 1 fi # Cleanup downloaded files rm -rf node_exporter-*.linux-amd64.tar.gz node_exporter-*.linux-amd64 # Set proper permissions chown node_exporter:node_exporter /usr/local/bin/node_exporter chmod +x /usr/local/bin/node_exporter # Create node_exporter service file cat > /etc/systemd/system/node_exporter.service << 'EOL' [Unit] Description=Node Exporter After=network.target [Service] User=node_exporter Group=node_exporter Type=simple ExecStart=/usr/local/bin/node_exporter [Install] WantedBy=multi-user.target EOL # Enable and start node_exporter systemctl daemon-reload systemctl enable node_exporter systemctl start node_exporter # Check if Node Exporter started successfully if ! systemctl is-active --quiet node_exporter; then echo "ERROR: Node Exporter failed to start" systemctl status node_exporter --no-pager || true exit 1 fi # Install Promtail (Loki log agent) echo "Installing Promtail log agent..." PROMTAIL_VERSION="3.4.2" LOKI_URL="http://10.10.10.69:3100" echo "Downloading Promtail..." if ! wget --timeout=30 --tries=3 "https://github.com/grafana/loki/releases/download/v${PROMTAIL_VERSION}/promtail-linux-amd64.zip" -O /tmp/promtail.zip; then echo "ERROR: Failed to download Promtail" exit 1 fi if ! command -v unzip &>/dev/null; then apt-get install -y unzip fi if ! unzip -o /tmp/promtail.zip -d /tmp/; then echo "ERROR: Failed to extract Promtail" exit 1 fi mv /tmp/promtail-linux-amd64 /usr/local/bin/promtail chmod +x /usr/local/bin/promtail rm -f /tmp/promtail.zip # Create Promtail directories mkdir -p /etc/promtail /var/lib/promtail # Get hostname for labeling PROMTAIL_HOST=$(hostname) # Create Promtail config cat > /etc/promtail/config.yml << PROMTAILEOF server: http_listen_port: 9080 grpc_listen_port: 0 positions: filename: /var/lib/promtail/positions.yaml clients: - url: ${LOKI_URL}/loki/api/v1/push scrape_configs: - job_name: syslog static_configs: - targets: - localhost labels: job: syslog host: ${PROMTAIL_HOST} __path__: /var/log/syslog - job_name: auth static_configs: - targets: - localhost labels: job: auth host: ${PROMTAIL_HOST} __path__: /var/log/auth.log - job_name: kern static_configs: - targets: - localhost labels: job: kernel host: ${PROMTAIL_HOST} __path__: /var/log/kern.log - job_name: daemon static_configs: - targets: - localhost labels: job: daemon host: ${PROMTAIL_HOST} __path__: /var/log/daemon.log - job_name: pveproxy static_configs: - targets: - localhost labels: job: pveproxy host: ${PROMTAIL_HOST} __path__: /var/log/pveproxy/access.log - job_name: pvedaemon static_configs: - targets: - localhost labels: job: pvedaemon host: ${PROMTAIL_HOST} __path__: /var/log/pvedaemon.log - job_name: pve-tasks static_configs: - targets: - localhost labels: job: pve-tasks host: ${PROMTAIL_HOST} __path__: /var/log/pve/tasks/active - job_name: ceph static_configs: - targets: - localhost labels: job: ceph host: ${PROMTAIL_HOST} __path__: /var/log/ceph/*.log - job_name: journal journal: max_age: 12h labels: job: journal host: ${PROMTAIL_HOST} relabel_configs: - source_labels: ['__journal__systemd_unit'] target_label: 'unit' - source_labels: ['__journal_priority_keyword'] target_label: 'priority' PROMTAILEOF # Create Promtail systemd service cat > /etc/systemd/system/promtail.service << 'EOL' [Unit] Description=Promtail Log Agent After=network.target [Service] Type=simple ExecStart=/usr/local/bin/promtail -config.file=/etc/promtail/config.yml Restart=always RestartSec=5 [Install] WantedBy=multi-user.target EOL systemctl daemon-reload systemctl enable promtail systemctl start promtail if ! systemctl is-active --quiet promtail; then echo "WARNING: Promtail failed to start" systemctl status promtail --no-pager || true else echo "✓ Promtail is running and shipping logs to Loki" fi # Install hwmon daemon echo "Installing hwmon daemon..." # Download hwmon service files with error handling echo "Downloading hwmon service files..." if ! curl --max-time 30 --retry 3 -o /etc/systemd/system/hwmon.service http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmon.service; then echo "ERROR: Failed to download hwmon.service" exit 1 fi if ! curl --max-time 30 --retry 3 -o /etc/systemd/system/hwmon.timer http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmon.timer; then echo "ERROR: Failed to download hwmon.timer" exit 1 fi # Verify downloaded files exist and are not empty if [[ ! -s /etc/systemd/system/hwmon.service ]]; then echo "ERROR: hwmon.service file is empty or missing" exit 1 fi if [[ ! -s /etc/systemd/system/hwmon.timer ]]; then echo "ERROR: hwmon.timer file is empty or missing" exit 1 fi # Create configuration directory for hwmon echo "Setting up hwmon configuration..." mkdir -p /etc/hwmonDaemon mkdir -p /var/log/hwmonDaemon # Prompt for API key or use default echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo " hwmonDaemon API Key Configuration" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" echo "The hwmonDaemon requires an API key to create tickets." echo "You can enter it now or configure it later by editing:" echo " /etc/hwmonDaemon/.env" echo "" read -p "Enter API key (or press Enter to skip): " API_KEY # Determine platform-specific defaults if [[ "$PLATFORM" == "pbs" ]]; then PBS_DEFAULT="true" CEPH_DEFAULT="false" else PBS_DEFAULT="false" CEPH_DEFAULT="true" fi if [[ -n "$API_KEY" ]]; then # Create .env file with API key and all config options cat > /etc/hwmonDaemon/.env << EOF # hwmonDaemon Configuration # Auto-generated by freshStart.sh on $(date) # Platform: ${PLATFORM} # Ticket API Configuration TICKET_API_KEY=$API_KEY TICKET_API_URL=http://10.10.10.45/create_ticket_api.php # Cluster Identification CLUSTER_NAME=proxmox-cluster # Ceph Monitoring (PVE nodes) CEPH_ENABLED=${CEPH_DEFAULT} #CEPH_TICKET_NODE= #CEPH_USAGE_WARNING=70 #CEPH_USAGE_CRITICAL=85 # PBS Monitoring (Proxmox Backup Server) PBS_ENABLED=${PBS_DEFAULT} #PBS_ZFS_WARNING=80 #PBS_ZFS_CRITICAL=90 # Prometheus Metrics PROMETHEUS_ENABLED=false #PROMETHEUS_PORT=9101 # Health Check Endpoint HEALTH_SERVER_ENABLED=false #HEALTH_SERVER_PORT=9102 EOF chmod 600 /etc/hwmonDaemon/.env echo "✓ API key configured in /etc/hwmonDaemon/.env" else # Create template .env file with all config options cat > /etc/hwmonDaemon/.env << EOF # hwmonDaemon Configuration # Edit this file to add your API key # Platform: ${PLATFORM} # Ticket API Configuration TICKET_API_KEY=your_api_key_here TICKET_API_URL=http://10.10.10.45/create_ticket_api.php # Cluster Identification CLUSTER_NAME=proxmox-cluster # Ceph Monitoring (PVE nodes) CEPH_ENABLED=${CEPH_DEFAULT} #CEPH_TICKET_NODE= #CEPH_USAGE_WARNING=70 #CEPH_USAGE_CRITICAL=85 # PBS Monitoring (Proxmox Backup Server) PBS_ENABLED=${PBS_DEFAULT} #PBS_ZFS_WARNING=80 #PBS_ZFS_CRITICAL=90 # Prometheus Metrics PROMETHEUS_ENABLED=false #PROMETHEUS_PORT=9101 # Health Check Endpoint HEALTH_SERVER_ENABLED=false #HEALTH_SERVER_PORT=9102 EOF chmod 600 /etc/hwmonDaemon/.env echo "⚠️ WARNING: API key not configured!" echo " Edit /etc/hwmonDaemon/.env to add your API key before tickets can be created" fi echo "" # Start the hwmon daemon systemctl daemon-reload systemctl enable hwmon.timer systemctl start hwmon.timer # Check if hwmon timer started successfully if ! systemctl is-active --quiet hwmon.timer; then echo "ERROR: hwmon timer failed to start" systemctl status hwmon.timer --no-pager || true exit 1 fi # Final verification echo "Verifying installation..." echo "Node Exporter status: $(systemctl is-active node_exporter)" echo "Promtail status: $(systemctl is-active promtail)" echo "hwmon timer status: $(systemctl is-active hwmon.timer)" echo "Node Exporter port check:" if ss -tlnp | grep :9100; then echo "✓ Node Exporter is listening on port 9100" else echo "WARNING: Node Exporter not listening on port 9100" fi # Test hwmon with error handling echo "Testing hwmon dry-run..." if ! /usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))" --dry-run; then echo "WARNING: hwmon dry-run test failed, but services are installed" fi # Disable cleanup trap on successful completion trap - ERR echo "✓ Installation complete! All services are running correctly." echo "" echo "Platform: ${PLATFORM^^}" echo "" echo "Services installed:" echo " - Node Exporter: http://$(hostname -I | awk '{print $1}'):9100/metrics" echo " - Promtail: Shipping logs to Loki at ${LOKI_URL:-http://10.10.10.69:3100}" echo " - hwmon daemon: Monitoring system health hourly" echo "" echo "Configuration files:" echo " - Promtail config: /etc/promtail/config.yml" echo " - hwmon config: /etc/hwmonDaemon/.env" echo "" echo "Log locations:" echo " - Node Exporter: journalctl -u node_exporter" echo " - Promtail: journalctl -u promtail" echo " - hwmon: journalctl -u hwmon.service" echo " - hwmon logs: /var/log/hwmonDaemon/" echo "" if [[ "$PLATFORM" == "pbs" ]]; then echo "PBS-specific monitoring enabled:" echo " - ZFS pool health and usage monitoring" echo " - Failed backup/GC/sync task detection" echo "" elif [[ "$PLATFORM" == "pve" ]]; then echo "PVE-specific monitoring enabled:" echo " - Ceph cluster health monitoring" echo " - LXC container storage monitoring" echo "" fi if [[ ! -s /etc/hwmonDaemon/.env ]] || grep -q "your_api_key_here" /etc/hwmonDaemon/.env 2>/dev/null; then echo "⚠️ REMINDER: Configure your API key in /etc/hwmonDaemon/.env" fi