2024-12-08 15:05:13 -05:00
#!/bin/bash
set -e
echo "Starting Proxmox fresh installation script..."
2026-02-10 13:20:04 -05:00
# =============================================================================
# Platform Detection
# =============================================================================
PLATFORM = "unknown"
if command -v pveversion & >/dev/null; then
PLATFORM = "pve"
echo " Detected platform: Proxmox VE ( $( pveversion 2>/dev/null || echo 'version unknown' ) ) "
elif command -v proxmox-backup-manager & >/dev/null; then
PLATFORM = "pbs"
echo "Detected platform: Proxmox Backup Server"
else
echo "WARNING: Could not detect Proxmox platform (PVE or PBS)"
echo "Proceeding with common package installation..."
fi
2025-09-03 13:43:46 -04:00
# Cleanup function for failed installations
cleanup( ) {
echo "Cleaning up on error..."
systemctl stop node_exporter 2>/dev/null || true
2026-02-14 12:00:07 -05:00
systemctl stop promtail 2>/dev/null || true
2025-09-03 13:43:46 -04:00
systemctl stop hwmon.timer 2>/dev/null || true
systemctl disable node_exporter 2>/dev/null || true
2026-02-14 12:00:07 -05:00
systemctl disable promtail 2>/dev/null || true
2025-09-03 13:43:46 -04:00
systemctl disable hwmon.timer 2>/dev/null || true
rm -f /etc/systemd/system/node_exporter.service
2026-02-14 12:00:07 -05:00
rm -f /etc/systemd/system/promtail.service
2025-09-03 13:43:46 -04:00
rm -f /etc/systemd/system/hwmon.service
rm -f /etc/systemd/system/hwmon.timer
rm -f /usr/local/bin/node_exporter
2026-02-14 12:00:07 -05:00
rm -f /usr/local/bin/promtail
2025-09-03 13:43:46 -04:00
rm -rf node_exporter-*.linux-amd64.tar.gz node_exporter-*.linux-amd64
2026-02-14 12:00:07 -05:00
rm -rf /etc/promtail /var/lib/promtail
2025-09-03 13:43:46 -04:00
userdel node_exporter 2>/dev/null || true
systemctl daemon-reload
echo "Cleanup completed."
}
# Set trap for cleanup on error
trap cleanup ERR
2024-12-08 15:05:13 -05:00
# Install dependencies
echo "Installing required packages..."
apt-get update
2026-02-10 13:20:04 -05:00
# Common packages for all platforms
2026-02-14 12:00:07 -05:00
COMMON_PKGS = "python3-pip smartmontools python3-psutil python3-requests lm-sensors fastfetch rsync jq sysstat unzip"
2026-02-10 13:20:04 -05:00
if [ [ " $PLATFORM " = = "pve" ] ] ; then
echo "Installing PVE-specific packages..."
apt-get install -y $COMMON_PKGS iperf3 fio nvme-cli
elif [ [ " $PLATFORM " = = "pbs" ] ] ; then
echo "Installing PBS-specific packages..."
apt-get install -y $COMMON_PKGS zfsutils-linux nvme-cli
else
echo "Installing common packages..."
apt-get install -y $COMMON_PKGS nvme-cli
fi
2024-12-08 15:05:13 -05:00
# Install Node Exporter
echo "Installing Prometheus Node Exporter..."
NODE_EXPORTER_VERSION = "1.8.2"
2025-09-03 13:43:46 -04:00
# Download Node Exporter with error handling
echo "Downloading Node Exporter..."
if ! wget --timeout= 30 --tries= 3 " https://github.com/prometheus/node_exporter/releases/download/v ${ NODE_EXPORTER_VERSION } /node_exporter- ${ NODE_EXPORTER_VERSION } .linux-amd64.tar.gz " ; then
echo "ERROR: Failed to download Node Exporter"
exit 1
fi
# Extract with error checking
if ! tar xvfz node_exporter-*.linux-amd64.tar.gz; then
echo "ERROR: Failed to extract Node Exporter archive"
exit 1
fi
# Check if user already exists
if ! id "node_exporter" & >/dev/null; then
echo "Creating node_exporter user..."
useradd -rs /bin/false node_exporter
else
echo "node_exporter user already exists, skipping creation..."
fi
2024-12-08 15:05:13 -05:00
# Move binary to proper location
2025-09-03 13:43:46 -04:00
if ! mv node_exporter-${ NODE_EXPORTER_VERSION } .linux-amd64/node_exporter /usr/local/bin/; then
echo "ERROR: Failed to move Node Exporter binary"
exit 1
fi
# Cleanup downloaded files
2024-12-08 15:05:13 -05:00
rm -rf node_exporter-*.linux-amd64.tar.gz node_exporter-*.linux-amd64
2025-09-03 13:43:46 -04:00
# Set proper permissions
chown node_exporter:node_exporter /usr/local/bin/node_exporter
chmod +x /usr/local/bin/node_exporter
2024-12-08 15:05:13 -05:00
# Create node_exporter service file
cat > /etc/systemd/system/node_exporter.service << 'EOL'
[ Unit]
Description = Node Exporter
After = network.target
[ Service]
User = node_exporter
Group = node_exporter
Type = simple
ExecStart = /usr/local/bin/node_exporter
[ Install]
WantedBy = multi-user.target
EOL
# Enable and start node_exporter
systemctl daemon-reload
systemctl enable node_exporter
systemctl start node_exporter
2025-09-03 13:43:46 -04:00
# Check if Node Exporter started successfully
if ! systemctl is-active --quiet node_exporter; then
echo "ERROR: Node Exporter failed to start"
systemctl status node_exporter --no-pager || true
exit 1
fi
2026-02-14 12:00:07 -05:00
# Install Promtail (Loki log agent)
echo "Installing Promtail log agent..."
PROMTAIL_VERSION = "3.4.2"
LOKI_URL = "http://10.10.10.69:3100"
echo "Downloading Promtail..."
if ! wget --timeout= 30 --tries= 3 " https://github.com/grafana/loki/releases/download/v ${ PROMTAIL_VERSION } /promtail-linux-amd64.zip " -O /tmp/promtail.zip; then
echo "ERROR: Failed to download Promtail"
exit 1
fi
if ! command -v unzip & >/dev/null; then
apt-get install -y unzip
fi
if ! unzip -o /tmp/promtail.zip -d /tmp/; then
echo "ERROR: Failed to extract Promtail"
exit 1
fi
mv /tmp/promtail-linux-amd64 /usr/local/bin/promtail
chmod +x /usr/local/bin/promtail
rm -f /tmp/promtail.zip
# Create Promtail directories
mkdir -p /etc/promtail /var/lib/promtail
# Get hostname for labeling
PROMTAIL_HOST = $( hostname)
# Create Promtail config
cat > /etc/promtail/config.yml << PROMTAILEOF
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /var/lib/promtail/positions.yaml
clients:
- url: ${ LOKI_URL } /loki/api/v1/push
scrape_configs:
- job_name: syslog
static_configs:
- targets:
- localhost
labels:
job: syslog
host: ${ PROMTAIL_HOST }
__path__: /var/log/syslog
- job_name: auth
static_configs:
- targets:
- localhost
labels:
job: auth
host: ${ PROMTAIL_HOST }
__path__: /var/log/auth.log
- job_name: kern
static_configs:
- targets:
- localhost
labels:
job: kernel
host: ${ PROMTAIL_HOST }
__path__: /var/log/kern.log
- job_name: daemon
static_configs:
- targets:
- localhost
labels:
job: daemon
host: ${ PROMTAIL_HOST }
__path__: /var/log/daemon.log
- job_name: pveproxy
static_configs:
- targets:
- localhost
labels:
job: pveproxy
host: ${ PROMTAIL_HOST }
__path__: /var/log/pveproxy/access.log
- job_name: pvedaemon
static_configs:
- targets:
- localhost
labels:
job: pvedaemon
host: ${ PROMTAIL_HOST }
__path__: /var/log/pvedaemon.log
- job_name: pve-tasks
static_configs:
- targets:
- localhost
labels:
job: pve-tasks
host: ${ PROMTAIL_HOST }
__path__: /var/log/pve/tasks/active
- job_name: ceph
static_configs:
- targets:
- localhost
labels:
job: ceph
host: ${ PROMTAIL_HOST }
__path__: /var/log/ceph/*.log
- job_name: journal
journal:
max_age: 12h
labels:
job: journal
host: ${ PROMTAIL_HOST }
relabel_configs:
- source_labels: [ '__journal__systemd_unit' ]
target_label: 'unit'
- source_labels: [ '__journal_priority_keyword' ]
target_label: 'priority'
PROMTAILEOF
# Create Promtail systemd service
cat > /etc/systemd/system/promtail.service << 'EOL'
[ Unit]
Description = Promtail Log Agent
After = network.target
[ Service]
Type = simple
ExecStart = /usr/local/bin/promtail -config.file= /etc/promtail/config.yml
Restart = always
RestartSec = 5
[ Install]
WantedBy = multi-user.target
EOL
systemctl daemon-reload
systemctl enable promtail
systemctl start promtail
if ! systemctl is-active --quiet promtail; then
echo "WARNING: Promtail failed to start"
systemctl status promtail --no-pager || true
else
echo "✓ Promtail is running and shipping logs to Loki"
fi
2024-12-08 15:05:13 -05:00
# Install hwmon daemon
echo "Installing hwmon daemon..."
2025-09-03 13:43:46 -04:00
# Download hwmon service files with error handling
echo "Downloading hwmon service files..."
2026-01-01 16:00:44 -05:00
if ! curl --max-time 30 --retry 3 -o /etc/systemd/system/hwmon.service http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmon.service; then
2025-09-03 13:43:46 -04:00
echo "ERROR: Failed to download hwmon.service"
exit 1
fi
2026-01-01 16:00:44 -05:00
if ! curl --max-time 30 --retry 3 -o /etc/systemd/system/hwmon.timer http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmon.timer; then
2025-09-03 13:43:46 -04:00
echo "ERROR: Failed to download hwmon.timer"
exit 1
fi
# Verify downloaded files exist and are not empty
if [ [ ! -s /etc/systemd/system/hwmon.service ] ] ; then
echo "ERROR: hwmon.service file is empty or missing"
exit 1
fi
if [ [ ! -s /etc/systemd/system/hwmon.timer ] ] ; then
echo "ERROR: hwmon.timer file is empty or missing"
exit 1
fi
2026-01-01 16:00:44 -05:00
# Create configuration directory for hwmon
echo "Setting up hwmon configuration..."
mkdir -p /etc/hwmonDaemon
2025-09-03 13:43:46 -04:00
mkdir -p /var/log/hwmonDaemon
2026-01-01 16:00:44 -05:00
# Prompt for API key or use default
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo " hwmonDaemon API Key Configuration"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "The hwmonDaemon requires an API key to create tickets."
echo "You can enter it now or configure it later by editing:"
echo " /etc/hwmonDaemon/.env"
echo ""
read -p "Enter API key (or press Enter to skip): " API_KEY
2026-02-10 13:20:04 -05:00
# Determine platform-specific defaults
if [ [ " $PLATFORM " = = "pbs" ] ] ; then
PBS_DEFAULT = "true"
CEPH_DEFAULT = "false"
else
PBS_DEFAULT = "false"
CEPH_DEFAULT = "true"
fi
2026-01-01 16:00:44 -05:00
if [ [ -n " $API_KEY " ] ] ; then
2026-02-10 13:20:04 -05:00
# Create .env file with API key and all config options
2026-01-01 16:00:44 -05:00
cat > /etc/hwmonDaemon/.env << EOF
# hwmonDaemon Configuration
# Auto-generated by freshStart.sh on $(date)
2026-02-10 13:20:04 -05:00
# Platform: ${PLATFORM}
2026-01-01 16:00:44 -05:00
# Ticket API Configuration
TICKET_API_KEY = $API_KEY
TICKET_API_URL = http://10.10.10.45/create_ticket_api.php
2026-02-10 13:20:04 -05:00
# Cluster Identification
CLUSTER_NAME = proxmox-cluster
# Ceph Monitoring (PVE nodes)
CEPH_ENABLED = ${ CEPH_DEFAULT }
#CEPH_TICKET_NODE=
#CEPH_USAGE_WARNING=70
#CEPH_USAGE_CRITICAL=85
# PBS Monitoring (Proxmox Backup Server)
PBS_ENABLED = ${ PBS_DEFAULT }
#PBS_ZFS_WARNING=80
#PBS_ZFS_CRITICAL=90
# Prometheus Metrics
PROMETHEUS_ENABLED = false
#PROMETHEUS_PORT=9101
# Health Check Endpoint
HEALTH_SERVER_ENABLED = false
#HEALTH_SERVER_PORT=9102
2026-01-01 16:00:44 -05:00
EOF
chmod 600 /etc/hwmonDaemon/.env
echo "✓ API key configured in /etc/hwmonDaemon/.env"
else
2026-02-10 13:20:04 -05:00
# Create template .env file with all config options
cat > /etc/hwmonDaemon/.env << EOF
2026-01-01 16:00:44 -05:00
# hwmonDaemon Configuration
# Edit this file to add your API key
2026-02-10 13:20:04 -05:00
# Platform: ${PLATFORM}
2026-01-01 16:00:44 -05:00
# Ticket API Configuration
TICKET_API_KEY = your_api_key_here
TICKET_API_URL = http://10.10.10.45/create_ticket_api.php
2026-02-10 13:20:04 -05:00
# Cluster Identification
CLUSTER_NAME = proxmox-cluster
# Ceph Monitoring (PVE nodes)
CEPH_ENABLED = ${ CEPH_DEFAULT }
#CEPH_TICKET_NODE=
#CEPH_USAGE_WARNING=70
#CEPH_USAGE_CRITICAL=85
# PBS Monitoring (Proxmox Backup Server)
PBS_ENABLED = ${ PBS_DEFAULT }
#PBS_ZFS_WARNING=80
#PBS_ZFS_CRITICAL=90
# Prometheus Metrics
PROMETHEUS_ENABLED = false
#PROMETHEUS_PORT=9101
# Health Check Endpoint
HEALTH_SERVER_ENABLED = false
#HEALTH_SERVER_PORT=9102
2026-01-01 16:00:44 -05:00
EOF
chmod 600 /etc/hwmonDaemon/.env
echo "⚠️ WARNING: API key not configured!"
echo " Edit /etc/hwmonDaemon/.env to add your API key before tickets can be created"
fi
echo ""
2025-09-03 13:43:46 -04:00
# Start the hwmon daemon
2024-12-08 15:05:13 -05:00
systemctl daemon-reload
systemctl enable hwmon.timer
systemctl start hwmon.timer
2025-09-03 13:43:46 -04:00
# Check if hwmon timer started successfully
if ! systemctl is-active --quiet hwmon.timer; then
echo "ERROR: hwmon timer failed to start"
systemctl status hwmon.timer --no-pager || true
exit 1
fi
# Final verification
echo "Verifying installation..."
echo " Node Exporter status: $( systemctl is-active node_exporter) "
2026-02-14 12:00:07 -05:00
echo " Promtail status: $( systemctl is-active promtail) "
2025-09-03 13:43:46 -04:00
echo " hwmon timer status: $( systemctl is-active hwmon.timer) "
echo "Node Exporter port check:"
if ss -tlnp | grep :9100; then
echo "✓ Node Exporter is listening on port 9100"
else
echo "WARNING: Node Exporter not listening on port 9100"
fi
# Test hwmon with error handling
2024-12-08 15:05:13 -05:00
echo "Testing hwmon dry-run..."
2026-01-01 16:00:44 -05:00
if ! /usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))" --dry-run; then
2025-09-03 13:43:46 -04:00
echo "WARNING: hwmon dry-run test failed, but services are installed"
fi
# Disable cleanup trap on successful completion
trap - ERR
2024-12-08 15:05:13 -05:00
2025-09-03 13:43:46 -04:00
echo "✓ Installation complete! All services are running correctly."
echo ""
2026-02-10 13:20:04 -05:00
echo " Platform: ${ PLATFORM ^^ } "
echo ""
2025-09-03 13:43:46 -04:00
echo "Services installed:"
echo " - Node Exporter: http:// $( hostname -I | awk '{print $1}' ) :9100/metrics "
2026-02-14 12:00:07 -05:00
echo " - Promtail: Shipping logs to Loki at ${ LOKI_URL :- http : //10.10.10.69 : 3100 } "
2026-02-10 13:20:04 -05:00
echo " - hwmon daemon: Monitoring system health hourly"
2025-09-03 13:43:46 -04:00
echo ""
2026-01-01 16:00:44 -05:00
echo "Configuration files:"
2026-02-14 12:00:07 -05:00
echo " - Promtail config: /etc/promtail/config.yml"
2026-01-01 16:00:44 -05:00
echo " - hwmon config: /etc/hwmonDaemon/.env"
echo ""
2025-09-03 13:43:46 -04:00
echo "Log locations:"
echo " - Node Exporter: journalctl -u node_exporter"
2026-02-14 12:00:07 -05:00
echo " - Promtail: journalctl -u promtail"
2025-09-03 13:43:46 -04:00
echo " - hwmon: journalctl -u hwmon.service"
2026-01-01 16:00:44 -05:00
echo " - hwmon logs: /var/log/hwmonDaemon/"
echo ""
2026-02-10 13:20:04 -05:00
if [ [ " $PLATFORM " = = "pbs" ] ] ; then
echo "PBS-specific monitoring enabled:"
echo " - ZFS pool health and usage monitoring"
echo " - Failed backup/GC/sync task detection"
echo ""
elif [ [ " $PLATFORM " = = "pve" ] ] ; then
echo "PVE-specific monitoring enabled:"
echo " - Ceph cluster health monitoring"
echo " - LXC container storage monitoring"
echo ""
fi
2026-01-01 16:00:44 -05:00
if [ [ ! -s /etc/hwmonDaemon/.env ] ] || grep -q "your_api_key_here" /etc/hwmonDaemon/.env 2>/dev/null; then
echo "⚠️ REMINDER: Configure your API key in /etc/hwmonDaemon/.env"
fi