Installs Promtail (Grafana Loki agent) to ship host logs to the central Loki instance at 10.10.10.69:3100. Scrapes syslog, auth, kernel, daemon, pveproxy, pvedaemon, pve-tasks, ceph, and systemd journal logs with per-host labeling. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
468 lines
14 KiB
Bash
468 lines
14 KiB
Bash
#!/bin/bash
|
|
set -e
|
|
|
|
echo "Starting Proxmox fresh installation script..."
|
|
|
|
# =============================================================================
|
|
# Platform Detection
|
|
# =============================================================================
|
|
PLATFORM="unknown"
|
|
if command -v pveversion &>/dev/null; then
|
|
PLATFORM="pve"
|
|
echo "Detected platform: Proxmox VE ($(pveversion 2>/dev/null || echo 'version unknown'))"
|
|
elif command -v proxmox-backup-manager &>/dev/null; then
|
|
PLATFORM="pbs"
|
|
echo "Detected platform: Proxmox Backup Server"
|
|
else
|
|
echo "WARNING: Could not detect Proxmox platform (PVE or PBS)"
|
|
echo "Proceeding with common package installation..."
|
|
fi
|
|
|
|
# Cleanup function for failed installations
|
|
cleanup() {
|
|
echo "Cleaning up on error..."
|
|
systemctl stop node_exporter 2>/dev/null || true
|
|
systemctl stop promtail 2>/dev/null || true
|
|
systemctl stop hwmon.timer 2>/dev/null || true
|
|
systemctl disable node_exporter 2>/dev/null || true
|
|
systemctl disable promtail 2>/dev/null || true
|
|
systemctl disable hwmon.timer 2>/dev/null || true
|
|
rm -f /etc/systemd/system/node_exporter.service
|
|
rm -f /etc/systemd/system/promtail.service
|
|
rm -f /etc/systemd/system/hwmon.service
|
|
rm -f /etc/systemd/system/hwmon.timer
|
|
rm -f /usr/local/bin/node_exporter
|
|
rm -f /usr/local/bin/promtail
|
|
rm -rf node_exporter-*.linux-amd64.tar.gz node_exporter-*.linux-amd64
|
|
rm -rf /etc/promtail /var/lib/promtail
|
|
userdel node_exporter 2>/dev/null || true
|
|
systemctl daemon-reload
|
|
echo "Cleanup completed."
|
|
}
|
|
|
|
# Set trap for cleanup on error
|
|
trap cleanup ERR
|
|
|
|
# Install dependencies
|
|
echo "Installing required packages..."
|
|
apt-get update
|
|
|
|
# Common packages for all platforms
|
|
COMMON_PKGS="python3-pip smartmontools python3-psutil python3-requests lm-sensors fastfetch rsync jq sysstat unzip"
|
|
|
|
if [[ "$PLATFORM" == "pve" ]]; then
|
|
echo "Installing PVE-specific packages..."
|
|
apt-get install -y $COMMON_PKGS iperf3 fio nvme-cli
|
|
elif [[ "$PLATFORM" == "pbs" ]]; then
|
|
echo "Installing PBS-specific packages..."
|
|
apt-get install -y $COMMON_PKGS zfsutils-linux nvme-cli
|
|
else
|
|
echo "Installing common packages..."
|
|
apt-get install -y $COMMON_PKGS nvme-cli
|
|
fi
|
|
|
|
# Install Node Exporter
|
|
echo "Installing Prometheus Node Exporter..."
|
|
NODE_EXPORTER_VERSION="1.8.2"
|
|
|
|
# Download Node Exporter with error handling
|
|
echo "Downloading Node Exporter..."
|
|
if ! wget --timeout=30 --tries=3 "https://github.com/prometheus/node_exporter/releases/download/v${NODE_EXPORTER_VERSION}/node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64.tar.gz"; then
|
|
echo "ERROR: Failed to download Node Exporter"
|
|
exit 1
|
|
fi
|
|
|
|
# Extract with error checking
|
|
if ! tar xvfz node_exporter-*.linux-amd64.tar.gz; then
|
|
echo "ERROR: Failed to extract Node Exporter archive"
|
|
exit 1
|
|
fi
|
|
|
|
# Check if user already exists
|
|
if ! id "node_exporter" &>/dev/null; then
|
|
echo "Creating node_exporter user..."
|
|
useradd -rs /bin/false node_exporter
|
|
else
|
|
echo "node_exporter user already exists, skipping creation..."
|
|
fi
|
|
|
|
# Move binary to proper location
|
|
if ! mv node_exporter-${NODE_EXPORTER_VERSION}.linux-amd64/node_exporter /usr/local/bin/; then
|
|
echo "ERROR: Failed to move Node Exporter binary"
|
|
exit 1
|
|
fi
|
|
|
|
# Cleanup downloaded files
|
|
rm -rf node_exporter-*.linux-amd64.tar.gz node_exporter-*.linux-amd64
|
|
|
|
# Set proper permissions
|
|
chown node_exporter:node_exporter /usr/local/bin/node_exporter
|
|
chmod +x /usr/local/bin/node_exporter
|
|
|
|
# Create node_exporter service file
|
|
cat > /etc/systemd/system/node_exporter.service << 'EOL'
|
|
[Unit]
|
|
Description=Node Exporter
|
|
After=network.target
|
|
|
|
[Service]
|
|
User=node_exporter
|
|
Group=node_exporter
|
|
Type=simple
|
|
ExecStart=/usr/local/bin/node_exporter
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOL
|
|
|
|
# Enable and start node_exporter
|
|
systemctl daemon-reload
|
|
systemctl enable node_exporter
|
|
systemctl start node_exporter
|
|
|
|
# Check if Node Exporter started successfully
|
|
if ! systemctl is-active --quiet node_exporter; then
|
|
echo "ERROR: Node Exporter failed to start"
|
|
systemctl status node_exporter --no-pager || true
|
|
exit 1
|
|
fi
|
|
|
|
# Install Promtail (Loki log agent)
|
|
echo "Installing Promtail log agent..."
|
|
PROMTAIL_VERSION="3.4.2"
|
|
LOKI_URL="http://10.10.10.69:3100"
|
|
|
|
echo "Downloading Promtail..."
|
|
if ! wget --timeout=30 --tries=3 "https://github.com/grafana/loki/releases/download/v${PROMTAIL_VERSION}/promtail-linux-amd64.zip" -O /tmp/promtail.zip; then
|
|
echo "ERROR: Failed to download Promtail"
|
|
exit 1
|
|
fi
|
|
|
|
if ! command -v unzip &>/dev/null; then
|
|
apt-get install -y unzip
|
|
fi
|
|
|
|
if ! unzip -o /tmp/promtail.zip -d /tmp/; then
|
|
echo "ERROR: Failed to extract Promtail"
|
|
exit 1
|
|
fi
|
|
|
|
mv /tmp/promtail-linux-amd64 /usr/local/bin/promtail
|
|
chmod +x /usr/local/bin/promtail
|
|
rm -f /tmp/promtail.zip
|
|
|
|
# Create Promtail directories
|
|
mkdir -p /etc/promtail /var/lib/promtail
|
|
|
|
# Get hostname for labeling
|
|
PROMTAIL_HOST=$(hostname)
|
|
|
|
# Create Promtail config
|
|
cat > /etc/promtail/config.yml << PROMTAILEOF
|
|
server:
|
|
http_listen_port: 9080
|
|
grpc_listen_port: 0
|
|
|
|
positions:
|
|
filename: /var/lib/promtail/positions.yaml
|
|
|
|
clients:
|
|
- url: ${LOKI_URL}/loki/api/v1/push
|
|
|
|
scrape_configs:
|
|
- job_name: syslog
|
|
static_configs:
|
|
- targets:
|
|
- localhost
|
|
labels:
|
|
job: syslog
|
|
host: ${PROMTAIL_HOST}
|
|
__path__: /var/log/syslog
|
|
- job_name: auth
|
|
static_configs:
|
|
- targets:
|
|
- localhost
|
|
labels:
|
|
job: auth
|
|
host: ${PROMTAIL_HOST}
|
|
__path__: /var/log/auth.log
|
|
- job_name: kern
|
|
static_configs:
|
|
- targets:
|
|
- localhost
|
|
labels:
|
|
job: kernel
|
|
host: ${PROMTAIL_HOST}
|
|
__path__: /var/log/kern.log
|
|
- job_name: daemon
|
|
static_configs:
|
|
- targets:
|
|
- localhost
|
|
labels:
|
|
job: daemon
|
|
host: ${PROMTAIL_HOST}
|
|
__path__: /var/log/daemon.log
|
|
- job_name: pveproxy
|
|
static_configs:
|
|
- targets:
|
|
- localhost
|
|
labels:
|
|
job: pveproxy
|
|
host: ${PROMTAIL_HOST}
|
|
__path__: /var/log/pveproxy/access.log
|
|
- job_name: pvedaemon
|
|
static_configs:
|
|
- targets:
|
|
- localhost
|
|
labels:
|
|
job: pvedaemon
|
|
host: ${PROMTAIL_HOST}
|
|
__path__: /var/log/pvedaemon.log
|
|
- job_name: pve-tasks
|
|
static_configs:
|
|
- targets:
|
|
- localhost
|
|
labels:
|
|
job: pve-tasks
|
|
host: ${PROMTAIL_HOST}
|
|
__path__: /var/log/pve/tasks/active
|
|
- job_name: ceph
|
|
static_configs:
|
|
- targets:
|
|
- localhost
|
|
labels:
|
|
job: ceph
|
|
host: ${PROMTAIL_HOST}
|
|
__path__: /var/log/ceph/*.log
|
|
- job_name: journal
|
|
journal:
|
|
max_age: 12h
|
|
labels:
|
|
job: journal
|
|
host: ${PROMTAIL_HOST}
|
|
relabel_configs:
|
|
- source_labels: ['__journal__systemd_unit']
|
|
target_label: 'unit'
|
|
- source_labels: ['__journal_priority_keyword']
|
|
target_label: 'priority'
|
|
PROMTAILEOF
|
|
|
|
# Create Promtail systemd service
|
|
cat > /etc/systemd/system/promtail.service << 'EOL'
|
|
[Unit]
|
|
Description=Promtail Log Agent
|
|
After=network.target
|
|
|
|
[Service]
|
|
Type=simple
|
|
ExecStart=/usr/local/bin/promtail -config.file=/etc/promtail/config.yml
|
|
Restart=always
|
|
RestartSec=5
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
EOL
|
|
|
|
systemctl daemon-reload
|
|
systemctl enable promtail
|
|
systemctl start promtail
|
|
|
|
if ! systemctl is-active --quiet promtail; then
|
|
echo "WARNING: Promtail failed to start"
|
|
systemctl status promtail --no-pager || true
|
|
else
|
|
echo "✓ Promtail is running and shipping logs to Loki"
|
|
fi
|
|
|
|
# Install hwmon daemon
|
|
echo "Installing hwmon daemon..."
|
|
|
|
# Download hwmon service files with error handling
|
|
echo "Downloading hwmon service files..."
|
|
if ! curl --max-time 30 --retry 3 -o /etc/systemd/system/hwmon.service http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmon.service; then
|
|
echo "ERROR: Failed to download hwmon.service"
|
|
exit 1
|
|
fi
|
|
|
|
if ! curl --max-time 30 --retry 3 -o /etc/systemd/system/hwmon.timer http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmon.timer; then
|
|
echo "ERROR: Failed to download hwmon.timer"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify downloaded files exist and are not empty
|
|
if [[ ! -s /etc/systemd/system/hwmon.service ]]; then
|
|
echo "ERROR: hwmon.service file is empty or missing"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ ! -s /etc/systemd/system/hwmon.timer ]]; then
|
|
echo "ERROR: hwmon.timer file is empty or missing"
|
|
exit 1
|
|
fi
|
|
|
|
# Create configuration directory for hwmon
|
|
echo "Setting up hwmon configuration..."
|
|
mkdir -p /etc/hwmonDaemon
|
|
mkdir -p /var/log/hwmonDaemon
|
|
|
|
# Prompt for API key or use default
|
|
echo ""
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo " hwmonDaemon API Key Configuration"
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|
echo ""
|
|
echo "The hwmonDaemon requires an API key to create tickets."
|
|
echo "You can enter it now or configure it later by editing:"
|
|
echo " /etc/hwmonDaemon/.env"
|
|
echo ""
|
|
read -p "Enter API key (or press Enter to skip): " API_KEY
|
|
|
|
# Determine platform-specific defaults
|
|
if [[ "$PLATFORM" == "pbs" ]]; then
|
|
PBS_DEFAULT="true"
|
|
CEPH_DEFAULT="false"
|
|
else
|
|
PBS_DEFAULT="false"
|
|
CEPH_DEFAULT="true"
|
|
fi
|
|
|
|
if [[ -n "$API_KEY" ]]; then
|
|
# Create .env file with API key and all config options
|
|
cat > /etc/hwmonDaemon/.env << EOF
|
|
# hwmonDaemon Configuration
|
|
# Auto-generated by freshStart.sh on $(date)
|
|
# Platform: ${PLATFORM}
|
|
|
|
# Ticket API Configuration
|
|
TICKET_API_KEY=$API_KEY
|
|
TICKET_API_URL=http://10.10.10.45/create_ticket_api.php
|
|
|
|
# Cluster Identification
|
|
CLUSTER_NAME=proxmox-cluster
|
|
|
|
# Ceph Monitoring (PVE nodes)
|
|
CEPH_ENABLED=${CEPH_DEFAULT}
|
|
#CEPH_TICKET_NODE=
|
|
#CEPH_USAGE_WARNING=70
|
|
#CEPH_USAGE_CRITICAL=85
|
|
|
|
# PBS Monitoring (Proxmox Backup Server)
|
|
PBS_ENABLED=${PBS_DEFAULT}
|
|
#PBS_ZFS_WARNING=80
|
|
#PBS_ZFS_CRITICAL=90
|
|
|
|
# Prometheus Metrics
|
|
PROMETHEUS_ENABLED=false
|
|
#PROMETHEUS_PORT=9101
|
|
|
|
# Health Check Endpoint
|
|
HEALTH_SERVER_ENABLED=false
|
|
#HEALTH_SERVER_PORT=9102
|
|
EOF
|
|
chmod 600 /etc/hwmonDaemon/.env
|
|
echo "✓ API key configured in /etc/hwmonDaemon/.env"
|
|
else
|
|
# Create template .env file with all config options
|
|
cat > /etc/hwmonDaemon/.env << EOF
|
|
# hwmonDaemon Configuration
|
|
# Edit this file to add your API key
|
|
# Platform: ${PLATFORM}
|
|
|
|
# Ticket API Configuration
|
|
TICKET_API_KEY=your_api_key_here
|
|
TICKET_API_URL=http://10.10.10.45/create_ticket_api.php
|
|
|
|
# Cluster Identification
|
|
CLUSTER_NAME=proxmox-cluster
|
|
|
|
# Ceph Monitoring (PVE nodes)
|
|
CEPH_ENABLED=${CEPH_DEFAULT}
|
|
#CEPH_TICKET_NODE=
|
|
#CEPH_USAGE_WARNING=70
|
|
#CEPH_USAGE_CRITICAL=85
|
|
|
|
# PBS Monitoring (Proxmox Backup Server)
|
|
PBS_ENABLED=${PBS_DEFAULT}
|
|
#PBS_ZFS_WARNING=80
|
|
#PBS_ZFS_CRITICAL=90
|
|
|
|
# Prometheus Metrics
|
|
PROMETHEUS_ENABLED=false
|
|
#PROMETHEUS_PORT=9101
|
|
|
|
# Health Check Endpoint
|
|
HEALTH_SERVER_ENABLED=false
|
|
#HEALTH_SERVER_PORT=9102
|
|
EOF
|
|
chmod 600 /etc/hwmonDaemon/.env
|
|
echo "⚠️ WARNING: API key not configured!"
|
|
echo " Edit /etc/hwmonDaemon/.env to add your API key before tickets can be created"
|
|
fi
|
|
echo ""
|
|
|
|
# Start the hwmon daemon
|
|
systemctl daemon-reload
|
|
systemctl enable hwmon.timer
|
|
systemctl start hwmon.timer
|
|
|
|
# Check if hwmon timer started successfully
|
|
if ! systemctl is-active --quiet hwmon.timer; then
|
|
echo "ERROR: hwmon timer failed to start"
|
|
systemctl status hwmon.timer --no-pager || true
|
|
exit 1
|
|
fi
|
|
|
|
# Final verification
|
|
echo "Verifying installation..."
|
|
echo "Node Exporter status: $(systemctl is-active node_exporter)"
|
|
echo "Promtail status: $(systemctl is-active promtail)"
|
|
echo "hwmon timer status: $(systemctl is-active hwmon.timer)"
|
|
echo "Node Exporter port check:"
|
|
if ss -tlnp | grep :9100; then
|
|
echo "✓ Node Exporter is listening on port 9100"
|
|
else
|
|
echo "WARNING: Node Exporter not listening on port 9100"
|
|
fi
|
|
|
|
# Test hwmon with error handling
|
|
echo "Testing hwmon dry-run..."
|
|
if ! /usr/bin/env python3 -c "import urllib.request; exec(urllib.request.urlopen('http://10.10.10.63:3000/LotusGuild/hwmonDaemon/raw/branch/main/hwmonDaemon.py').read().decode('utf-8'))" --dry-run; then
|
|
echo "WARNING: hwmon dry-run test failed, but services are installed"
|
|
fi
|
|
|
|
# Disable cleanup trap on successful completion
|
|
trap - ERR
|
|
|
|
echo "✓ Installation complete! All services are running correctly."
|
|
echo ""
|
|
echo "Platform: ${PLATFORM^^}"
|
|
echo ""
|
|
echo "Services installed:"
|
|
echo " - Node Exporter: http://$(hostname -I | awk '{print $1}'):9100/metrics"
|
|
echo " - Promtail: Shipping logs to Loki at ${LOKI_URL:-http://10.10.10.69:3100}"
|
|
echo " - hwmon daemon: Monitoring system health hourly"
|
|
echo ""
|
|
echo "Configuration files:"
|
|
echo " - Promtail config: /etc/promtail/config.yml"
|
|
echo " - hwmon config: /etc/hwmonDaemon/.env"
|
|
echo ""
|
|
echo "Log locations:"
|
|
echo " - Node Exporter: journalctl -u node_exporter"
|
|
echo " - Promtail: journalctl -u promtail"
|
|
echo " - hwmon: journalctl -u hwmon.service"
|
|
echo " - hwmon logs: /var/log/hwmonDaemon/"
|
|
echo ""
|
|
if [[ "$PLATFORM" == "pbs" ]]; then
|
|
echo "PBS-specific monitoring enabled:"
|
|
echo " - ZFS pool health and usage monitoring"
|
|
echo " - Failed backup/GC/sync task detection"
|
|
echo ""
|
|
elif [[ "$PLATFORM" == "pve" ]]; then
|
|
echo "PVE-specific monitoring enabled:"
|
|
echo " - Ceph cluster health monitoring"
|
|
echo " - LXC container storage monitoring"
|
|
echo ""
|
|
fi
|
|
if [[ ! -s /etc/hwmonDaemon/.env ]] || grep -q "your_api_key_here" /etc/hwmonDaemon/.env 2>/dev/null; then
|
|
echo "⚠️ REMINDER: Configure your API key in /etc/hwmonDaemon/.env"
|
|
fi |