From c13549f3da03341f46eb7164ac8aab4538ebc445 Mon Sep 17 00:00:00 2001 From: Jared Vititoe Date: Tue, 30 Jun 2026 18:10:10 -0400 Subject: [PATCH] cinny: harden + version-control the webhook web-deploy (lotus_deploy.sh) The live /usr/local/bin/lotus_deploy.sh (the `lotus-deploy` webhook target) was never under version control and had rotted into two deploy-killing bugs that froze chat.lotusguild.org on an old build: 1. CI gate: it waited on the WHOLE workflow run with a 15-min cap. Web CI shares the single act_runner with the slow Tauri desktop builds, so a web run could sit queued >15 min -> "result: timeout" -> deploy aborted. Now it gates only on the "Build & Quality Checks" commit-status context (build + unit tests), decoupled from "Trigger Desktop Build", and waits up to 45 min. 2. Dead element-call copy: `cp node_modules/@element-hq/element-call-embedded/...` under `set -e` aborted every deploy after the widget was forked to @lotusguild/element-call-embedded. The build already emits dist/public/ element-call; replaced the copy with a presence check. Also: rsync now excludes config.json so the app deploy stops clobbering the production runtime config (homeserver list / allowCustomHomeservers) that the matrix repo owns. lxc106-cinny.sh now installs this script (syntax-checked). Co-Authored-By: Claude Opus 4.8 --- cinny/lotus_deploy.sh | 123 +++++++++++++++++++++++++++++++++++++++++ deploy/lxc106-cinny.sh | 26 ++++++++- 2 files changed, 146 insertions(+), 3 deletions(-) create mode 100755 cinny/lotus_deploy.sh diff --git a/cinny/lotus_deploy.sh b/cinny/lotus_deploy.sh new file mode 100755 index 0000000..b3a1436 --- /dev/null +++ b/cinny/lotus_deploy.sh @@ -0,0 +1,123 @@ +#!/bin/bash +set -e + +REPO="/opt/lotus-cinny" +WEBROOT="/var/www/html" +LOCKFILE="/tmp/lotus-deploy.lock" +LOGFILE="/var/log/lotus-deploy.log" + +# Prevent concurrent deploys +exec 200>"$LOCKFILE" +flock -n 200 || { echo "[$(date '+%Y-%m-%d %H:%M:%S')] Deploy already in progress, skipping." >> "$LOGFILE"; exit 0; } + +exec >> "$LOGFILE" 2>&1 +echo "[$(date '+%Y-%m-%d %H:%M:%S')] ===== Deploy triggered =====" + +# Load secrets (auth tokens etc — not in git) +if [ -f /etc/lotus-deploy.env ]; then + # shellcheck disable=SC1091 + set -a; source /etc/lotus-deploy.env; set +a +fi + +cd "$REPO" + +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Fetching origin/lotus..." +git fetch --all +COMMIT_SHA=$(git rev-parse origin/lotus) +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Commit: $COMMIT_SHA" + +# ── CI gate ───────────────────────────────────────────────────────────────── +# Wait for the web build+test CI to pass before deploying. We gate ONLY on the +# "Build & Quality Checks" commit-status context (npm build + unit tests) — NOT +# the whole workflow run. This decouples the web deploy from the unrelated +# "Trigger Desktop Build" job and the slow downstream Tauri desktop builds that +# share the act_runner: web CI can sit queued behind a 30-min desktop build, so +# we keep waiting while the context is pending/absent, and only abort on an +# explicit failure or the (generous) cap. The previous version gated on the +# overall workflow run with a 15-min cap, so a web CI queued behind a desktop +# build timed out -> "result: timeout" -> deploy aborted -> the site stayed +# frozen on an old build for days. +if [ -n "${GITEA_API_TOKEN:-}" ]; then + GITEA_API="https://code.lotusguild.org/api/v1" + REPO_PATH="LotusGuild/cinny" + GATE_CONTEXT="Build & Quality Checks" + MAX_WAIT=2700 # 45 min — web CI can queue behind long Tauri desktop builds + POLL_INTERVAL=15 + elapsed=0 + ci_result="" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Waiting for CI '$GATE_CONTEXT' on $COMMIT_SHA..." + + while [ "$elapsed" -lt "$MAX_WAIT" ]; do + state=$(curl -s -H "Authorization: token $GITEA_API_TOKEN" \ + "$GITEA_API/repos/$REPO_PATH/commits/$COMMIT_SHA/status" \ + | GATE="$GATE_CONTEXT" python3 -c " +import json, os, sys +try: + d = json.load(sys.stdin) +except Exception: + print('pending'); sys.exit(0) +gate = os.environ.get('GATE', '') +for s in d.get('statuses', []): + if gate in (s.get('context') or ''): + print(s.get('status') or 'pending'); break +else: + print('pending') +" 2>/dev/null || echo pending) + + case "$state" in + success) ci_result=success; break ;; + failure|error) ci_result="$state"; break ;; + esac + echo "[$(date '+%Y-%m-%d %H:%M:%S')] CI not yet passed (${elapsed}s elapsed, '$GATE_CONTEXT': ${state}), waiting..." + sleep "$POLL_INTERVAL" + elapsed=$((elapsed + POLL_INTERVAL)) + done + + if [ "$ci_result" != "success" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] CI did not pass (result: ${ci_result:-timeout}). Aborting deploy." + exit 1 + fi + echo "[$(date '+%Y-%m-%d %H:%M:%S')] CI '$GATE_CONTEXT' passed. Proceeding with deploy." +else + echo "[$(date '+%Y-%m-%d %H:%M:%S')] WARNING: GITEA_API_TOKEN not set, deploying without CI gate." +fi + +git reset --hard origin/lotus + +# Tag this build with the exact commit so Sentry can link errors to source +export VITE_APP_VERSION=$COMMIT_SHA +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Building commit $VITE_APP_VERSION..." + +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Installing dependencies..." +npm ci --ignore-scripts + +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Building..." +NODE_OPTIONS=--max_old_space_size=4096 npm run build + +# The Element Call widget (the @lotusguild/element-call-embedded fork) is emitted +# into dist/public/element-call by the build itself — no manual copy is needed. +# (The old `cp node_modules/@element-hq/element-call-embedded/dist/.` step was a +# deploy-killer: the package was forked to @lotusguild, so under `set -e` that +# now-missing path aborted every deploy.) Verify the bundle actually landed +# before publishing rather than blindly copying. +if [ ! -f "$REPO/dist/public/element-call/index.html" ]; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: dist/public/element-call/ missing after build (check @lotusguild/element-call-embedded pin). Aborting." + exit 1 +fi + +echo "[$(date '+%Y-%m-%d %H:%M:%S')] Deploying to $WEBROOT..." +# Exclude config.json: the production runtime config (homeserver list, +# allowCustomHomeservers, etc.) is owned by the matrix repo and deployed to +# /var/www/html/config.json by lxc106-cinny.sh. The build ships a DEV default +# (allowCustomHomeservers:true); rsyncing it would clobber the production config +# on every deploy. Keep the app bundle and the runtime config separate. +rsync -a --delete --exclude config.json dist/ "$WEBROOT/" + +echo "[$(date '+%Y-%m-%d %H:%M:%S')] ===== Deploy complete ($VITE_APP_VERSION) =====" + +# Inject runtime secrets that are never stored in git. If the production +# config.json carries the "gifApiKey": "" placeholder, fill it from the env. +if [ -n "${GIPHY_API_KEY:-}" ]; then + sed -i "s|\"gifApiKey\": \"\"|\"gifApiKey\": \"$GIPHY_API_KEY\"|" "$WEBROOT/config.json" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Injected GIPHY_API_KEY into config.json" +fi diff --git a/deploy/lxc106-cinny.sh b/deploy/lxc106-cinny.sh index ab403a7..29e874c 100644 --- a/deploy/lxc106-cinny.sh +++ b/deploy/lxc106-cinny.sh @@ -1,8 +1,8 @@ #!/bin/bash # Auto-deploy script for LXC 106 (cinny) # Handles: cinny/config.json, cinny/nginx.conf, cinny/upstream-check.sh, -# cinny/lotus-build.sh, deploy/hooks-lxc106.json, -# systemd/cinny-upstream-check.cron +# cinny/lotus-build.sh, cinny/lotus_deploy.sh, +# deploy/hooks-lxc106.json, systemd/cinny-upstream-check.cron # Triggered by: Gitea webhook on push to main set -euo pipefail @@ -15,7 +15,7 @@ echo "=== $(date) === LXC106 deploy triggered ===" if [ ! -d "$REPO_DIR/.git" ]; then git clone "$CLONE_URL" "$REPO_DIR" - CHANGED="cinny/config.json cinny/nginx.conf cinny/upstream-check.sh cinny/lotus-build.sh deploy/hooks-lxc106.json systemd/cinny-upstream-check.cron" + CHANGED="cinny/config.json cinny/nginx.conf cinny/upstream-check.sh cinny/lotus-build.sh cinny/lotus_deploy.sh deploy/hooks-lxc106.json systemd/cinny-upstream-check.cron" else cd "$REPO_DIR" git fetch --all @@ -63,6 +63,26 @@ if echo "$CHANGED" | grep -q '^cinny/lotus-build.sh'; then echo "✓ lotus-build.sh deployed" fi +if echo "$CHANGED" | grep -q '^cinny/lotus-deploy.sh'; then + echo "Deploying lotus-deploy.sh..." + cp "$REPO_DIR/cinny/lotus-deploy.sh" /usr/local/bin/cinny-deploy.sh + chmod +x /usr/local/bin/cinny-deploy.sh + echo "✓ lotus-deploy.sh deployed" +fi + +if echo "$CHANGED" | grep -q '^cinny/lotus_deploy.sh'; then + echo "Deploying lotus_deploy.sh (webhook CI-gated web deploy)..." + # The `lotus-deploy` webhook hook executes /usr/local/bin/lotus_deploy.sh. + # Validate syntax before swapping so a broken script can never wedge deploys. + if bash -n "$REPO_DIR/cinny/lotus_deploy.sh"; then + cp "$REPO_DIR/cinny/lotus_deploy.sh" /usr/local/bin/lotus_deploy.sh + chmod +x /usr/local/bin/lotus_deploy.sh + echo "✓ lotus_deploy.sh deployed" + else + echo "✗ bash -n FAILED on lotus_deploy.sh — skipping install" + fi +fi + if echo "$CHANGED" | grep -q '^deploy/hooks-lxc106.json'; then echo "Deploying hooks-lxc106.json..." cp "$REPO_DIR/deploy/hooks-lxc106.json" /etc/webhook/hooks.json