LibreChat/.github/workflows/gitnexus-deploy.yml
Danny Avila b4fa200e5f
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
GitNexus Index / index (push) Waiting to run
GitNexus Index / post-index (push) Blocked by required conditions
ci: Bump GitNexus to 1.6.7 to Fix Embeddings Index Timeout (#13658)
*  ci: Bump GitNexus to 1.6.7 to Fix Embeddings Index Timeout

* ⏲️ ci: Raise GitNexus Index Timeout for 1.6.x Embedding Volume
2026-06-10 14:05:54 -04:00

600 lines
26 KiB
YAML

# Deploys GitNexus indexes to a droplet via SSH + rsync.
#
# Architecture:
# GitHub Actions (deploy)
# 1. Resolves latest successful index runs for main, dev, and every
# open PR that already has an index artifact (contributor-gated
# upstream by the index workflow's author_association check)
# 2. Downloads each matching .gitnexus/ artifact
# 3. Rsyncs them into /opt/gitnexus/indexes/<name>/ on the droplet
# 4. Removes any stale folders on the droplet for PRs that closed
# (even though gitnexus-cleanup-pr.yml also handles that path,
# this is a safety net in case the close event was missed)
# 5. Pulls latest image, force-recreates gitnexus, reloads Caddy,
# and polls docker health until the container reports healthy
# The caddy container is untouched — no TLS churn.
#
# First-time droplet bootstrap (run once, manually):
# 1. Create 2GB+ Ubuntu 24.04 droplet, add SSH key
# 2. Point DNS A record for your subdomain at the droplet IP
# 3. SSH in and run:
# curl -fsSL https://get.docker.com | sh
# systemctl enable --now docker
# mkdir -p /opt/gitnexus/indexes
# useradd -m -s /bin/bash deploy
# usermod -aG docker deploy
# mkdir -p /home/deploy/.ssh
# # Add deploy pubkey to /home/deploy/.ssh/authorized_keys
# chown -R deploy:deploy /home/deploy/.ssh /opt/gitnexus
# chmod 700 /home/deploy/.ssh
# ufw allow 22,80,443/tcp
# ufw --force enable
# 4. Copy .do/gitnexus/docker-compose.yml and Caddyfile into /opt/gitnexus/
# 5. Create /opt/gitnexus/.env with: GITNEXUS_DOMAIN=... and API_TOKEN=...
# 6. cd /opt/gitnexus && docker compose up -d
#
# Then capture the droplet's SSH host key from your workstation and
# save it as the GITNEXUS_DO_KNOWN_HOST secret (below) so CI can pin it:
# ssh-keyscan -H gitnexus.yourdomain.com
#
# GHCR image: the workflow runs `docker login ghcr.io` on the droplet
# on every deploy using GITHUB_TOKEN, so the package can stay private.
# If you'd rather not have CI manage droplet auth, make the package
# public under repo Settings -> Packages.
#
# Required GitHub secrets:
# GITNEXUS_DO_HOST — droplet IP or hostname
# GITNEXUS_DO_USER — SSH user (e.g. "deploy")
# GITNEXUS_DO_SSH_KEY — private key matching the authorized pubkey
# GITNEXUS_DO_KNOWN_HOST — output of `ssh-keyscan -H <host>` pinning the
# droplet's host keys (prevents MITM/TOFU risk)
name: GitNexus Deploy
on:
workflow_run:
workflows: ['GitNexus Index']
types: [completed]
workflow_dispatch:
inputs:
pr_number:
description: 'Optional PR number to post completion comment on (set by bot-triggered dispatches from gitnexus-index.yml)'
type: string
default: ''
permissions:
actions: read
contents: read
pull-requests: write # post completion comments on served PR indexes
# Global serialization. Earlier versions used per-ref concurrency with
# cancel-in-progress so rapid pushes to the same ref coalesced but deploys
# targeting different refs ran in parallel. That had a data race: the
# prune-stale-indexes step computes its active_names up front, so if
# deploy A is rsyncing /opt/gitnexus/indexes/LibreChat-pr-12580 while
# deploy B (started slightly later with a different ref) prunes, B can
# rm -rf a folder A is still uploading into.
#
# All deploys now queue behind a single group. cancel-in-progress is
# false so a running rsync/docker-compose restart never gets killed
# mid-operation (which would leave the droplet in a partial state).
# The 20-minute job timeout bounds total queue depth.
concurrency:
group: gitnexus-deploy
cancel-in-progress: false
env:
GITNEXUS_VERSION: '1.6.7'
IMAGE_NAME: ghcr.io/${{ github.repository_owner }}/librechat-gitnexus
jobs:
# Rebuilds the long-lived image only when Dockerfile/entrypoint/extensions
# change. Skipped on every other run, so index-only deploys are fast.
build-image:
if: |
github.event_name == 'workflow_dispatch' ||
github.event.workflow_run.conclusion == 'success'
runs-on: ubuntu-latest
timeout-minutes: 20
permissions:
contents: read
packages: write # push image to GHCR
outputs:
image_tag: ${{ steps.tag.outputs.value }}
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 2
- name: Detect image changes
id: changes
run: |
# Default to rebuild when we can't cleanly diff (first commit,
# workflow_run from a PR branch where HEAD isn't the trigger, etc).
# Rebuild on miss > skip when we should have rebuilt.
if git rev-parse --verify HEAD~1 >/dev/null 2>&1 && \
git diff --quiet HEAD~1 HEAD -- .do/gitnexus/Dockerfile .do/gitnexus/entrypoint.sh .do/gitnexus/install-extensions.js; then
echo "changed=false" >> "$GITHUB_OUTPUT"
else
echo "changed=true" >> "$GITHUB_OUTPUT"
fi
- name: Compute image tag
id: tag
run: echo "value=v${{ env.GITNEXUS_VERSION }}" >> "$GITHUB_OUTPUT"
- name: Set up Docker Buildx
if: steps.changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
uses: docker/setup-buildx-action@v3
- name: Log in to GHCR
if: steps.changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push image
if: steps.changes.outputs.changed == 'true' || github.event_name == 'workflow_dispatch'
uses: docker/build-push-action@v5
with:
context: .do/gitnexus
file: .do/gitnexus/Dockerfile
push: true
tags: |
${{ env.IMAGE_NAME }}:latest
${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.value }}
build-args: |
GITNEXUS_VERSION=${{ env.GITNEXUS_VERSION }}
cache-from: type=gha
cache-to: type=gha,mode=max
deploy:
needs: build-image
runs-on: ubuntu-latest
timeout-minutes: 20
permissions:
actions: read
contents: read
pull-requests: write # post deploy-complete comments on served PR indexes
steps:
- name: Checkout deploy config
uses: actions/checkout@v4
with:
sparse-checkout: .do/gitnexus
fetch-depth: 1
# Resolve every index to serve. All resolutions go through
# listArtifactsForRepo keyed by the expected artifact name, so a
# run's branch or event type doesn't matter — we always pick the
# freshest artifact that actually exists.
#
# Why this matters: a /gitnexus index command dispatches
# gitnexus-index.yml with ref=main and an input pr_number, which
# produces a run whose head_branch is "main" but whose artifact
# is gitnexus-index-pr-<N>. listWorkflowRuns(branch='main') would
# happily return that run, and we'd then try to download a
# nonexistent gitnexus-index-main artifact from it. Querying by
# artifact name directly avoids the whole mess.
- name: Resolve indexes to serve
id: resolve
uses: actions/github-script@v7
with:
script: |
const serve = []; // [{ name, artifactName, runId }]
// Helper — pick the newest non-expired artifact matching a name.
const latestArtifact = async (artifactName) => {
const { data } = await github.rest.actions.listArtifactsForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
name: artifactName,
per_page: 10,
});
return data.artifacts
.filter((a) => !a.expired)
.sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
};
// --- main and dev branches ---
for (const [branch, name] of [
['main', 'LibreChat'],
['dev', 'LibreChat-dev'],
]) {
const artifactName = `gitnexus-index-${branch}`;
const fresh = await latestArtifact(artifactName);
if (!fresh) {
core.warning(`No artifact found for ${branch} (expected ${artifactName})`);
continue;
}
serve.push({
name,
artifactName,
runId: fresh.workflow_run.id,
});
core.info(`${branch}: run ${fresh.workflow_run.id} -> ${name}`);
}
// --- open PRs with at least one successful index run ---
// github.paginate handles the 100-per-page ceiling automatically
// so the resolution works on repos with 200+ concurrent open PRs.
const openPrs = await github.paginate(github.rest.pulls.list, {
owner: context.repo.owner,
repo: context.repo.repo,
state: 'open',
per_page: 100,
});
core.info(`Found ${openPrs.length} open PRs`);
// Parallelize artifact lookups in fixed-size batches so the
// resolve step runs in seconds instead of minutes on big repos,
// without burning the GitHub API rate limit all at once.
const BATCH_SIZE = 10;
const prMatches = [];
for (let i = 0; i < openPrs.length; i += BATCH_SIZE) {
const batch = openPrs.slice(i, i + BATCH_SIZE);
const results = await Promise.all(
batch.map(async (pr) => {
const artifactName = `gitnexus-index-pr-${pr.number}`;
const fresh = await latestArtifact(artifactName);
return fresh ? { pr, artifactName, fresh } : null;
}),
);
for (const hit of results) {
if (hit) prMatches.push(hit);
}
}
// Cap to the N most recent PR indexes by artifact creation time.
// On a 10GB droplet each index is ~130MB; 3 PRs + main + dev ≈
// 650MB of index data, leaving headroom for the ~700MB Docker image
// and OS. Older PR indexes are evicted by the prune step.
const MAX_PR_INDEXES = 3;
prMatches.sort(
(a, b) => new Date(b.fresh.created_at) - new Date(a.fresh.created_at),
);
const keptPrs = prMatches.slice(0, MAX_PR_INDEXES);
const evictedPrs = prMatches.slice(MAX_PR_INDEXES);
for (const { pr, artifactName, fresh } of keptPrs) {
serve.push({
name: `LibreChat-pr-${pr.number}`,
artifactName,
runId: fresh.workflow_run.id,
});
core.info(`PR #${pr.number}: run ${fresh.workflow_run.id} -> LibreChat-pr-${pr.number}`);
}
if (evictedPrs.length) {
core.info(
`Evicted ${evictedPrs.length} older PR indexes (cap=${MAX_PR_INDEXES}): ` +
evictedPrs.map((e) => `#${e.pr.number}`).join(', '),
);
}
core.info(`Serving ${keptPrs.length} PR indexes out of ${prMatches.length} with artifacts (${openPrs.length} open PRs total)`);
if (!serve.length) {
core.setFailed('No indexes to serve');
return;
}
core.setOutput('matrix', JSON.stringify(serve));
core.setOutput('active_names', serve.map((s) => s.name).join(','));
- name: Download each index artifact
env:
MATRIX: ${{ steps.resolve.outputs.matrix }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
set -e
mkdir -p staging
# main/dev artifact download failures are fatal — a missing
# main/dev index is a real deploy failure. PR artifact failures
# are soft — a PR artifact deleted mid-deploy shouldn't abort
# the whole deploy and take main/dev down with it.
echo "$MATRIX" | jq -c '.[]' | while read -r entry; do
name=$(echo "$entry" | jq -r '.name')
artifact=$(echo "$entry" | jq -r '.artifactName')
runId=$(echo "$entry" | jq -r '.runId')
target="staging/${name}/.gitnexus"
echo "Downloading $artifact from run $runId -> $target"
mkdir -p "$target"
if ! gh run download "$runId" \
--repo "${{ github.repository }}" \
--name "$artifact" \
--dir "$target"; then
case "$name" in
LibreChat|LibreChat-dev)
echo "::error::Failed to download critical artifact $artifact"
exit 1
;;
*)
# The name stays in active_names so the prune step
# won't remove the droplet's existing copy. The old
# index keeps being served instead of being wiped to
# nothing — stale beats empty — but observability
# requires an explicit notice since this path is
# invisible in the happy-path deploy log.
echo "::warning::Failed to download PR artifact $artifact — skipping fresh sync; previous index (if any) will continue being served from the droplet"
rm -rf "staging/${name}"
;;
esac
fi
done
echo ""
echo "Staged for rsync:"
du -sh staging/*/.gitnexus/ 2>/dev/null || echo "(none)"
- name: Setup SSH
env:
SSH_KEY: ${{ secrets.GITNEXUS_DO_SSH_KEY }}
KNOWN_HOST: ${{ secrets.GITNEXUS_DO_KNOWN_HOST }}
run: |
set -e
mkdir -p ~/.ssh
chmod 700 ~/.ssh
printf '%s\n' "$SSH_KEY" > ~/.ssh/deploy_key
chmod 600 ~/.ssh/deploy_key
# Pin the droplet's SSH host key from a repository secret instead
# of trusting whatever ssh-keyscan returns at deploy time. The
# secret is populated from `ssh-keyscan -H <host>` at bootstrap.
if [ -z "$KNOWN_HOST" ]; then
echo "::error::GITNEXUS_DO_KNOWN_HOST secret is empty. Run ssh-keyscan -H <host> and paste the output as this secret."
exit 1
fi
printf '%s\n' "$KNOWN_HOST" > ~/.ssh/known_hosts
chmod 600 ~/.ssh/known_hosts
- name: Authenticate droplet with GHCR
# GHCR packages pushed by GITHUB_TOKEN start private. The droplet
# pulls the image on every deploy, so we re-authenticate it here
# using the same short-lived token. If the package is public, this
# step is redundant but harmless.
#
# The token MUST travel through SSH stdin (not as a command arg)
# so it's never visible in the droplet's process table via
# /proc/<pid>/cmdline. `printf '%s'` is preferred over `echo`
# so the exact byte sequence sent is explicit — docker login
# tolerates a trailing newline but `printf` makes the intent
# obvious and portable across shells.
env:
SSH_USER: ${{ secrets.GITNEXUS_DO_USER }}
SSH_HOST: ${{ secrets.GITNEXUS_DO_HOST }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
GH_ACTOR: ${{ github.actor }}
run: |
printf '%s' "$GH_TOKEN" | ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" \
"docker login ghcr.io -u '$GH_ACTOR' --password-stdin"
- name: Upload config files
env:
SSH_USER: ${{ secrets.GITNEXUS_DO_USER }}
SSH_HOST: ${{ secrets.GITNEXUS_DO_HOST }}
run: |
rsync -az -e "ssh -i ~/.ssh/deploy_key" \
.do/gitnexus/docker-compose.yml \
.do/gitnexus/Caddyfile \
"$SSH_USER@$SSH_HOST:/opt/gitnexus/"
- name: Prune stale indexes then sync fresh ones
env:
SSH_USER: ${{ secrets.GITNEXUS_DO_USER }}
SSH_HOST: ${{ secrets.GITNEXUS_DO_HOST }}
ACTIVE_NAMES: ${{ steps.resolve.outputs.active_names }}
run: |
set -e
# ── Step 1: prune FIRST ────────────────────────────────
# Remove any folders on the droplet that aren't in the active set.
# This frees disk BEFORE rsyncing new data, which matters on a
# 10GB disk where each index is ~130MB.
echo "Pruning stale indexes (keeping: $ACTIVE_NAMES)"
ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" \
ACTIVE_NAMES="$ACTIVE_NAMES" bash <<'REMOTE'
set -e
cd /opt/gitnexus/indexes || exit 0
shopt -s nullglob
IFS=',' read -ra ACTIVE <<< "$ACTIVE_NAMES"
for dir in */; do
dir="${dir%/}"
keep=false
for a in "${ACTIVE[@]}"; do
if [ "$dir" = "$a" ]; then keep=true; break; fi
done
if [ "$keep" = false ]; then
echo "Removing stale index: $dir"
rm -rf "$dir"
fi
done
echo "Disk after prune:"
df -h / | tail -1
REMOTE
# ── Step 2: rsync-then-swap ─────────────────────────────
# Upload each index to a temp directory, then atomically swap
# it into place. If rsync fails, the old index survives intact
# and the partial temp dir is cleaned up — no production data
# is lost. The brief period where both old + new exist costs
# ~130MB of extra disk, but the prune step already freed
# space from evicted PR indexes so this fits on a 10GB disk.
for dir in staging/*/; do
[ -d "$dir" ] || continue
name=$(basename "$dir")
echo "Syncing $name (rsync-then-swap)"
ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" \
"mkdir -p /opt/gitnexus/indexes/${name}.new"
if rsync -az -e "ssh -i ~/.ssh/deploy_key" \
"$dir" \
"$SSH_USER@$SSH_HOST:/opt/gitnexus/indexes/${name}.new/"; then
# Swap: remove old, rename new into place
ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" \
"rm -rf /opt/gitnexus/indexes/$name && mv /opt/gitnexus/indexes/${name}.new /opt/gitnexus/indexes/$name"
echo " $name swapped successfully"
else
# Clean up the partial temp dir
ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" \
"rm -rf /opt/gitnexus/indexes/${name}.new"
# main/dev are critical — abort the deploy so the failure
# is visible and the container isn't restarted with stale
# or missing data. PR indexes are best-effort.
case "$name" in
LibreChat|LibreChat-dev)
echo "::error::rsync failed for critical index $name — aborting deploy"
exit 1
;;
*)
echo "::warning::rsync failed for PR index $name — keeping previous index"
;;
esac
fi
done
- name: Pull image, restart gitnexus, reload Caddy, wait for healthy
env:
SSH_USER: ${{ secrets.GITNEXUS_DO_USER }}
SSH_HOST: ${{ secrets.GITNEXUS_DO_HOST }}
run: |
ssh -i ~/.ssh/deploy_key "$SSH_USER@$SSH_HOST" bash <<'REMOTE'
set -e
cd /opt/gitnexus
# ── Disk cleanup ──────────────────────────────────────
# Docker accumulates old image layers, dangling images, and
# build cache across deploys. On a 60GB droplet with a 700MB+
# gitnexus image, this fills the disk after ~40 deploys.
# Prune everything not used by currently-running containers
# BEFORE pulling the new image so the extract has room.
echo "Disk before cleanup:"
df -h / | tail -1
# Omit --volumes: Caddy's caddy-data and caddy-config volumes
# hold TLS certificates and ACME state. If Caddy happens to be
# stopped when this runs (the workflow handles that case later),
# --volumes would wipe them, forcing Let's Encrypt re-issuance
# and risking rate-limit lockout (5 certs/domain/week).
docker system prune -af 2>/dev/null || true
echo "Disk after cleanup:"
df -h / | tail -1
# Fail fast if disk is critically low even after prune
AVAIL_MB=$(df --output=avail -m / | tail -1 | tr -d ' ')
if [ "$AVAIL_MB" -lt 2048 ]; then
echo "::error::Disk critically low (${AVAIL_MB}MB free). Aborting deploy."
exit 1
fi
docker compose pull gitnexus
docker compose up -d --force-recreate gitnexus
# Reload Caddy in-place so a changed Caddyfile takes effect
# without losing TLS certs or restarting connections. If caddy
# isn't running yet (first-time bootstrap), bring it up.
if docker compose ps --status running caddy 2>/dev/null | grep -q caddy; then
echo "Reloading Caddy config"
docker compose exec -T caddy caddy reload --config /etc/caddy/Caddyfile || {
echo "Caddy reload failed — forcing restart"
docker compose up -d --force-recreate caddy
}
else
echo "Caddy not running — starting"
docker compose up -d caddy
fi
# Poll gitnexus health until ready or timeout. Docker's own
# unhealthy detection takes up to 150s (start_period 60s +
# retries 3 * interval 30s), so the poll ceiling must clear
# that to avoid false negatives when gitnexus legitimately
# takes ~2.5 min to warm up.
# Max wait = 36 sleeps * 5s = 180s (final iteration exits
# before its sleep on failure, so 37 iterations is the
# correct upper bound for a true 180s ceiling).
echo "Waiting for gitnexus to report healthy..."
for i in $(seq 1 37); do
STATUS=$(docker inspect --format='{{.State.Health.Status}}' gitnexus 2>/dev/null || echo unknown)
echo "[$i/37] gitnexus health: $STATUS"
if [ "$STATUS" = "healthy" ]; then
echo "gitnexus is healthy"
break
fi
if [ "$i" -eq 37 ]; then
echo "ERROR: gitnexus failed to become healthy after 180s"
docker compose ps
docker compose logs --tail 80 gitnexus
exit 1
fi
sleep 5
done
docker compose ps
echo "--- Caddy logs (last 20 lines) ---"
docker compose logs --tail 20 caddy || true
echo "--- GitNexus logs (last 30 lines) ---"
docker compose logs --tail 30 gitnexus || true
REMOTE
# When the deploy was triggered by a PR command path, post a
# terminal status comment on that one PR only. Two sub-cases:
#
# 1. workflow_run trigger: the PR's native auto-index run fired
# workflow_run, so github.event.workflow_run.id is the trigger.
# Find the matching PR via the matrix entry whose runId matches.
#
# 2. workflow_dispatch trigger with inputs.pr_number set: the
# index workflow's bot-fallback path dispatched us directly
# because workflow_run is suppressed for GITHUB_TOKEN triggers.
# Use inputs.pr_number as the comment target.
#
# Broadcast-commenting on every active PR would be noise — only the
# PR that asked for a fresh index gets a reply.
- name: Comment on PR — deploy complete
if: always()
uses: actions/github-script@v7
env:
MATRIX: ${{ steps.resolve.outputs.matrix }}
TRIGGER_RUN_ID: ${{ github.event.workflow_run.id }}
DISPATCH_PR_NUMBER: ${{ github.event.inputs.pr_number }}
DEPLOY_STATUS: ${{ job.status }}
with:
script: |
let prNum = null;
// Case 1: dispatched directly with pr_number (bot-fallback path)
if (process.env.DISPATCH_PR_NUMBER && process.env.DISPATCH_PR_NUMBER !== '') {
prNum = parseInt(process.env.DISPATCH_PR_NUMBER, 10);
}
// Case 2: workflow_run trigger from a PR index run
else if (context.eventName === 'workflow_run') {
const matrix = JSON.parse(process.env.MATRIX || '[]');
const triggerRunId = Number(process.env.TRIGGER_RUN_ID);
const match = matrix.find(
(m) => m.runId === triggerRunId && m.name.startsWith('LibreChat-pr-'),
);
if (match) {
prNum = parseInt(match.name.replace('LibreChat-pr-', ''), 10);
}
}
if (!prNum) {
core.info('No PR to comment on (trigger was not a PR-scoped index); skipping.');
return;
}
const deployUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
const ok = process.env.DEPLOY_STATUS === 'success';
const body = [
`### GitNexus: ${ok ? '🚀 deployed' : '❌ deploy failed'}`,
'',
ok
? `The \`LibreChat-pr-${prNum}\` index is now live on the MCP server.`
: `The deploy failed — the previous index (if any) continues to be served.`,
`[Deploy run](${deployUrl})`,
].join('\n');
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: prNum,
body,
});
- name: Cleanup SSH key
if: always()
run: rm -f ~/.ssh/deploy_key