Skip to content

(build): front

(build): front #507

# =============================================================================
# Production CI/CD Pipeline β€” Quality Gates + Blue-Green Deployment
# =============================================================================
#
# Pipeline:
# 1. Quality Gates (parallel): lint, typecheck, security
# 2. Build Verification: extension + Next.js builds (smoke test)
# 3. Deploy Gate: parses commit message with a strict regex (anti-footgun)
# 4. Deploy: docker compose build on the prod host β†’ blue-green rollout
#
# Triggers:
# - push to main: full pipeline. Deploy gated by commit-message convention
# `(build): front`, `(build): back`, or `(build): front back` parsed by a
# real regex in the `deploy-gate` job (see A1 in CI/CD Roadmap v1.12).
# - pull_request to main: quality gates + build verification only.
# - schedule (Sunday 00:00 UTC): security audit only.
# - workflow_dispatch: manual run with `force_deploy` / `skip_quality_gates`.
#
# Architectural notes (decided 2026-05, see commit history):
# - Images are built on the prod self-hosted runner (`prod.docs.plus`).
# Disk pressure is mitigated by a pre-build disk guard, not by pushing
# to a registry. If pressure resurfaces, revisit M3 (ghcr.io push).
# - Rollback uses an on-disk tag stash on the prod host:
# /opt/projects/prod.docs.plus/.deploy/last-good-tag.
# - All third-party actions (workflow + composite) are pinned to commit
# SHA. Renovate/Dependabot should bump them; never use floating tags.
# =============================================================================
name: CI/CD Production
on:
push:
branches: [main]
pull_request:
branches: [main]
schedule:
# Weekly security scan (Sunday 00:00 UTC)
- cron: '0 0 * * 0'
workflow_dispatch:
inputs:
skip_quality_gates:
description: 'Skip quality gates (emergency deploy)'
required: false
default: false
type: boolean
force_deploy:
description: 'Force deployment (bypass commit-message gate)'
required: false
default: false
type: boolean
# Two concurrency groups:
# - quality-gates can be cancelled freely (cheap to redo)
# - deploy MUST finish or rollback (mid-deploy SIGTERM corrupts blue-green state)
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-quality
cancel-in-progress: true
# Default to bash so set -e/-o pipefail behavior is consistent across runners.
defaults:
run:
shell: bash
# Workflow-level least privilege; per-job overrides where needed.
permissions:
contents: read
env:
ENV_SOURCE: /opt/projects/prod.docs.plus/.env
ENV_FILE: .env.production
COMPOSE_FILE: docker-compose.prod.yml
DEPLOY_TAG: ${{ github.sha }}
# Where the prod host stashes the last successfully deployed SHA for rollback.
DEPLOY_STATE_DIR: /opt/projects/prod.docs.plus/.deploy
LAST_GOOD_TAG_FILE: /opt/projects/prod.docs.plus/.deploy/last-good-tag
jobs:
# ===========================================================================
# STAGE 1 β€” QUALITY GATES (parallel, fast feedback)
# ===========================================================================
lint:
name: πŸ” Lint & Format
runs-on: ubuntu-latest
timeout-minutes: 10
if: github.event_name != 'schedule'
permissions:
contents: read
steps:
- name: πŸ“¦ Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: πŸ₯Ÿ Setup Environment
uses: ./.github/actions/setup-bun
- name: πŸ” Lint, format & styles
run: |
bun run lint
bun run format
bun run styles
typecheck:
name: πŸ“ Type Check
runs-on: ubuntu-latest
timeout-minutes: 15
if: github.event_name != 'schedule'
permissions:
contents: read
steps:
- name: πŸ“¦ Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: πŸ₯Ÿ Setup Environment
uses: ./.github/actions/setup-bun
- name: πŸ“¦ Restore extension build cache
id: cache-ext
uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
with:
path: |
packages/extension-hyperlink/dist
packages/extension-hypermultimedia/dist
packages/extension-indent/dist
packages/extension-inline-code/dist
packages/extension-placeholder/dist
key: ext-${{ runner.os }}-${{ hashFiles('bun.lock',
'packages/extension-hyperlink/package.json',
'packages/extension-hypermultimedia/package.json',
'packages/extension-indent/package.json',
'packages/extension-inline-code/package.json',
'packages/extension-placeholder/package.json',
'packages/extension-hyperlink/src/**',
'packages/extension-hypermultimedia/src/**',
'packages/extension-indent/src/**',
'packages/extension-inline-code/src/**',
'packages/extension-placeholder/src/**') }}
restore-keys: ext-${{ runner.os }}-
- name: πŸ”§ Build Extensions (required for types)
if: steps.cache-ext.outputs.cache-hit != 'true'
run: |
bun run --filter @docs.plus/extension-hyperlink build
bun run --filter @docs.plus/extension-hypermultimedia build
bun run --filter @docs.plus/extension-indent build
bun run --filter @docs.plus/extension-inline-code build
bun run --filter @docs.plus/extension-placeholder build
- name: πŸ“ Type Check All
run: bun run typecheck
security:
name: πŸ”’ Security Audit
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read
steps:
- name: πŸ“¦ Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: πŸ₯Ÿ Setup Environment
uses: ./.github/actions/setup-bun
with:
ignore-scripts: 'true'
- name: πŸ” Bun Audit
run: |
set -o pipefail
echo "πŸ” Checking for known vulnerabilities..."
# Capture both human-readable and machine-readable output.
# `bun pm audit` exits non-zero when vulns exist; we always want both files.
bun pm audit 2>&1 | tee audit-results.txt || true
bun pm audit --json > audit-results.json 2>/dev/null || echo '{}' > audit-results.json
# Structured parsing β€” replaces brittle `grep -ci "critical"` which
# matched the summary header line and produced false positives.
# Expected shape: { "vulnerabilities": { "critical": N, "high": N, ... } }
CRITICAL=$(bun -e 'const a=JSON.parse(require("fs").readFileSync("audit-results.json","utf8"));process.stdout.write(String(a?.vulnerabilities?.critical||0))')
HIGH=$(bun -e 'const a=JSON.parse(require("fs").readFileSync("audit-results.json","utf8"));process.stdout.write(String(a?.vulnerabilities?.high||0))')
echo ""
echo "πŸ“Š Summary: critical=${CRITICAL}, high=${HIGH}"
# Defensive: if both are 0 AND the JSON looks empty, the audit
# shape may have changed (Bun has changed it before). Print the
# raw JSON head so a future regression doesn't silently neutralize
# this gate. Caps at 4 KB to keep logs tidy.
if [ "${CRITICAL}" -eq 0 ] && [ "${HIGH}" -eq 0 ]; then
BYTES=$(wc -c < audit-results.json | tr -d ' ')
if [ "${BYTES}" -lt 32 ]; then
echo "::warning::audit-results.json is suspiciously small (${BYTES} bytes). Bun audit JSON shape may have changed."
echo "--- audit-results.json (head 4KB) ---"
head -c 4096 audit-results.json || true
echo ""
echo "--- end ---"
fi
fi
if [ "${CRITICAL}" -gt 0 ] || [ "${HIGH}" -gt 0 ]; then
echo "::error::Critical/High vulnerabilities detected (critical=${CRITICAL}, high=${HIGH})"
exit 1
fi
echo "βœ… No critical/high vulnerabilities"
- name: πŸ“€ Upload Audit Results
if: always()
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3
with:
name: security-audit-${{ github.sha }}
path: |
audit-results.txt
audit-results.json
retention-days: 30
# ===========================================================================
# STAGE 2 β€” BUILD VERIFICATION (smoke test, no artifacts produced)
# ===========================================================================
# Note: this job intentionally does NOT build Docker images. The deploy job
# rebuilds them on the prod host anyway (decided 2026-05); duplicating the
# docker build here would just slow the pipeline without sharing cache. The
# webapp/admin Next.js compile here catches type/build regressions early.
# ===========================================================================
build:
name: πŸ—οΈ Build Verification
runs-on: ubuntu-latest
timeout-minutes: 35
needs: [lint, typecheck, security]
if: |
always() &&
github.event_name != 'schedule' &&
(needs.lint.result == 'success' || (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates)) &&
(needs.typecheck.result == 'success' || (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates)) &&
(needs.security.result == 'success' || (github.event_name == 'workflow_dispatch' && inputs.skip_quality_gates))
permissions:
contents: read
steps:
- name: πŸ“¦ Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: πŸ₯Ÿ Setup Environment
uses: ./.github/actions/setup-bun
- name: πŸ“¦ Restore extension build cache
id: cache-ext
uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
with:
path: |
packages/extension-hyperlink/dist
packages/extension-hypermultimedia/dist
packages/extension-indent/dist
packages/extension-inline-code/dist
packages/extension-placeholder/dist
key: ext-${{ runner.os }}-${{ hashFiles('bun.lock',
'packages/extension-hyperlink/package.json',
'packages/extension-hypermultimedia/package.json',
'packages/extension-indent/package.json',
'packages/extension-inline-code/package.json',
'packages/extension-placeholder/package.json',
'packages/extension-hyperlink/src/**',
'packages/extension-hypermultimedia/src/**',
'packages/extension-indent/src/**',
'packages/extension-inline-code/src/**',
'packages/extension-placeholder/src/**') }}
restore-keys: ext-${{ runner.os }}-
- name: πŸ”§ Build TipTap Extensions
if: steps.cache-ext.outputs.cache-hit != 'true'
run: |
bun run --filter @docs.plus/extension-hyperlink build
bun run --filter @docs.plus/extension-hypermultimedia build
bun run --filter @docs.plus/extension-indent build
bun run --filter @docs.plus/extension-inline-code build
bun run --filter @docs.plus/extension-placeholder build
- name: πŸ—οΈ Build Webapp
run: bun run --filter @docs.plus/webapp build:ci
env:
NEXT_PUBLIC_SUPABASE_URL: ${{ secrets.NEXT_PUBLIC_SUPABASE_URL || 'http://localhost:54321' }}
NEXT_PUBLIC_SUPABASE_ANON_KEY: ${{ secrets.NEXT_PUBLIC_SUPABASE_ANON_KEY || 'dummy-key' }}
- name: πŸ—οΈ Build Admin Dashboard
run: bun run --filter @docs.plus/admin-dashboard build:ci
env:
NEXT_PUBLIC_SUPABASE_URL: ${{ secrets.NEXT_PUBLIC_SUPABASE_URL || 'http://localhost:54321' }}
NEXT_PUBLIC_SUPABASE_ANON_KEY: ${{ secrets.NEXT_PUBLIC_SUPABASE_ANON_KEY || 'dummy-key' }}
NEXT_PUBLIC_API_URL: ${{ secrets.NEXT_PUBLIC_API_URL || 'http://localhost:3003' }}
NEXT_PUBLIC_APP_URL: ${{ secrets.NEXT_PUBLIC_APP_URL || 'http://localhost:3000' }}
# ===========================================================================
# STAGE 2.5 β€” DEPLOY GATE (precise commit-message parsing)
# ===========================================================================
# Replaces the previous loose `contains(...)` chain in the deploy job's `if:`.
# The old check matched any commit whose body contained both "build" and
# "front" or "back" anywhere (e.g. "fix iOS back gesture build crash" would
# have triggered a production deploy). This job parses the convention
# documented in AGENTS.md (`(build): front|back|front back`) with a real
# regex and surfaces the result as a job output.
# ===========================================================================
deploy-gate:
name: 🚦 Deploy Gate
runs-on: ubuntu-latest
timeout-minutes: 2
needs: [build]
if: |
always() &&
needs.build.result == 'success' &&
(github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && inputs.force_deploy))
permissions:
contents: read
outputs:
deploy: ${{ steps.gate.outputs.deploy }}
reason: ${{ steps.gate.outputs.reason }}
steps:
- name: 🚦 Evaluate trigger
id: gate
env:
EVENT_NAME: ${{ github.event_name }}
REF: ${{ github.ref }}
FORCE_DEPLOY: ${{ inputs.force_deploy }}
# Use the head_commit body verbatim. Avoid `git log` because the
# runner's checkout depth might not include it.
COMMIT_MSG: ${{ github.event.head_commit.message }}
run: |
set -euo pipefail
# workflow_dispatch + force_deploy=true β†’ unconditional deploy
if [ "${EVENT_NAME}" = "workflow_dispatch" ] && [ "${FORCE_DEPLOY}" = "true" ]; then
echo "deploy=true" >> "$GITHUB_OUTPUT"
echo "reason=workflow_dispatch+force_deploy" >> "$GITHUB_OUTPUT"
echo "βœ… Deploying: workflow_dispatch with force_deploy=true"
exit 0
fi
# push + main only beyond this point
if [ "${EVENT_NAME}" != "push" ] || [ "${REF}" != "refs/heads/main" ]; then
echo "deploy=false" >> "$GITHUB_OUTPUT"
echo "reason=non-main push" >> "$GITHUB_OUTPUT"
echo "ℹ️ Skipping deploy: not a push to main"
exit 0
fi
# Strict regex per AGENTS.md convention. Anchored to a line and
# accepts an optional Conventional-Commits-style scope after build,
# e.g. `(build): front`, `(build): back`, `(build): front back`.
# Matches in the SUBJECT or any body line via -E (extended regex).
if printf '%s' "${COMMIT_MSG}" | grep -Eq '(^|[[:space:]])\(build\):[[:space:]]+(front([[:space:]]+back)?|back([[:space:]]+front)?)([[:space:]]|$)'; then
echo "deploy=true" >> "$GITHUB_OUTPUT"
echo "reason=(build): trigger matched" >> "$GITHUB_OUTPUT"
echo "βœ… Deploying: matched (build): front|back convention"
else
echo "deploy=false" >> "$GITHUB_OUTPUT"
echo "reason=no (build): trigger" >> "$GITHUB_OUTPUT"
echo "ℹ️ Skipping deploy: commit subject does not match '(build): front|back|front back'"
echo "ℹ️ Subject line was:"
printf '%s\n' "${COMMIT_MSG}" | head -1
fi
# ===========================================================================
# STAGE 3 β€” PRODUCTION DEPLOYMENT
# ===========================================================================
# IMPORTANT: this job intentionally opts OUT of cancel-in-progress at the job
# level. Mid-deploy SIGTERM during `docker compose up --scale` can leave the
# cluster with a mix of old+new containers and corrupt blue-green state.
# ===========================================================================
deploy:
name: πŸš€ Deploy Production
runs-on: prod.docs.plus
timeout-minutes: 30
needs: [deploy-gate]
# Separate concurrency group with cancel-in-progress: false. If two pushes
# arrive close together, the second waits for the first to finish.
concurrency:
group: ${{ github.workflow }}-deploy
cancel-in-progress: false
if: needs.deploy-gate.outputs.deploy == 'true'
environment:
name: production
url: https://docs.plus
permissions:
contents: read
steps:
- name: πŸ“¦ Checkout Code
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 1
- name: πŸ” Prepare Environment
run: |
# Compose `--env-file` is the single source of truth. We do NOT also
# `set -a; source` it elsewhere β€” that path was double-loading and
# leaking vars to subshells unintentionally.
cp "${ENV_SOURCE}" "${ENV_FILE}"
echo "DEPLOY_TAG=${DEPLOY_TAG}" >> "${ENV_FILE}"
# Stash the previous successful tag (if any) for the rollback step.
mkdir -p "${DEPLOY_STATE_DIR}"
if [ -f "${LAST_GOOD_TAG_FILE}" ]; then
PREVIOUS_TAG=$(cat "${LAST_GOOD_TAG_FILE}")
echo "PREVIOUS_TAG=${PREVIOUS_TAG}" >> "$GITHUB_ENV"
echo "ℹ️ Previous good tag: ${PREVIOUS_TAG}"
else
echo "PREVIOUS_TAG=" >> "$GITHUB_ENV"
echo "ℹ️ No previous good tag stashed (first deploy or fresh state dir)"
fi
echo "βœ… Environment ready"
- name: πŸ’Ύ Pre-deploy disk guard
run: |
echo "πŸ“Š Disk before prune:"
df -h / | tail -1
# Free space proactively. Without this, --no-cache builds can fill
# the root volume between deploys and silently OOM/ENOSPC the build
# step (job ends in <2min with no error). Runs before build, not after.
docker image prune -af --filter "until=24h" 2>/dev/null || true
docker builder prune -af --filter "until=24h" 2>/dev/null || true
# Hard guard: refuse to build when <10 GB free. Fail loud here
# rather than fail silently mid-build.
AVAIL_KB=$(df --output=avail / | tail -1)
AVAIL_GB=$((AVAIL_KB / 1024 / 1024))
echo "πŸ“Š Disk after prune: ${AVAIL_GB} GB free"
if [ "${AVAIL_GB}" -lt 10 ]; then
echo "::error::Less than 10 GB free on /. Aborting deploy. SSH to host and run 'docker system prune -af --volumes'."
df -h /
docker system df
exit 1
fi
- name: πŸ“‚ Verify build context (monorepo root)
run: |
if [ ! -d packages/email-templates ]; then
echo "::error::packages/email-templates missing. Build context must be repo root (context: .). Check checkout includes the workspace."
exit 1
fi
if ! grep -q 'email-templates' packages/hocuspocus.server/docker/Dockerfile.bun; then
echo "::error::packages/hocuspocus.server/docker/Dockerfile.bun must COPY packages/email-templates."
exit 1
fi
if ! grep -q 'email-templates' packages/webapp/docker/Dockerfile.bun; then
echo "::error::packages/webapp/docker/Dockerfile.bun must COPY packages/email-templates."
exit 1
fi
echo "βœ… Build context OK (repo root, email-templates present)"
- name: πŸ—οΈ Build Docker Images
env:
DOCKER_BUILDKIT: '1'
COMPOSE_DOCKER_CLI_BUILD: '1'
run: |
echo "πŸ”¨ Building images with tag: ${DEPLOY_TAG}"
# hocuspocus-server and hocuspocus-worker share `docsplus-hocuspocus`;
# building both via compose with --no-cache duplicates context transfer
# and ties up the bake plan. Build via hocuspocus-server only; the
# worker reuses the resulting tag at `up` time.
#
# --no-cache: required as long as the prod entrypoint script changes
# are layered late in the Dockerfile and we don't yet have stable
# layer ordering. If/when entrypoint COPY moves to the last layer,
# we can drop --no-cache and gain ~5 min per deploy.
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
build --no-cache rest-api hocuspocus-server
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
build --parallel webapp admin-dashboard
echo "βœ… Images built"
- name: πŸ”§ Ensure Infrastructure
run: |
echo "πŸ”§ Ensuring infrastructure..."
docker network create docsplus-network 2>/dev/null || true
# Start Traefik and Redis (--no-recreate keeps existing if running)
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
up -d --no-recreate traefik redis
# Force-start Traefik if somehow not running
if ! docker ps --filter "name=traefik" --filter "status=running" --format '{{.Names}}' | grep -q traefik; then
echo "⚠️ Traefik not running, starting..."
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" up -d traefik
sleep 15
fi
# Wait for healthy
echo "⏳ Waiting for Traefik..."
for i in {1..30}; do
if docker ps --filter "name=traefik" --filter "health=healthy" --format '{{.Names}}' | grep -q traefik; then
echo "βœ… Traefik healthy"
break
fi
[ "${i}" -eq 30 ] && echo "⚠️ Traefik health timeout, continuing..."
sleep 2
done
- name: πŸš€ Deploy Services (Blue-Green)
run: |
echo "πŸš€ Starting zero-downtime deployment..."
deploy_service() {
local SERVICE="$1"
local TARGET="$2"
local CURRENT
CURRENT=$(docker ps --filter "label=com.docker.compose.service=${SERVICE}" -q | wc -l | tr -d ' ')
local SCALE_UP=$((CURRENT + TARGET))
echo ""
echo "πŸ“¦ Deploying ${SERVICE} (current: ${CURRENT}, target: ${TARGET})..."
# Scale UP first (keeps old containers serving traffic)
if [ "${SCALE_UP}" -gt "${CURRENT}" ]; then
echo "⬆️ Scaling up to ${SCALE_UP}..."
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
up -d --no-deps --scale "${SERVICE}=${SCALE_UP}" "${SERVICE}"
# Wait for healthy. 60Γ—2s = 120s β€” Next.js cold start can hit
# 60-90s right after a --no-cache build. Was 30Γ—2s = 60s before
# which produced false-fail rollbacks.
echo "⏳ Waiting for healthy containers..."
for i in {1..60}; do
local HEALTHY
HEALTHY=$(docker ps --filter "label=com.docker.compose.service=${SERVICE}" --filter "health=healthy" -q | wc -l)
if [ "${HEALTHY}" -ge "${TARGET}" ]; then
echo "βœ… ${HEALTHY} healthy containers"
break
fi
[ $((i % 10)) -eq 0 ] && echo " ... ${HEALTHY}/${TARGET} healthy (attempt ${i}/60)"
sleep 2
done
fi
# Scale to target (compose removes old containers)
echo "πŸ“ Scaling to target: ${TARGET}..."
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
up -d --no-deps --scale "${SERVICE}=${TARGET}" "${SERVICE}"
sleep 2
}
deploy_service "webapp" 2
deploy_service "rest-api" 2
deploy_service "hocuspocus-server" 2
deploy_service "hocuspocus-worker" 1
deploy_service "admin-dashboard" 1
echo ""
echo "βœ… All services deployed"
- name: 🩺 Verify Deployment
run: |
echo "🩺 Verifying deployment..."
sleep 10
# Infrastructure check
echo "πŸ“Š Infrastructure:"
for svc in traefik docsplus-redis; do
if docker ps --filter "name=${svc}" --filter "status=running" --format '{{.Names}}' | grep -q "${svc}"; then
echo " βœ… ${svc}: running"
else
echo " ❌ ${svc}: NOT running"
docker logs "${svc}" --tail 30 2>/dev/null || true
exit 1
fi
done
# Service running + healthy check
echo "πŸ“Š Services:"
for svc in webapp rest-api hocuspocus-server hocuspocus-worker admin-dashboard; do
RUNNING=$(docker ps --filter "label=com.docker.compose.service=${svc}" --filter "status=running" --format "{{.Names}}" | wc -l)
HEALTHY=$(docker ps --filter "label=com.docker.compose.service=${svc}" --filter "health=healthy" --format "{{.Names}}" | wc -l)
if [ "${RUNNING}" -gt 0 ]; then
echo " βœ… ${svc}: ${RUNNING} running, ${HEALTHY} healthy"
else
echo " ❌ ${svc}: NOT running"
exit 1
fi
done
# Internal smoke test β€” hit container health endpoints via the
# docker network, NOT via the public DNS+TLS stack. A transient
# ACME / Let's Encrypt hiccup must not trigger a false-fail rollback.
echo ""
echo "πŸ” Internal smoke tests..."
smoke() {
local SVC="$1" PORT="$2" PATH_="$3"
if docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" exec -T "${SVC}" \
bun -e "fetch('http://localhost:${PORT}${PATH_}').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"; then
echo " βœ… ${SVC} internal health"
else
echo " ❌ ${SVC} internal health"
return 1
fi
}
smoke webapp 3000 /api/health
smoke rest-api 4000 /health
smoke hocuspocus-server 4001 /health
smoke hocuspocus-worker 4002 /health
smoke admin-dashboard 3100 /api/health
# Public-URL probe is now informational only β€” does NOT fail the deploy.
# Real public-availability monitoring belongs in uptime-kuma, not here.
echo ""
echo "🌐 Public URL probe (informational):"
PUBLIC_CODE=$(curl -sf -o /dev/null -w "%{http_code}" --max-time 10 https://docs.plus/ 2>/dev/null || echo "000")
echo " https://docs.plus/ β†’ ${PUBLIC_CODE}"
API_CODE=$(curl -sf -o /dev/null -w "%{http_code}" --max-time 10 https://prodback.docs.plus/api/health 2>/dev/null || echo "000")
echo " https://prodback.docs.plus/api/health β†’ ${API_CODE}"
echo ""
echo "βœ… Deployment verified"
- name: πŸ“ Sync compose files for break-glass
if: success()
run: |
# Replaces the previous "Sync Production Directory" step which re-`up`'d
# services from a different cwd β€” that re-up broke the blue-green
# guarantee. Now we only COPY the active compose + env files to a
# stable path so a human SSH'd in can run, e.g.:
# cd /opt/projects/prod.docs.plus/.deploy/current
# docker compose -f docker-compose.prod.yml --env-file .env.production ps
# without having to know the runner's _work directory.
mkdir -p "${DEPLOY_STATE_DIR}/current"
cp "${COMPOSE_FILE}" "${DEPLOY_STATE_DIR}/current/${COMPOSE_FILE}"
cp "${ENV_FILE}" "${DEPLOY_STATE_DIR}/current/${ENV_FILE}"
echo "βœ… Synced compose+env to ${DEPLOY_STATE_DIR}/current/"
- name: πŸ’Ύ Stash this tag as last-good
# Only on success β€” failure path is handled by the rollback step.
if: success()
run: |
# Persist current tag for the next deploy's rollback target.
mkdir -p "${DEPLOY_STATE_DIR}"
# Keep the previous one as last-good-tag.previous for one-step-back debugging.
if [ -f "${LAST_GOOD_TAG_FILE}" ]; then
cp "${LAST_GOOD_TAG_FILE}" "${LAST_GOOD_TAG_FILE}.previous"
fi
echo "${DEPLOY_TAG}" > "${LAST_GOOD_TAG_FILE}"
echo "βœ… Stashed last-good-tag = ${DEPLOY_TAG}"
- name: 🧹 Cleanup
if: success()
continue-on-error: true # cleanup failure shouldn't fail an otherwise green deploy
run: |
docker image prune -f
docker image prune -f --filter "until=24h" 2>/dev/null || true
echo "βœ… Cleanup complete"
- name: πŸ“Š Summary
if: success()
run: |
echo "======================================"
echo "βœ… DEPLOYMENT SUCCESSFUL"
echo "======================================"
echo "Tag: ${DEPLOY_TAG}"
echo "Previous tag: ${PREVIOUS_TAG:-<none>}"
echo ""
echo "Services:"
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | grep -E "(traefik|docsplus|webapp|rest-api|hocuspocus)" | head -15
echo ""
echo "URLs:"
echo " - https://docs.plus"
echo " - https://prodback.docs.plus"
echo "======================================"
- name: 🚨 Rollback on Failure
if: failure()
run: |
echo "⚠️ Deployment failed β€” attempting rollback..."
if [ -z "${PREVIOUS_TAG:-}" ]; then
echo "::warning::No PREVIOUS_TAG stashed β€” cannot auto-rollback."
echo "πŸ“Š Current state:"
docker ps --format "table {{.Names}}\t{{.Status}}" | head -15
exit 0
fi
echo "↩️ Rolling back to: ${PREVIOUS_TAG}"
# Multi-image precondition (A2): ALL service images for the previous
# tag must still exist locally. The cleanup step honors --filter
# until=24h, so within a 24h window this works reliably; outside
# that window we fail loudly rather than partially-rollback into a
# mixed-version cluster.
MISSING=()
for img in docsplus-webapp docsplus-rest-api docsplus-hocuspocus docsplus-admin; do
if ! docker image inspect "${img}:${PREVIOUS_TAG}" >/dev/null 2>&1; then
MISSING+=("${img}:${PREVIOUS_TAG}")
fi
done
if [ "${#MISSING[@]}" -gt 0 ]; then
echo "::error::Cannot auto-rollback. Missing images for previous tag:"
for img in "${MISSING[@]}"; do
echo " - ${img}"
done
echo "Manual recovery: bring traffic back via the existing healthy containers."
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" up -d --no-recreate 2>/dev/null || true
exit 1
fi
# Override DEPLOY_TAG in the env file and re-deploy with previous images.
sed -i.bak "s|^DEPLOY_TAG=.*|DEPLOY_TAG=${PREVIOUS_TAG}|" "${ENV_FILE}"
rm -f "${ENV_FILE}.bak"
docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" \
up -d --force-recreate \
webapp rest-api hocuspocus-server hocuspocus-worker admin-dashboard
# Post-rollback verification (A3). Give containers a moment to bind
# ports + pass first healthcheck, then run the same internal smoke
# set the forward path runs. If rollback itself can't come healthy,
# we want the workflow to fail RED so the on-call sees it instead
# of a misleading "rollback complete" green check.
echo ""
echo "⏳ Waiting 30s for rolled-back containers to settle..."
sleep 30
smoke() {
local SVC="$1" PORT="$2" PATH_="$3"
if docker compose -f "${COMPOSE_FILE}" --env-file "${ENV_FILE}" exec -T "${SVC}" \
bun -e "fetch('http://localhost:${PORT}${PATH_}').then(r => r.ok ? process.exit(0) : process.exit(1)).catch(() => process.exit(1))"; then
echo " βœ… ${SVC} internal health (post-rollback)"
return 0
else
echo " ❌ ${SVC} internal health (post-rollback)"
return 1
fi
}
ROLLBACK_OK=1
smoke webapp 3000 /api/health || ROLLBACK_OK=0
smoke rest-api 4000 /health || ROLLBACK_OK=0
smoke hocuspocus-server 4001 /health || ROLLBACK_OK=0
smoke hocuspocus-worker 4002 /health || ROLLBACK_OK=0
smoke admin-dashboard 3100 /api/health || ROLLBACK_OK=0
echo ""
echo "πŸ“Š Post-rollback state:"
docker ps --format "table {{.Names}}\t{{.Status}}" | head -15
if [ "${ROLLBACK_OK}" -ne 1 ]; then
echo "::error::Rollback to ${PREVIOUS_TAG} did not pass smoke tests. Manual intervention required."
exit 1
fi
echo "βœ… Rollback to ${PREVIOUS_TAG} verified healthy"
# ===========================================================================
# UPTIME KUMA (optional monitoring service)
# ===========================================================================
deploy-uptime-kuma:
name: πŸ”” Deploy Uptime Kuma
runs-on: prod.docs.plus
timeout-minutes: 10
if: |
github.ref == 'refs/heads/main' &&
github.event_name == 'push' &&
contains(github.event.head_commit.message, 'build') &&
contains(github.event.head_commit.message, 'uptime-kuma')
permissions:
contents: read
steps:
- name: πŸš€ Deploy
run: |
# Pinned by digest (not :latest) so the same uptime-kuma version
# is reproducible across redeploys. To bump, look up new digest:
# docker pull louislam/uptime-kuma:1 && docker inspect ...
UPTIME_KUMA_IMAGE='louislam/uptime-kuma:1@sha256:bb1bcecbc3e3ffb1cb0f8fc5f9c3cdaa78c1dfb56d98d64e06da13ebfc6dba0d'
docker network create docsplus-network 2>/dev/null || true
docker stop uptime-kuma 2>/dev/null || true
docker rm uptime-kuma 2>/dev/null || true
docker run -d \
--name uptime-kuma \
--network docsplus-network \
--restart unless-stopped \
-v uptime-kuma-data:/app/data \
--label "traefik.enable=true" \
--label "traefik.http.routers.uptime.rule=Host(\`status.docs.plus\`)" \
--label "traefik.http.routers.uptime.entrypoints=websecure" \
--label "traefik.http.routers.uptime.tls.certresolver=letsencrypt" \
--label "traefik.http.services.uptime.loadbalancer.server.port=3001" \
"${UPTIME_KUMA_IMAGE}"
sleep 15
echo "βœ… Uptime Kuma deployed at https://status.docs.plus"