Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions .github/scripts/preview/bootstrap-preview-auth.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,45 @@ require_env_vars \

mask_env_vars RUN_DB_URL SPICEDB_ENDPOINT SPICEDB_PRESHARED_KEY INKEEP_AGENTS_MANAGE_UI_PASSWORD BETTER_AUTH_SECRET

SPICEDB_TRANSIENT_RETRY_PATTERN='(No connection established|UNAVAILABLE|ECONNRESET|ECONNREFUSED|EPIPE|ETIMEDOUT|deadline exceeded|Protocol error|transport is closing)'

run_with_transient_spicedb_retry() {
local label="$1"
local attempts="$2"
shift 2

local attempt=""
local log_file=""
local status="0"

for attempt in $(seq 1 "${attempts}"); do
preview_log "${label} (attempt ${attempt}/${attempts})."
log_file="$(mktemp)"

set +e
"$@" 2>&1 | tee "${log_file}"
status="${PIPESTATUS[0]}"
set -e

if [ "${status}" = "0" ]; then
rm -f "${log_file}"
return 0
fi

if [ "${attempt}" -lt "${attempts}" ] && grep -Eqi "${SPICEDB_TRANSIENT_RETRY_PATTERN}" "${log_file}"; then
preview_log "${label} failed with a transient SpiceDB transport error; retrying."
rm -f "${log_file}"
sleep_with_backoff_and_jitter 2 "${attempt}" 10
continue
fi

rm -f "${log_file}"
return "${status}"
done

return 1
}

if [ -z "${RUN_DB_URL:-}" ] || [ -z "${SPICEDB_ENDPOINT:-}" ]; then
require_env_vars \
RAILWAY_API_TOKEN \
Expand All @@ -24,6 +63,7 @@ if [ -z "${RUN_DB_URL:-}" ] || [ -z "${SPICEDB_ENDPOINT:-}" ]; then
PR_NUMBER

RAILWAY_ENV_NAME="$(pr_env_name "${PR_NUMBER}")"
preview_log "Resolving runtime bootstrap values from Railway environment ${RAILWAY_ENV_NAME}."
RAILWAY_ENV_ID="$(railway_wait_for_environment_id "${RAILWAY_PROJECT_ID}" "${RAILWAY_ENV_NAME}" 10 2)"
OUTPUT_SERVICE_ID="$(railway_project_service_id "${RAILWAY_PROJECT_ID}" "${RAILWAY_OUTPUT_SERVICE}")"
OUTPUT_SERVICE_ENV_JSON="$(
Expand All @@ -42,22 +82,27 @@ if [ -z "${RUN_DB_URL:-}" ] || [ -z "${SPICEDB_ENDPOINT:-}" ]; then
fi

require_env_vars RUN_DB_URL SPICEDB_ENDPOINT
preview_log "Bootstrapping preview auth for tenant ${TENANT_ID:-default} via ${API_URL}."

export INKEEP_AGENTS_API_URL="${API_URL}"
export INKEEP_AGENTS_RUN_DATABASE_URL="${RUN_DB_URL}"
export SPICEDB_ENDPOINT
export TENANT_ID="${TENANT_ID:-default}"

echo "::group::Run preview runtime migrations"
preview_log "Running preview runtime migrations."
pnpm db:run:migrate
echo "::endgroup::"

echo "::group::Wait for SpiceDB readiness"
pnpm --filter @inkeep/agents-core exec tsx src/auth/wait-for-spicedb.ts
run_with_transient_spicedb_retry \
"Wait for SpiceDB readiness" \
2 \
pnpm --filter @inkeep/agents-core exec tsx src/auth/wait-for-spicedb.ts
echo "::endgroup::"

echo "::group::Initialize preview auth"
pnpm db:auth:init
run_with_transient_spicedb_retry "Initialize preview auth" 2 pnpm db:auth:init
echo "::endgroup::"

if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
Expand Down
154 changes: 154 additions & 0 deletions .github/scripts/preview/cleanup-stale-railway-envs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=.github/scripts/preview/common.sh
source "${SCRIPT_DIR}/common.sh"

require_env_vars \
RAILWAY_API_TOKEN \
RAILWAY_PROJECT_ID \
GH_TOKEN \
GITHUB_REPOSITORY

DRY_RUN="${DRY_RUN:-false}"
DELETE_UNKNOWN_PREVIEW_ENVS="${DELETE_UNKNOWN_PREVIEW_ENVS:-false}"

github_pr_state() {
local pr_number="$1"
local response_file=""
local status=""

response_file="$(mktemp)"
status="$(
curl -sS \
--connect-timeout 10 \
--max-time 30 \
-o "${response_file}" \
-w '%{http_code}' \
-H "Authorization: Bearer ${GH_TOKEN}" \
-H "Accept: application/vnd.github+json" \
"https://api.github.com/repos/${GITHUB_REPOSITORY}/pulls/${pr_number}"
)"
Comment on lines +24 to +33
Copy link

Copilot AI Mar 31, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The GitHub PR lookup is hard-coded to https://api.github.com/.... GitHub Actions exposes GITHUB_API_URL (and this also matters for GHES); consider building the endpoint from ${GITHUB_API_URL} (fallbacking to api.github.com) to avoid breaking the janitor in non-public-GitHub contexts.

Copilot uses AI. Check for mistakes.

case "${status}" in
200)
jq -r '.state' "${response_file}"
;;
404)
printf 'missing'
;;
*)
echo "Failed to query PR #${pr_number} from GitHub API (HTTP ${status})." >&2
cat "${response_file}" >&2
rm -f "${response_file}"
return 1
;;
esac

rm -f "${response_file}"
}

delete_env_and_verify() {
local env_id="$1"
local env_name="$2"

preview_log "Deleting stale Railway preview environment ${env_name}."
railway_environment_delete_by_id "${env_id}" >/dev/null
railway_wait_for_environment_absent "${RAILWAY_PROJECT_ID}" "${env_name}" 10 2
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 MAJOR: Deletion errors are silently ignored

Issue: railway_environment_delete_by_id return status is not checked. If deletion fails, the function proceeds to railway_wait_for_environment_absent which will eventually fail with a generic timeout message.

Why: When debugging deletion failures, operators see "Railway environment still exists after waiting for deletion" but won't know why (permission denied, API error, rate limit). The actual Railway API error is discarded.

Fix: Check the return status before proceeding:

delete_env_and_verify() {
  local env_id="$1"
  local env_name="$2"

  preview_log "Deleting stale Railway preview environment ${env_name}."
  if ! railway_environment_delete_by_id "${env_id}"; then
    echo "Failed to delete Railway environment ${env_name}." >&2
    return 1
  fi
  railway_wait_for_environment_absent "${RAILWAY_PROJECT_ID}" "${env_name}" 10 2
}

Refs:


ENVIRONMENTS_JSON="$(railway_project_environments_json "${RAILWAY_PROJECT_ID}")"
PR_ENVIRONMENTS_JSON="$(jq -c '[.[] | select(.name | test("^pr-[0-9]+$"))]' <<< "${ENVIRONMENTS_JSON}")"
PR_ENVIRONMENT_COUNT="$(jq 'length' <<< "${PR_ENVIRONMENTS_JSON}")"

if [ "${PR_ENVIRONMENT_COUNT}" = "0" ]; then
preview_log "No Railway PR environments found in project ${RAILWAY_PROJECT_ID}."
exit 0
fi

preview_log "Evaluating ${PR_ENVIRONMENT_COUNT} Railway PR environment(s) for stale state."

deleted=0
kept=0
unknown_seen=0
unknown_skipped=0
stale_targets=0
errors=0
deleted_names=()
unknown_names=()

while IFS= read -r row; do
[ -z "${row}" ] && continue

env_id="$(jq -r '.id' <<< "${row}")"
env_name="$(jq -r '.name' <<< "${row}")"
pr_number="${env_name#pr-}"

pr_state="$(github_pr_state "${pr_number}")" || {
errors=$((errors + 1))
continue
}

case "${pr_state}" in
open)
kept=$((kept + 1))
;;
closed)
stale_targets=$((stale_targets + 1))
if [ "${DRY_RUN}" = "true" ]; then
preview_log "[dry-run] Would delete stale Railway preview environment ${env_name}."
else
delete_env_and_verify "${env_id}" "${env_name}"
deleted=$((deleted + 1))
deleted_names+=("${env_name}")
fi
Comment on lines +105 to +113
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 MAJOR: Deletion failures not tracked in exit code

Issue: When delete_env_and_verify fails during the janitor loop, the error is not counted. The final exit code (line 153) only reflects GitHub API lookup errors (errors variable), not Railway deletion failures.

Why: The janitor could fail to delete multiple Railway environments due to API issues, but exit with code 0 if all GitHub PR lookups succeeded. Operators checking CI may incorrectly believe all targeted environments were cleaned up.

Fix: Add a deletion_failures counter:

# Add at line 77 with other counters:
deletion_failures=0

# In the delete blocks (lines 100-105 and 113-118):
if ! delete_env_and_verify "${env_id}" "${env_name}"; then
  deletion_failures=$((deletion_failures + 1))
else
  deleted=$((deleted + 1))
  deleted_names+=("${env_name}")
fi

# Update exit check at line 153:
if [ "${errors}" -gt 0 ] || [ "${deletion_failures}" -gt 0 ]; then
  echo "Encountered ${errors} GitHub lookup error(s) and ${deletion_failures} Railway deletion failure(s)." >&2
  exit 1
fi

;;
missing)
unknown_seen=$((unknown_seen + 1))
unknown_names+=("${env_name}")
if [ "${DELETE_UNKNOWN_PREVIEW_ENVS}" = "true" ]; then
stale_targets=$((stale_targets + 1))
if [ "${DRY_RUN}" = "true" ]; then
preview_log "[dry-run] Would delete orphaned Railway preview environment ${env_name}."
else
delete_env_and_verify "${env_id}" "${env_name}"
deleted=$((deleted + 1))
deleted_names+=("${env_name}")
fi
else
unknown_skipped=$((unknown_skipped + 1))
preview_log "Leaving Railway preview environment ${env_name} in place because GitHub PR lookup returned 404."
fi
;;
*)
echo "Unexpected GitHub PR state for #${pr_number}: ${pr_state}" >&2
errors=$((errors + 1))
;;
esac
done < <(jq -rc '.[]' <<< "${PR_ENVIRONMENTS_JSON}")

if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
{
echo "## Railway Preview Janitor"
echo "- Dry run: \`${DRY_RUN}\`"
echo "- Evaluated PR envs: \`${PR_ENVIRONMENT_COUNT}\`"
echo "- Open PR envs kept: \`${kept}\`"
echo "- Closed/orphaned PR envs targeted: \`${stale_targets}\`"
echo "- Railway envs deleted: \`${deleted}\`"
echo "- Unknown PR envs seen: \`${unknown_seen}\`"
echo "- Unknown PR envs left in place: \`${unknown_skipped}\`"
echo "- GitHub lookup errors: \`${errors}\`"
if [ "${deleted}" -gt 0 ]; then
echo "- Deleted envs: \`${deleted_names[*]}\`"
fi
if [ "${unknown_seen}" -gt 0 ]; then
echo "- Unknown envs encountered: \`${unknown_names[*]}\`"
fi
} >> "${GITHUB_STEP_SUMMARY}"
fi

if [ "${errors}" -gt 0 ]; then
echo "Encountered ${errors} GitHub lookup error(s) during Railway preview janitor." >&2
exit 1
fi
21 changes: 10 additions & 11 deletions .github/scripts/preview/cleanup-vercel-stale-env.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
#!/usr/bin/env bash
#
# ONE-TIME CLEANUP SCRIPT
# Stale Vercel preview env cleanup.
#
# Created to resolve the Vercel envs_size_too_large error that blocked all
# preview deployments (March 2026). Branch-scoped preview env vars were never
# cleaned up on PR close, accumulating ~1,500+ stale vars across two Vercel
# projects until hitting Vercel's 64KB storage limit.
#
# Going forward, the teardown-vercel-preview-env.sh script (triggered by the
# teardown-vercel job in preview-environments.yml on PR close) prevents
# re-accumulation. This script only needs to be run once to clear the backlog,
# or again if stale vars accumulate due to teardown failures.
# This script is safe to run manually or from a scheduled janitor workflow.
# It removes branch-scoped preview-only env vars for branches that no longer
# have an open PR, which repairs missed close events and prevents preview env
# storage from filling up again.
#
# Usage:
# # Dry run (preview what would be deleted)
Expand Down Expand Up @@ -60,7 +55,11 @@ fi
branch_has_open_pr() {
local branch="$1"
local count=""
count="$(gh pr list --head "${branch}" --state open --json number --jq 'length' 2>/dev/null || echo "0")"
if [ -n "${GITHUB_REPOSITORY:-}" ]; then
count="$(gh pr list --repo "${GITHUB_REPOSITORY}" --head "${branch}" --state open --json number --jq 'length' 2>/dev/null || echo "0")"
else
count="$(gh pr list --head "${branch}" --state open --json number --jq 'length' 2>/dev/null || echo "0")"
fi
[ "${count}" -gt 0 ]
}

Expand Down
Loading
Loading