Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions .github/scripts/preview/bootstrap-preview-auth.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=.github/scripts/preview/common.sh
source "${SCRIPT_DIR}/common.sh"

require_env_vars \
API_URL \
SPICEDB_PRESHARED_KEY \
INKEEP_AGENTS_MANAGE_UI_USERNAME \
INKEEP_AGENTS_MANAGE_UI_PASSWORD \
BETTER_AUTH_SECRET

mask_env_vars RUN_DB_URL SPICEDB_ENDPOINT SPICEDB_PRESHARED_KEY INKEEP_AGENTS_MANAGE_UI_PASSWORD BETTER_AUTH_SECRET

if [ -z "${RUN_DB_URL:-}" ] || [ -z "${SPICEDB_ENDPOINT:-}" ]; then
require_env_vars \
RAILWAY_API_TOKEN \
RAILWAY_PROJECT_ID \
RAILWAY_OUTPUT_SERVICE \
RAILWAY_RUN_DB_URL_KEY \
RAILWAY_SPICEDB_ENDPOINT_KEY \
PR_NUMBER

RAILWAY_ENV_NAME="$(pr_env_name "${PR_NUMBER}")"

railway_link_service "${RAILWAY_PROJECT_ID}" "${RAILWAY_OUTPUT_SERVICE}" "${RAILWAY_ENV_NAME}"

if [ -z "${RUN_DB_URL:-}" ]; then
RUN_DB_URL="$(railway_extract_runtime_var "${RAILWAY_OUTPUT_SERVICE}" "${RAILWAY_ENV_NAME}" "${RAILWAY_RUN_DB_URL_KEY}")"
fi

if [ -z "${SPICEDB_ENDPOINT:-}" ]; then
SPICEDB_ENDPOINT="$(railway_extract_runtime_var "${RAILWAY_OUTPUT_SERVICE}" "${RAILWAY_ENV_NAME}" "${RAILWAY_SPICEDB_ENDPOINT_KEY}")"
fi

mask_env_vars RUN_DB_URL SPICEDB_ENDPOINT
fi

require_env_vars RUN_DB_URL SPICEDB_ENDPOINT

export INKEEP_AGENTS_API_URL="${API_URL}"
export INKEEP_AGENTS_RUN_DATABASE_URL="${RUN_DB_URL}"
export SPICEDB_ENDPOINT
export TENANT_ID="${TENANT_ID:-default}"

echo "::group::Run preview runtime migrations"
pnpm db:run:migrate
echo "::endgroup::"

echo "::group::Initialize preview auth"
pnpm db:auth:init
echo "::endgroup::"
Comment on lines +48 to +54
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💭 Consider: Add explicit timeouts to migration and auth init commands

Issue: The pnpm db:run:migrate and pnpm db:auth:init commands run without explicit timeouts. If a database connection hangs, the job will wait until the 20-minute job timeout.

Why: Database migrations during preview bootstrapping may encounter transient connection issues to Railway-hosted Postgres. Explicit timeouts provide faster feedback and clearer failure attribution.

Fix: Consider wrapping with explicit timeouts:

echo "::group::Run preview runtime migrations"
timeout 300 pnpm db:run:migrate
echo "::endgroup::"

echo "::group::Initialize preview auth"
timeout 180 pnpm db:auth:init
echo "::endgroup::"

This is optional — the job timeout is a backstop, but explicit command timeouts give faster feedback.


if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
{
echo "## Preview Auth Bootstrap"
echo "- Tenant: \`${TENANT_ID}\`"
echo "- Admin email: \`${INKEEP_AGENTS_MANAGE_UI_USERNAME}\`"
echo "- Runtime migrations: \`pnpm db:run:migrate\`"
echo "- Auth seed: \`pnpm db:auth:init\`"
} >> "${GITHUB_STEP_SUMMARY}"
fi
46 changes: 45 additions & 1 deletion .github/scripts/preview/capture-preview-failure-diagnostics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,48 @@ echo "## Smoke Failure Diagnostics" >> "${GITHUB_STEP_SUMMARY}"
echo
echo "### UI response"
curl --connect-timeout 5 --max-time 15 -I -sS "${UI_URL}" || true
} | tee /tmp/preview-smoke-diagnostics.txt
echo
if [ -n "${INKEEP_AGENTS_MANAGE_UI_USERNAME:-}" ] && [ -n "${INKEEP_AGENTS_MANAGE_UI_PASSWORD:-}" ]; then
tenant_id="${TENANT_ID:-default}"
tmpdir="$(mktemp -d)"
trap 'rm -rf "${tmpdir}"' EXIT

cookie_jar="${tmpdir}/cookies.txt"
sign_in_headers="${tmpdir}/sign-in-headers.txt"
sign_in_body="${tmpdir}/sign-in-body.txt"
manage_body="${tmpdir}/manage-projects-body.txt"

sign_in_status="$(
curl --connect-timeout 5 --max-time 20 -sS \
-c "${cookie_jar}" \
-D "${sign_in_headers}" \
-o "${sign_in_body}" \
-w '%{http_code}' \
-H 'Content-Type: application/json' \
-H "Origin: ${UI_URL}" \
-d "$(jq -cn \
--arg email "${INKEEP_AGENTS_MANAGE_UI_USERNAME}" \
--arg password "${INKEEP_AGENTS_MANAGE_UI_PASSWORD}" \
'{email:$email, password:$password}')" \
"${API_URL}/api/auth/sign-in/email" || true
)"

echo "### API sign-in response (${sign_in_status})"
cat "${sign_in_headers}"
cat "${sign_in_body}"
echo

manage_status="$(
curl --connect-timeout 5 --max-time 20 -sS \
-b "${cookie_jar}" \
-o "${manage_body}" \
-w '%{http_code}' \
-H 'Accept: application/json' \
"${API_URL}/manage/tenants/${tenant_id}/projects" || true
)"

echo "### Authenticated manage/projects response (${manage_status})"
cat "${manage_body}"
echo
fi
} | redact_preview_logs | tee /tmp/preview-smoke-diagnostics.txt
203 changes: 203 additions & 0 deletions .github/scripts/preview/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,57 @@ railway_env_exists_count() {
"${output_path}"
}

railway_link_service() {
local project_id="$1"
local service="$2"
local env_name="$3"

if ! railway link \
--project "${project_id}" \
--service "${service}" \
--environment "${env_name}" \
>/dev/null; then
echo "Failed to link Railway CLI to project ${project_id} service ${service} env ${env_name}." >&2
return 1
fi
}

railway_extract_runtime_var() {
local service="$1"
local env_name="$2"
local key="$3"
local max_attempts="${4:-20}"
local sleep_seconds="${5:-2}"
local attempt=""
local value=""

for attempt in $(seq 1 "${max_attempts}"); do
value="$(
railway variable list \
--service "${service}" \
--environment "${env_name}" \
--json |
jq -r --arg key "${key}" '.[$key] // empty'
)"

if [ -n "${value}" ] && ! printf '%s' "${value}" | grep -q '\$[{][{]'; then
printf '%s' "${value}"
return 0
fi

if [ "${attempt}" -lt "${max_attempts}" ]; then
sleep "${sleep_seconds}"
fi
done
Comment on lines +100 to +103
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟠 MAJOR: Retry without jitter causes thundering herd

Issue: The retry loops in railway_extract_runtime_var and railway_ensure_tcp_proxy use fixed sleep intervals without jitter.

Why: When multiple concurrent PRs experience Railway variable resolution delays, they'll all retry at synchronized intervals, creating a thundering herd pattern that can overwhelm Railway's API and cause cascading timeouts.

Fix: Add randomized jitter to the sleep duration:

    if [ "${attempt}" -lt "${max_attempts}" ]; then
      # Add jitter: sleep_seconds * (0.5 to 1.5)
      jittered_sleep=$(awk "BEGIN {srand(); print ${sleep_seconds} * (0.5 + rand())}")
      sleep "${jittered_sleep}"
    fi

Refs:


if [ -z "${value:-}" ]; then
echo "Missing runtime variable ${key} in Railway service ${service} for env ${env_name}." >&2
else
echo "Runtime variable ${key} is unresolved (${value}) after waiting for Railway interpolation." >&2
fi
return 1
}

mask_env_vars() {
local var_name
for var_name in "$@"; do
Expand All @@ -53,9 +104,161 @@ mask_env_vars() {
done
}

railway_graphql() {
local query="$1"
local payload=""

payload="$(jq -nc --arg query "${query}" '{query: $query}')"

curl --connect-timeout 10 --max-time 30 -fsS \
-H "Content-Type: application/json" \
-H "Authorization: Bearer ${RAILWAY_API_TOKEN}" \
-H "User-Agent: Mozilla/5.0" \
-H "Origin: https://railway.com" \
-H "Referer: https://railway.com/" \
-d "${payload}" \
https://backboard.railway.com/graphql/v2
}

railway_environment_id() {
local project_id="$1"
local env_name="$2"
local response=""

response="$(
railway_graphql "$(cat <<EOF
query {
environments(projectId: "${project_id}") {
edges {
node {
id
name
}
}
}
}
EOF
)"
)"

jq -r --arg env_name "${env_name}" '.data.environments.edges[] | select(.node.name == $env_name) | .node.id' <<< "${response}"
}

railway_service_id_for_env() {
local env_id="$1"
local service_name="$2"
local response=""

response="$(
railway_graphql "$(cat <<EOF
query {
environment(id: "${env_id}") {
serviceInstances {
edges {
node {
serviceId
serviceName
}
}
}
}
}
EOF
)"
)"

jq -r --arg service_name "${service_name}" '.data.environment.serviceInstances.edges[] | select(.node.serviceName == $service_name) | .node.serviceId' <<< "${response}"
}

railway_ensure_tcp_proxy() {
local project_id="$1"
local env_name="$2"
local service_name="$3"
local application_port="$4"
local max_attempts="${5:-30}"
local sleep_seconds="${6:-2}"
local env_id=""
local service_id=""
local response=""
local count=""
local active=""
local attempt=""

env_id="$(railway_environment_id "${project_id}" "${env_name}")"
if [ -z "${env_id}" ]; then
echo "Unable to resolve Railway environment ID for ${env_name}." >&2
return 1
fi

service_id="$(railway_service_id_for_env "${env_id}" "${service_name}")"
if [ -z "${service_id}" ]; then
echo "Unable to resolve Railway service ID for ${service_name} in ${env_name}." >&2
return 1
fi

response="$(
railway_graphql "$(cat <<EOF
query {
tcpProxies(environmentId: "${env_id}", serviceId: "${service_id}") {
id
domain
proxyPort
applicationPort
syncStatus
}
}
EOF
)"
)"

count="$(jq -r --argjson application_port "${application_port}" '[.data.tcpProxies[] | select(.applicationPort == $application_port)] | length' <<< "${response}")"
if [ "${count}" = "0" ]; then
railway_graphql "$(cat <<EOF
mutation {
tcpProxyCreate(input: {
environmentId: "${env_id}"
serviceId: "${service_id}"
applicationPort: ${application_port}
}) {
id
}
}
EOF
)" >/dev/null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Minor: TCP proxy creation mutation discards error response

Issue: The tcpProxyCreate mutation response is redirected to /dev/null, discarding any error information.

Why: If the mutation fails (e.g., quota exceeded, invalid parameters), the error details are lost. The subsequent polling loop will timeout after ~60s, but without the creation error context, debugging becomes difficult.

Fix: Capture the mutation response and check for GraphQL errors:

  if [ "${count}" = "0" ]; then
    mutation_response="$(railway_graphql "$(cat <<EOF
mutation {
  tcpProxyCreate(input: {
    environmentId: "${env_id}"
    serviceId: "${service_id}"
    applicationPort: ${application_port}
  }) {
    id
  }
}
EOF
)")"
    if echo "${mutation_response}" | jq -e '.errors' >/dev/null 2>&1; then
      echo "Failed to create TCP proxy: $(echo "${mutation_response}" | jq -r '.errors[0].message')" >&2
      return 1
    fi
  fi

fi

for attempt in $(seq 1 "${max_attempts}"); do
response="$(
railway_graphql "$(cat <<EOF
query {
tcpProxies(environmentId: "${env_id}", serviceId: "${service_id}") {
applicationPort
syncStatus
}
}
EOF
)"
)"

active="$(jq -r --argjson application_port "${application_port}" '[.data.tcpProxies[] | select(.applicationPort == $application_port and .syncStatus == "ACTIVE")] | length' <<< "${response}")"
if [ "${active}" != "0" ]; then
return 0
fi

if [ "${attempt}" -lt "${max_attempts}" ]; then
sleep "${sleep_seconds}"
fi
done

echo "TCP proxy for ${service_name} in ${env_name} did not become ACTIVE." >&2
return 1
}

redact_preview_logs() {
sed -E \
-e 's#(postgres(ql)?://)[^[:space:]]+#\1[REDACTED]#g' \
-e 's#([A-Z_]*(SECRET|KEY|TOKEN|PASSWORD)[A-Z_]*[:=])[^\r\n[:space:]]+#\1[REDACTED]#g' \
-e 's#((s|S)et-(c|C)ookie:[[:space:]]*better-auth[^=]*=)[^;[:space:]]+#\1[REDACTED]#g' \
-e 's#(better-auth\.[^=]+=)[^;[:space:]]+#\1[REDACTED]#g' \
-e 's#(Bearer )[A-Za-z0-9._-]+#\1[REDACTED]#g'
}
Loading
Loading