Skip to content

Commit 0521245

Browse files
authored
hotfix: close frontend 500 incident with runtime and APIM safeguards (#198)
* hotfix: close frontend 500 incident with runtime and APIM safeguards * hardening: prevent frontend 500 recurrence across infra and deploy - add deploy guardrails (ACR preflight, CRUD /ready gate, deterministic SWA selection) - align Helm render hooks and chart availability controls (strategy, spread, PDB) - stabilize CRUD Postgres auth env generation and defaults with explicit mode handling - harden UI API base-url fallback resolution and proxy error coverage - document incident hardening plan and runtime resilience updates * fix: address PR review findings on readiness and proxy leakage - allow /ready postgres check to recover from stale startup init errors - add readiness recovery unit coverage for stale init-error path - sanitize proxy 502 payloads by removing raw upstream error detail - keep diagnostics server-side via structured error logging * docs: finalize incident #198 architecture and guardrail documentation - add implementation-closure and operational guardrails summary - document readiness recovery follow-up and auth-mode contract - align UI proxy docs with sanitized error payload behavior - record roadmap closure status and residual hardening items * hotfix: finalize truth-export compatibility and executive notebook updates
1 parent 99cd2a1 commit 0521245

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+6655
-201
lines changed

.github/workflows/deploy-azd.yml

Lines changed: 451 additions & 19 deletions
Large diffs are not rendered by default.

.github/workflows/deploy-ui-swa.yml

Lines changed: 166 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,26 +7,114 @@ on:
77
description: Target GitHub environment
88
required: true
99
default: dev
10-
apiUrl:
11-
description: Public API base URL used by the UI build
10+
projectName:
11+
description: Project prefix used by naming convention
1212
required: true
13-
default: https://apim-holidaypeakhub405-dev.azure-api.net
13+
default: holidaypeakhub
14+
apiUrl:
15+
description: Optional API base URL override (must match APIM gateway URL for drift safety)
16+
required: false
17+
default: ''
1418

1519
permissions:
20+
id-token: write
1621
contents: read
1722

23+
concurrency:
24+
group: deploy-ui-swa-${{ inputs.environment }}
25+
cancel-in-progress: false
26+
1827
jobs:
1928
deploy-ui:
2029
runs-on: ubuntu-latest
2130
environment: ${{ inputs.environment }}
31+
env:
32+
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
33+
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
34+
AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
2235
steps:
2336
- uses: actions/checkout@v4
2437

38+
- name: Azure login (OIDC)
39+
uses: azure/login@v2
40+
with:
41+
client-id: ${{ env.AZURE_CLIENT_ID }}
42+
tenant-id: ${{ env.AZURE_TENANT_ID }}
43+
subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }}
44+
2545
- name: Setup Node.js
2646
uses: actions/setup-node@v4
2747
with:
2848
node-version: '20'
2949

50+
- name: Resolve and validate API URL
51+
id: api
52+
shell: bash
53+
run: |
54+
set -euo pipefail
55+
56+
RESOURCE_GROUP="${{ inputs.projectName }}-${{ inputs.environment }}-rg"
57+
58+
APIM_URL=$(az apim show \
59+
--name "${{ inputs.projectName }}-${{ inputs.environment }}-apim" \
60+
--resource-group "$RESOURCE_GROUP" \
61+
--query "gatewayUrl" -o tsv)
62+
63+
APIM_URL="$(echo "$APIM_URL" | xargs)"
64+
APIM_URL="${APIM_URL%/}"
65+
66+
if [ -z "$APIM_URL" ]; then
67+
echo "Failed to resolve APIM gateway URL for environment '${{ inputs.environment }}'." >&2
68+
exit 1
69+
fi
70+
71+
RESOLVED_API_URL="$APIM_URL"
72+
73+
if [ -n "${{ inputs.apiUrl }}" ]; then
74+
OVERRIDE_API_URL="$(echo "${{ inputs.apiUrl }}" | xargs)"
75+
OVERRIDE_API_URL="${OVERRIDE_API_URL%/}"
76+
77+
if [ "$OVERRIDE_API_URL" != "$APIM_URL" ]; then
78+
echo "apiUrl override drift detected. Provided URL does not match live APIM gateway URL." >&2
79+
echo "Provided: $OVERRIDE_API_URL" >&2
80+
echo "Expected: $APIM_URL" >&2
81+
exit 1
82+
fi
83+
84+
RESOLVED_API_URL="$OVERRIDE_API_URL"
85+
fi
86+
87+
if [ -z "$RESOLVED_API_URL" ]; then
88+
echo "Could not resolve API URL from APIM." >&2
89+
exit 1
90+
fi
91+
92+
echo "api_url=$RESOLVED_API_URL" >> "$GITHUB_OUTPUT"
93+
echo "crud_api_url=$RESOLVED_API_URL" >> "$GITHUB_OUTPUT"
94+
95+
- name: Smoke test APIM health before UI deploy
96+
shell: bash
97+
run: |
98+
set -euo pipefail
99+
100+
API_BASE_URL="${{ steps.api.outputs.api_url }}"
101+
API_BASE_URL="${API_BASE_URL%/}"
102+
103+
smoke_endpoint() {
104+
local url="$1"
105+
local label="$2"
106+
STATUS_CODE=$(curl -sS -o /tmp/ui-only-api-health.json -w "%{http_code}" "$url" || true)
107+
if [ "$STATUS_CODE" != "200" ]; then
108+
echo "$label check failed before UI deployment with HTTP $STATUS_CODE at $url" >&2
109+
cat /tmp/ui-only-api-health.json 2>/dev/null || true
110+
exit 1
111+
fi
112+
}
113+
114+
smoke_endpoint "${API_BASE_URL}/api/health" "APIM health"
115+
smoke_endpoint "${API_BASE_URL}/api/products?limit=1" "Products"
116+
smoke_endpoint "${API_BASE_URL}/api/categories" "Categories"
117+
30118
- name: Deploy UI to Azure Static Web Apps
31119
uses: Azure/static-web-apps-deploy@v1
32120
with:
@@ -37,5 +125,78 @@ jobs:
37125
skip_api_build: true
38126
app_build_command: yarn install --frozen-lockfile && yarn build
39127
env:
40-
NEXT_PUBLIC_API_URL: ${{ inputs.apiUrl }}
41-
NEXT_PUBLIC_CRUD_API_URL: ${{ inputs.apiUrl }}
128+
NEXT_PUBLIC_API_URL: ${{ steps.api.outputs.api_url }}
129+
NEXT_PUBLIC_CRUD_API_URL: ${{ steps.api.outputs.crud_api_url }}
130+
131+
- name: Smoke test UI host and API health after deploy
132+
shell: bash
133+
run: |
134+
set -euo pipefail
135+
136+
RESOURCE_GROUP="${{ inputs.projectName }}-${{ inputs.environment }}-rg"
137+
EXPECTED_SWA_NAME="${{ inputs.projectName }}-ui-${{ inputs.environment }}"
138+
SWA_NAME=""
139+
140+
if az staticwebapp show --name "$EXPECTED_SWA_NAME" --resource-group "$RESOURCE_GROUP" >/dev/null 2>&1; then
141+
SWA_NAME="$EXPECTED_SWA_NAME"
142+
else
143+
SWA_NAME=$(az staticwebapp list \
144+
--resource-group "$RESOURCE_GROUP" \
145+
--query "[?starts_with(defaultHostname, '${EXPECTED_SWA_NAME}.')].name | [0]" -o tsv)
146+
147+
if [ -z "$SWA_NAME" ]; then
148+
COUNT=$(az staticwebapp list --resource-group "$RESOURCE_GROUP" --query "length(@)" -o tsv)
149+
if [ "$COUNT" = "1" ]; then
150+
SWA_NAME=$(az staticwebapp list --resource-group "$RESOURCE_GROUP" --query "[0].name" -o tsv)
151+
fi
152+
fi
153+
fi
154+
155+
SWA_NAME="$(echo "$SWA_NAME" | xargs)"
156+
if [ -z "$SWA_NAME" ]; then
157+
echo "Could not deterministically resolve Static Web App in resource group $RESOURCE_GROUP." >&2
158+
echo "Expected name: $EXPECTED_SWA_NAME" >&2
159+
exit 1
160+
fi
161+
162+
SWA_HOSTNAME=$(az staticwebapp show \
163+
--name "$SWA_NAME" \
164+
--resource-group "$RESOURCE_GROUP" \
165+
--query "defaultHostname" -o tsv)
166+
167+
SWA_HOSTNAME="$(echo "$SWA_HOSTNAME" | xargs)"
168+
if [ -z "$SWA_HOSTNAME" ]; then
169+
echo "Could not resolve Static Web App hostname for $SWA_NAME." >&2
170+
exit 1
171+
fi
172+
173+
UI_URL="https://$SWA_HOSTNAME"
174+
STATUS_CODE=$(curl -sS -o /tmp/ui-only-homepage.html -w "%{http_code}" "$UI_URL" || true)
175+
if [ "$STATUS_CODE" != "200" ]; then
176+
echo "UI smoke test failed with HTTP $STATUS_CODE at $UI_URL" >&2
177+
exit 1
178+
fi
179+
180+
HEALTH_URL="${{ steps.api.outputs.api_url }}/api/health"
181+
STATUS_CODE=$(curl -sS -o /tmp/ui-only-api-health-post.json -w "%{http_code}" "$HEALTH_URL" || true)
182+
if [ "$STATUS_CODE" != "200" ]; then
183+
echo "API health check failed after UI deployment with HTTP $STATUS_CODE at $HEALTH_URL" >&2
184+
cat /tmp/ui-only-api-health-post.json 2>/dev/null || true
185+
exit 1
186+
fi
187+
188+
PRODUCTS_URL="${{ steps.api.outputs.api_url }}/api/products?limit=1"
189+
STATUS_CODE=$(curl -sS -o /tmp/ui-only-api-products-post.json -w "%{http_code}" "$PRODUCTS_URL" || true)
190+
if [ "$STATUS_CODE" != "200" ]; then
191+
echo "Products smoke check failed after UI deployment with HTTP $STATUS_CODE at $PRODUCTS_URL" >&2
192+
cat /tmp/ui-only-api-products-post.json 2>/dev/null || true
193+
exit 1
194+
fi
195+
196+
CATEGORIES_URL="${{ steps.api.outputs.api_url }}/api/categories"
197+
STATUS_CODE=$(curl -sS -o /tmp/ui-only-api-categories-post.json -w "%{http_code}" "$CATEGORIES_URL" || true)
198+
if [ "$STATUS_CODE" != "200" ]; then
199+
echo "Categories smoke check failed after UI deployment with HTTP $STATUS_CODE at $CATEGORIES_URL" >&2
200+
cat /tmp/ui-only-api-categories-post.json 2>/dev/null || true
201+
exit 1
202+
fi

.infra/DEPLOYMENT.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,14 @@ az account set --subscription <SUBSCRIPTION_ID>
7272

7373
Deploy a single shared resource stack, then deploy all services as AKS workloads.
7474

75-
**Deployment order**: Shared Infrastructure → Static Web App → CRUD Service → Agent Services
75+
**Deployment order**: Shared Infrastructure → CRUD Service + Agent Services → APIM Sync + APIM Smoke Gate → Static Web App
76+
77+
Release gate notes:
78+
79+
- UI deployment is blocked unless backend deployment jobs are `success` or `skipped`.
80+
- APIM gateway URL is propagated from `azd` outputs and checked against live APIM to catch config drift.
81+
- APIM smoke checks validate `GET /api/health`, `GET /api/products?limit=1`, and `GET /api/categories` plus changed agent `GET /agents/<service>/health` before UI publish.
82+
- UI deployment runs pre/post smoke checks to ensure API health and SWA hostname reachability.
7683

7784
### Step 1: Deploy Shared Infrastructure
7885

@@ -379,6 +386,13 @@ az deployment sub delete --name shared-infra-dev
379386

380387
## CI/CD Integration
381388

389+
Current workflow gate behavior:
390+
391+
- `.github/workflows/deploy-azd.yml` enforces backend and APIM readiness before `deploy-ui`.
392+
- `.github/workflows/deploy-azd.yml` fails on Foundry readiness check failures instead of warning-only.
393+
- `.github/workflows/deploy-ui-swa.yml` resolves APIM gateway URL from Azure and rejects mismatched manual `apiUrl` overrides.
394+
- `.github/workflows/deploy-ui-swa.yml` includes pre/post deployment smoke checks (`/api/health`, `/api/products?limit=1`, `/api/categories`, and SWA home page).
395+
382396
### GitHub Actions Workflow (recommended)
383397

384398
```yaml

.infra/azd/hooks/generate-crud-env.ps1

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,32 @@ $environment = First-Value -Map $values -Keys @('ENVIRONMENT', 'environment') -D
9090

9191
$postgresHost = First-Value -Map $values -Keys @('POSTGRES_HOST', 'postgresFqdn', 'POSTGRES_FQDN')
9292
$postgresDatabase = First-Value -Map $values -Keys @('POSTGRES_DATABASE', 'postgresDatabaseName') -DefaultValue 'holiday_peak_crud'
93-
$postgresUser = First-Value -Map $values -Keys @('POSTGRES_USER', 'postgresAdminUser') -DefaultValue 'crud_admin'
93+
$postgresAuthMode = First-Value -Map $values -Keys @('POSTGRES_AUTH_MODE', 'postgresAuthMode') -DefaultValue 'password'
94+
$postgresAdminUser = First-Value -Map $values -Keys @('POSTGRES_ADMIN_USER', 'postgresAdminUser') -DefaultValue 'crud_admin'
95+
$postgresUser = First-Value -Map $values -Keys @('POSTGRES_USER')
96+
if ($postgresAuthMode -eq 'password') {
97+
$postgresUser = $postgresAdminUser
98+
}
99+
elseif (-not $postgresUser) {
100+
$aksClusterName = First-Value -Map $values -Keys @('AZURE_AKS_CLUSTER_NAME', 'AKS_CLUSTER_NAME', 'aksClusterName')
101+
if ($aksClusterName) {
102+
$postgresUser = "$aksClusterName-agentpool"
103+
}
104+
else {
105+
$projectName = First-Value -Map $values -Keys @('projectName', 'PROJECT_NAME')
106+
if ($projectName) {
107+
if ($environment -eq 'prod') {
108+
$postgresUser = "$projectName-aks-agentpool"
109+
}
110+
else {
111+
$postgresUser = "$projectName-$environment-aks-agentpool"
112+
}
113+
}
114+
else {
115+
$postgresUser = "crud-$environment-aks-agentpool"
116+
}
117+
}
118+
}
94119

95120
$eventHubNamespace = Ensure-Suffix -Value (First-Value -Map $values -Keys @('EVENT_HUB_NAMESPACE', 'eventHubsNamespaceName')) -Suffix '.servicebus.windows.net'
96121
$keyVaultUri = First-Value -Map $values -Keys @('KEY_VAULT_URI', 'keyVaultUri')
@@ -119,9 +144,11 @@ LOG_LEVEL=INFO
119144
POSTGRES_HOST=$postgresHost
120145
POSTGRES_PORT=5432
121146
POSTGRES_DATABASE=$postgresDatabase
147+
POSTGRES_AUTH_MODE=$postgresAuthMode
122148
POSTGRES_USER=$postgresUser
123149
POSTGRES_PASSWORD=
124150
POSTGRES_PASSWORD_SECRET_NAME=postgres-admin-password
151+
POSTGRES_ENTRA_SCOPE=https://ossrdbms-aad.database.windows.net/.default
125152
POSTGRES_SSL=true
126153
127154
EVENT_HUB_NAMESPACE=$eventHubNamespace

.infra/azd/hooks/generate-crud-env.sh

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,39 @@ POSTGRES_DATABASE="$(get_val POSTGRES_DATABASE)"
5454
[ -z "$POSTGRES_DATABASE" ] && POSTGRES_DATABASE="$(get_val postgresDatabaseName)"
5555
[ -z "$POSTGRES_DATABASE" ] && POSTGRES_DATABASE="holiday_peak_crud"
5656

57+
POSTGRES_AUTH_MODE="$(get_val POSTGRES_AUTH_MODE)"
58+
[ -z "$POSTGRES_AUTH_MODE" ] && POSTGRES_AUTH_MODE="$(get_val postgresAuthMode)"
59+
[ -z "$POSTGRES_AUTH_MODE" ] && POSTGRES_AUTH_MODE="password"
60+
61+
POSTGRES_ADMIN_USER="$(get_val POSTGRES_ADMIN_USER)"
62+
[ -z "$POSTGRES_ADMIN_USER" ] && POSTGRES_ADMIN_USER="$(get_val postgresAdminUser)"
63+
[ -z "$POSTGRES_ADMIN_USER" ] && POSTGRES_ADMIN_USER="crud_admin"
64+
5765
POSTGRES_USER="$(get_val POSTGRES_USER)"
58-
[ -z "$POSTGRES_USER" ] && POSTGRES_USER="$(get_val postgresAdminUser)"
59-
[ -z "$POSTGRES_USER" ] && POSTGRES_USER="crud_admin"
66+
if [ "$POSTGRES_AUTH_MODE" = "password" ]; then
67+
POSTGRES_USER="$POSTGRES_ADMIN_USER"
68+
elif [ -z "$POSTGRES_USER" ]; then
69+
AKS_CLUSTER_NAME="$(get_val AZURE_AKS_CLUSTER_NAME)"
70+
[ -z "$AKS_CLUSTER_NAME" ] && AKS_CLUSTER_NAME="$(get_val AKS_CLUSTER_NAME)"
71+
[ -z "$AKS_CLUSTER_NAME" ] && AKS_CLUSTER_NAME="$(get_val aksClusterName)"
72+
73+
if [ -n "$AKS_CLUSTER_NAME" ]; then
74+
POSTGRES_USER="${AKS_CLUSTER_NAME}-agentpool"
75+
else
76+
PROJECT_NAME="$(get_val projectName)"
77+
[ -z "$PROJECT_NAME" ] && PROJECT_NAME="$(get_val PROJECT_NAME)"
78+
79+
if [ -n "$PROJECT_NAME" ]; then
80+
if [ "$ENVIRONMENT_VALUE" = "prod" ]; then
81+
POSTGRES_USER="${PROJECT_NAME}-aks-agentpool"
82+
else
83+
POSTGRES_USER="${PROJECT_NAME}-${ENVIRONMENT_VALUE}-aks-agentpool"
84+
fi
85+
else
86+
POSTGRES_USER="crud-${ENVIRONMENT_VALUE}-aks-agentpool"
87+
fi
88+
fi
89+
fi
6090

6191
EVENT_HUB_NAMESPACE="$(get_val EVENT_HUB_NAMESPACE)"
6292
[ -z "$EVENT_HUB_NAMESPACE" ] && EVENT_HUB_NAMESPACE="$(get_val eventHubsNamespaceName)"
@@ -103,9 +133,11 @@ LOG_LEVEL=INFO
103133
POSTGRES_HOST=$POSTGRES_HOST
104134
POSTGRES_PORT=5432
105135
POSTGRES_DATABASE=$POSTGRES_DATABASE
136+
POSTGRES_AUTH_MODE=$POSTGRES_AUTH_MODE
106137
POSTGRES_USER=$POSTGRES_USER
107138
POSTGRES_PASSWORD=
108139
POSTGRES_PASSWORD_SECRET_NAME=postgres-admin-password
140+
POSTGRES_ENTRA_SCOPE=https://ossrdbms-aad.database.windows.net/.default
109141
POSTGRES_SSL=true
110142
111143
EVENT_HUB_NAMESPACE=$EVENT_HUB_NAMESPACE

.infra/azd/hooks/render-helm.ps1

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,21 @@ $ingressClassName = if ($env:INGRESS_CLASS_NAME) { $env:INGRESS_CLASS_NAME } els
1212
$canaryEnabled = if ($env:CANARY_ENABLED) { $env:CANARY_ENABLED } else { "false" }
1313
$readinessPath = "/ready"
1414

15+
$nodePool = "agents"
16+
$workloadType = "agents"
17+
$pdbEnabled = "false"
18+
$pdbMinAvailable = ""
19+
$maxUnavailable = ""
20+
$maxSurge = ""
21+
1522
if ($ServiceName -eq "crud-service") {
16-
$readinessPath = "/health"
23+
$nodePool = "crud"
24+
$workloadType = "crud"
25+
# Safer defaults for CRUD without changing global chart defaults.
26+
$pdbEnabled = "true"
27+
$pdbMinAvailable = "1"
28+
$maxUnavailable = "0"
29+
$maxSurge = "1"
1730
}
1831

1932
$serviceImageVarName = "SERVICE_$($ServiceName.ToUpper().Replace('-', '_'))_IMAGE_NAME"
@@ -59,9 +72,34 @@ $helmArgs = @(
5972
'--set',
6073
"canary.enabled=$canaryEnabled",
6174
'--set',
62-
"probes.readiness.path=$readinessPath"
75+
"probes.readiness.path=$readinessPath",
76+
'--set',
77+
"nodeSelector.agentpool=$nodePool",
78+
'--set',
79+
"tolerations[0].key=workload",
80+
'--set',
81+
"tolerations[0].operator=Equal",
82+
'--set',
83+
"tolerations[0].value=$workloadType",
84+
'--set',
85+
"tolerations[0].effect=NoSchedule"
6386
)
6487

88+
if ($maxUnavailable) {
89+
$helmArgs += @('--set-string', "availability.rollingUpdate.maxUnavailable=$maxUnavailable")
90+
}
91+
92+
if ($maxSurge) {
93+
$helmArgs += @('--set-string', "availability.rollingUpdate.maxSurge=$maxSurge")
94+
}
95+
96+
if ($pdbEnabled -eq "true") {
97+
$helmArgs += @('--set', "pdb.enabled=true")
98+
if ($pdbMinAvailable) {
99+
$helmArgs += @('--set-string', "pdb.minAvailable=$pdbMinAvailable")
100+
}
101+
}
102+
65103
$envMappings = @{
66104
# Database
67105
POSTGRES_HOST = $env:POSTGRES_HOST

0 commit comments

Comments
 (0)