Skip to content

Commit 5c3e4bf

Browse files
authored
bug: harden deploy-azd ingress readiness and class detection (#196)
* bug: harden deploy-azd ingress readiness and class detection * fix: restore valid detect-changes heredoc indentation * fix: remediate non-prod infra drift before azd provision * fix: recover soft-deleted key vault before purge fallback * fix: avoid key vault location drift conflicts in non-prod preflight * fix: ensure deploy principal has AKS RBAC for postprovision hooks * fix: assign AKS RBAC at RG scope for first-run provisioning
1 parent 508cca9 commit 5c3e4bf

File tree

6 files changed

+240
-48
lines changed

6 files changed

+240
-48
lines changed

.github/workflows/deploy-azd.yml

Lines changed: 219 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -75,46 +75,46 @@ jobs:
7575
7676
CHANGED_FILES=$(git diff --name-only "origin/$DEFAULT_BRANCH...HEAD")
7777
78-
mapfile -t AGENT_SERVICES < <(python3 - <<'PY'
79-
import re
80-
81-
with open('azure.yaml', encoding='utf-8') as f:
82-
lines = f.readlines()
83-
84-
in_services = False
85-
current_service = None
86-
current_host = None
87-
services = []
88-
89-
for raw in lines:
90-
line = raw.rstrip('\n')
91-
if not in_services:
92-
if re.match(r'^services:\s*$', line):
93-
in_services = True
94-
continue
95-
96-
if re.match(r'^[^\s]', line):
97-
break
98-
99-
service_match = re.match(r'^ ([a-z0-9\-]+):\s*$', line)
100-
if service_match:
101-
if current_service and current_host == 'aks' and current_service != 'crud-service':
102-
services.append(current_service)
103-
current_service = service_match.group(1)
104-
current_host = None
105-
continue
106-
107-
host_match = re.match(r'^ host:\s*(\S+)\s*$', line)
108-
if host_match:
109-
current_host = host_match.group(1)
110-
111-
if current_service and current_host == 'aks' and current_service != 'crud-service':
112-
services.append(current_service)
113-
114-
for service in services:
115-
print(service)
116-
PY
117-
)
78+
mapfile -t AGENT_SERVICES < <(python3 - <<'PY'
79+
import re
80+
81+
with open('azure.yaml', encoding='utf-8') as f:
82+
lines = f.readlines()
83+
84+
in_services = False
85+
current_service = None
86+
current_host = None
87+
services = []
88+
89+
for raw in lines:
90+
line = raw.rstrip('\n')
91+
if not in_services:
92+
if re.match(r'^services:\s*$', line):
93+
in_services = True
94+
continue
95+
96+
if re.match(r'^[^\s]', line):
97+
break
98+
99+
service_match = re.match(r'^ ([a-z0-9\-]+):\s*$', line)
100+
if service_match:
101+
if current_service and current_host == 'aks' and current_service != 'crud-service':
102+
services.append(current_service)
103+
current_service = service_match.group(1)
104+
current_host = None
105+
continue
106+
107+
host_match = re.match(r'^ host:\s*(\S+)\s*$', line)
108+
if host_match:
109+
current_host = host_match.group(1)
110+
111+
if current_service and current_host == 'aks' and current_service != 'crud-service':
112+
services.append(current_service)
113+
114+
for service in services:
115+
print(service)
116+
PY
117+
)
118118
119119
CRUD_CHANGED=false
120120
if echo "$CHANGED_FILES" | grep -Eq '^apps/crud-service/'; then
@@ -291,6 +291,134 @@ jobs:
291291
azd env set K8S_NAMESPACE holiday-peak -e "${{ inputs.environment }}"
292292
azd env set KEDA_ENABLED false -e "${{ inputs.environment }}"
293293
294+
- name: Ensure deploy principal has AKS RBAC Cluster Admin
295+
shell: bash
296+
run: |
297+
set -euo pipefail
298+
299+
AKS_RG="${{ inputs.projectName }}-${{ inputs.environment }}-rg"
300+
DEPLOY_SP_OBJECT_ID=$(az ad sp show --id "${AZURE_CLIENT_ID}" --query id -o tsv)
301+
RG_ID=$(az group show --name "$AKS_RG" --query id -o tsv)
302+
303+
EXISTING=$(az role assignment list \
304+
--scope "$RG_ID" \
305+
--assignee-object-id "$DEPLOY_SP_OBJECT_ID" \
306+
--query "[?roleDefinitionName=='Azure Kubernetes Service RBAC Cluster Admin'] | length(@)" -o tsv)
307+
308+
if [ "$EXISTING" = "0" ]; then
309+
echo "Assigning Azure Kubernetes Service RBAC Cluster Admin to deploy principal at RG scope $AKS_RG."
310+
az role assignment create \
311+
--assignee-object-id "$DEPLOY_SP_OBJECT_ID" \
312+
--assignee-principal-type ServicePrincipal \
313+
--role "Azure Kubernetes Service RBAC Cluster Admin" \
314+
--scope "$RG_ID"
315+
else
316+
echo "Deploy principal already has Azure Kubernetes Service RBAC Cluster Admin at RG scope $AKS_RG."
317+
fi
318+
319+
VERIFIED=$(az role assignment list \
320+
--scope "$RG_ID" \
321+
--assignee-object-id "$DEPLOY_SP_OBJECT_ID" \
322+
--query "[?roleDefinitionName=='Azure Kubernetes Service RBAC Cluster Admin'] | length(@)" -o tsv)
323+
324+
if [ "$VERIFIED" = "0" ]; then
325+
echo "Failed to verify Azure Kubernetes Service RBAC Cluster Admin assignment for deploy principal at scope $AKS_RG."
326+
exit 1
327+
fi
328+
329+
- name: Preflight drift remediation (non-prod)
330+
if: ${{ inputs.environment != 'prod' && inputs.environment != 'production' }}
331+
shell: bash
332+
run: |
333+
set -euo pipefail
334+
335+
RG_NAME="${{ inputs.projectName }}-${{ inputs.environment }}-rg"
336+
KEY_VAULT_NAME="${{ inputs.projectName }}-${{ inputs.environment }}-kv"
337+
POSTGRES_SERVER_NAME="${{ inputs.projectName }}-${{ inputs.environment }}-postgres"
338+
DESIRED_LOCATION="$(echo "${{ inputs.location }}" | tr '[:upper:]' '[:lower:]')"
339+
340+
echo "Running non-prod preflight drift remediation for environment '${{ inputs.environment }}'."
341+
342+
# Remediate soft-deleted Key Vault name conflicts that can block azd provision.
343+
if az keyvault show --name "$KEY_VAULT_NAME" --resource-group "$RG_NAME" >/dev/null 2>&1; then
344+
CURRENT_KV_LOCATION=$(az keyvault show \
345+
--name "$KEY_VAULT_NAME" \
346+
--resource-group "$RG_NAME" \
347+
--query location -o tsv | tr '[:upper:]' '[:lower:]')
348+
349+
if [ "$CURRENT_KV_LOCATION" != "$DESIRED_LOCATION" ]; then
350+
LOCATION_SUFFIX=$(echo "$DESIRED_LOCATION" | tr -cd '[:alnum:]' | cut -c1-4)
351+
KEY_VAULT_OVERRIDE=$(echo "${{ inputs.projectName }}-${{ inputs.environment }}-kv-${LOCATION_SUFFIX}" | tr '[:upper:]' '[:lower:]' | cut -c1-24)
352+
KEY_VAULT_OVERRIDE="${KEY_VAULT_OVERRIDE%-}"
353+
354+
echo "Key Vault location mismatch detected for $KEY_VAULT_NAME ($CURRENT_KV_LOCATION vs $DESIRED_LOCATION)."
355+
echo "Setting keyVaultNameOverride to $KEY_VAULT_OVERRIDE for this deployment."
356+
azd env set keyVaultNameOverride "$KEY_VAULT_OVERRIDE" -e "${{ inputs.environment }}"
357+
else
358+
echo "Key Vault $KEY_VAULT_NAME already exists in $RG_NAME with matching location $CURRENT_KV_LOCATION."
359+
fi
360+
else
361+
DELETED_KV_LOCATION=$(az keyvault list-deleted \
362+
--query "[?name=='$KEY_VAULT_NAME'] | [0].properties.location" \
363+
-o tsv)
364+
365+
if [ -n "$DELETED_KV_LOCATION" ]; then
366+
echo "Recovering soft-deleted Key Vault $KEY_VAULT_NAME."
367+
if az keyvault recover --name "$KEY_VAULT_NAME" >/dev/null 2>&1; then
368+
echo "Recovery initiated for $KEY_VAULT_NAME."
369+
else
370+
echo "Recovery failed for $KEY_VAULT_NAME. Attempting purge fallback from $DELETED_KV_LOCATION."
371+
az keyvault purge --name "$KEY_VAULT_NAME" --location "$DELETED_KV_LOCATION"
372+
fi
373+
374+
for _ in $(seq 1 20); do
375+
if az keyvault show --name "$KEY_VAULT_NAME" --resource-group "$RG_NAME" >/dev/null 2>&1; then
376+
echo "Key Vault $KEY_VAULT_NAME is now available in $RG_NAME."
377+
break
378+
fi
379+
sleep 10
380+
done
381+
else
382+
echo "No soft-deleted Key Vault conflict found for $KEY_VAULT_NAME."
383+
fi
384+
fi
385+
386+
# Start stopped PostgreSQL Flexible Server so provisioning can reconcile state.
387+
if az postgres flexible-server show --resource-group "$RG_NAME" --name "$POSTGRES_SERVER_NAME" >/dev/null 2>&1; then
388+
POSTGRES_STATE=$(az postgres flexible-server show \
389+
--resource-group "$RG_NAME" \
390+
--name "$POSTGRES_SERVER_NAME" \
391+
--query state -o tsv)
392+
393+
if [ "$POSTGRES_STATE" = "Stopped" ]; then
394+
echo "PostgreSQL server $POSTGRES_SERVER_NAME is stopped. Starting..."
395+
az postgres flexible-server start --resource-group "$RG_NAME" --name "$POSTGRES_SERVER_NAME"
396+
397+
POSTGRES_READY=false
398+
for _ in $(seq 1 40); do
399+
CURRENT_STATE=$(az postgres flexible-server show \
400+
--resource-group "$RG_NAME" \
401+
--name "$POSTGRES_SERVER_NAME" \
402+
--query state -o tsv)
403+
if [ "$CURRENT_STATE" = "Ready" ]; then
404+
echo "PostgreSQL server is ready."
405+
POSTGRES_READY=true
406+
break
407+
fi
408+
sleep 15
409+
done
410+
411+
if [ "$POSTGRES_READY" != "true" ]; then
412+
echo "PostgreSQL server $POSTGRES_SERVER_NAME did not reach Ready state in time."
413+
exit 1
414+
fi
415+
else
416+
echo "PostgreSQL server $POSTGRES_SERVER_NAME state is $POSTGRES_STATE. No start needed."
417+
fi
418+
else
419+
echo "PostgreSQL server $POSTGRES_SERVER_NAME does not exist yet. Continuing."
420+
fi
421+
294422
- name: Provision infrastructure
295423
run: azd provision --no-prompt -e "${{ inputs.environment }}"
296424

@@ -453,14 +581,60 @@ jobs:
453581
--query "identityProfile.kubeletidentity.clientId" -o tsv)
454582
echo "WORKLOAD_AZURE_CLIENT_ID=${AKS_MI_CLIENT_ID}" >> "$GITHUB_ENV"
455583
456-
- name: Deploy CRUD service
584+
- name: Resolve ingress class
585+
shell: bash
457586
run: |
458-
if ! azd deploy --service crud-service --no-prompt -e "${{ inputs.environment }}"; then
459-
echo "Initial CRUD deploy failed; retrying once after short wait..."
460-
sleep 60
461-
azd deploy --service crud-service --no-prompt -e "${{ inputs.environment }}"
587+
set -euo pipefail
588+
kubectl get ingressclass -o wide || true
589+
590+
if [ -n "${INGRESS_CLASS_NAME:-}" ] && kubectl get ingressclass "${INGRESS_CLASS_NAME}" >/dev/null 2>&1; then
591+
echo "Using preconfigured ingress class: ${INGRESS_CLASS_NAME}"
592+
echo "INGRESS_CLASS_NAME=${INGRESS_CLASS_NAME}" >> "$GITHUB_ENV"
593+
exit 0
462594
fi
595+
596+
for cls in webapprouting.kubernetes.azure.com nginx azure-application-gateway; do
597+
if kubectl get ingressclass "$cls" >/dev/null 2>&1; then
598+
echo "Using detected ingress class: $cls"
599+
echo "INGRESS_CLASS_NAME=$cls" >> "$GITHUB_ENV"
600+
exit 0
601+
fi
602+
done
603+
604+
echo "No supported IngressClass found. Enable AKS Web App Routing or provide INGRESS_CLASS_NAME." >&2
605+
exit 1
606+
607+
- name: Deploy CRUD service
608+
timeout-minutes: 25
609+
shell: bash
610+
run: |
611+
set -euo pipefail
612+
max_attempts=4
613+
614+
for attempt in $(seq 1 "$max_attempts"); do
615+
echo "Deploy attempt ${attempt}/${max_attempts}"
616+
if azd deploy --service crud-service --no-prompt -e "${{ inputs.environment }}"; then
617+
echo "CRUD deploy succeeded."
618+
exit 0
619+
fi
620+
621+
echo "Attempt ${attempt} failed; collecting ingress diagnostics..."
622+
kubectl get ingressclass -o wide || true
623+
kubectl get ingress -n holiday-peak -o wide || true
624+
kubectl get pods -n app-routing-system -o wide || true
625+
kubectl get pods -n ingress-nginx -o wide || true
626+
627+
if [ "$attempt" -eq "$max_attempts" ]; then
628+
echo "CRUD deploy failed after ${max_attempts} attempts." >&2
629+
exit 1
630+
fi
631+
632+
backoff=$((attempt * 45))
633+
echo "Retrying after ${backoff}s..."
634+
sleep "$backoff"
635+
done
463636
env:
637+
INGRESS_CLASS_NAME: ${{ env.INGRESS_CLASS_NAME }}
464638
AZURE_CLIENT_ID: ${{ env.WORKLOAD_AZURE_CLIENT_ID }}
465639
AZURE_TENANT_ID: ${{ env.AZURE_TENANT_ID }}
466640
PROJECT_ENDPOINT: ${{ needs.provision.outputs.PROJECT_ENDPOINT }}

.infra/azd/hooks/render-helm.ps1

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ $namespace = if ($env:K8S_NAMESPACE) { $env:K8S_NAMESPACE } else { "holiday-peak
77
$imagePrefix = if ($env:IMAGE_PREFIX) { $env:IMAGE_PREFIX } else { "ghcr.io/azure-samples" }
88
$imageTag = if ($env:IMAGE_TAG) { $env:IMAGE_TAG } else { "latest" }
99
$kedaEnabled = if ($env:KEDA_ENABLED) { $env:KEDA_ENABLED } else { "false" }
10+
$ingressEnabled = if ($env:INGRESS_ENABLED) { $env:INGRESS_ENABLED } else { "true" }
11+
$ingressClassName = if ($env:INGRESS_CLASS_NAME) { $env:INGRESS_CLASS_NAME } else { "webapprouting.kubernetes.azure.com" }
12+
$canaryEnabled = if ($env:CANARY_ENABLED) { $env:CANARY_ENABLED } else { "false" }
1013
$readinessPath = "/ready"
1114

1215
if ($ServiceName -eq "crud-service") {
@@ -48,7 +51,13 @@ $helmArgs = @(
4851
'--set',
4952
"image.tag=$imageTag",
5053
'--set',
51-
"keda.enabled=$kedaEnabled"
54+
"keda.enabled=$kedaEnabled",
55+
'--set',
56+
"ingress.enabled=$ingressEnabled",
57+
'--set-string',
58+
"ingress.className=$ingressClassName",
59+
'--set',
60+
"canary.enabled=$canaryEnabled",
5261
'--set',
5362
"probes.readiness.path=$readinessPath"
5463
)

.infra/azd/hooks/render-helm.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ IMAGE_PREFIX="${IMAGE_PREFIX:-ghcr.io/azure-samples}"
88
IMAGE_TAG="${IMAGE_TAG:-latest}"
99
KEDA_ENABLED="${KEDA_ENABLED:-false}"
1010
INGRESS_ENABLED="${INGRESS_ENABLED:-true}"
11+
INGRESS_CLASS_NAME="${INGRESS_CLASS_NAME:-webapprouting.kubernetes.azure.com}"
1112
CANARY_ENABLED="${CANARY_ENABLED:-false}"
1213
READINESS_PATH="/ready"
1314

@@ -48,6 +49,7 @@ HELM_ARGS="$HELM_ARGS --set image.repository=$IMAGE_PREFIX"
4849
HELM_ARGS="$HELM_ARGS --set image.tag=$IMAGE_TAG"
4950
HELM_ARGS="$HELM_ARGS --set keda.enabled=$KEDA_ENABLED"
5051
HELM_ARGS="$HELM_ARGS --set ingress.enabled=$INGRESS_ENABLED"
52+
HELM_ARGS="$HELM_ARGS --set-string ingress.className=$INGRESS_CLASS_NAME"
5153
HELM_ARGS="$HELM_ARGS --set canary.enabled=$CANARY_ENABLED"
5254
HELM_ARGS="$HELM_ARGS --set probes.readiness.path=$READINESS_PATH"
5355

.kubernetes/chart/templates/ingress.yaml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ metadata:
1010
{{- toYaml . | nindent 4 }}
1111
{{- end }}
1212
spec:
13-
ingressClassName: webapprouting.kubernetes.azure.com
13+
{{- if .Values.ingress.className }}
14+
ingressClassName: {{ .Values.ingress.className | quote }}
15+
{{- end }}
1416
{{- if .Values.ingress.tls }}
1517
tls:
1618
{{- range .Values.ingress.tls }}
@@ -53,7 +55,9 @@ metadata:
5355
{{- toYaml . | nindent 4 }}
5456
{{- end }}
5557
spec:
56-
ingressClassName: webapprouting.kubernetes.azure.com
58+
{{- if .Values.ingress.className }}
59+
ingressClassName: {{ .Values.ingress.className | quote }}
60+
{{- end }}
5761
rules:
5862
{{- if .Values.ingress.host }}
5963
- host: {{ .Values.ingress.host | quote }}

.kubernetes/chart/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ service:
1010
# Ingress configuration for AKS Web Application Routing (NGINX-based add-on)
1111
ingress:
1212
enabled: true
13+
className: "webapprouting.kubernetes.azure.com"
1314
host: "" # Leave empty for path-based routing, or set hostname for host-based
1415
path: "" # Defaults to /{serviceName}
1516
pathType: "Prefix"

docs/implementation/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@ Target: **100%**
155155
- App deployments in `deploy-azd` are now strictly changed-only (CRUD, UI, and agent matrix entries are deployed only when their app paths change).
156156
- Post-deploy hooks (`sync-apim-agents` and `ensure-foundry-agents`) consume these lists through `CHANGED_SERVICES` and run only for changed services.
157157
- Foundry readiness verification in deployment workflow is scoped to changed agent services under changed-only mode.
158+
- CRUD deployment now preflights `IngressClass` availability and passes `INGRESS_CLASS_NAME` into Helm rendering to avoid class/controller drift.
159+
- CRUD deployment retries are now bounded with diagnostics (`kubectl get ingressclass`, ingress, and controller pods) to improve root-cause visibility for endpoint readiness delays.
158160

159161
---
160162

0 commit comments

Comments
 (0)