@@ -75,46 +75,46 @@ jobs:
7575
7676 CHANGED_FILES=$(git diff --name-only "origin/$DEFAULT_BRANCH...HEAD")
7777
78- mapfile -t AGENT_SERVICES < <(python3 - <<'PY'
79- import re
80-
81- with open('azure.yaml', encoding='utf-8') as f:
82- lines = f.readlines()
83-
84- in_services = False
85- current_service = None
86- current_host = None
87- services = []
88-
89- for raw in lines:
90- line = raw.rstrip('\n')
91- if not in_services:
92- if re.match(r'^services:\s*$', line):
93- in_services = True
94- continue
95-
96- if re.match(r'^[^\s]', line):
97- break
98-
99- service_match = re.match(r'^ ([a-z0-9\-]+):\s*$', line)
100- if service_match:
101- if current_service and current_host == 'aks' and current_service != 'crud-service':
102- services.append(current_service)
103- current_service = service_match.group(1)
104- current_host = None
105- continue
106-
107- host_match = re.match(r'^ host:\s*(\S+)\s*$', line)
108- if host_match:
109- current_host = host_match.group(1)
110-
111- if current_service and current_host == 'aks' and current_service != 'crud-service':
112- services.append(current_service)
113-
114- for service in services:
115- print(service)
116- PY
117- )
78+ mapfile -t AGENT_SERVICES < <(python3 - <<'PY'
79+ import re
80+
81+ with open('azure.yaml', encoding='utf-8') as f:
82+ lines = f.readlines()
83+
84+ in_services = False
85+ current_service = None
86+ current_host = None
87+ services = []
88+
89+ for raw in lines:
90+ line = raw.rstrip('\n')
91+ if not in_services:
92+ if re.match(r'^services:\s*$', line):
93+ in_services = True
94+ continue
95+
96+ if re.match(r'^[^\s]', line):
97+ break
98+
99+ service_match = re.match(r'^ ([a-z0-9\-]+):\s*$', line)
100+ if service_match:
101+ if current_service and current_host == 'aks' and current_service != 'crud-service':
102+ services.append(current_service)
103+ current_service = service_match.group(1)
104+ current_host = None
105+ continue
106+
107+ host_match = re.match(r'^ host:\s*(\S+)\s*$', line)
108+ if host_match:
109+ current_host = host_match.group(1)
110+
111+ if current_service and current_host == 'aks' and current_service != 'crud-service':
112+ services.append(current_service)
113+
114+ for service in services:
115+ print(service)
116+ PY
117+ )
118118
119119 CRUD_CHANGED=false
120120 if echo "$CHANGED_FILES" | grep -Eq '^apps/crud-service/'; then
@@ -291,6 +291,134 @@ jobs:
291291 azd env set K8S_NAMESPACE holiday-peak -e "${{ inputs.environment }}"
292292 azd env set KEDA_ENABLED false -e "${{ inputs.environment }}"
293293
294+ - name : Ensure deploy principal has AKS RBAC Cluster Admin
295+ shell : bash
296+ run : |
297+ set -euo pipefail
298+
299+ AKS_RG="${{ inputs.projectName }}-${{ inputs.environment }}-rg"
300+ DEPLOY_SP_OBJECT_ID=$(az ad sp show --id "${AZURE_CLIENT_ID}" --query id -o tsv)
301+ RG_ID=$(az group show --name "$AKS_RG" --query id -o tsv)
302+
303+ EXISTING=$(az role assignment list \
304+ --scope "$RG_ID" \
305+ --assignee-object-id "$DEPLOY_SP_OBJECT_ID" \
306+ --query "[?roleDefinitionName=='Azure Kubernetes Service RBAC Cluster Admin'] | length(@)" -o tsv)
307+
308+ if [ "$EXISTING" = "0" ]; then
309+ echo "Assigning Azure Kubernetes Service RBAC Cluster Admin to deploy principal at RG scope $AKS_RG."
310+ az role assignment create \
311+ --assignee-object-id "$DEPLOY_SP_OBJECT_ID" \
312+ --assignee-principal-type ServicePrincipal \
313+ --role "Azure Kubernetes Service RBAC Cluster Admin" \
314+ --scope "$RG_ID"
315+ else
316+ echo "Deploy principal already has Azure Kubernetes Service RBAC Cluster Admin at RG scope $AKS_RG."
317+ fi
318+
319+ VERIFIED=$(az role assignment list \
320+ --scope "$RG_ID" \
321+ --assignee-object-id "$DEPLOY_SP_OBJECT_ID" \
322+ --query "[?roleDefinitionName=='Azure Kubernetes Service RBAC Cluster Admin'] | length(@)" -o tsv)
323+
324+ if [ "$VERIFIED" = "0" ]; then
325+ echo "Failed to verify Azure Kubernetes Service RBAC Cluster Admin assignment for deploy principal at scope $AKS_RG."
326+ exit 1
327+ fi
328+
329+ - name : Preflight drift remediation (non-prod)
330+ if : ${{ inputs.environment != 'prod' && inputs.environment != 'production' }}
331+ shell : bash
332+ run : |
333+ set -euo pipefail
334+
335+ RG_NAME="${{ inputs.projectName }}-${{ inputs.environment }}-rg"
336+ KEY_VAULT_NAME="${{ inputs.projectName }}-${{ inputs.environment }}-kv"
337+ POSTGRES_SERVER_NAME="${{ inputs.projectName }}-${{ inputs.environment }}-postgres"
338+ DESIRED_LOCATION="$(echo "${{ inputs.location }}" | tr '[:upper:]' '[:lower:]')"
339+
340+ echo "Running non-prod preflight drift remediation for environment '${{ inputs.environment }}'."
341+
342+ # Remediate soft-deleted Key Vault name conflicts that can block azd provision.
343+ if az keyvault show --name "$KEY_VAULT_NAME" --resource-group "$RG_NAME" >/dev/null 2>&1; then
344+ CURRENT_KV_LOCATION=$(az keyvault show \
345+ --name "$KEY_VAULT_NAME" \
346+ --resource-group "$RG_NAME" \
347+ --query location -o tsv | tr '[:upper:]' '[:lower:]')
348+
349+ if [ "$CURRENT_KV_LOCATION" != "$DESIRED_LOCATION" ]; then
350+ LOCATION_SUFFIX=$(echo "$DESIRED_LOCATION" | tr -cd '[:alnum:]' | cut -c1-4)
351+ KEY_VAULT_OVERRIDE=$(echo "${{ inputs.projectName }}-${{ inputs.environment }}-kv-${LOCATION_SUFFIX}" | tr '[:upper:]' '[:lower:]' | cut -c1-24)
352+ KEY_VAULT_OVERRIDE="${KEY_VAULT_OVERRIDE%-}"
353+
354+ echo "Key Vault location mismatch detected for $KEY_VAULT_NAME ($CURRENT_KV_LOCATION vs $DESIRED_LOCATION)."
355+ echo "Setting keyVaultNameOverride to $KEY_VAULT_OVERRIDE for this deployment."
356+ azd env set keyVaultNameOverride "$KEY_VAULT_OVERRIDE" -e "${{ inputs.environment }}"
357+ else
358+ echo "Key Vault $KEY_VAULT_NAME already exists in $RG_NAME with matching location $CURRENT_KV_LOCATION."
359+ fi
360+ else
361+ DELETED_KV_LOCATION=$(az keyvault list-deleted \
362+ --query "[?name=='$KEY_VAULT_NAME'] | [0].properties.location" \
363+ -o tsv)
364+
365+ if [ -n "$DELETED_KV_LOCATION" ]; then
366+ echo "Recovering soft-deleted Key Vault $KEY_VAULT_NAME."
367+ if az keyvault recover --name "$KEY_VAULT_NAME" >/dev/null 2>&1; then
368+ echo "Recovery initiated for $KEY_VAULT_NAME."
369+ else
370+ echo "Recovery failed for $KEY_VAULT_NAME. Attempting purge fallback from $DELETED_KV_LOCATION."
371+ az keyvault purge --name "$KEY_VAULT_NAME" --location "$DELETED_KV_LOCATION"
372+ fi
373+
374+ for _ in $(seq 1 20); do
375+ if az keyvault show --name "$KEY_VAULT_NAME" --resource-group "$RG_NAME" >/dev/null 2>&1; then
376+ echo "Key Vault $KEY_VAULT_NAME is now available in $RG_NAME."
377+ break
378+ fi
379+ sleep 10
380+ done
381+ else
382+ echo "No soft-deleted Key Vault conflict found for $KEY_VAULT_NAME."
383+ fi
384+ fi
385+
386+ # Start stopped PostgreSQL Flexible Server so provisioning can reconcile state.
387+ if az postgres flexible-server show --resource-group "$RG_NAME" --name "$POSTGRES_SERVER_NAME" >/dev/null 2>&1; then
388+ POSTGRES_STATE=$(az postgres flexible-server show \
389+ --resource-group "$RG_NAME" \
390+ --name "$POSTGRES_SERVER_NAME" \
391+ --query state -o tsv)
392+
393+ if [ "$POSTGRES_STATE" = "Stopped" ]; then
394+ echo "PostgreSQL server $POSTGRES_SERVER_NAME is stopped. Starting..."
395+ az postgres flexible-server start --resource-group "$RG_NAME" --name "$POSTGRES_SERVER_NAME"
396+
397+ POSTGRES_READY=false
398+ for _ in $(seq 1 40); do
399+ CURRENT_STATE=$(az postgres flexible-server show \
400+ --resource-group "$RG_NAME" \
401+ --name "$POSTGRES_SERVER_NAME" \
402+ --query state -o tsv)
403+ if [ "$CURRENT_STATE" = "Ready" ]; then
404+ echo "PostgreSQL server is ready."
405+ POSTGRES_READY=true
406+ break
407+ fi
408+ sleep 15
409+ done
410+
411+ if [ "$POSTGRES_READY" != "true" ]; then
412+ echo "PostgreSQL server $POSTGRES_SERVER_NAME did not reach Ready state in time."
413+ exit 1
414+ fi
415+ else
416+ echo "PostgreSQL server $POSTGRES_SERVER_NAME state is $POSTGRES_STATE. No start needed."
417+ fi
418+ else
419+ echo "PostgreSQL server $POSTGRES_SERVER_NAME does not exist yet. Continuing."
420+ fi
421+
294422 - name : Provision infrastructure
295423 run : azd provision --no-prompt -e "${{ inputs.environment }}"
296424
@@ -453,14 +581,60 @@ jobs:
453581 --query "identityProfile.kubeletidentity.clientId" -o tsv)
454582 echo "WORKLOAD_AZURE_CLIENT_ID=${AKS_MI_CLIENT_ID}" >> "$GITHUB_ENV"
455583
456- - name : Deploy CRUD service
584+ - name : Resolve ingress class
585+ shell : bash
457586 run : |
458- if ! azd deploy --service crud-service --no-prompt -e "${{ inputs.environment }}"; then
459- echo "Initial CRUD deploy failed; retrying once after short wait..."
460- sleep 60
461- azd deploy --service crud-service --no-prompt -e "${{ inputs.environment }}"
587+ set -euo pipefail
588+ kubectl get ingressclass -o wide || true
589+
590+ if [ -n "${INGRESS_CLASS_NAME:-}" ] && kubectl get ingressclass "${INGRESS_CLASS_NAME}" >/dev/null 2>&1; then
591+ echo "Using preconfigured ingress class: ${INGRESS_CLASS_NAME}"
592+ echo "INGRESS_CLASS_NAME=${INGRESS_CLASS_NAME}" >> "$GITHUB_ENV"
593+ exit 0
462594 fi
595+
596+ for cls in webapprouting.kubernetes.azure.com nginx azure-application-gateway; do
597+ if kubectl get ingressclass "$cls" >/dev/null 2>&1; then
598+ echo "Using detected ingress class: $cls"
599+ echo "INGRESS_CLASS_NAME=$cls" >> "$GITHUB_ENV"
600+ exit 0
601+ fi
602+ done
603+
604+ echo "No supported IngressClass found. Enable AKS Web App Routing or provide INGRESS_CLASS_NAME." >&2
605+ exit 1
606+
607+ - name : Deploy CRUD service
608+ timeout-minutes : 25
609+ shell : bash
610+ run : |
611+ set -euo pipefail
612+ max_attempts=4
613+
614+ for attempt in $(seq 1 "$max_attempts"); do
615+ echo "Deploy attempt ${attempt}/${max_attempts}"
616+ if azd deploy --service crud-service --no-prompt -e "${{ inputs.environment }}"; then
617+ echo "CRUD deploy succeeded."
618+ exit 0
619+ fi
620+
621+ echo "Attempt ${attempt} failed; collecting ingress diagnostics..."
622+ kubectl get ingressclass -o wide || true
623+ kubectl get ingress -n holiday-peak -o wide || true
624+ kubectl get pods -n app-routing-system -o wide || true
625+ kubectl get pods -n ingress-nginx -o wide || true
626+
627+ if [ "$attempt" -eq "$max_attempts" ]; then
628+ echo "CRUD deploy failed after ${max_attempts} attempts." >&2
629+ exit 1
630+ fi
631+
632+ backoff=$((attempt * 45))
633+ echo "Retrying after ${backoff}s..."
634+ sleep "$backoff"
635+ done
463636 env :
637+ INGRESS_CLASS_NAME : ${{ env.INGRESS_CLASS_NAME }}
464638 AZURE_CLIENT_ID : ${{ env.WORKLOAD_AZURE_CLIENT_ID }}
465639 AZURE_TENANT_ID : ${{ env.AZURE_TENANT_ID }}
466640 PROJECT_ENDPOINT : ${{ needs.provision.outputs.PROJECT_ENDPOINT }}
0 commit comments