From 87a10da247fe1493482030299128ab01547ae7ee Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 8 Jan 2026 09:48:43 +0000 Subject: [PATCH 1/2] Initial plan From e9bd6d5fee01cd2af96fe2cd703f83e2cf991afd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 8 Jan 2026 10:08:26 +0000 Subject: [PATCH 2/2] Add fixed AKS deployment workflows from PR #106 Co-authored-by: wolfgang-desalvador <118554802+wolfgang-desalvador@users.noreply.github.com> --- .github/workflows/deploy-aks-callable.yml | 128 ++++++++++++++++++++++ .github/workflows/deploy-aks-matrix.yaml | 28 +++++ .github/workflows/deploy-aks.yml | 68 ++++++++++++ 3 files changed, 224 insertions(+) create mode 100644 .github/workflows/deploy-aks-callable.yml create mode 100644 .github/workflows/deploy-aks-matrix.yaml create mode 100644 .github/workflows/deploy-aks.yml diff --git a/.github/workflows/deploy-aks-callable.yml b/.github/workflows/deploy-aks-callable.yml new file mode 100644 index 00000000..cd1a0543 --- /dev/null +++ b/.github/workflows/deploy-aks-callable.yml @@ -0,0 +1,128 @@ +name: Deploy AKS (Reusable) + +on: + workflow_call: + inputs: + location: + description: "Azure region" + required: false + type: string + default: "westus3" + command: + description: "deploy-aks.sh command to run" + required: false + type: string + default: "deploy-aks" + node_pool_vm_size: + description: "GPU node pool VM size" + required: false + type: string + default: "Standard_D2ads_v5" + node_pool_node_count: + description: "GPU node pool node count" + required: false + type: string + default: "2" + system_pool_vm_size: + description: "Optional system nodepool VM size" + required: false + type: string + default: "Standard_D2ads_v5" + install_network_operator: + description: "Install NVIDIA Network Operator for InfiniBand/RDMA support" + required: false + type: boolean + default: false + install_gpu_operator: + description: "Install NVIDIA GPU Operator for GPU workload management" + required: false + type: boolean + default: false + cleanup: + description: "Delete the resource group after the run" + required: false + type: boolean + default: true + +jobs: + deploy: + runs-on: ubuntu-latest + environment: testing + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Validate deploy script is syntactically valid + run: | + set -euo pipefail + bash -n infrastructure_references/aks/scripts/deploy-aks.sh + + - name: Azure Login + uses: azure/login@v2 + with: + client-id: ${{ secrets.AZURE_CLIENT_ID }} + tenant-id: ${{ secrets.AZURE_TENANT_ID }} + subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} + + - name: Install jq and kustomize + run: | + set -euo pipefail + sudo apt-get update + sudo apt-get install -y jq + curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash + + - name: Setup kubectl + uses: azure/setup-kubectl@v3 + + - name: Setup helm + uses: azure/setup-helm@v3 + + - name: Verify CLI tools + run: | + set -euo pipefail + kubectl version --client=true + helm version + kustomize version + az version + + - name: Set script permissions + run: | + set -euo pipefail + chmod +x infrastructure_references/aks/scripts/deploy-aks.sh + + - name: Generate resource group and cluster names + id: names + run: | + set -euo pipefail + timestamp=$(date -u +"%y%d%m%H%M") + rg_name="ai-infra-aks-${timestamp}" + cluster_name="ai-infra-${timestamp}" + echo "resource_group=${rg_name}" >> "$GITHUB_OUTPUT" + echo "cluster_name=${cluster_name}" >> "$GITHUB_OUTPUT" + + - name: Run deploy-aks.sh + env: + AZURE_REGION: ${{ inputs.location }} + AZURE_RESOURCE_GROUP: ${{ steps.names.outputs.resource_group }} + CLUSTER_NAME: ${{ steps.names.outputs.cluster_name }} + SYSTEM_POOL_VM_SIZE: ${{ inputs.system_pool_vm_size }} + NODE_POOL_VM_SIZE: ${{ inputs.node_pool_vm_size }} + NODE_POOL_NODE_COUNT: ${{ inputs.node_pool_node_count }} + INSTALL_NETWORK_OPERATOR: ${{ inputs.install_network_operator }} + INSTALL_GPU_OPERATOR: ${{ inputs.install_gpu_operator }} + run: | + set -euo pipefail + ./infrastructure_references/aks/scripts/deploy-aks.sh "${{ inputs.command }}" + + - name: Cleanup resource group after run + if: always() && inputs.cleanup == true + run: | + set -euo pipefail + echo "Cleaning up resource group: ${{ steps.names.outputs.resource_group }}" + if az group exists --name "${{ steps.names.outputs.resource_group }}" --output tsv 2>/dev/null | grep -q "true"; then + az group delete --name "${{ steps.names.outputs.resource_group }}" --yes --no-wait + echo "✅ Resource group deletion initiated" >> "$GITHUB_STEP_SUMMARY" + else + echo "Resource group does not exist, nothing to clean up" + fi diff --git a/.github/workflows/deploy-aks-matrix.yaml b/.github/workflows/deploy-aks-matrix.yaml new file mode 100644 index 00000000..200cbaa2 --- /dev/null +++ b/.github/workflows/deploy-aks-matrix.yaml @@ -0,0 +1,28 @@ +name: Deploy AKS (Full Matrix) + +on: + workflow_dispatch: + schedule: + - cron: "0 2 * * 1" # Every Monday at 02:00 UTC + +permissions: + id-token: write + contents: read + +jobs: + deploy: + strategy: + matrix: + include: + - name: smoke-westus3 + location: westus3 + command: deploy-aks + name: ${{ matrix.name }} + uses: ./.github/workflows/deploy-aks-callable.yml + with: + location: ${{ matrix.location }} + command: ${{ matrix.command }} + install_network_operator: false + install_gpu_operator: false + cleanup: true + secrets: inherit diff --git a/.github/workflows/deploy-aks.yml b/.github/workflows/deploy-aks.yml new file mode 100644 index 00000000..2297f0aa --- /dev/null +++ b/.github/workflows/deploy-aks.yml @@ -0,0 +1,68 @@ +name: Deploy AKS (Test) + +on: + push: + paths: + - ".github/workflows/deploy-aks.yml" + - ".github/workflows/deploy-aks-callable.yml" + - "infrastructure_references/aks/scripts/deploy-aks.sh" + - "infrastructure_references/aks/configs/**" + + workflow_dispatch: + inputs: + location: + description: "Azure region (e.g., westus3, westeurope, westus2)" + required: true + type: string + default: "westus3" + command: + description: "deploy-aks.sh command to run" + required: false + type: choice + default: "all" + options: + - "all" + +permissions: + id-token: write + contents: read + +jobs: + init: + runs-on: ubuntu-latest + outputs: + location: ${{ steps.params.outputs.location }} + command: ${{ steps.params.outputs.command }} + + steps: + - name: Set deployment parameters + id: params + env: + EVENT_NAME: ${{ github.event_name }} + LOCATION_IN: ${{ inputs.location }} + COMMAND_IN: ${{ inputs.command }} + run: | + set -euo pipefail + + if [ "${EVENT_NAME}" = "push" ]; then + { + echo "location=westus3" + echo "command=all" + } >> "$GITHUB_OUTPUT" + else + location="${LOCATION_IN:-westus3}" + command="${COMMAND_IN:-deploy-aks}" + + { + echo "location=${location}" + echo "command=${command}" + } >> "$GITHUB_OUTPUT" + fi + + call-deployment: + needs: init + uses: ./.github/workflows/deploy-aks-callable.yml + with: + location: ${{ needs.init.outputs.location }} + command: ${{ needs.init.outputs.command }} + secrets: inherit