Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions .github/workflows/deploy-aks-callable.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
name: Deploy AKS (Reusable)

on:
workflow_call:
inputs:
location:
description: "Azure region"
required: false
type: string
default: "westus3"
command:
description: "deploy-aks.sh command to run"
required: false
type: string
default: "deploy-aks"
node_pool_vm_size:
description: "GPU node pool VM size"
required: false
type: string
default: "Standard_D2ads_v5"
node_pool_node_count:
description: "GPU node pool node count"
required: false
type: string
default: "2"
system_pool_vm_size:
description: "Optional system nodepool VM size"
required: false
type: string
default: "Standard_D2ads_v5"
install_network_operator:
description: "Install NVIDIA Network Operator for InfiniBand/RDMA support"
required: false
type: boolean
default: false
install_gpu_operator:
description: "Install NVIDIA GPU Operator for GPU workload management"
required: false
type: boolean
default: false
cleanup:
description: "Delete the resource group after the run"
required: false
type: boolean
default: true

jobs:
deploy:
runs-on: ubuntu-latest
environment: testing

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Validate deploy script is syntactically valid
run: |
set -euo pipefail
bash -n infrastructure_references/aks/scripts/deploy-aks.sh

- name: Azure Login
uses: azure/login@v2
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}

- name: Install jq and kustomize
run: |
set -euo pipefail
sudo apt-get update
sudo apt-get install -y jq
curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash

- name: Setup kubectl
uses: azure/setup-kubectl@v3

- name: Setup helm
uses: azure/setup-helm@v3

- name: Verify CLI tools
run: |
set -euo pipefail
kubectl version --client=true
helm version
kustomize version
az version

- name: Set script permissions
run: |
set -euo pipefail
chmod +x infrastructure_references/aks/scripts/deploy-aks.sh

- name: Generate resource group and cluster names
id: names
run: |
set -euo pipefail
timestamp=$(date -u +"%y%d%m%H%M")
rg_name="ai-infra-aks-${timestamp}"
cluster_name="ai-infra-${timestamp}"
echo "resource_group=${rg_name}" >> "$GITHUB_OUTPUT"
echo "cluster_name=${cluster_name}" >> "$GITHUB_OUTPUT"

- name: Run deploy-aks.sh
env:
AZURE_REGION: ${{ inputs.location }}
AZURE_RESOURCE_GROUP: ${{ steps.names.outputs.resource_group }}
CLUSTER_NAME: ${{ steps.names.outputs.cluster_name }}
SYSTEM_POOL_VM_SIZE: ${{ inputs.system_pool_vm_size }}
NODE_POOL_VM_SIZE: ${{ inputs.node_pool_vm_size }}
NODE_POOL_NODE_COUNT: ${{ inputs.node_pool_node_count }}
INSTALL_NETWORK_OPERATOR: ${{ inputs.install_network_operator }}
INSTALL_GPU_OPERATOR: ${{ inputs.install_gpu_operator }}
run: |
set -euo pipefail
./infrastructure_references/aks/scripts/deploy-aks.sh "${{ inputs.command }}"

- name: Cleanup resource group after run
if: always() && inputs.cleanup == true
run: |
set -euo pipefail
echo "Cleaning up resource group: ${{ steps.names.outputs.resource_group }}"
if az group exists --name "${{ steps.names.outputs.resource_group }}" --output tsv 2>/dev/null | grep -q "true"; then
az group delete --name "${{ steps.names.outputs.resource_group }}" --yes --no-wait
echo "✅ Resource group deletion initiated" >> "$GITHUB_STEP_SUMMARY"
else
echo "Resource group does not exist, nothing to clean up"
fi
28 changes: 28 additions & 0 deletions .github/workflows/deploy-aks-matrix.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: Deploy AKS (Full Matrix)

on:
workflow_dispatch:
schedule:
- cron: "0 2 * * 1" # Every Monday at 02:00 UTC

permissions:
id-token: write
contents: read

jobs:
deploy:
strategy:
matrix:
include:
- name: smoke-westus3
location: westus3
command: deploy-aks
name: ${{ matrix.name }}
uses: ./.github/workflows/deploy-aks-callable.yml
with:
location: ${{ matrix.location }}
command: ${{ matrix.command }}
install_network_operator: false
install_gpu_operator: false
cleanup: true
secrets: inherit
68 changes: 68 additions & 0 deletions .github/workflows/deploy-aks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
name: Deploy AKS (Test)

on:
push:
paths:
- ".github/workflows/deploy-aks.yml"
- ".github/workflows/deploy-aks-callable.yml"
- "infrastructure_references/aks/scripts/deploy-aks.sh"
- "infrastructure_references/aks/configs/**"

workflow_dispatch:
inputs:
location:
description: "Azure region (e.g., westus3, westeurope, westus2)"
required: true
type: string
default: "westus3"
command:
description: "deploy-aks.sh command to run"
required: false
type: choice
default: "all"
options:
- "all"

permissions:
id-token: write
contents: read

jobs:
init:
runs-on: ubuntu-latest
outputs:
location: ${{ steps.params.outputs.location }}
command: ${{ steps.params.outputs.command }}

steps:
- name: Set deployment parameters
id: params
env:
EVENT_NAME: ${{ github.event_name }}
LOCATION_IN: ${{ inputs.location }}
COMMAND_IN: ${{ inputs.command }}
run: |
set -euo pipefail

if [ "${EVENT_NAME}" = "push" ]; then
{
echo "location=westus3"
echo "command=all"
} >> "$GITHUB_OUTPUT"
else
location="${LOCATION_IN:-westus3}"
command="${COMMAND_IN:-deploy-aks}"

{
echo "location=${location}"
echo "command=${command}"
} >> "$GITHUB_OUTPUT"
fi

call-deployment:
needs: init
uses: ./.github/workflows/deploy-aks-callable.yml
with:
location: ${{ needs.init.outputs.location }}
command: ${{ needs.init.outputs.command }}
secrets: inherit