ci: Migrate to using Nvidia Github Runners (#694)

chtruong814 · web-flow · commit 8426b661282c · 2025-05-09T15:25:54.000-05:00
* Test nv runner Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Fix testing Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Add Azure login Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Add Azure CLI Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Testing Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Ensure Azure CLI exists Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Update id-token permissions Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Test login Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Use environment for Azure login Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Login to Azure nemoci Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Debug runner docker Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Use nv-gh-runner for building Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Fix the build container step Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Fix passing secrets to build step Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Fix build Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Pass has-azure-credentials to build step Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Pass in environment to build container step Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Fix build environment Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Do not use inline cache Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Ensure we use PR number for build cache Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Test GPU runner Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Fix GPU test Signed-off-by: Charlie Truong <chtruong@nvidia.com> * debug test failures Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Debug test Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Test build cache Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Fix build template Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Use hash of Dockerfile and pyproject.toml for tag Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Echo image tag hash Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Mount repo code to the test container Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Fix the checkout step when running tests Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Truncate to 12 characters for the image tag hash Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Revert "debug test failures" This reverts commit ea68000. Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Update build-container template ref Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Force build for cache Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Skip build if possible Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Update comments in gpuci.yml Signed-off-by: Charlie Truong <chtruong@nvidia.com> * debug test Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Use run_id as image tag Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Remove generate-image-tag as needed step Signed-off-by: Charlie Truong <chtruong@nvidia.com> * debug test_classifier Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Revert test change Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Set auto_sync_ready to true Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Update build container template ref Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Remove unused skip-build-if-exists Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Update to only run gpuci if certain files are changed Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Fix changed files step Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Update build-container template ref Signed-off-by: Charlie Truong <chtruong@nvidia.com> * debug Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Update changed files ref Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Update build contianer ref Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Increase gpu timeout to 40m Signed-off-by: Charlie Truong <chtruong@nvidia.com> * Add tests directory to track changed files for running gpu tests Signed-off-by: Charlie Truong <chtruong@nvidia.com> --------- Signed-off-by: Charlie Truong <chtruong@nvidia.com>
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -26,4 +26,4 @@ additional_vetters:
   - VibhuJawa
   - arhamm1
 auto_sync_draft: false
-auto_sync_ready: false
+auto_sync_ready: true
diff --git a/.github/workflows/gpuci.yml b/.github/workflows/gpuci.yml
@@ -4,41 +4,62 @@ on:
   push:
     branches:
       - main
-  pull_request:
-    branches:
-      # We can run gpuCI on any PR targeting these branches
-      - "main"
-      - "[rv][0-9].[0-9].[0-9]"
-      - "[rv][0-9].[0-9].[0-9]rc[0-9]"
-    # PR has to be labeled with "gpuCI" label
-    types: [labeled, synchronize]
+      - "pull-request/[0-9]+"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+permissions:
+  id-token: write
+  contents: read
+
 jobs:
+  changed-files:
+    runs-on: ubuntu-latest
+    outputs:
+      any_changed: ${{ steps.changed-files.outputs.any_changed }}
+      changed_files: ${{ steps.changed-files.outputs.all_changed_files }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Get PR info
+        id: get-pr-info
+        if: startsWith(github.ref, 'refs/heads/pull-request/')
+        uses: nv-gha-runners/get-pr-info@main
+
+      - name: Determine base reference
+        id: base-ref
+        run: |
+          if [[ "${{ github.ref }}" == refs/heads/pull-request/* ]]; then
+            # For PR branches, use the base branch from PR info
+            echo "base=${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}" >> $GITHUB_OUTPUT
+          else
+            # For other branches, use the last commit
+            echo "base=HEAD~1" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Get changed files
+        id: changed-files
+        uses: step-security/changed-files@v45.0.1
+        with:
+          files: |
+            nemo_curator/**
+            config/**
+            .github/**
+            pyproject.toml
+            Dockerfile
+            tests/**
+          base_sha: ${{ steps.base-ref.outputs.base }}
+
   # First, we build and push a NeMo Curator container
   build-container:
-    # This block covers 3 cases when gpuCI should be triggered:
-    # 1. The PR has the "gpuCI" label and is opened by a maintainer.
-    #    In this case, gpuCI will autorun on any subsequent pushes to the PR,
-    #    as long as the "gpuCI" label is not removed.
-    # 2. The "gpuCI" label is added to the PR. If a non-maintainer opened the PR,
-    #    then subsequent pushes to the PR will not autorun gpuCI
-    #    unless the "gpuCI" label is removed and re-added again.
-    # 3. PR is merged to main.
-    if: >-
-      (
-        contains(github.event.pull_request.labels.*.name, 'gpuci') &&
-        contains(
-          '["ayushdg", "ko3n1g", "praateekmahajan", "ryantwolf", "sarahyurick", "VibhuJawa"]',
-          github.event.pull_request.user.login
-        )
-      ) ||
-      (github.event.label.name == 'gpuci') ||
-      (github.ref == 'refs/heads/main')
-    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.18.0
+    uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.29.0
+    needs: [changed-files]
+    if: ${{ needs.changed-files.outputs.any_changed == 'true' }}
     with:
       image-name: nemo_curator_container
       dockerfile: Dockerfile
@@ -48,36 +69,44 @@ jobs:
         REPO_URL=https://github.com/${{ github.repository }}.git
         CURATOR_COMMIT=${{ github.sha }}
       prune-filter-timerange: 24h
+      runner: linux-amd64-cpu8
+      has-azure-credentials: true
+      use-inline-cache: false
+    secrets:
+      AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
+      AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
+      AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
   # Then, we run our PyTests in the container we just built
   run-gpu-tests:
-    needs: build-container
-    # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners
-    # It has 2 A100 GPUs
-    runs-on: self-hosted-azure
-    # Unit tests should not take longer than 30 minutes
-    timeout-minutes: 30
-    # This block covers 3 cases when gpuCI should be triggered:
-    # 1. The PR has the "gpuCI" label and is opened by a maintainer.
-    #    In this case, gpuCI will autorun on any subsequent pushes to the PR,
-    #    as long as the "gpuCI" label is not removed.
-    # 2. The "gpuCI" label is added to the PR. If a non-maintainer opened the PR,
-    #    then subsequent pushes to the PR will not autorun gpuCI
-    #    unless the "gpuCI" label is removed and re-added again.
-    # 3. PR is merged to main.
-    if: >-
-      (
-        contains(github.event.pull_request.labels.*.name, 'gpuci') &&
-        contains(
-          '["ayushdg", "ko3n1g", "praateekmahajan", "ryantwolf", "sarahyurick", "VibhuJawa"]',
-          github.event.pull_request.user.login
-        )
-      ) ||
-      (github.event.label.name == 'gpuci') ||
-      (github.ref == 'refs/heads/main')
+    needs: [build-container]
+    if: ${{ needs.changed-files.outputs.any_changed == 'true' }}
+    runs-on: linux-amd64-gpu-rtxa6000-latest-1
+    environment: nemo-ci
+    # Unit tests should not take longer than 40 minutes including docker pull and startup time
+    timeout-minutes: 40
     env:
       DIR: ${{ github.run_id }}
     steps:
+      - name: Install Azure CLI
+        run: |
+          curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
+
+      - name: Azure Login
+        uses: azure/login@v2
+        with:
+          client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+
+      - name: Azure ACR Login
+        run: |
+          az acr login --name nemoci
+          which docker
+
+      - name: Checkout NeMo-Curator
+        uses: actions/checkout@v4
+
       # If something went wrong during the last cleanup, this step ensures any existing container is removed
       - name: Remove existing container if it exists
         run: |
@@ -86,15 +115,19 @@ jobs:
           fi
 
         # This runs the container which was pushed by build-container, which we call "nemo-curator-container"
-        # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container
-        # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with
+        # `--gpus all` ensures that all of the GPUs from our runner are available in the container
         # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting
       - name: Run Docker container
         run: |
-          docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity"
+          docker run \
+            --gpus all \
+            --name nemo-curator-container \
+            -d \
+            --volume ${{ github.workspace }}:/opt/NeMo-Curator \
+            nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} \
+            bash -c "sleep infinity"
 
-        # Expect `whoami` to be "azureuser"
-        # Expect `nvidia-smi` to show our 2 A100 GPUs
+        # Expect `nvidia-smi` to show available GPUs
       - name: Check GPUs
         run: |
           whoami
@@ -127,6 +160,7 @@ jobs:
 
           docker exec nemo-curator-container coverage xml
 
+          mkdir -p $DIR
           docker cp nemo-curator-container:/opt/.coverage $DIR/.coverage
           docker cp nemo-curator-container:/opt/coverage.xml $DIR/coverage.xml
           coverage_report="codecov"
diff --git a/Dockerfile b/Dockerfile
@@ -7,7 +7,7 @@ ARG IMAGE_LABEL
 ARG REPO_URL
 ARG CURATOR_COMMIT
 
-FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER} as curator-update
+FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER} AS curator-update
 # Needed to navigate to and pull the forked repository's changes
 ARG REPO_URL
 ARG CURATOR_COMMIT
@@ -24,7 +24,7 @@ RUN bash -exu <<EOF
 EOF
 
 
-FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER}
+FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER} AS deps
 LABEL "nemo.library"=${IMAGE_LABEL}
 WORKDIR /opt
 
@@ -50,15 +50,12 @@ RUN \
   --mount=type=bind,source=/opt/NeMo-Curator/pyproject.toml,target=/opt/NeMo-Curator/pyproject.toml,from=curator-update \
   cd /opt/NeMo-Curator && \
   source activate curator && \
-  pip install ".[all]"
+  pip install --extra-index-url https://pypi.nvidia.com -e ".[all]"
 
-COPY --from=curator-update /opt/NeMo-Curator/ /opt/NeMo-Curator/
 
-# Clone the user's repository, find the relevant commit, and install everything we need
-RUN bash -exu <<EOF
-  source activate curator
-  cd /opt/NeMo-Curator/
-  pip install --extra-index-url https://pypi.nvidia.com ".[all]"
-EOF
+FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER} AS final
 
 ENV PATH /opt/conda/envs/curator/bin:$PATH
+LABEL "nemo.library"=${IMAGE_LABEL}
+WORKDIR /opt
+COPY --from=deps /opt/conda/envs/curator /opt/conda/envs/curator