ci: Limit unit-test duration (#534)

ko3n1g · web-flow · commit 2d55eaa6e821 · 2025-02-10T18:55:42.000+01:00
Signed-off-by: oliver könig &lt;okoenig@nvidia.com&gt;
diff --git a/.github/workflows/gpuci.yml b/.github/workflows/gpuci.yml
@@ -7,12 +7,12 @@ on:
   pull_request:
     branches:
       # We can run gpuCI on any PR targeting these branches
-      - 'main'
-      - '[rv][0-9].[0-9].[0-9]'
-      - '[rv][0-9].[0-9].[0-9]rc[0-9]'
+      - "main"
+      - "[rv][0-9].[0-9].[0-9]"
+      - "[rv][0-9].[0-9].[0-9]rc[0-9]"
     # PR has to be labeled with "gpuCI" label
     # If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI
-    types: [ labeled ]
+    types: [labeled]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -40,50 +40,52 @@ jobs:
     # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners
     # It has 2 A100 GPUs
     runs-on: self-hosted-azure
+    # Unit tests shouldn't take longer than 30minutes
+    timeout-minutes: 30
     # "run-gpu-tests" job is run if the "gpuci" label is added to the PR
     if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }}
 
     steps:
       # If something went wrong during the last cleanup, this step ensures any existing container is removed
-    - name: Remove existing container if it exists
-      run: |
-        if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then
-            docker rm -f nemo-curator-container
-        fi
+      - name: Remove existing container if it exists
+        run: |
+          if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then
+              docker rm -f nemo-curator-container
+          fi
 
-      # This runs the container which was pushed by build-container, which we call "nemo-curator-container"
-      # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container
-      # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with
-      # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting
-    - name: Run Docker container
-      run: |
-        docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity"
+        # This runs the container which was pushed by build-container, which we call "nemo-curator-container"
+        # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container
+        # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with
+        # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting
+      - name: Run Docker container
+        run: |
+          docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity"
 
-      # Expect `whoami` to be "azureuser"
-      # Expect `nvidia-smi` to show our 2 A100 GPUs
-    - name: Check GPUs
-      run: |
-        whoami
-        docker exec nemo-curator-container nvidia-smi
+        # Expect `whoami` to be "azureuser"
+        # Expect `nvidia-smi` to show our 2 A100 GPUs
+      - name: Check GPUs
+        run: |
+          whoami
+          docker exec nemo-curator-container nvidia-smi
 
-      # In the virtual environment (called "curator") we created in the container,
-      # list all of our packages. Useful for debugging
-    - name: Verify installations
-      run: |
-        docker exec nemo-curator-container pip list
+        # In the virtual environment (called "curator") we created in the container,
+        # list all of our packages. Useful for debugging
+      - name: Verify installations
+        run: |
+          docker exec nemo-curator-container pip list
 
-      # In the virtual environment (called "curator") we created in the container,
-      # run our PyTests marked with `@pytest.mark.gpu`
-      # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository),
-      # and then the directory where the PyTests are located
-    - name: Run PyTests with GPU mark
-      run: |
-        docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests
+        # In the virtual environment (called "curator") we created in the container,
+        # run our PyTests marked with `@pytest.mark.gpu`
+        # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository),
+        # and then the directory where the PyTests are located
+      - name: Run PyTests with GPU mark
+        run: |
+          docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests
 
-      # After running `docker stop`, the container remains in an exited state
-      # It is still present on our system and could be restarted with `docker start`
-      # Thus, we use `docker rm` to permanently removed it from the system
-    - name: Cleanup
-      if: always()
-      run: |
-        docker stop nemo-curator-container && docker rm nemo-curator-container
+        # After running `docker stop`, the container remains in an exited state
+        # It is still present on our system and could be restarted with `docker start`
+        # Thus, we use `docker rm` to permanently removed it from the system
+      - name: Cleanup
+        if: always()
+        run: |
+          docker stop nemo-curator-container && docker rm nemo-curator-container