|
7 | 7 | pull_request: |
8 | 8 | branches: |
9 | 9 | # We can run gpuCI on any PR targeting these branches |
10 | | - - 'main' |
11 | | - - '[rv][0-9].[0-9].[0-9]' |
12 | | - - '[rv][0-9].[0-9].[0-9]rc[0-9]' |
| 10 | + - "main" |
| 11 | + - "[rv][0-9].[0-9].[0-9]" |
| 12 | + - "[rv][0-9].[0-9].[0-9]rc[0-9]" |
13 | 13 | # PR has to be labeled with "gpuCI" label |
14 | 14 | # If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI |
15 | | - types: [ labeled ] |
| 15 | + types: [labeled] |
16 | 16 |
|
17 | 17 | concurrency: |
18 | 18 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} |
@@ -40,50 +40,52 @@ jobs: |
40 | 40 | # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners |
41 | 41 | # It has 2 A100 GPUs |
42 | 42 | runs-on: self-hosted-azure |
| 43 | + # Unit tests shouldn't take longer than 30minutes |
| 44 | + timeout-minutes: 30 |
43 | 45 | # "run-gpu-tests" job is run if the "gpuci" label is added to the PR |
44 | 46 | if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }} |
45 | 47 |
|
46 | 48 | steps: |
47 | 49 | # If something went wrong during the last cleanup, this step ensures any existing container is removed |
48 | | - - name: Remove existing container if it exists |
49 | | - run: | |
50 | | - if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then |
51 | | - docker rm -f nemo-curator-container |
52 | | - fi |
| 50 | + - name: Remove existing container if it exists |
| 51 | + run: | |
| 52 | + if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then |
| 53 | + docker rm -f nemo-curator-container |
| 54 | + fi |
53 | 55 |
|
54 | | - # This runs the container which was pushed by build-container, which we call "nemo-curator-container" |
55 | | - # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container |
56 | | - # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with |
57 | | - # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting |
58 | | - - name: Run Docker container |
59 | | - run: | |
60 | | - docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" |
| 56 | + # This runs the container which was pushed by build-container, which we call "nemo-curator-container" |
| 57 | + # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container |
| 58 | + # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with |
| 59 | + # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting |
| 60 | + - name: Run Docker container |
| 61 | + run: | |
| 62 | + docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" |
61 | 63 |
|
62 | | - # Expect `whoami` to be "azureuser" |
63 | | - # Expect `nvidia-smi` to show our 2 A100 GPUs |
64 | | - - name: Check GPUs |
65 | | - run: | |
66 | | - whoami |
67 | | - docker exec nemo-curator-container nvidia-smi |
| 64 | + # Expect `whoami` to be "azureuser" |
| 65 | + # Expect `nvidia-smi` to show our 2 A100 GPUs |
| 66 | + - name: Check GPUs |
| 67 | + run: | |
| 68 | + whoami |
| 69 | + docker exec nemo-curator-container nvidia-smi |
68 | 70 |
|
69 | | - # In the virtual environment (called "curator") we created in the container, |
70 | | - # list all of our packages. Useful for debugging |
71 | | - - name: Verify installations |
72 | | - run: | |
73 | | - docker exec nemo-curator-container pip list |
| 71 | + # In the virtual environment (called "curator") we created in the container, |
| 72 | + # list all of our packages. Useful for debugging |
| 73 | + - name: Verify installations |
| 74 | + run: | |
| 75 | + docker exec nemo-curator-container pip list |
74 | 76 |
|
75 | | - # In the virtual environment (called "curator") we created in the container, |
76 | | - # run our PyTests marked with `@pytest.mark.gpu` |
77 | | - # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository), |
78 | | - # and then the directory where the PyTests are located |
79 | | - - name: Run PyTests with GPU mark |
80 | | - run: | |
81 | | - docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests |
| 77 | + # In the virtual environment (called "curator") we created in the container, |
| 78 | + # run our PyTests marked with `@pytest.mark.gpu` |
| 79 | + # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository), |
| 80 | + # and then the directory where the PyTests are located |
| 81 | + - name: Run PyTests with GPU mark |
| 82 | + run: | |
| 83 | + docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests |
82 | 84 |
|
83 | | - # After running `docker stop`, the container remains in an exited state |
84 | | - # It is still present on our system and could be restarted with `docker start` |
85 | | - # Thus, we use `docker rm` to permanently removed it from the system |
86 | | - - name: Cleanup |
87 | | - if: always() |
88 | | - run: | |
89 | | - docker stop nemo-curator-container && docker rm nemo-curator-container |
| 85 | + # After running `docker stop`, the container remains in an exited state |
| 86 | + # It is still present on our system and could be restarted with `docker start` |
| 87 | + # Thus, we use `docker rm` to permanently removed it from the system |
| 88 | + - name: Cleanup |
| 89 | + if: always() |
| 90 | + run: | |
| 91 | + docker stop nemo-curator-container && docker rm nemo-curator-container |
0 commit comments