Skip to content

Commit 684c7cf

Browse files
authored
[CI][AMD] Add env-file for better GPU isolation (#7650)
The env-file sets HIP/ROCR visible devices so that different CI jobs can target different GPUs. This helps to isolate GPUs used for different CI job runs to avoid out of memory issues.
1 parent 062e38e commit 684c7cf

File tree

1 file changed

+24
-8
lines changed

1 file changed

+24
-8
lines changed

.github/workflows/integration-tests-amd.yml

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,25 @@ jobs:
1616
runner: ${{ fromJson(inputs.matrix) }}
1717
include:
1818
- image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
19+
runner: ["self-hosted", "gfx90a"]
20+
# Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
21+
# container expect it at /github/home/.triton. So map here to make sure visible in docker.
22+
options: >-
23+
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
24+
--volume /home/runner/.triton:/github/home/.triton
25+
- image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
26+
runner: ["amd-gfx942"]
27+
# We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
28+
options: >-
29+
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
30+
--env-file /etc/podinfo/gha-gpu-isolation-settings
31+
--volume /home/runner/.triton:/github/home/.triton
1932
- image: rocm/7.0-preview:rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
2033
runner: ["amd-gfx950"]
34+
options: >-
35+
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
36+
--env-file /etc/podinfo/gha-gpu-isolation-settings
37+
--volume /home/runner/.triton:/github/home/.triton
2138
env:
2239
RUNNER_TYPE: ${{ matrix.runner[1] }}
2340
TRITON_BUILD_WITH_CCACHE: "true"
@@ -29,11 +46,7 @@ jobs:
2946
CCACHE_COMPRESS: "true"
3047
container:
3148
image: ${{ matrix.image }}
32-
# Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
33-
# container expect it at /github/home/.triton. So map here to make sure visible in docker.
34-
options: >-
35-
--device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
36-
--volume /home/runner/.triton:/github/home/.triton
49+
options: ${{ matrix.options }}
3750
steps:
3851
- name: Checkout
3952
uses: actions/checkout@v4
@@ -94,6 +107,8 @@ jobs:
94107
run: ccache --print-stats
95108
- name: Run lit tests
96109
run: make test-lit
110+
- name: Run C++ unittests
111+
run: make test-cpp
97112
- name: Run python tests on AMD
98113
run: |
99114
INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
@@ -145,13 +160,13 @@ jobs:
145160
python3 -m pytest -s -n 8 ./test_cast_matmul.py
146161
- name: Run Proton tests
147162
run: |
163+
unset HIP_VISIBLE_DEVICES
164+
unset ROCR_VISIBLE_DEVICES
148165
if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ]; then
149166
python3 -m pytest -s -n 8 third_party/proton/test -k "not test_instrument_exec"
150167
else
151168
make test-proton
152169
fi
153-
- name: Run C++ unittests
154-
run: make test-cpp
155170
- name: Inspect cache directories
156171
run: |
157172
mkdir -p ~/.triton
@@ -160,7 +175,8 @@ jobs:
160175
mkdir -p ~/.ccache
161176
du -h -d 1 ~/.ccache
162177
- name: Clean up caches
163-
# Always cleanup the worker, even if builds or tests failed
178+
# Always cleanup the worker, even if builds or tests failed given that these directories are
179+
# mapped from the host and we write files as the root user in the docker.
164180
if: always()
165181
run: |
166182
rm -rf ~/.triton/cache

0 commit comments

Comments
 (0)