1818 runner : ${{ fromJson(inputs.matrix) }}
1919 include :
2020 - image : rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
21+ runner : ["self-hosted", "gfx90a"]
22+ # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
23+ # container expect it at /github/home/.triton. So map here to make sure visible in docker.
24+ options : >-
25+ --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
26+ --volume /home/runner/.triton:/github/home/.triton
27+ - image : rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
28+ runner : ["amd-gfx942"]
29+ # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
30+ options : >-
31+ --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
32+ --env-file /etc/podinfo/gha-gpu-isolation-settings
33+ --volume /home/runner/.triton:/github/home/.triton
2134 - image : rocm/7.0-preview:rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
2235 runner : ["amd-gfx950"]
36+ options : >-
37+ --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
38+ --env-file /etc/podinfo/gha-gpu-isolation-settings
39+ --volume /home/runner/.triton:/github/home/.triton
2340 env :
2441 RUNNER_TYPE : ${{ matrix.runner[1] }}
2542 TRITON_BUILD_WITH_CCACHE : " true"
3148 CCACHE_COMPRESS : " true"
3249 container :
3350 image : ${{ matrix.image }}
34- # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
35- # container expect it at /github/home/.triton. So map here to make sure visible in docker.
36- options : >-
37- --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
38- --volume /home/runner/.triton:/github/home/.triton
51+ options : ${{ matrix.options }}
3952 steps :
4053 - name : Checkout
4154 uses : actions/checkout@v4
96109 run : ccache --print-stats
97110 - name : Run lit tests
98111 run : make test-lit
112+ - name : Run C++ unittests
113+ run : make test-cpp
99114 - name : Run python tests on AMD
100115 run : |
101116 INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
@@ -147,13 +162,13 @@ jobs:
147162 python3 -m pytest -s -n 8 ./test_cast_matmul.py
148163 - name : Run Proton tests
149164 run : |
165+ unset HIP_VISIBLE_DEVICES
166+ unset ROCR_VISIBLE_DEVICES
150167 if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ]; then
151168 python3 -m pytest -s -n 8 third_party/proton/test -k "not test_instrument_exec"
152169 else
153170 make test-proton
154171 fi
155- - name : Run C++ unittests
156- run : make test-cpp
157172 - name : Inspect cache directories
158173 run : |
159174 mkdir -p ~/.triton
@@ -162,7 +177,8 @@ jobs:
162177 mkdir -p ~/.ccache
163178 du -h -d 1 ~/.ccache
164179 - name : Clean up caches
165- # Always cleanup the worker, even if builds or tests failed
180+ # Always cleanup the worker, even if builds or tests failed given that these directories are
181+ # mapped from the host and we write files as the root user in the docker.
166182 if : always()
167183 run : |
168184 rm -rf ~/.triton/cache
0 commit comments