@@ -13,31 +13,33 @@ jobs:
1313 integration-tests-amd :
1414 runs-on : ${{ matrix.runner }}
1515 timeout-minutes : 45
16- continue-on-error : ${{ matrix.runner[1] == 'gfx90a' }}
16+ continue-on-error : ${{ matrix.runner[1] == 'gfx90a' || matrix.runner[0] == 'amd-gfx950' }}
1717 strategy :
1818 matrix :
1919 runner : ${{ fromJson(inputs.matrix) }}
2020 include :
21- - image : rocm/pytorch:rocm6.2.2_ubuntu22. 04_py3.10_pytorch_2.5.1_asan
21+ - image : rocm/pytorch:rocm7.0_ubuntu22. 04_py3.10_pytorch_release_2.8.0
2222 runner : ["self-hosted", "gfx90a"]
2323 # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
2424 # container expect it at /github/home/.triton. So map here to make sure visible in docker.
2525 options : >-
2626 --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
2727 --volume /home/runner/.triton:/github/home/.triton
28- - image : rocm/pytorch:rocm6.2.2_ubuntu22. 04_py3.10_pytorch_2.5.1_asan
28+ - image : rocm/pytorch:rocm7.0_ubuntu22. 04_py3.10_pytorch_release_2.8.0
2929 runner : ["amd-gfx942"]
3030 # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
3131 options : >-
3232 --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
3333 --env-file /etc/podinfo/gha-gpu-isolation-settings
3434 --volume /home/runner/.triton:/github/home/.triton
35- - image : rocm/7.0-preview :rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
35+ - image : rocm/pytorch :rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0
3636 runner : ["amd-gfx950"]
37+ # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
3738 options : >-
3839 --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
3940 --env-file /etc/podinfo/gha-gpu-isolation-settings
4041 --volume /home/runner/.triton:/github/home/.triton
42+ --volume /triton-data:/triton-data
4143 env :
4244 RUNNER_TYPE : ${{ matrix.runner[1] }}
4345 TRITON_BUILD_WITH_CCACHE : " true"
@@ -83,14 +85,16 @@ jobs:
8385 ~/.triton/nvidia
8486 ~/.triton/json
8587 key : ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
88+ - name : Install dependencies
89+ run : apt-get install -y clang lld ccache
8690 - name : Inspect cache directories
8791 run : |
8892 mkdir -p ~/.triton
8993 du -h -d 1 ~/.triton
9094
9195 mkdir -p ~/.ccache
9296 du -h -d 1 ~/.ccache
93- - name : Update compiler to clang
97+ - name : Update compiler to Clang
9498 run : |
9599 export CC=/usr/bin/clang
96100 export CXX=/usr/bin/clang++
@@ -100,19 +104,19 @@ jobs:
100104 echo "PATH is '$PATH'"
101105 pip uninstall -y triton pytorch-triton-rocm
102106
103- if [ "${{ matrix.runner[0] }}" != "amd-gfx950" ]; then
104- ccache --zero-stats
107+ ccache --zero-stats
108+ if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ]; then
109+ pip install --cache-dir /triton-data/pip-cache -r python/requirements.txt
110+ pip install --cache-dir /triton-data/pip-cache -r python/test-requirements.txt
105111 fi
106-
107112 make dev-install
108- - name : CCache Stats
109- if : ${{ matrix.runner[0] != 'amd-gfx950' }}
113+ - name : Print ccache stats
110114 run : ccache --print-stats
111115 - name : Run lit tests
112116 run : make test-lit
113117 - name : Run C++ unittests
114118 run : make test-cpp
115- - name : Run python tests on AMD
119+ - name : Run Python tests on AMD
116120 run : |
117121 INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
118122 if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
0 commit comments