File tree Expand file tree Collapse file tree 6 files changed +37
-16
lines changed
test/distributed/_composable/fsdp
torch/csrc/distributed/c10d/symm_mem Expand file tree Collapse file tree 6 files changed +37
-16
lines changed Original file line number Diff line number Diff line change @@ -188,7 +188,7 @@ case "$tag" in
188188 fi
189189 GCC_VERSION=11
190190 VISION=yes
191- ROCM_VERSION=7.0
191+ ROCM_VERSION=7.1
192192 NINJA_VERSION=1.9.0
193193 TRITON=yes
194194 KATEX=yes
Original file line number Diff line number Diff line change 6060 DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
6161 fi
6262
63- # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
64- # search for all unversioned packages
65- # if search fails it will abort this script; use true to avoid case where search fails
66- MIOPENHIPGFX=$( apt-cache search --names-only miopen-hip-gfx | awk ' {print $1}' | grep -F -v . || true)
67- if [[ " x${MIOPENHIPGFX} " = x ]]; then
68- echo " miopen-hip-gfx package not available" && exit 1
69- else
70- DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
63+ if [[ $( ver $ROCM_VERSION ) -lt $( ver 7.1) ]]; then
64+ # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
65+ # search for all unversioned packages
66+ # if search fails it will abort this script; use true to avoid case where search fails
67+ MIOPENHIPGFX=$( apt-cache search --names-only miopen-hip-gfx | awk ' {print $1}' | grep -F -v . || true)
68+ if [[ " x${MIOPENHIPGFX} " = x ]]; then
69+ echo " miopen-hip-gfx package not available" && exit 1
70+ else
71+ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
72+ fi
7173 fi
7274
7375 # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
Original file line number Diff line number Diff line change @@ -12,8 +12,8 @@ function do_install() {
1212
1313 rocm_version_nodot=${rocm_version// ./ }
1414
15- # post merge of https://github.com/icl-utk-edu/magma/pull/65
16- MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
15+ # https://github.com/icl-utk-edu/magma/pull/65
16+ MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
1717 magma_archive=" magma-rocm${rocm_version_nodot} -${MAGMA_VERSION} -1.tar.bz2"
1818
1919 rocm_dir=" /opt/rocm"
Original file line number Diff line number Diff line change @@ -15,19 +15,22 @@ if(NOT __AOTRITON_INCLUDED)
1515 "manylinux_2_28" # rocm6.3
1616 "manylinux_2_28" # rocm6.4
1717 "manylinux_2_28" # rocm7.0
18+ "manylinux_2_28" # rocm7.1
1819 )
1920 set (__AOTRITON_ROCM_LIST
2021 "rocm6.2"
2122 "rocm6.3"
2223 "rocm6.4"
2324 "rocm7.0"
25+ "rocm7.1"
2426 )
2527 set (__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895" )
2628 set (__AOTRITON_SHA256_LIST
2729 "6cae3d5de75ee205d22e088f7dfaab1227056d02ea67f29ccdbc09f2be4e8c8f" # rocm6.2
2830 "72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe" # rocm6.3
2931 "c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393" # rocm6.4
3032 "a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf" # rocm7.0
33+ "d4eb24c9f1a0cfedb35f9292efb41d16589cf5a4b98c3c0940181bbefc49d722" # rocm7.1
3134 )
3235 set (__AOTRITON_IMAGE_LIST
3336 "amd-gfx90a"
Original file line number Diff line number Diff line change 5959 patch_reshard ,
6060 patch_unshard ,
6161)
62- from torch .testing ._internal .common_utils import run_tests , TEST_XPU , xfailIf
62+ from torch .testing ._internal .common_utils import (
63+ run_tests ,
64+ TEST_WITH_ROCM ,
65+ TEST_XPU ,
66+ xfailIf ,
67+ )
6368from torch .testing ._internal .distributed ._tensor .common_dtensor import (
6469 FeedForward ,
6570 ModelArgs ,
@@ -1658,10 +1663,17 @@ def test_exception_when_used_together_with_comm_hooks(self):
16581663class TestFullyShardForceSumReduction (FSDPTest ):
16591664 # The messages might change when we move to a different NCCL version.
16601665 # Please update this test if it starts failing.
1661- COLLECTIVE_RE = (
1662- "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
1663- "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
1664- )
1666+
1667+ if TEST_WITH_ROCM and torch .cuda .nccl .version ()[:2 ] >= (2 , 27 ):
1668+ COLLECTIVE_RE = (
1669+ r"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ acc \(nil\) "
1670+ "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
1671+ )
1672+ else :
1673+ COLLECTIVE_RE = (
1674+ "NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
1675+ "count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
1676+ )
16651677 # See here for the numerical values for each reduction op:
16661678 # https://github.com/NVIDIA/nccl/blob/72d2432094d6ae36abd6e511c3a16a2d052dbf94/src/nccl.h.in#L260-L275
16671679 SUM_REDUCTION = 0
Original file line number Diff line number Diff line change @@ -718,7 +718,11 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
718718#elif defined(USE_ROCM)
719719 C10_HIP_CHECK (hipMemImportFromShareableHandle (
720720 &handles[r],
721+ #if ROCM_VERSION >= 70100
722+ reinterpret_cast <void *>(static_cast <uintptr_t >(imported_handles[r])),
723+ #else
721724 (void *)(uintptr_t ) & (imported_handles[r]),
725+ #endif
722726 hipMemHandleTypePosixFileDescriptor));
723727#else
724728 TORCH_CHECK (
You can’t perform that action at this time.
0 commit comments