Skip to content

Commit 57f36c9

Browse files
xinyazhangjeffdailypragupta
authored andcommitted
[ROCm][CI] Upgrade ROCm CI to 7.1 (pytorch#166743)
Upgrade all the ROCm docker images to ROCm 7.1 release version. Pull Request resolved: pytorch#166743 Approved by: https://github.com/atalman Co-authored-by: Jeff Daily <[email protected]> Co-authored-by: Prachi Gupta <[email protected]>
1 parent ee5610f commit 57f36c9

File tree

6 files changed

+37
-16
lines changed

6 files changed

+37
-16
lines changed

.ci/docker/build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ case "$tag" in
188188
fi
189189
GCC_VERSION=11
190190
VISION=yes
191-
ROCM_VERSION=7.0
191+
ROCM_VERSION=7.1
192192
NINJA_VERSION=1.9.0
193193
TRITON=yes
194194
KATEX=yes

.ci/docker/common/install_rocm.sh

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,16 @@ EOF
6060
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
6161
fi
6262

63-
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
64-
# search for all unversioned packages
65-
# if search fails it will abort this script; use true to avoid case where search fails
66-
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
67-
if [[ "x${MIOPENHIPGFX}" = x ]]; then
68-
echo "miopen-hip-gfx package not available" && exit 1
69-
else
70-
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
63+
if [[ $(ver $ROCM_VERSION) -lt $(ver 7.1) ]]; then
64+
# precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5, removed in ROCm 7.1
65+
# search for all unversioned packages
66+
# if search fails it will abort this script; use true to avoid case where search fails
67+
MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
68+
if [[ "x${MIOPENHIPGFX}" = x ]]; then
69+
echo "miopen-hip-gfx package not available" && exit 1
70+
else
71+
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
72+
fi
7173
fi
7274

7375
# ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime

.ci/docker/common/install_rocm_magma.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@ function do_install() {
1212

1313
rocm_version_nodot=${rocm_version//./}
1414

15-
# post merge of https://github.com/icl-utk-edu/magma/pull/65
16-
MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
15+
# https://github.com/icl-utk-edu/magma/pull/65
16+
MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
1717
magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
1818

1919
rocm_dir="/opt/rocm"

cmake/External/aotriton.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,22 @@ if(NOT __AOTRITON_INCLUDED)
1515
"manylinux_2_28" # rocm6.3
1616
"manylinux_2_28" # rocm6.4
1717
"manylinux_2_28" # rocm7.0
18+
"manylinux_2_28" # rocm7.1
1819
)
1920
set(__AOTRITON_ROCM_LIST
2021
"rocm6.2"
2122
"rocm6.3"
2223
"rocm6.4"
2324
"rocm7.0"
25+
"rocm7.1"
2426
)
2527
set(__AOTRITON_CI_COMMIT "972223c501ffc22068bb035ac5d64cf54318d895")
2628
set(__AOTRITON_SHA256_LIST
2729
"6cae3d5de75ee205d22e088f7dfaab1227056d02ea67f29ccdbc09f2be4e8c8f" # rocm6.2
2830
"72a153549ea20707331e8a1f1e3d1b8de2913f9d5af2b900c56235d578b57efe" # rocm6.3
2931
"c7f319dd7448cbbbab81889dd8a37d47dbc25ebcbd89760f09e6a0904e556393" # rocm6.4
3032
"a2a974e0ad929a5e5827c0f896c59bda4872459cbaf8dd8e0a00407f404491cf" # rocm7.0
33+
"d4eb24c9f1a0cfedb35f9292efb41d16589cf5a4b98c3c0940181bbefc49d722" # rocm7.1
3134
)
3235
set(__AOTRITON_IMAGE_LIST
3336
"amd-gfx90a"

test/distributed/_composable/fsdp/test_fully_shard_comm.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,12 @@
5959
patch_reshard,
6060
patch_unshard,
6161
)
62-
from torch.testing._internal.common_utils import run_tests, TEST_XPU, xfailIf
62+
from torch.testing._internal.common_utils import (
63+
run_tests,
64+
TEST_WITH_ROCM,
65+
TEST_XPU,
66+
xfailIf,
67+
)
6368
from torch.testing._internal.distributed._tensor.common_dtensor import (
6469
FeedForward,
6570
ModelArgs,
@@ -1658,10 +1663,17 @@ def test_exception_when_used_together_with_comm_hooks(self):
16581663
class TestFullyShardForceSumReduction(FSDPTest):
16591664
# The messages might change when we move to a different NCCL version.
16601665
# Please update this test if it starts failing.
1661-
COLLECTIVE_RE = (
1662-
"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
1663-
"count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
1664-
)
1666+
1667+
if TEST_WITH_ROCM and torch.cuda.nccl.version()[:2] >= (2, 27):
1668+
COLLECTIVE_RE = (
1669+
r"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ acc \(nil\) "
1670+
"count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
1671+
)
1672+
else:
1673+
COLLECTIVE_RE = (
1674+
"NCCL INFO {coll}: opCount [0-9a-f]+ sendbuff 0x[0-9a-f]+ recvbuff 0x[0-9a-f]+ "
1675+
"count {count} datatype [0-9]+ op {reduce_op} root [0-9]+ comm 0x[0-9a-f]+"
1676+
)
16651677
# See here for the numerical values for each reduction op:
16661678
# https://github.com/NVIDIA/nccl/blob/72d2432094d6ae36abd6e511c3a16a2d052dbf94/src/nccl.h.in#L260-L275
16671679
SUM_REDUCTION = 0

torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,11 @@ c10::intrusive_ptr<CUDASymmetricMemory> make_symm_mem(
718718
#elif defined(USE_ROCM)
719719
C10_HIP_CHECK(hipMemImportFromShareableHandle(
720720
&handles[r],
721+
#if ROCM_VERSION >= 70100
722+
reinterpret_cast<void*>(static_cast<uintptr_t>(imported_handles[r])),
723+
#else
721724
(void*)(uintptr_t) & (imported_handles[r]),
725+
#endif
722726
hipMemHandleTypePosixFileDescriptor));
723727
#else
724728
TORCH_CHECK(

0 commit comments

Comments
 (0)