Skip to content

Commit 36e36c8

Browse files
[rocm7.1_internal_testing] Fix issues with merge conflicts (#2461)
- Remove building tensorpipe for ROCm by reverting 550bc77 as this support is going to get dropped upstream as well. - External/aotriton.cmake: remove use of __AOTRITON_VER_WITH_COMMIT - macros/Export.h: remove TORCH_HIP_CPP_API/TORCH_HIP_API and other hipified instances as CUDA ones get hipified and converted correctly (need to upstream this) - CUDALoops.cuh: Bad merge - Blas.cpp: remove MX patch (Blockwise support is not upstreamed) - cuda_vectorized_test.cu: remove ROCmloops specific test, this was removed in rocm7.0_internal_testing branch. I had incorrectly addressed the merge conflicts when merging with upstream - Update requirements-ci.txt to reflect both upstream and rocm/release/2.8 changes. I tested this with the following docker image: `registry-sc-harbor.amd.com/framework/compute-rocm-rel-7.0:24_ubuntu24.04_py3.12_pytorch_lw_release-2.7_faae1f39` and ran all the "core" UTs. export TESTS_TO_INCLUDE="test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" export TESTS_TO_INCLUDE="distributed/test_c10d_common distributed/test_c10d_nccl distributed/test_distributed_spawn" Only the following UTs failed with accuracy issues: - test/test_nn.py::TestNN::test_Transformer_multilayer_coder_cuda_tf32 - test/test_cuda.py::TestCudaMallocAsync::test_memory_snapshot - test/distributed/test_distributed_spawn.py::TestDistBackendWithSpawn::test_ddp_profiling_execution_trace - test/distributed/test_c10d_nccl.py::CommTest::test_intra_node_comm_all_reduce Fixes #ISSUE_NUMBER --------- Co-authored-by: Jithun Nair <[email protected]>
1 parent 1897a54 commit 36e36c8

File tree

12 files changed

+27
-125
lines changed

12 files changed

+27
-125
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
f7888497a1eb9e98d4c07537f0d0bcfe180d1363
1+
711e2a92522e0a9921ce58ae658571ca55c49b97

.ci/docker/requirements-ci.txt

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,8 @@ ninja==1.11.1.3
110110
#Pinned versions: 1.11.1.3
111111
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
112112

113-
numba==0.49.0 ; python_version < "3.9"
114-
numba==0.55.2 ; python_version == "3.9"
115-
numba==0.55.2 ; python_version == "3.10"
116-
numba==0.60.0 ; python_version == "3.12"
113+
numba==0.60.0 ; python_version == "3.9"
114+
numba==0.61.2 ; python_version > "3.9"
117115
#Description: Just-In-Time Compiler for Numerical Functions
118116
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
119117
#test that import: test_numba_integration.py
@@ -135,6 +133,7 @@ numpy==2.0.2 ; python_version == "3.9"
135133
numpy==2.1.2 ; python_version > "3.9"
136134

137135
pandas==2.2.3
136+
138137
#onnxruntime
139138
#Description: scoring engine for Open Neural Network Exchange (ONNX) models
140139
#Pinned versions: 1.9.0
@@ -168,6 +167,7 @@ protobuf==5.29.4
168167
#Pinned versions: 5.29.4
169168
#test that import: test_tensorboard.py, test/onnx/*
170169

170+
171171
psutil
172172
#Description: information on running processes and system utilization
173173
#Pinned versions:
@@ -263,11 +263,6 @@ tb-nightly==2.13.0a20230426
263263
#Pinned versions:
264264
#test that import:
265265

266-
tlparse==0.3.30
267-
#Description: parse logs produced by torch.compile
268-
#Pinned versions:
269-
#test that import: dynamo/test_structured_trace.py
270-
271266
# needed by torchgen utils
272267
typing-extensions>=4.10.0
273268
#Description: type hints for python
@@ -326,7 +321,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
326321
#Pinned versions: 1.4.1
327322
#test that import:
328323

329-
lxml==5.3.0
324+
lxml==5.3.0 ; python_version <= "3.12"
325+
lxml==6.0.0 ; python_version == "3.13"
330326
#Description: This is a requirement of unittest-xml-reporting
331327

332328
# Python-3.9 binaries
@@ -340,6 +336,7 @@ sympy==1.13.3
340336

341337
onnx==1.18.0
342338
#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
339+
343340
#Pinned versions:
344341
#test that import:
345342

aten/src/ATen/native/cuda/Blas.cpp

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1205,7 +1205,6 @@ std::pair<ScalingType, ScalingType> get_joint_scaling(
12051205

12061206
} // namespace
12071207

1208-
12091208
// Computes matrix multiply + bias while applying scaling to input and output matrices
12101209
// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
12111210
// If output matrix type is 16 or 32-bit type, scale_result is not applied.
@@ -1362,25 +1361,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
13621361
else {
13631362
TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
13641363
}
1365-
// Until more than bf16 is supported
1364+
// Until more than bf16 is supported.
13661365
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
1367-
"hipblaslt rowwise _scaled_mm only supports BFloat16 output");
1368-
}
1369-
else if (scaling_choice == ScalingType::BlockWise) {
1370-
#if ROCM_VERSION >= 70000
1371-
TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
1372-
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
1373-
1374-
TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
1375-
mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
1376-
"Matrix dimensions must be multiples of 32 for block-wise scaling");
1377-
1378-
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
1379-
out.scalar_type() == ScalarType::Half,
1380-
"Block-wise scaling only supports BFloat16 or Half output types");
1381-
#else
1382-
TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
1383-
#endif
1366+
"hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
13841367
}
13851368
#endif
13861369

aten/src/ATen/native/cuda/CUDALoops.cuh

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -531,44 +531,6 @@ __global__ void elementwise_kernel(int N, func_t f) {
531531
}
532532
}
533533

534-
#ifdef USE_ROCM
535-
template <int nt, int vt, typename func_t>
536-
C10_LAUNCH_BOUNDS_2(nt, 4)
537-
__global__ void elementwise_kernel_manual_unroll(int N, func_t f) {
538-
int tid = threadIdx.x;
539-
int nv = nt * vt;
540-
int idx = nv * blockIdx.x + tid;
541-
if ((idx + nt*(vt-1)) < N) {
542-
f(idx, true);
543-
} else {
544-
#pragma unroll
545-
for (int i = 0; i < vt; i++) {
546-
if (idx < N) {
547-
f(idx, false);
548-
idx += nt;
549-
}
550-
}
551-
}
552-
}
553-
554-
template <int nt, int vt, typename func_t>
555-
C10_LAUNCH_BOUNDS_2(nt, 4)
556-
__global__ void elementwise_kernel_strided(int N, func_t f) {
557-
int tid = threadIdx.x;
558-
int idx = nt * vt * blockIdx.x + tid;
559-
int step = nt * vt * gridDim.x;
560-
while (idx < N) {
561-
#pragma unroll
562-
for (int i = 0; i < vt; i++) {
563-
if ((idx + nt * i) < N) {
564-
f(idx + nt * i);
565-
}
566-
}
567-
idx += step;
568-
}
569-
}
570-
#endif
571-
572534
template <int nt, int vt, typename func_t>
573535
static void launch_legacy_kernel(int64_t N, const func_t& f) {
574536
TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());

aten/src/ATen/test/cuda_vectorized_test.cu

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,23 +27,6 @@ void reset_buffers() {
2727
}
2828
}
2929

30-
#if defined(USE_ROCM) && !defined(_WIN32)
31-
TEST(TestLoops, HasSameArgTypes) {
32-
// This is a compile-time unit test. If this file compiles without error,
33-
// then the test passes and during runtime, we just need to return.
34-
using namespace at::native::modern::detail;
35-
using func1_t = int (*)(float, float);
36-
using func2_t = int (*)(bool, float, float);
37-
using func3_t = int (*)(float);
38-
using func4_t = int (*)();
39-
static_assert(has_same_arg_types<func1_t>::value, "func1_t has the same argument types");
40-
static_assert(!has_same_arg_types<func2_t>::value, "func2_t does not have the same argument types");
41-
static_assert(has_same_arg_types<func3_t>::value, "func3_t has the same argument types");
42-
static_assert(has_same_arg_types<func4_t>::value, "func4_t has the same argument types");
43-
return;
44-
}
45-
#endif
46-
4730
TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
4831
char *ptr = reinterpret_cast<char *>(buffer1);
4932

cmake/Dependencies.cmake

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1159,14 +1159,6 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
11591159
set(TP_USE_CUDA ON CACHE BOOL "" FORCE)
11601160
set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
11611161
endif()
1162-
if(USE_ROCM)
1163-
add_compile_options(-D__HIP_PLATFORM_AMD__=1)
1164-
set(TP_USE_ROCM ON CACHE BOOL "" FORCE)
1165-
set(TP_ENABLE_HIP_IPC OFF CACHE BOOL "" FORCE)
1166-
set(TP_ENABLE_HIP_XTH OFF CACHE BOOL "" FORCE)
1167-
set(TP_ENABLE_HIP_GDR OFF CACHE BOOL "" FORCE)
1168-
set(TP_ENABLE_IBV OFF CACHE BOOL "" FORCE)
1169-
endif()
11701162
set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
11711163
add_compile_options(-DTORCH_USE_LIBUV)
11721164
include_directories(BEFORE SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/tensorpipe/third_party/libuv/include)
@@ -1192,9 +1184,9 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
11921184
if(USE_CUDA)
11931185
list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda)
11941186
elseif(USE_ROCM)
1195-
message(WARNING "TensorPipe is supported on ROCm")
1187+
message(WARNING "TensorPipe doesn't yet support ROCm")
11961188
# Not yet...
1197-
list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
1189+
# list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
11981190
endif()
11991191
endif()
12001192
endif()

cmake/External/aotriton.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ if(NOT __AOTRITON_INCLUDED)
8181
list(GET __AOTRITON_MANYLINUX_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_MANYLINUX)
8282
set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
8383
string(CONCAT __AOTRITON_FILE "aotriton-"
84-
"${__AOTRITON_VER_WITH_COMMIT}-${__AOTRITON_MANYLINUX}"
84+
"${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
8585
"_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}"
8686
"-shared.tar.${__AOTRITON_Z}")
8787
string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/" # @lint-ignore

related_commits

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
ubuntu|pytorch|apex|master|ca54c058f1094b3463788371325025be707a5982|https://github.com/ROCm/apex
2-
centos|pytorch|apex|master|ca54c058f1094b3463788371325025be707a5982|https://github.com/ROCm/apex
3-
ubuntu|pytorch|torchvision|main|f52c4f1afd7dec25cbe7b98bcf1cbc840298e8da|https://github.com/pytorch/vision
4-
centos|pytorch|torchvision|main|f52c4f1afd7dec25cbe7b98bcf1cbc840298e8da|https://github.com/pytorch/vision
5-
ubuntu|pytorch|torchtext|main|bde7ecdb6ba9179ccd30cde60a6550478d0a359f|https://github.com/pytorch/text
6-
centos|pytorch|torchtext|main|bde7ecdb6ba9179ccd30cde60a6550478d0a359f|https://github.com/pytorch/text
7-
ubuntu|pytorch|torchdata|main|922ac065407546b9cb4f629ab99f1fbf04d8fc12|https://github.com/pytorch/data
8-
centos|pytorch|torchdata|main|922ac065407546b9cb4f629ab99f1fbf04d8fc12|https://github.com/pytorch/data
9-
ubuntu|pytorch|torchaudio|main|bccaa454a54c3c648697cc2f46a4fb0500b1f01b|https://github.com/pytorch/audio
10-
centos|pytorch|torchaudio|main|bccaa454a54c3c648697cc2f46a4fb0500b1f01b|https://github.com/pytorch/audio
11-
ubuntu|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
12-
centos|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
1+
ubuntu|pytorch|apex|master|62c94ed1789bc177a83567985be6c1cb29b2d98c|https://github.com/ROCm/apex
2+
centos|pytorch|apex|master|62c94ed1789bc177a83567985be6c1cb29b2d98c|https://github.com/ROCm/apex
3+
ubuntu|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
4+
centos|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
5+
ubuntu|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
6+
centos|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
7+
ubuntu|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
8+
centos|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
9+
ubuntu|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao
10+
centos|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao

torch/csrc/distributed/rpc/tensorpipe_cuda.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#include <torch/csrc/distributed/rpc/tensorpipe_agent.h>
22
#include <torch/csrc/distributed/rpc/tensorpipe_utils.h>
33

4-
#if defined(USE_TENSORPIPE)
4+
#if defined(USE_TENSORPIPE) && !defined(USE_ROCM)
55

66
#include <c10/cuda/CUDACachingAllocator.h>
77
#include <c10/cuda/CUDAGuard.h>
@@ -50,8 +50,6 @@ C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_gdr, makeCudaGdrChannel)
5050

5151
#endif
5252

53-
#if TENSORPIPE_HAS_CUDA_XTH_CHANNEL
54-
5553
std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
5654
auto context = tensorpipe::channel::cuda_xth::create();
5755
return std::make_unique<ChannelRegistration>(
@@ -61,8 +59,6 @@ std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
6159
// The cuda_xth channel supports same-process GPU-to-GPU comm
6260
C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_xth, makeCudaXthChannel)
6361

64-
#endif
65-
6662
std::unique_ptr<ChannelRegistration> makeCudaBasicChannel() {
6763
auto context = tensorpipe::channel::cuda_basic::create(
6864
tensorpipe::channel::basic::create());

0 commit comments

Comments
 (0)