[rocm7.1_internal_testing] Fix issues with merge conflicts (#2461)

pragupta · jithunnair-amd · web-flow · commit 36e36c8ff19b · 2025-08-08T16:21:59.000-05:00
- Remove building tensorpipe for ROCm by reverting 550bc77 as this support is going to get dropped upstream as well. - External/aotriton.cmake: remove use of __AOTRITON_VER_WITH_COMMIT - macros/Export.h: remove TORCH_HIP_CPP_API/TORCH_HIP_API and other hipified instances as CUDA ones get hipified and converted correctly (need to upstream this) - CUDALoops.cuh: Bad merge - Blas.cpp: remove MX patch (Blockwise support is not upstreamed) - cuda_vectorized_test.cu: remove ROCmloops specific test, this was removed in rocm7.0_internal_testing branch. I had incorrectly addressed the merge conflicts when merging with upstream - Update requirements-ci.txt to reflect both upstream and rocm/release/2.8 changes. I tested this with the following docker image: `registry-sc-harbor.amd.com/framework/compute-rocm-rel-7.0:24_ubuntu24.04_py3.12_pytorch_lw_release-2.7_faae1f39` and ran all the "core" UTs. export TESTS_TO_INCLUDE="test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" export TESTS_TO_INCLUDE="distributed/test_c10d_common distributed/test_c10d_nccl distributed/test_distributed_spawn" Only the following UTs failed with accuracy issues: - test/test_nn.py::TestNN::test_Transformer_multilayer_coder_cuda_tf32 - test/test_cuda.py::TestCudaMallocAsync::test_memory_snapshot - test/distributed/test_distributed_spawn.py::TestDistBackendWithSpawn::test_ddp_profiling_execution_trace - test/distributed/test_c10d_nccl.py::CommTest::test_intra_node_comm_all_reduce Fixes #ISSUE_NUMBER --------- Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com>
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-f7888497a1eb9e98d4c07537f0d0bcfe180d1363
+711e2a92522e0a9921ce58ae658571ca55c49b97
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -110,10 +110,8 @@ ninja==1.11.1.3
 #Pinned versions: 1.11.1.3
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 
-numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
-numba==0.55.2 ; python_version == "3.10"
-numba==0.60.0 ; python_version == "3.12"
+numba==0.60.0 ; python_version == "3.9"
+numba==0.61.2 ; python_version > "3.9"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
@@ -135,6 +133,7 @@ numpy==2.0.2 ; python_version == "3.9"
 numpy==2.1.2 ; python_version > "3.9"
 
 pandas==2.2.3
+
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
 #Pinned versions: 1.9.0
@@ -168,6 +167,7 @@ protobuf==5.29.4
 #Pinned versions: 5.29.4
 #test that import: test_tensorboard.py, test/onnx/*
 
+
 psutil
 #Description: information on running processes and system utilization
 #Pinned versions:
@@ -263,11 +263,6 @@ tb-nightly==2.13.0a20230426
 #Pinned versions:
 #test that import:
 
-tlparse==0.3.30
-#Description: parse logs produced by torch.compile
-#Pinned versions:
-#test that import: dynamo/test_structured_trace.py
-
 # needed by torchgen utils
 typing-extensions>=4.10.0
 #Description: type hints for python
@@ -326,7 +321,8 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:
 
-lxml==5.3.0
+lxml==5.3.0 ; python_version <= "3.12"
+lxml==6.0.0 ; python_version == "3.13"
 #Description: This is a requirement of unittest-xml-reporting
 
 # Python-3.9 binaries
@@ -340,6 +336,7 @@ sympy==1.13.3
 
 onnx==1.18.0
 #Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
+
 #Pinned versions:
 #test that import:
 
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1205,7 +1205,6 @@ std::pair<ScalingType, ScalingType> get_joint_scaling(
 
 } // namespace
 
-
 // Computes matrix multiply + bias while applying scaling to input and output matrices
 // Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
 // If output matrix type is 16 or 32-bit type, scale_result is not applied.
@@ -1362,25 +1361,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
     else {
       TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
     }
-    // Until more than bf16 is supported
+    // Until more than bf16 is supported.
     TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
-         "hipblaslt rowwise _scaled_mm only supports BFloat16 output");
-  }
-  else if (scaling_choice == ScalingType::BlockWise) {
-#if ROCM_VERSION >= 70000
-    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-               "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
-
-    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
-               mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
-               "Matrix dimensions must be multiples of 32 for block-wise scaling");
-
-    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
-                out.scalar_type() == ScalarType::Half,
-                "Block-wise scaling only supports BFloat16 or Half output types");
-#else
-    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
-#endif
+         "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
   }
 #endif
 
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -531,44 +531,6 @@ __global__ void elementwise_kernel(int N, func_t f) {
   }
 }
 
-#ifdef USE_ROCM
-template <int nt, int vt, typename func_t>
-C10_LAUNCH_BOUNDS_2(nt, 4)
-__global__ void elementwise_kernel_manual_unroll(int N, func_t f) {
-  int tid = threadIdx.x;
-  int nv = nt * vt;
-  int idx = nv * blockIdx.x + tid;
-  if ((idx + nt*(vt-1)) < N) {
-    f(idx, true);
-  } else {
-#pragma unroll
-    for (int i = 0; i < vt; i++) {
-      if (idx < N) {
-        f(idx, false);
-        idx += nt;
-      }
-    }
-  }
-}
-
-template <int nt, int vt, typename func_t>
-C10_LAUNCH_BOUNDS_2(nt, 4)
-__global__ void elementwise_kernel_strided(int N, func_t f) {
-  int tid = threadIdx.x;
-  int idx = nt * vt * blockIdx.x + tid;
-  int step = nt * vt * gridDim.x;
-  while (idx < N) {
-#pragma unroll
-    for (int i = 0; i < vt; i++) {
-      if ((idx + nt * i) < N) {
-        f(idx + nt * i);
-      }
-    }
-    idx += step;
-  }
-}
-#endif
-
 template <int nt, int vt, typename func_t>
 static void launch_legacy_kernel(int64_t N, const func_t& f) {
   TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits<int32_t>::max());
diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -27,23 +27,6 @@ void reset_buffers() {
   }
 }
 
-#if defined(USE_ROCM) && !defined(_WIN32)
-TEST(TestLoops, HasSameArgTypes) {
-  // This is a compile-time unit test. If this file compiles without error,
-  // then the test passes and during runtime, we just need to return.
-  using namespace at::native::modern::detail;
-  using func1_t = int (*)(float, float);
-  using func2_t = int (*)(bool, float, float);
-  using func3_t = int (*)(float);
-  using func4_t = int (*)();
-  static_assert(has_same_arg_types<func1_t>::value, "func1_t has the same argument types");
-  static_assert(!has_same_arg_types<func2_t>::value, "func2_t does not have the same argument types");
-  static_assert(has_same_arg_types<func3_t>::value, "func3_t has the same argument types");
-  static_assert(has_same_arg_types<func4_t>::value, "func4_t has the same argument types");
-  return;
-}
-#endif
-
 TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
   char *ptr = reinterpret_cast<char *>(buffer1);
 
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -1159,14 +1159,6 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
       set(TP_USE_CUDA ON CACHE BOOL "" FORCE)
       set(TP_ENABLE_CUDA_IPC ON CACHE BOOL "" FORCE)
     endif()
-    if(USE_ROCM)
-      add_compile_options(-D__HIP_PLATFORM_AMD__=1)
-      set(TP_USE_ROCM ON CACHE BOOL "" FORCE)
-      set(TP_ENABLE_HIP_IPC OFF CACHE BOOL "" FORCE)
-      set(TP_ENABLE_HIP_XTH OFF CACHE BOOL "" FORCE)
-      set(TP_ENABLE_HIP_GDR OFF CACHE BOOL "" FORCE)
-      set(TP_ENABLE_IBV OFF CACHE BOOL "" FORCE)
-    endif()
     set(TP_BUILD_LIBUV ON CACHE BOOL "" FORCE)
     add_compile_options(-DTORCH_USE_LIBUV)
     include_directories(BEFORE SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/tensorpipe/third_party/libuv/include)
@@ -1192,9 +1184,9 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE)
     if(USE_CUDA)
       list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda)
     elseif(USE_ROCM)
-      message(WARNING "TensorPipe is supported on ROCm")
+      message(WARNING "TensorPipe doesn't yet support ROCm")
       # Not yet...
-      list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
+      # list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip)
     endif()
   endif()
 endif()
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
@@ -81,7 +81,7 @@ if(NOT __AOTRITON_INCLUDED)
     list(GET __AOTRITON_MANYLINUX_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_MANYLINUX)
     set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
     string(CONCAT __AOTRITON_FILE "aotriton-"
-                                  "${__AOTRITON_VER_WITH_COMMIT}-${__AOTRITON_MANYLINUX}"
+                                  "${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
                                   "_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}"
                                   "-shared.tar.${__AOTRITON_Z}")
     string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/"  # @lint-ignore
diff --git a/related_commits b/related_commits
@@ -1,12 +1,10 @@
-ubuntu|pytorch|apex|master|ca54c058f1094b3463788371325025be707a5982|https://github.com/ROCm/apex
-centos|pytorch|apex|master|ca54c058f1094b3463788371325025be707a5982|https://github.com/ROCm/apex
-ubuntu|pytorch|torchvision|main|f52c4f1afd7dec25cbe7b98bcf1cbc840298e8da|https://github.com/pytorch/vision
-centos|pytorch|torchvision|main|f52c4f1afd7dec25cbe7b98bcf1cbc840298e8da|https://github.com/pytorch/vision
-ubuntu|pytorch|torchtext|main|bde7ecdb6ba9179ccd30cde60a6550478d0a359f|https://github.com/pytorch/text
-centos|pytorch|torchtext|main|bde7ecdb6ba9179ccd30cde60a6550478d0a359f|https://github.com/pytorch/text
-ubuntu|pytorch|torchdata|main|922ac065407546b9cb4f629ab99f1fbf04d8fc12|https://github.com/pytorch/data
-centos|pytorch|torchdata|main|922ac065407546b9cb4f629ab99f1fbf04d8fc12|https://github.com/pytorch/data
-ubuntu|pytorch|torchaudio|main|bccaa454a54c3c648697cc2f46a4fb0500b1f01b|https://github.com/pytorch/audio
-centos|pytorch|torchaudio|main|bccaa454a54c3c648697cc2f46a4fb0500b1f01b|https://github.com/pytorch/audio
-ubuntu|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
-centos|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
+ubuntu|pytorch|apex|master|62c94ed1789bc177a83567985be6c1cb29b2d98c|https://github.com/ROCm/apex
+centos|pytorch|apex|master|62c94ed1789bc177a83567985be6c1cb29b2d98c|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
+centos|pytorch|torchvision|main|98f8b3757c0648724064ca95434b18281c43c5f6|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
+centos|pytorch|torchdata|main|a05a54f797dd0f1a66610652a949fd47243ff952|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|main|0c22347335f4c9a5b92a2f5bad65e05e2464c184|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao
+centos|pytorch|ao|main|3b4bc9869d933927b2547d8231feab69789a80d4|https://github.com/pytorch/ao
diff --git a/third_party/tensorpipe b/third_party/tensorpipe
@@ -1 +1 @@
-Subproject commit 135ba25f6be9991ebfe83d41d268d9c3d4cc5c5b
+Subproject commit dacda0567d9f23d4bc503e1c4f84aa65f33ac38a
diff --git a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp
@@ -1,7 +1,7 @@
 #include <torch/csrc/distributed/rpc/tensorpipe_agent.h>
 #include <torch/csrc/distributed/rpc/tensorpipe_utils.h>
 
-#if defined(USE_TENSORPIPE)
+#if defined(USE_TENSORPIPE) && !defined(USE_ROCM)
 
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -50,8 +50,6 @@ C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_gdr, makeCudaGdrChannel)
 
 #endif
 
-#if TENSORPIPE_HAS_CUDA_XTH_CHANNEL
-
 std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
   auto context = tensorpipe::channel::cuda_xth::create();
   return std::make_unique<ChannelRegistration>(
@@ -61,8 +59,6 @@ std::unique_ptr<ChannelRegistration> makeCudaXthChannel() {
 // The cuda_xth channel supports same-process GPU-to-GPU comm
 C10_REGISTER_CREATOR(TensorPipeChannelRegistry, cuda_xth, makeCudaXthChannel)
 
-#endif
-
 std::unique_ptr<ChannelRegistration> makeCudaBasicChannel() {
   auto context = tensorpipe::channel::cuda_basic::create(
       tensorpipe::channel::basic::create());
diff --git a/torch/headeronly/macros/Export.h b/torch/headeronly/macros/Export.h
@@ -100,10 +100,10 @@
 #define TORCH_API C10_IMPORT
 #endif
 
-// You may be wondering why we have TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API
+// You may be wondering why we have TORCH_CUDA_CPP_API and TORCH_CUDA_API
 // belonging to the same library instead of just one TORCH_CUDA_API. Well, it
 // can indeed just be one TORCH_CUDA_API (and used to be)! TORCH_CUDA_CPP_API
-// and TORCH_CUDA_CU_API are artifacts of when we needed a split build to
+// and TORCH_CUDA_API are artifacts of when we needed a split build to
 // avoid relocation marker linking errors. The context is as follows:
 //
 // Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
@@ -130,14 +130,6 @@
 #define TORCH_CUDA_CU_API C10_IMPORT
 #endif
 
-#if defined(TORCH_HIP_BUILD_MAIN_LIB)
-#define TORCH_HIP_CPP_API C10_EXPORT
-#define TORCH_HIP_API C10_EXPORT
-#else
-#define TORCH_HIP_CPP_API C10_IMPORT
-#define TORCH_HIP_API C10_IMPORT
-#endif
-
 #if defined(TORCH_XPU_BUILD_MAIN_LIB)
 #define TORCH_XPU_API C10_EXPORT
 #else
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py
@@ -37,7 +37,6 @@
     captured_output,
     skip_if_lt_x_gpu,
     tp_transports,
-    skip_if_rocm,
 )
 from torch.testing._internal.common_utils import (
     get_cycles_per_ms,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-f7888497a1eb9e98d4c07537f0d0bcfe180d1363`
	`1`	`+711e2a92522e0a9921ce58ae658571ca55c49b97`