ROCm
diff --git a/‎examples/pytorch/comm_gemm_overlap/te_layer_with_overlap.py‎
Lines changed: 9 additions & 3 deletions b/‎examples/pytorch/comm_gemm_overlap/te_layer_with_overlap.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎transformer_engine/common/CMakeLists.txt‎
Lines changed: 7 additions & 10 deletions b/‎transformer_engine/common/CMakeLists.txt‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp‎
Lines changed: 20 additions & 4 deletions b/‎transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎transformer_engine/common/comm_gemm_overlap/rocm_comm_gemm_overlap.cpp‎
Lines changed: 142 additions & 0 deletions b/‎transformer_engine/common/comm_gemm_overlap/rocm_comm_gemm_overlap.cpp‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp‎
Lines changed: 4 additions & 1 deletion b/‎transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp‎
Lines changed: 4 additions & 1 deletion
@@ -68,7 +68,7 @@ def _parse_args(argv=None, namespace=None):
     )
     parser.add_argument("--seed", type=int, default=1234, help="RNG seed.")
     parser.add_argument(
-        "--fp8", action="store_true", default=False, help="Enables the te.fp8_autocast() context."
+        "--fp8", action="store_true", default=False, help="Enables the te.autocast() context."
     )
     parser.add_argument(
         "--no-comm-overlap",
@@ -263,7 +263,13 @@ def dist_print(msg, end="\n", group=nccl_world, src=0, debug=False, error=False)
         te.module.base.initialize_ub(
             [batched_size, hidden_size],
             tp_size,
-            use_fp8=opts.fp8,
+            quantization_modes=[
+                (
+                    te.module.base.UserBufferQuantizationMode.FP8
+                    if opts.fp8
+                    else te.module.base.UserBufferQuantizationMode.NONE
+                )
+            ],
             dtype=torch.bfloat16,
             bootstrap_backend=opts.bootstrap_backend,
         )
@@ -293,7 +299,7 @@ def dist_print(msg, end="\n", group=nccl_world, src=0, debug=False, error=False)
 
         dist_print("    |-- Forward pass", group=tp_group, debug=True)
         with torch.amp.autocast("cuda", dtype=torch.bfloat16):
-            with te.fp8_autocast(enabled=opts.fp8, fp8_recipe=fp8_recipe, fp8_group=nccl_world):
+            with te.autocast(enabled=opts.fp8, recipe=fp8_recipe, amax_reduction_group=nccl_world):
                 y = model(x)
                 if isinstance(y, tuple):
                     out, *_ = y
 
@@ -162,7 +162,11 @@ list(APPEND transformer_engine_SOURCES
      fused_router/fused_topk_with_score_function.cu
      recipe/current_scaling.cu
      recipe/delayed_scaling.cu
-     recipe/fp8_block_scaling.cu)
+     recipe/fp8_block_scaling.cu
+     comm_gemm_overlap/userbuffers/ipcsocket.cc
+     comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+     comm_gemm_overlap/userbuffers/userbuffers.cu
+     comm_gemm_overlap/comm_gemm_overlap.cpp)
 if(USE_CUDA)
 # Removed indent to minimize code diff with NV upstream
 # Files unique in cuda building
@@ -175,11 +179,7 @@ list(APPEND transformer_engine_SOURCES
      fused_attn/fused_attn_fp8.cu
      fused_attn/fused_attn.cpp
      fused_attn/utils.cu
-     util/cuda_nvml.cpp
-     comm_gemm_overlap/userbuffers/ipcsocket.cc
-     comm_gemm_overlap/userbuffers/userbuffers-host.cpp
-     comm_gemm_overlap/userbuffers/userbuffers.cu
-     comm_gemm_overlap/comm_gemm_overlap.cpp)
+     util/cuda_nvml.cpp)
 add_library(transformer_engine SHARED ${transformer_engine_SOURCES})
 else()
   list(APPEND transformer_engine_SOURCES
@@ -189,10 +189,7 @@ else()
        fused_attn_rocm/utils.cpp
        gemm/rocm_gemm.cu
        amd_detail/system.cpp
-       comm_gemm_overlap/userbuffers/ipcsocket.cc
-       comm_gemm_overlap/userbuffers/userbuffers-host.cpp
-       comm_gemm_overlap/userbuffers/userbuffers.cu
-       comm_gemm_overlap/comm_gemm_overlap.cpp)
+       comm_gemm_overlap/rocm_comm_gemm_overlap.cpp)
 
   # process source code files
   set(TE ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 
@@ -21,6 +21,12 @@
 #define HALF_BYTES 2
 #define UB_MAX_SM 32
 
+#ifdef __HIP_PLATFORM_AMD__
+#define half_dtype hip_bfloat16
+#define __nv_fp8_e5m2 te_hip_fp8_e5m2
+#define __nv_fp8_e4m3 te_hip_fp8_e4m3
+#endif
+
 using namespace std::placeholders;
 
 namespace transformer_engine {
@@ -328,6 +334,7 @@ void CommOverlapBase::bulk_overlap(const TensorWrapper &A, bool transa, const Te
                                    bool accumulate, bool use_split_accumulator,
                                    CommOverlapType comm_type, TensorWrapper &rs_output,
                                    cudaStream_t stream_main) {
+  printf("bulk_overlap\n");                                              
   int ori_sms = _ub_comm->sms;
   _ub_comm->use_ce = _use_ce;
   _ub_comm->sms = _num_comm_sm;
@@ -353,7 +360,7 @@ void CommOverlapBase::bulk_overlap(const TensorWrapper &A, bool transa, const Te
       char *rs_output_ptr = reinterpret_cast<char *>(rs_output.dptr());
       reducescatter2_userbuff_fp8<__nv_fp8_e5m2>(rs_output_ptr, _ubuf.scale_inv(), _ub_reg, 0,
                                                  comm_elements, _ub_comm, _stream_comm,
-                                                 (cudaEvent_t)_comm_launch_event);
+                                                 (cudaEvent_t)_comm_launch_event);                                         
     } else {
       reducescatter2_userbuff_inplace(_ub_reg, 0, comm_elements, _ub_comm, _stream_comm,
                                       (cudaEvent_t)_comm_launch_event);
@@ -385,6 +392,7 @@ void CommOverlapBase::atomic_gemm_overlap_rs(const TensorWrapper &A, bool transa
                                              TensorWrapper &workspace, bool grad, bool accumulate,
                                              bool use_split_accumulator, TensorWrapper &rs_output,
                                              cudaStream_t stream_main) {
+  printf("atomic_gemm_overlap_rs\n");                                              
   int ori_sms = _ub_comm->sms;
   _ub_comm->use_ce = _use_ce;
   _ub_comm->sms = _num_comm_sm;
@@ -481,6 +489,7 @@ void CommOverlapBase::split_overlap_rs(const TensorWrapper &A, bool transa, cons
                                        TensorWrapper &pre_gelu_out, TensorWrapper &workspace,
                                        bool grad, bool accumulate, bool use_split_accumulator,
                                        TensorWrapper &rs_output, cudaStream_t stream_main) {
+  printf("split_overlap_rs\n");                                              
   // Get GEMM dimensions
   int ori_sms = _ub_comm->sms;
   _ub_comm->use_ce = _use_ce;
@@ -619,6 +628,8 @@ void CommOverlapBase::split_overlap_rs(const TensorWrapper &A, bool transa, cons
 
 void CommOverlapBase::bulk_overlap_external_ag(cudaStream_t send_stream, cudaStream_t recv_stream,
                                                cudaStream_t stream_main) {
+  printf("bulk_overlap_external_ag\n");                                              
+  
   int comm_bytes = _ubuf.bytes();
   int comm_bytes_per_rank = comm_bytes / _tp_size;
 
@@ -651,19 +662,20 @@ CommOverlapP2PBase::CommOverlapP2PBase(const std::vector<size_t> &buffer_shape,
                                        CommOverlapType comm_type, int num_max_streams,
                                        int comm_cga_size, int gemm_priority, int comm_priority,
                                        int num_comm_sm, bool set_sm_margin, bool use_ce,
-                                       bool atomic_gemm, bool aggregate)
+                                       bool atomic_gemm, bool aggregate, bool use_rd)
     : CommOverlapCore(myrank, numranks, mylocal, numlocal, mynode, numnodes, tp_size,
                       allgather_handle, barrier_handle, tp_size, num_max_streams, comm_cga_size,
                       gemm_priority, comm_priority, num_comm_sm, set_sm_margin, use_ce,
                       atomic_gemm) {
-  initialize(buffer_shape, buffer_dtype, comm_type, aggregate);
+  initialize(buffer_shape, buffer_dtype, comm_type, aggregate, use_rd);
 }
 
 void CommOverlapP2PBase::initialize(const std::vector<size_t> &buffer_shape, DType buffer_dtype,
-                                    CommOverlapType comm_type, bool aggregate) {
+                                    CommOverlapType comm_type, bool aggregate, bool use_rd) {
   _is_p2p = true;
   _is_reduce_scatter = comm_type == CommOverlapType::RS;
   _aggregate = aggregate;
+  _use_rd = use_rd;
 
   // Create workspace tensor with userbuffer
   NVTE_CHECK(buffer_shape.size() == 2, "Userbuffer shape must be 2-dimensional!");
@@ -788,6 +800,7 @@ void CommOverlapP2PBase::atomic_gemm_overlap_ag(
     const TensorWrapper &A, bool transa, const TensorWrapper &B, bool transb, TensorWrapper &D,
     TensorWrapper &bias, TensorWrapper &pre_gelu_out, TensorWrapper &workspace, bool grad,
     bool accumulate, bool use_split_accumulator, TensorWrapper &B_copy, cudaStream_t stream_main) {
+  printf("atomic_gemm_overlap_ag\n");                                              
   int ori_sms = _ub_comm->sms;
   _ub_comm->use_ce = _use_ce;
   _ub_comm->sms = _num_comm_sm;
@@ -890,6 +903,7 @@ void CommOverlapP2PBase::split_overlap_ag(const TensorWrapper &A, bool transa,
                                           TensorWrapper &workspace, bool grad, bool accumulate,
                                           bool use_split_accumulator, TensorWrapper &B_copy,
                                           cudaStream_t stream_main) {
+  printf("split_overlap_ag\n");
   int ori_sms = _ub_comm->sms;
   _ub_comm->use_ce = _use_ce;
   _ub_comm->sms = _num_comm_sm;
@@ -1057,6 +1071,7 @@ void CommOverlapP2PBase::atomic_gemm_overlap_rs(
     TensorWrapper &bias, TensorWrapper &pre_gelu_out, TensorWrapper &workspace, bool grad,
     bool accumulate, bool use_split_accumulator, TensorWrapper &rs_output,
     cudaStream_t stream_main) {
+   printf("atomic_gemm_overlap_rs\n");    
   int ori_sms = _ub_comm->sms;
   _ub_comm->use_ce = _use_ce;
   _ub_comm->sms = _num_comm_sm;
@@ -1121,6 +1136,7 @@ void CommOverlapP2PBase::split_overlap_rs(const TensorWrapper &A, bool transa,
                                           TensorWrapper &workspace, bool grad, bool accumulate,
                                           bool use_split_accumulator, TensorWrapper &rs_output,
                                           cudaStream_t stream_main) {
+  printf("split_overlap_rs\n");                                              
   int ori_sms = _ub_comm->sms;
   _ub_comm->use_ce = _use_ce;
   _ub_comm->sms = _num_comm_sm;
 
@@ -0,0 +1,142 @@
+/*************************************************************************
+ * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * License for AMD contributions = MIT. See LICENSE for more information
+ ************************************************************************/
+
+#include <transformer_engine/comm_gemm_overlap.h>
+#include <transformer_engine/gemm.h>
+#include <transformer_engine/transformer_engine.h>
+
+#include "common/common.h"
+#include "common/util/cuda_driver.h"
+#include "common/util/cuda_runtime.h"
+#include "common/util/logging.h"
+#include "common/util/system.h"
+#include "userbuffers/userbuffers.h"
+
+namespace transformer_engine {
+
+void CommOverlapP2PBase::rocm_split_overlap_ag_rd(const TensorWrapper &A, bool transa, const TensorWrapper &B,
+                                bool transb, TensorWrapper &D, TensorWrapper &bias,
+                                TensorWrapper &pre_gelu_out, TensorWrapper &workspace, bool grad,
+                                bool accumulate, bool use_split_accumulator, TensorWrapper &B_copy,
+                                cudaStream_t stream_main) {
+  printf("rocm_split_overlap_ag_rd\n");
+  int ori_sms = _ub_comm->sms;
+  _ub_comm->use_ce = _use_ce;
+  _ub_comm->sms = _num_comm_sm;
+  _ub_comm->cga_size = _cga_size;
+  // Get GEMM dimensions between TN and NN input layouts
+  const size_t m = (transa) ? A.size(0) : A.size(1);
+  const size_t k = (transa) ? A.size(1) : A.size(0);
+  const size_t n_chunk = _ubufs[0].size(0);
+  const int comm_bytes = _ubufs[0].bytes();
+  const bool do_gelu = pre_gelu_out.numel() > 0;
+  const size_t workspace_size_chunk = workspace.numel() / _stream_compute.size();
+
+  // Check B copy sizing
+  if (B_copy.numel() > 0) {
+    NVTE_CHECK(B_copy.numel() == _ubuf.numel(), "Expected all-gathered B copy buffer with ",
+               _ubuf.numel(), " elements but got ", B_copy.numel());
+    NVTE_CHECK(B_copy.element_size() == _ubuf.element_size(),
+               "Expected all-gathered B copy buffer with ", _ubuf.element_size() * 8,
+               "-bit data type but got ", B_copy.element_size() * 8, "-bit");
+  }
+
+  NVTE_CHECK_CUDA(cudaEventRecord(_start_compute, stream_main));
+  NVTE_CHECK_CUDA(cudaStreamWaitEvent(_stream_send[0], _start_compute, 0));
+  NVTE_CHECK_CUDA(cudaStreamWaitEvent(_stream_recv, _start_compute, 0));
+  for (size_t i = 0; i < _stream_compute.size(); i++) {
+    NVTE_CHECK_CUDA(cudaStreamWaitEvent(_stream_compute[i], _start_compute, 0));
+  }
+
+  int steps = 31 - __builtin_clz(_tp_size);
+
+  // Chunk dims
+  std::vector<size_t> input_b_chunk_shape =
+      (transb ? std::vector<size_t>{k, n_chunk} : std::vector<size_t>{n_chunk, k});
+  std::vector<size_t> output_chunk_shape = {n_chunk, m};
+  size_t input_b_chunk_size = n_chunk * k;
+  size_t output_chunk_size = n_chunk * m;
+
+  // GEMM
+  auto input_b_chunk =
+      get_buffer_chunk_like(B, input_b_chunk_size * _tp_id, input_b_chunk_shape);
+  auto output_chunk =
+      get_tensor_chunk(D, output_chunk_size * _tp_id, output_chunk_shape);
+  auto aux_chunk =
+      (do_gelu)
+          ? get_tensor_chunk(pre_gelu_out, output_chunk_size * _tp_id, {n_chunk, k})
+          : TensorWrapper(nullptr, std::vector<size_t>{0}, pre_gelu_out.dtype());
+  auto workspace_chunk = get_tensor_chunk(
+      workspace, (_tp_id % _stream_compute.size()) * workspace_size_chunk, {workspace_size_chunk});
+
+  nvte_cublas_gemm(A.data(), input_b_chunk.data(), output_chunk.data(), bias.data(),
+                    aux_chunk.data(), transa, transb, grad, workspace_chunk.data(), accumulate,
+                    use_split_accumulator, _math_sms,
+                    _stream_compute[_tp_id % _stream_compute.size()]);
+
+  std::vector<size_t> owned_chunks;
+  owned_chunks.reserve(_tp_size);
+  owned_chunks.push_back(_tp_id);
+  size_t offset = 1;
+
+  for (int step = 0; step < steps; step++) {
+    int send_rank = (_tp_id + offset) % _tp_size;
+    int recv_rank = (_tp_id - offset + _tp_size) % _tp_size;
+    
+    for (int i = 0; i < owned_chunks.size(); i++) {
+      size_t send_offset = owned_chunks[i] * comm_bytes;
+      userbuffers_send(_ub_reg, send_offset, _ub_reg, send_offset,
+                       comm_bytes, _ub_comm, send_rank, _stream_send[i % _stream_send.size()]);
+    }
+
+    std::vector<size_t> new_chunks;
+    for (size_t i = 0; i < owned_chunks.size(); i++) {
+      size_t new_chunk_id = (recv_rank + i * offset) % _tp_size;
+      if (new_chunk_id >= _tp_size || 
+          std::find(owned_chunks.begin(), owned_chunks.end(), new_chunk_id) != owned_chunks.end()) continue;
+      size_t recv_offset  = new_chunk_id * comm_bytes;
+      size_t stream_id    = new_chunks.size() % _stream_compute.size();
+
+      userbuffers_recv(_ub_reg, recv_offset, _ub_reg, recv_offset,
+                       comm_bytes, _ub_comm, recv_rank, _stream_recv);
+
+      NVTE_CHECK_CUDA(cudaEventRecord(_stop_recv, _stream_recv));
+      NVTE_CHECK_CUDA(cudaStreamWaitEvent(_stream_compute[stream_id], _stop_recv, 0));
+
+      auto input_b_chunk = get_buffer_chunk_like(B, input_b_chunk_size * new_chunk_id, input_b_chunk_shape);
+      output_chunk = get_tensor_chunk(D, output_chunk_size * new_chunk_id, output_chunk_shape);
+      aux_chunk = (do_gelu) ? get_tensor_chunk(pre_gelu_out, output_chunk_size * new_chunk_id, {n_chunk, k})
+                            : TensorWrapper(nullptr, std::vector<size_t>{0}, pre_gelu_out.dtype());
+      workspace_chunk = get_tensor_chunk(workspace, stream_id * workspace_size_chunk, {workspace_size_chunk});
+
+      nvte_cublas_gemm(A.data(), input_b_chunk.data(), output_chunk.data(), bias.data(),
+                       aux_chunk.data(), transa, transb, grad, workspace_chunk.data(), accumulate,
+                       use_split_accumulator, _math_sms,
+                       _stream_compute[stream_id]);
+      
+      new_chunks.push_back(new_chunk_id);
+    }
+    owned_chunks.insert(owned_chunks.end(), new_chunks.begin(), new_chunks.end());
+    offset <<= 1;
+  }
+
+  if (B_copy.numel() > 0) {
+    NVTE_CHECK_CUDA(cudaMemcpyAsync(B_copy.dptr(), _ubuf.dptr(), _ubuf.bytes(),
+                                    cudaMemcpyDeviceToDevice, _stream_send[0]));
+  }
+
+  _ub_comm->sms = ori_sms;
+  for (size_t i = 0; i < _stream_compute.size(); i++) {
+    NVTE_CHECK_CUDA(cudaEventRecord(_stop_compute, _stream_compute[i]));
+    NVTE_CHECK_CUDA(cudaStreamWaitEvent(stream_main, _stop_compute, 0));
+  }
+  NVTE_CHECK_CUDA(cudaEventRecord(_stop_send, _stream_send[0]));
+  NVTE_CHECK_CUDA(cudaStreamWaitEvent(stream_main, _stop_send, 0));
+  NVTE_CHECK_CUDA(cudaEventRecord(_stop_recv, _stream_recv));
+  NVTE_CHECK_CUDA(cudaStreamWaitEvent(stream_main, _stop_recv, 0));
+} // rocm_split_overlap_ag_rd
+
+} // namespace transformer_engine
@@ -375,8 +375,11 @@ int create_communicator_grouped2(communicator **comm, int myrank, int numranks,
       cudaMalloc(reinterpret_cast<void **>(&(*comm)->flags_baseptr), 2 * GPU_PAGE_SIZE));
   NVTE_CHECK_CUDA(cudaMemset((*comm)->flags_baseptr, 0, 2 * GPU_PAGE_SIZE));
   (*comm)->flags = reinterpret_cast<int *>(
+#ifdef __HIP_PLATFORM_AMD__
+      (reinterpret_cast<uintptr_t>((*comm)->flags) + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK);
+#else
       ((CUdeviceptr)(*comm)->flags_baseptr + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK);
-
+#endif
   using namespace std;
 
   sched_param param;