integrate DDA AllToAll to rcclx

Liu Ke · meta-codesync[bot] · commit 1bfe95db8f83 · 2025-11-17T18:24:31.000-08:00
Summary:
- Integrate DDA AllToAll into RCCLX
- rccl-tests performance of DDA vs. RCCLX on MI300x and MI350x
- tune DdaAllToAllMaxBytes based on MI300x perf

Reviewed By: cenzhaometa

Differential Revision: D86030871

fbshipit-source-id: 3ad5903e22b028eeb2dddf56ae8104e243d2a066
diff --git a/comms/common/algorithms/AlgoFactory.cu b/comms/common/algorithms/AlgoFactory.cu
@@ -14,11 +14,12 @@ AlgoFactory::AlgoFactory(
     int maxBlocks,
     const AllReduceOptions& allReduceOpts,
     const AllGatherOptions& allGatherOpts,
-    const ReduceScatterOptions& reduceScatterOpts) {
+    const ReduceScatterOptions& reduceScatterOpts,
+    const AllToAllOptions& allToAllOpts) {
   if (allReduceOpts.enableDda || allGatherOpts.enableDda ||
-      reduceScatterOpts.enableDda) {
+      reduceScatterOpts.enableDda || allToAllOpts.enableDda) {
     XLOG(DBG)
-        << "Initializing AllReduceAlgoManager / AllGatherAlgoManager / ReduceScatterAlgoManager";
+        << "Initializing AllReduceAlgoManager / AllGatherAlgoManager / ReduceScatterAlgoManager / AllToAllAlgoManager";
 
     for (int i = 0; i < nRanks; ++i) {
       if (i == selfRank) {
@@ -64,6 +65,17 @@ AlgoFactory::AlgoFactory(
         reduceScatterOpts.ddaMaxThresholdBytes);
     XLOG(DBG) << "Successfully initialized ReduceScatterAlgoManager";
   }
+
+  if (allToAllOpts.enableDda) {
+    allToAllMgr_ = std::make_unique<AllToAllAlgoManager>(
+        bootstrap,
+        nRanks,
+        selfRank,
+        maxBlocks,
+        allToAllOpts.ddaSendbufSizeBytes,
+        allToAllOpts.ddaMaxThresholdBytes);
+    XLOG(DBG) << "Successfully initialized AllToAllAlgoManager";
+  }
 }
 
 } // namespace meta::comms
diff --git a/comms/common/algorithms/AlgoFactory.cuh b/comms/common/algorithms/AlgoFactory.cuh
@@ -4,6 +4,7 @@
 
 #include "comms/common/algorithms/all_gather/AllGatherAlgoManager.h"
 #include "comms/common/algorithms/all_reduce/AllReduceAlgoManager.h"
+#include "comms/common/algorithms/all_to_all/AllToAllAlgoManager.h"
 #include "comms/common/algorithms/reduce_scatter/ReduceScatterAlgoManager.h"
 #include "comms/ctran/interfaces/IBootstrap.h" // @manual
 #include "comms/utils/commSpecs.h"
@@ -14,6 +15,7 @@ namespace meta::comms {
 class AlgoManagerAllReduce;
 class AlgoManagerAllGather;
 class AlgoManagerReduceScatter;
+class AlgoManagerAllToAll;
 
 /**
  * per communicator per rank Algorithm factory that
@@ -46,14 +48,22 @@ class AlgoFactory {
     // DDA will be used
     int ddaMaxThresholdBytes{0};
   };
+  struct AllToAllOptions {
+    bool enableDda{false};
+    int ddaSendbufSizeBytes{0};
+    // If msg size is not larger than the threshold,
+    // DDA will be used
+    int ddaMaxThresholdBytes{0};
+  };
   AlgoFactory(
       std::shared_ptr<ctran::bootstrap::IBootstrap> bootstrap,
       int nRanks,
       int selfRank,
       int maxBlocks,
       const AllReduceOptions& allReduceOpts,
       const AllGatherOptions& allGatherOpts,
-      const ReduceScatterOptions& reduceScatterOpts);
+      const ReduceScatterOptions& reduceScatterOpts,
+      const AllToAllOptions& allToAllOpts);
 
   std::unique_ptr<AlgoAllReduce> getAllReduceAlgo(
       const void* sendbuff,
@@ -97,9 +107,24 @@ class AlgoFactory {
         sendbuff, recvbuff, count, datatype, stream, acc);
   }
 
+  std::unique_ptr<AlgoAllToAll> getAllToAllAlgo(
+      const void* sendbuff,
+      void* recvbuff,
+      size_t count,
+      commDataType_t datatype,
+      cudaStream_t stream,
+      const void* acc = nullptr) {
+    if (allToAllMgr_ == nullptr) {
+      return nullptr;
+    }
+    return allToAllMgr_->getAllToAllAlgo(
+        sendbuff, recvbuff, count, datatype, stream, acc);
+  }
+
  private:
   std::unique_ptr<AllReduceAlgoManager> allReduceMgr_{nullptr};
   std::unique_ptr<AllGatherAlgoManager> allGatherMgr_{nullptr};
   std::unique_ptr<ReduceScatterAlgoManager> reduceScatterMgr_{nullptr};
+  std::unique_ptr<AllToAllAlgoManager> allToAllMgr_{nullptr};
 };
 } // namespace meta::comms
diff --git a/comms/common/algorithms/all_to_all/AlgoAllToAll.cu b/comms/common/algorithms/all_to_all/AlgoAllToAll.cu
@@ -0,0 +1,36 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include "comms/common/algorithms/all_to_all/AlgoAllToAll.cuh"
+#include "comms/utils/checks.h"
+
+namespace meta::comms {
+
+AlgoAllToAll::AlgoAllToAll(
+    const void* sendbuff,
+    void** allRankDdaSendbuffs,
+    void* recvbuff,
+    size_t count,
+    commDataType_t datatype,
+    cudaStream_t stream,
+    int nRanks,
+    int selfRank,
+    int maxBlocks,
+    IpcGpuBarrier* barrier,
+    const void* acc)
+    : sendbuff_(sendbuff),
+      allRankDdaSendbuffs_(allRankDdaSendbuffs),
+      recvbuff_(recvbuff),
+      count_(count),
+      datatype_(datatype),
+      stream_(stream),
+      nRanks_(nRanks),
+      selfRank_(selfRank),
+      maxBlocks_(maxBlocks),
+      barrier_(barrier),
+      acc_(acc) {}
+
+void AlgoAllToAllDdaIpc::allToAll() {
+  TYPED_CALL(datatype_, launchKernel);
+}
+
+} // namespace meta::comms
diff --git a/comms/common/algorithms/all_to_all/AlgoAllToAll.cuh b/comms/common/algorithms/all_to_all/AlgoAllToAll.cuh
@@ -0,0 +1,83 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "comms/common/IpcGpuBarrier.cuh"
+#include "comms/common/algorithms/AlgoUtils.h"
+#include "comms/common/algorithms/all_to_all/all_to_all_dda.cuh"
+#include "comms/utils/checks.h"
+#include "comms/utils/commSpecs.h"
+
+namespace meta::comms {
+
+/**
+ * This class defines common interface for all AllToAll Algorithms
+ * subclasses are expected to provide actual implementation
+ */
+class AlgoAllToAll {
+ public:
+  // NOTE: acc is not used for all-to-all
+  AlgoAllToAll(
+      const void* sendbuff,
+      void** allRankDdaSendbuffs,
+      void* recvbuff,
+      size_t count,
+      commDataType_t datatype,
+      cudaStream_t stream,
+      int nRanks,
+      int selfRank,
+      int maxBlocks,
+      IpcGpuBarrier* barrier,
+      const void* acc);
+
+  virtual ~AlgoAllToAll() = default;
+
+  virtual void allToAll() = 0;
+
+ protected:
+  const void* sendbuff_{nullptr};
+  void** allRankDdaSendbuffs_{nullptr};
+  void* recvbuff_{nullptr};
+  size_t count_{0};
+  commDataType_t datatype_{commBfloat16};
+  cudaStream_t stream_{nullptr};
+  int nRanks_{0};
+  int selfRank_{0};
+  const size_t maxBlocks_{0};
+  IpcGpuBarrier* barrier_;
+  const void* acc_{nullptr};
+};
+
+class AlgoAllToAllDdaIpc : public AlgoAllToAll {
+ public:
+  using AlgoAllToAll::AlgoAllToAll;
+
+  void allToAll() override;
+
+ private:
+  template <typename T>
+  void launchKernel() {
+    const void* func = nullptr;
+
+    ASSIGN_FUNC_NRANKS(func, ddaAllToAllIpc, nRanks_, false /* hasAcc */);
+
+    auto gridBlock =
+        getGridAndBlockDims(nRanks_ * count_, datatype_, maxBlocks_);
+    const auto& grid = gridBlock.first;
+    const auto& block = gridBlock.second;
+
+    void* args[] = {
+        &allRankDdaSendbuffs_,
+        &recvbuff_,
+        &count_,
+        &sendbuff_,
+        &selfRank_,
+        barrier_,
+        &acc_};
+    CUDA_CHECK(cudaLaunchKernel(func, grid, block, args, 0, stream_));
+  }
+};
+
+} // namespace meta::comms
diff --git a/comms/common/algorithms/all_to_all/AllToAllAlgoManager.cu b/comms/common/algorithms/all_to_all/AllToAllAlgoManager.cu
@@ -0,0 +1,103 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include "comms/common/algorithms/all_to_all/AllToAllAlgoManager.h"
+
+namespace meta::comms {
+
+AllToAllAlgoManager::AllToAllAlgoManager(
+    std::shared_ptr<ctran::bootstrap::IBootstrap> bootstrap,
+    int nRanks,
+    int selfRank,
+    int maxBlocks,
+    int ddaSendbufSizeBytes,
+    int ddaMaxThresholdBytes)
+    : nRanks_(nRanks),
+      selfRank_(selfRank),
+      maxBlocks_(maxBlocks),
+      ddaSendbufSizeBytes_(ddaSendbufSizeBytes),
+      ddaMaxThresholdBytes_(ddaMaxThresholdBytes) {
+  auto [barrierResources, barrier] =
+      IpcGpuBarrier::mallocAndInit(nRanks_, maxBlocks_, selfRank_, bootstrap);
+  barrierResources_ = std::move(barrierResources);
+  barrier_ = barrier;
+
+  ddaSendbuf_ = std::make_unique<DeviceBuffer>(ddaSendbufSizeBytes_ * nRanks_);
+  memHandler_ = std::make_unique<IpcMemHandler>(bootstrap, selfRank_, nRanks_);
+  memHandler_->addSelfDeviceMemPtr(ddaSendbuf_->get());
+  memHandler_->exchangeMemPtrs();
+
+  std::vector<void*> ipcSendbufs(nRanks_);
+  for (int i = 0; i < nRanks_; ++i) {
+    ipcSendbufs[i] = memHandler_->getPeerDeviceMemPtr(i);
+  }
+
+  allRankDdaSendbuffs_ =
+      std::make_unique<DeviceBuffer>(sizeof(void*) * nRanks_);
+  CUDA_CHECK(cudaMemcpy(
+      allRankDdaSendbuffs_->get(),
+      ipcSendbufs.data(),
+      sizeof(void*) * nRanks_,
+      cudaMemcpyDefault));
+  XLOG(DBG) << "Successfully initialized AllToAllAlgoManager";
+}
+
+std::unique_ptr<AlgoAllToAll> AllToAllAlgoManager::getAllToAllAlgo(
+    const void* sendbuff,
+    void* recvbuff,
+    size_t count,
+    commDataType_t datatype,
+    cudaStream_t stream,
+    const void* acc) {
+  if ((count * commTypeSize(datatype)) > ddaSendbufSizeBytes_) {
+    // msg size must fit into the dda sendbuf
+    XLOG(DBG) << "Not using custom all-to-all algo because message size "
+              << count * commTypeSize(datatype)
+              << " is larger than ddaSendbufSizeBytes " << ddaSendbufSizeBytes_;
+    return nullptr;
+  }
+
+  if (((uintptr_t)sendbuff % 16) || ((uintptr_t)recvbuff % 16) ||
+      ((count * commTypeSize(datatype)) % 16)) {
+    // 16 byte alignment as we do 16-byte loads in DDA kernel
+    XLOG(DBG) << "Not using custom all-to-all algo because send/recv buff "
+                 "or msg size is not 16-byte aligned";
+    return nullptr;
+  }
+
+  if (datatype != commBfloat16 && datatype != commFloat16) {
+    // we currently only support bf16 and half
+    XLOG(DBG)
+        << "Not using custom all-to-all algo because cudaDataType_t datatype "
+        << static_cast<int>(datatype) << " is not supported";
+    return nullptr;
+  }
+
+  std::unique_ptr<AlgoAllToAll> algo;
+  if ((count * commTypeSize(datatype)) > ddaMaxThresholdBytes_) {
+    XLOG(DBG) << "Not using custom all-to-all algo because msg size "
+              << count * commTypeSize(datatype)
+              << " is larger than DDA algo threshold " << ddaMaxThresholdBytes_;
+    return nullptr;
+  } else {
+    if ((count * commTypeSize(datatype)) % 16) {
+      XLOG(DBG) << "Not using DDA all-to-all algo because send/recv buff "
+                   "or msg size is not 16-byte aligned for each rank";
+      return nullptr;
+    }
+    algo = std::make_unique<AlgoAllToAllDdaIpc>(
+        sendbuff,
+        reinterpret_cast<void**>(allRankDdaSendbuffs_->get()),
+        recvbuff,
+        count,
+        datatype,
+        stream,
+        nRanks_,
+        selfRank_,
+        maxBlocks_,
+        &barrier_,
+        acc);
+  }
+  return algo;
+}
+
+} // namespace meta::comms
diff --git a/comms/common/algorithms/all_to_all/AllToAllAlgoManager.h b/comms/common/algorithms/all_to_all/AllToAllAlgoManager.h
@@ -0,0 +1,47 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#pragma once
+
+#include "comms/common/IpcGpuBarrier.cuh"
+#include "comms/common/algorithms/all_to_all/AlgoAllToAll.cuh"
+#include "comms/ctran/interfaces/IBootstrap.h" // @manual
+#include "comms/utils/CudaRAII.h"
+#include "comms/utils/commSpecs.h"
+
+namespace meta::comms {
+
+class AllToAllAlgoManager {
+ public:
+  AllToAllAlgoManager(
+      std::shared_ptr<ctran::bootstrap::IBootstrap> bootstrap,
+      int nRanks,
+      int selfRank,
+      int maxBlocks,
+      int ddaSendbufSizeBytes,
+      int ddaMaxThresholdBytes);
+  AllToAllAlgoManager(const AllToAllAlgoManager&) = delete;
+  AllToAllAlgoManager(AllToAllAlgoManager&&) = delete;
+
+  std::unique_ptr<AlgoAllToAll> getAllToAllAlgo(
+      const void* sendbuff,
+      void* recvbuff,
+      size_t count,
+      commDataType_t datatype,
+      cudaStream_t stream,
+      const void* acc);
+
+ private:
+  int nRanks_{0};
+  int selfRank_{-1};
+  int maxBlocks_{0};
+  int ddaSendbufSizeBytes_{0};
+  int ddaMaxThresholdBytes_{0};
+  std::unique_ptr<IpcGpuBarrierResources> barrierResources_;
+  IpcGpuBarrier barrier_;
+  std::unique_ptr<DeviceBuffer> ddaSendbuf_;
+  std::unique_ptr<IpcMemHandler> memHandler_;
+  // arrary of void* (all ranks' ipc enabled sendbuf) in device memory
+  std::unique_ptr<DeviceBuffer> allRankDdaSendbuffs_;
+};
+
+} // namespace meta::comms
diff --git a/comms/common/algorithms/tests/AlgoFactoryTest.cu b/comms/common/algorithms/tests/AlgoFactoryTest.cu
diff --git a/comms/rcclx/develop/meta/algorithms/AlgoInit.h b/comms/rcclx/develop/meta/algorithms/AlgoInit.h
diff --git a/comms/rcclx/develop/src/collectives.cc b/comms/rcclx/develop/src/collectives.cc