meta-pytorch
diff --git a/‎comms/common/algorithms/AlgoFactory.cu‎
Lines changed: 47 additions & 18 deletions b/‎comms/common/algorithms/AlgoFactory.cu‎
Lines changed: 47 additions & 18 deletions
diff --git a/‎comms/common/algorithms/AlgoFactory.cuh‎
Lines changed: 25 additions & 13 deletions b/‎comms/common/algorithms/AlgoFactory.cuh‎
Lines changed: 25 additions & 13 deletions
diff --git a/‎comms/common/algorithms/all_gather/AlgoAllGather.cu‎
Lines changed: 2 additions & 4 deletions b/‎comms/common/algorithms/all_gather/AlgoAllGather.cu‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎comms/common/algorithms/all_gather/AlgoAllGather.cuh‎
Lines changed: 2 additions & 5 deletions b/‎comms/common/algorithms/all_gather/AlgoAllGather.cuh‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎comms/common/algorithms/all_gather/AllGatherAlgoManager.cu‎
Lines changed: 18 additions & 36 deletions b/‎comms/common/algorithms/all_gather/AllGatherAlgoManager.cu‎
Lines changed: 18 additions & 36 deletions
@@ -12,14 +12,19 @@ AlgoFactory::AlgoFactory(
     int nRanks,
     int selfRank,
     int maxBlocks,
+    int ddaSendbufSizeBytes,
     const AllReduceOptions& allReduceOpts,
     const AllGatherOptions& allGatherOpts,
     const ReduceScatterOptions& reduceScatterOpts,
-    const AllToAllOptions& allToAllOpts) {
+    const AllToAllOptions& allToAllOpts)
+    : nRanks_(nRanks),
+      selfRank_(selfRank),
+      maxBlocks_(maxBlocks),
+      ddaSendbufSizeBytes_(ddaSendbufSizeBytes) {
   if (allReduceOpts.enableDda || allGatherOpts.enableDda ||
       reduceScatterOpts.enableDda || allToAllOpts.enableDda) {
     XLOG(DBG)
-        << "Initializing AllReduceAlgoManager / AllGatherAlgoManager / ReduceScatterAlgoManager / AllToAllAlgoManager";
+        << "Initializing AllReduce / AllGather / ReduceScatter / AllToAll AlgoManager";
 
     for (int i = 0; i < nRanks; ++i) {
       if (i == selfRank) {
@@ -30,51 +35,75 @@ AlgoFactory::AlgoFactory(
         CUDA_CHECK(e);
       }
     }
+
+    auto [barrierResources, barrier] =
+        IpcGpuBarrier::mallocAndInit(nRanks_, maxBlocks_, selfRank_, bootstrap);
+    barrierResources_ = std::move(barrierResources);
+    barrier_ = barrier;
+
+    ddaSendbuf_ = std::make_unique<DeviceBuffer>(ddaSendbufSizeBytes_);
+    memHandler_ =
+        std::make_unique<IpcMemHandler>(bootstrap, selfRank_, nRanks_);
+    memHandler_->addSelfDeviceMemPtr(ddaSendbuf_->get());
+    memHandler_->exchangeMemPtrs();
+
+    std::vector<void*> ipcSendbufs(nRanks_);
+    for (int i = 0; i < nRanks_; ++i) {
+      ipcSendbufs[i] = memHandler_->getPeerDeviceMemPtr(i);
+    }
+
+    allRankDdaSendbuffs_ =
+        std::make_unique<DeviceBuffer>(sizeof(void*) * nRanks_);
+    CUDA_CHECK(cudaMemcpy(
+        allRankDdaSendbuffs_->get(),
+        ipcSendbufs.data(),
+        sizeof(void*) * nRanks_,
+        cudaMemcpyDefault));
   }
 
   if (allReduceOpts.enableDda) {
     allReduceMgr_ = std::make_unique<AllReduceAlgoManager>(
-        bootstrap,
         nRanks,
         selfRank,
         maxBlocks,
-        allReduceOpts.ddaSendbufSizeBytes,
+        ddaSendbufSizeBytes,
         allReduceOpts.ddaFlatMaxThresholdBytes,
-        allReduceOpts.ddaTreeMaxThresholdBytes);
-    XLOG(DBG) << "Successfully initialized AllReduceAlgoManager";
+        allReduceOpts.ddaTreeMaxThresholdBytes,
+        reinterpret_cast<void**>(allRankDdaSendbuffs_->get()),
+        &barrier_);
   }
 
   if (allGatherOpts.enableDda) {
     allGatherMgr_ = std::make_unique<AllGatherAlgoManager>(
-        bootstrap,
         nRanks,
         selfRank,
         maxBlocks,
-        allGatherOpts.ddaSendbufSizeBytes,
-        allGatherOpts.ddaMaxThresholdBytes);
-    XLOG(DBG) << "Successfully initialized AllGatherAlgoManager";
+        ddaSendbufSizeBytes,
+        allGatherOpts.ddaMaxThresholdBytes,
+        reinterpret_cast<void**>(allRankDdaSendbuffs_->get()),
+        &barrier_);
   }
 
   if (reduceScatterOpts.enableDda) {
     reduceScatterMgr_ = std::make_unique<ReduceScatterAlgoManager>(
-        bootstrap,
         nRanks,
         selfRank,
         maxBlocks,
-        reduceScatterOpts.ddaSendbufSizeBytes,
-        reduceScatterOpts.ddaMaxThresholdBytes);
-    XLOG(DBG) << "Successfully initialized ReduceScatterAlgoManager";
+        ddaSendbufSizeBytes,
+        reduceScatterOpts.ddaMaxThresholdBytes,
+        reinterpret_cast<void**>(allRankDdaSendbuffs_->get()),
+        &barrier_);
   }
 
   if (allToAllOpts.enableDda) {
     allToAllMgr_ = std::make_unique<AllToAllAlgoManager>(
-        bootstrap,
         nRanks,
         selfRank,
         maxBlocks,
-        allToAllOpts.ddaSendbufSizeBytes,
-        allToAllOpts.ddaMaxThresholdBytes);
-    XLOG(DBG) << "Successfully initialized AllToAllAlgoManager";
+        ddaSendbufSizeBytes,
+        allToAllOpts.ddaMaxThresholdBytes,
+        reinterpret_cast<void**>(allRankDdaSendbuffs_->get()),
+        &barrier_);
   }
 }
 
 
@@ -2,11 +2,13 @@
 
 #pragma once
 
+#include "comms/common/IpcGpuBarrier.cuh"
 #include "comms/common/algorithms/all_gather/AllGatherAlgoManager.h"
 #include "comms/common/algorithms/all_reduce/AllReduceAlgoManager.h"
 #include "comms/common/algorithms/all_to_all/AllToAllAlgoManager.h"
 #include "comms/common/algorithms/reduce_scatter/ReduceScatterAlgoManager.h"
 #include "comms/ctran/interfaces/IBootstrap.h" // @manual
+#include "comms/utils/CudaRAII.h"
 #include "comms/utils/commSpecs.h"
 
 namespace meta::comms {
@@ -26,40 +28,41 @@ class AlgoFactory {
  public:
   struct AllReduceOptions {
     bool enableDda{false};
-    int ddaSendbufSizeBytes{0};
     // If msg size is not larger than the threshold,
     // flat (one-shot) DDA will be used
     int ddaFlatMaxThresholdBytes{0};
     // If msg size is not larger than the threshold,
     // tree (two-shot) DDA will be used
     int ddaTreeMaxThresholdBytes{0};
   };
+
   struct AllGatherOptions {
     bool enableDda{false};
-    int ddaSendbufSizeBytes{0};
     // If msg size is not larger than the threshold,
     // DDA will be used
     int ddaMaxThresholdBytes{0};
   };
+
   struct ReduceScatterOptions {
     bool enableDda{false};
-    int ddaSendbufSizeBytes{0};
     // If msg size is not larger than the threshold,
     // DDA will be used
     int ddaMaxThresholdBytes{0};
   };
+
   struct AllToAllOptions {
     bool enableDda{false};
-    int ddaSendbufSizeBytes{0};
     // If msg size is not larger than the threshold,
     // DDA will be used
     int ddaMaxThresholdBytes{0};
   };
+
   AlgoFactory(
       std::shared_ptr<ctran::bootstrap::IBootstrap> bootstrap,
       int nRanks,
       int selfRank,
       int maxBlocks,
+      int ddaSendbufSizeBytes,
       const AllReduceOptions& allReduceOpts,
       const AllGatherOptions& allGatherOpts,
       const ReduceScatterOptions& reduceScatterOpts,
@@ -84,44 +87,53 @@ class AlgoFactory {
       void* recvbuff,
       size_t count,
       commDataType_t datatype,
-      cudaStream_t stream,
-      const void* acc = nullptr) {
+      cudaStream_t stream) {
     if (allGatherMgr_ == nullptr) {
       return nullptr;
     }
     return allGatherMgr_->getAllGatherAlgo(
-        sendbuff, recvbuff, count, datatype, stream, acc);
+        sendbuff, recvbuff, count, datatype, stream);
   }
 
   std::unique_ptr<AlgoReduceScatter> getReduceScatterAlgo(
       const void* sendbuff,
       void* recvbuff,
       size_t count,
       commDataType_t datatype,
-      cudaStream_t stream,
-      const void* acc = nullptr) {
+      cudaStream_t stream) {
     if (reduceScatterMgr_ == nullptr) {
       return nullptr;
     }
     return reduceScatterMgr_->getReduceScatterAlgo(
-        sendbuff, recvbuff, count, datatype, stream, acc);
+        sendbuff, recvbuff, count, datatype, stream);
   }
 
   std::unique_ptr<AlgoAllToAll> getAllToAllAlgo(
       const void* sendbuff,
       void* recvbuff,
       size_t count,
       commDataType_t datatype,
-      cudaStream_t stream,
-      const void* acc = nullptr) {
+      cudaStream_t stream) {
     if (allToAllMgr_ == nullptr) {
       return nullptr;
     }
     return allToAllMgr_->getAllToAllAlgo(
-        sendbuff, recvbuff, count, datatype, stream, acc);
+        sendbuff, recvbuff, count, datatype, stream);
   }
 
  private:
+  int nRanks_{0};
+  int selfRank_{-1};
+  int maxBlocks_{0};
+  int ddaSendbufSizeBytes_{0};
+
+  std::unique_ptr<IpcGpuBarrierResources> barrierResources_;
+  IpcGpuBarrier barrier_;
+  std::unique_ptr<DeviceBuffer> ddaSendbuf_;
+  std::unique_ptr<IpcMemHandler> memHandler_;
+  // arrary of void* (all ranks' ipc enabled sendbuf) in device memory
+  std::unique_ptr<DeviceBuffer> allRankDdaSendbuffs_;
+
   std::unique_ptr<AllReduceAlgoManager> allReduceMgr_{nullptr};
   std::unique_ptr<AllGatherAlgoManager> allGatherMgr_{nullptr};
   std::unique_ptr<ReduceScatterAlgoManager> reduceScatterMgr_{nullptr};
 
@@ -15,8 +15,7 @@ AlgoAllGather::AlgoAllGather(
     int nRanks,
     int selfRank,
     int maxBlocks,
-    IpcGpuBarrier* barrier,
-    const void* acc)
+    IpcGpuBarrier* barrier)
     : sendbuff_(sendbuff),
       allRankDdaSendbuffs_(allRankDdaSendbuffs),
       recvbuff_(recvbuff),
@@ -26,8 +25,7 @@ AlgoAllGather::AlgoAllGather(
       nRanks_(nRanks),
       selfRank_(selfRank),
       maxBlocks_(maxBlocks),
-      barrier_(barrier),
-      acc_(acc) {}
+      barrier_(barrier) {}
 
 void AlgoAllGatherDdaIpc::allGather() {
   TYPED_CALL(datatype_, launchKernel);
 
@@ -29,8 +29,7 @@ class AlgoAllGather {
       int nRanks,
       int selfRank,
       int maxBlocks,
-      IpcGpuBarrier* barrier,
-      const void* acc);
+      IpcGpuBarrier* barrier);
 
   virtual ~AlgoAllGather() = default;
 
@@ -47,7 +46,6 @@ class AlgoAllGather {
   int selfRank_{0};
   const size_t maxBlocks_{0};
   IpcGpuBarrier* barrier_;
-  const void* acc_{nullptr};
 };
 
 class AlgoAllGatherDdaIpc : public AlgoAllGather {
@@ -74,8 +72,7 @@ class AlgoAllGatherDdaIpc : public AlgoAllGather {
         &count_,
         &sendbuff_,
         &selfRank_,
-        barrier_,
-        &acc_};
+        barrier_};
     CUDA_CHECK(cudaLaunchKernel(func, grid, block, args, 0, stream_));
   }
 };
 
@@ -5,39 +5,20 @@
 namespace meta::comms {
 
 AllGatherAlgoManager::AllGatherAlgoManager(
-    std::shared_ptr<ctran::bootstrap::IBootstrap> bootstrap,
     int nRanks,
     int selfRank,
     int maxBlocks,
     int ddaSendbufSizeBytes,
-    int ddaMaxThresholdBytes)
+    int ddaMaxThresholdBytes,
+    void** allRankDdaSendbuffs,
+    IpcGpuBarrier* barrier)
     : nRanks_(nRanks),
       selfRank_(selfRank),
       maxBlocks_(maxBlocks),
       ddaSendbufSizeBytes_(ddaSendbufSizeBytes),
-      ddaMaxThresholdBytes_(ddaMaxThresholdBytes) {
-  auto [barrierResources, barrier] =
-      IpcGpuBarrier::mallocAndInit(nRanks_, maxBlocks_, selfRank_, bootstrap);
-  barrierResources_ = std::move(barrierResources);
-  barrier_ = barrier;
-
-  ddaSendbuf_ = std::make_unique<DeviceBuffer>(ddaSendbufSizeBytes_);
-  memHandler_ = std::make_unique<IpcMemHandler>(bootstrap, selfRank_, nRanks_);
-  memHandler_->addSelfDeviceMemPtr(ddaSendbuf_->get());
-  memHandler_->exchangeMemPtrs();
-
-  std::vector<void*> ipcSendbufs(nRanks_);
-  for (int i = 0; i < nRanks_; ++i) {
-    ipcSendbufs[i] = memHandler_->getPeerDeviceMemPtr(i);
-  }
-
-  allRankDdaSendbuffs_ =
-      std::make_unique<DeviceBuffer>(sizeof(void*) * nRanks_);
-  CUDA_CHECK(cudaMemcpy(
-      allRankDdaSendbuffs_->get(),
-      ipcSendbufs.data(),
-      sizeof(void*) * nRanks_,
-      cudaMemcpyDefault));
+      ddaMaxThresholdBytes_(ddaMaxThresholdBytes),
+      allRankDdaSendbuffs_(allRankDdaSendbuffs),
+      barrier_(barrier) {
   XLOG(DBG) << "Successfully initialized AllGatherAlgoManager";
 }
 
@@ -46,15 +27,15 @@ std::unique_ptr<AlgoAllGather> AllGatherAlgoManager::getAllGatherAlgo(
     void* recvbuff,
     size_t count,
     commDataType_t datatype,
-    cudaStream_t stream,
-    const void* acc) {
-  if (count * commTypeSize(datatype) > ddaSendbufSizeBytes_) {
-    // msg size must fit into the dda sendbuf
+    cudaStream_t stream) {
+  if ((nRanks_ * count * commTypeSize(datatype)) > ddaSendbufSizeBytes_) {
+    // AG: msgSize = (nRanks_ x count x datatype) must fit into the dda sendbuf
     XLOG(DBG) << "Not using custom all gather algo because message size "
-              << count * commTypeSize(datatype)
+              << nRanks_ * count * commTypeSize(datatype)
               << " is larger than ddaSendbufSizeBytes " << ddaSendbufSizeBytes_;
     return nullptr;
   }
+
   if (((uintptr_t)sendbuff % 16) || ((uintptr_t)recvbuff % 16) ||
       ((count * commTypeSize(datatype)) % 16)) {
     // 16 byte alignment as we do 16-byte loads in DDA kernel
@@ -72,29 +53,30 @@ std::unique_ptr<AlgoAllGather> AllGatherAlgoManager::getAllGatherAlgo(
   }
 
   std::unique_ptr<AlgoAllGather> algo;
-  if (count * commTypeSize(datatype) > ddaMaxThresholdBytes_) {
+  if ((nRanks_ * count * commTypeSize(datatype)) > ddaMaxThresholdBytes_) {
+    // AG: msgSize = (nRanks_ x count x datatype) must less than algo threshold
     XLOG(DBG) << "Not using custom all gather algo because msg size "
-              << count * commTypeSize(datatype)
+              << nRanks_ * count * commTypeSize(datatype)
               << " is larger than DDA algo threshold " << ddaMaxThresholdBytes_;
     return nullptr;
   } else {
-    if ((count * commTypeSize(datatype)) % 16) {
+    if (((count * commTypeSize(datatype)) % 16) ||
+        ((nRanks_ * count * commTypeSize(datatype)) % 16)) {
       XLOG(DBG) << "Not using DDA all gather algo because send/recv buff "
                    "or msg size is not 16-byte aligned for each rank";
       return nullptr;
     }
     algo = std::make_unique<AlgoAllGatherDdaIpc>(
         sendbuff,
-        reinterpret_cast<void**>(allRankDdaSendbuffs_->get()),
+        allRankDdaSendbuffs_,
         recvbuff,
         count,
         datatype,
         stream,
         nRanks_,
         selfRank_,
         maxBlocks_,
-        &barrier_,
-        acc);
+        barrier_);
   }
   return algo;
 }