perplexityai
diff --git a/‎csrc/all_to_all/all_to_all.cpp
Lines changed: 5 additions & 5 deletions b/‎csrc/all_to_all/all_to_all.cpp
Lines changed: 5 additions & 5 deletions
diff --git a/‎csrc/all_to_all/bench_all_to_all.cpp
Lines changed: 1 addition & 0 deletions b/‎csrc/all_to_all/bench_all_to_all.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/all_to_all/internode.cpp
Lines changed: 8 additions & 8 deletions b/‎csrc/all_to_all/internode.cpp
Lines changed: 8 additions & 8 deletions
diff --git a/‎csrc/all_to_all/internode_combine.cu
Lines changed: 1 addition & 1 deletion b/‎csrc/all_to_all/internode_combine.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/all_to_all/internode_dispatch.cu
Lines changed: 7 additions & 8 deletions b/‎csrc/all_to_all/internode_dispatch.cu
Lines changed: 7 additions & 8 deletions
diff --git a/‎csrc/all_to_all/intranode.cpp
Lines changed: 29 additions & 1 deletion b/‎csrc/all_to_all/intranode.cpp
Lines changed: 29 additions & 1 deletion
diff --git a/‎csrc/all_to_all/intranode.cuh
Lines changed: 2 additions & 2 deletions b/‎csrc/all_to_all/intranode.cuh
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/all_to_all/intranode.h
Lines changed: 14 additions & 10 deletions b/‎csrc/all_to_all/intranode.h
Lines changed: 14 additions & 10 deletions
@@ -29,14 +29,14 @@ AllToAll::AllToAll(
       dpSize(dpSize),
       numSMs(get_sm_count()) {
 
-  ROSE_ASSERT(hiddenDimBytes % 16 == 0, "invalid hidden dim bytes");
-  ROSE_ASSERT(hiddenDimScaleBytes % 16 == 0, "invalid hidden dim scale bytes");
+  PPLX_ASSERT(hiddenDimBytes % 16 == 0, "invalid hidden dim bytes");
+  PPLX_ASSERT(hiddenDimScaleBytes % 16 == 0, "invalid hidden dim scale bytes");
   const size_t perTokenBytes =
       round_up<size_t>(hiddenDimBytes + hiddenDimScaleBytes + sizeof(uint32_t), 16);
 
-  ROSE_ASSERT(numLocalExperts != 0, "numLocalExperts is 0");
-  ROSE_ASSERT(numDPGroups > 1, "at least 2 DP groups are required");
-  ROSE_ASSERT(hiddenDimScaleBytes <= hiddenDimBytes, "invalid hidden dim bytes");
+  PPLX_ASSERT(numLocalExperts != 0, "numLocalExperts is 0");
+  PPLX_ASSERT(numDPGroups > 1, "at least 2 DP groups are required");
+  PPLX_ASSERT(hiddenDimScaleBytes <= hiddenDimBytes, "invalid hidden dim bytes");
 }
 
 AllToAll::~AllToAll() {}
@@ -16,6 +16,7 @@
 #include <nvshmemx.h>
 #include <nvtx3/nvToolsExt.h>
 
+#include <array>
 #include <iomanip>
 #include <iostream>
 
 
@@ -36,36 +36,36 @@ AllToAllInterNode::AllToAllInterNode(
   numTokensPerDP = mallocZeroBuffer<uint32_t>(numLocalExperts * numDPGroups);
 
   numTokensBuffer = (uint64_t *)nvshmem_malloc(sizeof(uint64_t) * numLocalExperts * numDPGroups);
-  ROSE_ASSERT(numTokensBuffer != nullptr, "failed to allocate numTokensBuffer");
+  PPLX_ASSERT(numTokensBuffer != nullptr, "failed to allocate numTokensBuffer");
   cudaMemset(numTokensBuffer, 0, sizeof(uint64_t) * numLocalExperts * numDPGroups);
 
   numDispatchRecvBuffer =
       (uint64_t *)nvshmem_malloc(sizeof(uint64_t) * numLocalExperts * numDPGroups);
-  ROSE_ASSERT(numDispatchRecvBuffer != nullptr, "failed to allocate numDispatchRecvBuffer");
+  PPLX_ASSERT(numDispatchRecvBuffer != nullptr, "failed to allocate numDispatchRecvBuffer");
   cudaMemset(numDispatchRecvBuffer, 0, sizeof(uint64_t) * numLocalExperts * numDPGroups);
 
   combineSignalBuffer = (uint64_t *)nvshmem_malloc(sizeof(uint64_t) * maxNumTokens);
-  ROSE_ASSERT(combineSignalBuffer != nullptr, "failed to allocate combineSignalBuffer");
+  PPLX_ASSERT(combineSignalBuffer != nullptr, "failed to allocate combineSignalBuffer");
   cudaMemset(combineSignalBuffer, 0, sizeof(uint64_t) * maxNumTokens);
 
   combineSyncBuffer = (uint64_t *)nvshmem_malloc(sizeof(uint64_t) * worldSize);
-  ROSE_ASSERT(combineSyncBuffer != nullptr, "failed to allocate combineSyncBuffer");
+  PPLX_ASSERT(combineSyncBuffer != nullptr, "failed to allocate combineSyncBuffer");
   cudaMemset(combineSyncBuffer, 0, sizeof(uint64_t) * worldSize);
 
   // Buffers for dispatch.
   const size_t perTokenBytes =
       round_up<size_t>(hiddenDimBytes + hiddenDimScaleBytes + sizeof(uint32_t), 16);
   xDispatchIn = (std::byte *)nvshmem_malloc(maxNumTokens * perTokenBytes);
-  ROSE_ASSERT(xDispatchIn != nullptr, "failed to allocate xDispatchIn");
+  PPLX_ASSERT(xDispatchIn != nullptr, "failed to allocate xDispatchIn");
   xDispatchOut = (std::byte *)nvshmem_malloc(maxBatchTokens * perTokenBytes);
-  ROSE_ASSERT(xDispatchOut != nullptr, "failed to allocate xDispatchOut");
+  PPLX_ASSERT(xDispatchOut != nullptr, "failed to allocate xDispatchOut");
 
   // Buffers for combine. The allocations are a bit wider to accommodate all
   // possible data types (primarily float for testing and bfloat16 for prod).
   xCombineIn = (std::byte *)nvshmem_malloc(maxBatchTokens * hiddenDim * sizeof(float));
-  ROSE_ASSERT(xCombineIn != nullptr, "failed to allocate xCombineIn");
+  PPLX_ASSERT(xCombineIn != nullptr, "failed to allocate xCombineIn");
   xCombineOut = (std::byte *)nvshmem_malloc(maxNumTokens * numExperts * hiddenDim * sizeof(float));
-  ROSE_ASSERT(xCombineOut != nullptr, "failed to allocate xCombineOut");
+  PPLX_ASSERT(xCombineOut != nullptr, "failed to allocate xCombineOut");
 
   // Buffers for token tracking.
   sourceIndex = mallocZeroBuffer<uint32_t>(maxBatchTokens);
 
@@ -218,7 +218,7 @@ void AllToAllInterNode::combine(
     ));
     break;
   default:
-    ROSE_UNREACHABLE("invalid split mode");
+    PPLX_UNREACHABLE("invalid split mode");
   }
   nvtxRangePop();
 }
 
@@ -4,7 +4,7 @@
 #include <nvtx3/nvToolsExt.h>
 
 #include "all_to_all/internode.h"
-#include "core/device_utils.h"
+#include "core/device_utils.cuh"
 #include "core/utils.h"
 
 using namespace pplx;
@@ -58,16 +58,15 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void dispatchKernel(
   const unsigned dpGroup = rank / dpSize;
   const unsigned dpRank = rank % dpSize;
   const unsigned tokenDim = hiddenDim + hiddenDimScale;
-  const unsigned tokenStride =
-      device::round_up<unsigned>(tokenDim + sizeof(uint32_t), sizeof(int4));
+  const unsigned tokenStride = round_up<unsigned>(tokenDim + sizeof(uint32_t), sizeof(int4));
   const unsigned WARP_SIZE = 32;
   const unsigned warpId = threadIdx.x / WARP_SIZE;
   const unsigned laneId = threadIdx.x % WARP_SIZE;
 
   // Determine the number of tokens populated which are to be sent.
   const unsigned numSendTokens = boundM ? __ldg(boundM) : m;
-  ROSE_DEVICE_ASSERT(numSendTokens <= maxNumTokens);
-  ROSE_DEVICE_ASSERT(
+  PPLX_DEVICE_ASSERT(numSendTokens <= maxNumTokens);
+  PPLX_DEVICE_ASSERT(
       hiddenDimScale == 0 || numSendTokens == 0 || (expertXScale != nullptr && dpXScale != nullptr)
   );
 
@@ -170,14 +169,14 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void dispatchKernel(
     }
 
     if (DO_RECV) {
-      __syncthreads();
+      cooperative_groups::this_grid().sync();
     }
   }
 
   if constexpr (DO_RECV) {
     // Wait for the token counts to be sent.
     const size_t numExpertsAndGroups = numLocalExperts * numDPGroups;
-    const size_t expertsPerBlock = device::ceil_div<size_t>(numExpertsAndGroups, gridDim.x);
+    const size_t expertsPerBlock = ceil_div<size_t>(numExpertsAndGroups, gridDim.x);
     uint32_t *sharedExpert = reinterpret_cast<uint32_t *>(sharedMemory);
     uint32_t *sharedToken = sharedExpert + expertsPerBlock;
 
@@ -353,7 +352,7 @@ void AllToAllInterNode::dispatch(
     ));
     break;
   default:
-    ROSE_UNREACHABLE("invalid split mode");
+    PPLX_UNREACHABLE("invalid split mode");
   }
   nvtxRangePop();
 }
@@ -76,7 +76,6 @@ AllToAllIntraNode::AllToAllIntraNode(
     }
 
     auto dstHandlesHost = distributed->allToAll(srcHandlesHost);
-
     for (unsigned i = 0; i < worldSize; i++) {
       auto &ptr = recvBuffers.emplace_back();
       if (i == rank) {
@@ -97,6 +96,31 @@ AllToAllIntraNode::AllToAllIntraNode(
     ));
   }
 
+  // Allocate the local buffer for dispatch counts.
+  CUDACHECK(cudaMalloc(&localRecvCountPtr, sizeof(uint32_t) * maxNumTokens));
+  CUDACHECK(cudaMemset(localRecvCountPtr, 0, sizeof(uint32_t) * maxNumTokens));
+  CUDACHECK(cudaMalloc(&countBuffersPtr, sizeof(uint32_t *) * worldSize));
+  {
+    cudaIpcMemHandle_t countHandle;
+    CUDACHECK(cudaIpcGetMemHandle(&countHandle, localRecvCountPtr));
+    auto countHandlesHost = distributed->allGather(countHandle);
+
+    countBuffers.resize(worldSize);
+    for (unsigned i = 0; i < worldSize; i++) {
+      if (i == rank) {
+        countBuffers[i] = localRecvCountPtr;
+      } else {
+        CUDACHECK(cudaIpcOpenMemHandle(
+            (void **)&countBuffers[i], countHandlesHost[i], cudaIpcMemLazyEnablePeerAccess
+        ));
+      }
+    }
+
+    CUDACHECK(cudaMemcpy(
+        countBuffersPtr, countBuffers.data(), sizeof(uint32_t *) * worldSize, cudaMemcpyHostToDevice
+    ));
+  }
+
   // Allocate the local buffers.
   tokenCount = mallocZeroBuffer<uint32_t>(numExperts);
   numTokensPerRank = mallocZeroBuffer<uint32_t>(numLocalExperts * worldSize);
@@ -117,11 +141,15 @@ AllToAllIntraNode::~AllToAllIntraNode() {
     CUDACHECK(cudaFree(sendBuffers[i]));
     if (i != rank) {
       CUDACHECK(cudaIpcCloseMemHandle(recvBuffers[i]));
+      CUDACHECK(cudaIpcCloseMemHandle(countBuffers[i]));
     }
   }
 
   CUDACHECK(cudaFree(recvBuffersPtr));
   CUDACHECK(cudaFree(sendBuffersPtr));
+  CUDACHECK(cudaFree(countBuffersPtr));
+  CUDACHECK(cudaFree(localRecvCountPtr));
+
   CUDACHECK(cudaFree(tokenCount));
   CUDACHECK(cudaFree(numTokensPerRank));
 
 
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "core/device_utils.h"
+#include "core/device_utils.cuh"
 
 #include <cstdint>
 
@@ -52,7 +52,7 @@ private:
 
   __device__ __forceinline__ std::byte *getBaseTokenPtr(unsigned rank) {
     return getBaseCounterPtr(rank) +
-           device::round_up<size_t>(numLocalExperts * sizeof(uint32_t), sizeof(int4));
+           round_up<size_t>(numLocalExperts * sizeof(uint32_t), sizeof(int4));
   }
 
 private:
 
@@ -66,22 +66,26 @@ class AllToAllIntraNode final : public AllToAll {
   /// @section Peer-to-Peer shared buffers.
   std::vector<std::byte *> sendBuffers;
   std::byte **sendBuffersPtr;
-
   std::vector<std::byte *> recvBuffers;
   std::byte **recvBuffersPtr;
 
+  /// Buffer to synchronize multiple senders with a receiver in dispatch.
+  uint32_t *localRecvCountPtr;
+  std::vector<uint32_t *> countBuffers;
+  uint32_t **countBuffersPtr;
+
   /// @section Global buffers for use within kernels.
-  uint32_t *numTokensPerRank = nullptr;
-  uint32_t *tokenCount = nullptr;
+  uint32_t *numTokensPerRank;
+  uint32_t *tokenCount;
 
   /// @section Internal buffers communicating between dispatch and combine.
-  uint32_t *sourceIndex = nullptr;
-  uint32_t *sourceExpert = nullptr;
-  uint32_t *sourceOffset = nullptr;
-  uint32_t *sourceRank = nullptr;
-  uint32_t *sourceToken = nullptr;
-  uint32_t *sourceRoute = nullptr;
-  uint32_t *tokenIndex = nullptr;
+  uint32_t *sourceIndex;
+  uint32_t *sourceExpert;
+  uint32_t *sourceOffset;
+  uint32_t *sourceRank;
+  uint32_t *sourceToken;
+  uint32_t *sourceRoute;
+  uint32_t *tokenIndex;
 };
 
 } // namespace pplx
Original file line number	Diff line number	Diff line change
`@@ -218,7 +218,7 @@ void AllToAllInterNode::combine(`
`218`	`218`	`));`
`219`	`219`	`break;`
`220`	`220`	`default:`
`221`		`- ROSE_UNREACHABLE("invalid split mode");`
	`221`	`+ PPLX_UNREACHABLE("invalid split mode");`
`222`	`222`	`}`
`223`	`223`	`nvtxRangePop();`
`224`	`224`	`}`