perplexityai
diff --git a/‎.gitignore
Lines changed: 6 additions & 1 deletion b/‎.gitignore
Lines changed: 6 additions & 1 deletion
diff --git a/‎csrc/all_to_all/CMakeLists.txt
Lines changed: 2 additions & 2 deletions b/‎csrc/all_to_all/CMakeLists.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/all_to_all/all_to_all.h
Lines changed: 1 addition & 1 deletion b/‎csrc/all_to_all/all_to_all.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/all_to_all/bench_all_to_all.cpp
Lines changed: 22 additions & 21 deletions b/‎csrc/all_to_all/bench_all_to_all.cpp
Lines changed: 22 additions & 21 deletions
diff --git a/‎csrc/all_to_all/internode.cpp
Lines changed: 26 additions & 26 deletions b/‎csrc/all_to_all/internode.cpp
Lines changed: 26 additions & 26 deletions
diff --git a/‎csrc/all_to_all/internode.h
Lines changed: 10 additions & 10 deletions b/‎csrc/all_to_all/internode.h
Lines changed: 10 additions & 10 deletions
diff --git a/‎csrc/all_to_all/internode_gather.cu renamed to ‎csrc/all_to_all/internode_combine.cu
Lines changed: 23 additions & 23 deletions b/‎csrc/all_to_all/internode_gather.cu renamed to ‎csrc/all_to_all/internode_combine.cu
Lines changed: 23 additions & 23 deletions
@@ -1,6 +1,11 @@
 build-cmake
 build
-pplx_kernels/*.so
+*.so
 *.egg-info
 *.pyc
 data
+dist
+.ruff_cache
+.mypy_cache
+.pytest_cache
+__pycache__
@@ -2,8 +2,8 @@
 
 add_library(all_to_all_lib STATIC
     all_to_all.cpp
-    internode_scatter.cu
-    internode_gather.cu
+    internode_dispatch.cu
+    internode_combine.cu
     internode.cpp
 )
 target_link_libraries(all_to_all_lib PUBLIC
 
@@ -66,7 +66,7 @@ class AllToAll {
   /// The maximum number of tokens in a batch.
   const size_t maxBatchTokens;
 
-  /// @section Internal buffers communicating between scatter and gather.
+  /// @section Internal buffers communicating between dispatch and combine.
   uint32_t *numTokensPerDP = nullptr;
   uint32_t *sourceIndex = nullptr;
   uint32_t *sourceExpert = nullptr;
 
@@ -1,4 +1,4 @@
-// All-to-all scatter benchmark
+// All-to-all benchmark
 
 #include "all_to_all/internode.h"
 #include "all_to_all/test_utils.h"
@@ -100,14 +100,14 @@ std::pair<Time, Time> benchmark(
   // Warmup
   auto run = [&]() -> std::pair<float, float> {
     nvshmemx_barrier_all_on_stream(stream);
-    // Scatter.
+    // Dispatch.
     for (size_t i = 0; i < numSamples; i++) {
       nvshmemx_barrier_all_on_stream(stream);
       CUDACHECK(cudaStreamSynchronize(stream));
 
       CUDACHECK(cudaEventRecord(std::get<0>(events[i]), stream));
 
-      allToAll.scatter(
+      allToAll.dispatch(
           Strided1D<int32_t>(outTokensPerExpertDevice, 1),
           Strided2D<std::byte>(
               outExpertDevice, hiddenDimBytes, hiddenDimBytes * config.numTokens * numPEs
@@ -128,7 +128,7 @@ std::pair<Time, Time> benchmark(
 
       CUDACHECK(cudaEventRecord(std::get<1>(events[i]), stream));
 
-      allToAll.gather<T>(
+      allToAll.combine<T>(
           Strided1D<nv_bfloat16>(outTokensDevice, config.hiddenDim),
           Strided2D<uint32_t>(indicesDevice, 1, config.expertsPerToken),
           Strided2D<float>(weightsDevice, 1, config.expertsPerToken),
@@ -145,15 +145,15 @@ std::pair<Time, Time> benchmark(
     }
 
     CUDACHECK(cudaStreamSynchronize(stream));
-    float totalScatterMs = 0.0f, totalGatherMs = 0.0f;
+    float totalDispatchMs = 0.0f, totalCombineMs = 0.0f;
     for (size_t i = 0; i < numSamples; i++) {
-      float scatterMs = 0.0f, gatherMs = 0.0f;
-      CUDACHECK(cudaEventElapsedTime(&scatterMs, std::get<0>(events[i]), std::get<1>(events[i])));
-      CUDACHECK(cudaEventElapsedTime(&gatherMs, std::get<1>(events[i]), std::get<2>(events[i])));
-      totalScatterMs += scatterMs;
-      totalGatherMs += gatherMs;
+      float dispatchMs = 0.0f, combineMs = 0.0f;
+      CUDACHECK(cudaEventElapsedTime(&dispatchMs, std::get<0>(events[i]), std::get<1>(events[i])));
+      CUDACHECK(cudaEventElapsedTime(&combineMs, std::get<1>(events[i]), std::get<2>(events[i])));
+      totalDispatchMs += dispatchMs;
+      totalCombineMs += combineMs;
     }
-    return {totalScatterMs / numSamples, totalGatherMs / numSamples};
+    return {totalDispatchMs / numSamples, totalCombineMs / numSamples};
   };
 
   MPI_Barrier(MPI_COMM_WORLD);
@@ -165,15 +165,15 @@ std::pair<Time, Time> benchmark(
 
   MPI_Barrier(MPI_COMM_WORLD);
   nvtxRangePush("benchmark");
-  std::vector<float> scatterTimeUs, gatherTimeUs;
+  std::vector<float> dispatchTimeUs, combineTimeUs;
   for (int i = 0; i < repeat; i++) {
-    auto [scatterTimeMs, gatherTimeMs] = run();
-    scatterTimeUs.push_back(scatterTimeMs * 1000);
-    gatherTimeUs.push_back(gatherTimeMs * 1000);
+    auto [dispatchTimeMs, combineTimeMs] = run();
+    dispatchTimeUs.push_back(dispatchTimeMs * 1000);
+    combineTimeUs.push_back(combineTimeMs * 1000);
   }
   nvtxRangePop();
 
-  return {average(scatterTimeUs), average(gatherTimeUs)};
+  return {average(dispatchTimeUs), average(combineTimeUs)};
 }
 
 } // namespace
@@ -240,15 +240,16 @@ int main(int argc, char **argv) {
   };
 
   for (const auto &config : configs) {
-    auto [scatter, gather] = benchmark<nv_bfloat16>(10, config, currentPE, numPEs, stream);
+    auto [dispatch, combine] = benchmark<nv_bfloat16>(10, config, currentPE, numPEs, stream);
     if (currentPE == 0) {
-      auto [scatterMean, scatterStddev] = scatter;
-      auto [gatherMean, gatherStddev] = gather;
+      auto [dispatchMean, dispatchStddev] = dispatch;
+      auto [combineMean, combineStddev] = combine;
       std::cout << std::setw(3) << config.numTokens << " " << std::setw(3) << config.numExperts
                 << " " << std::setw(3) << config.expertsPerToken << " " << std::setw(4)
                 << config.hiddenDim << " " << std::fixed << std::setprecision(3)
-                << "Scatter: " << std::setw(10) << scatterMean << "us ± " << scatterStddev << "us "
-                << "Gather: " << std::setw(10) << gatherMean << "us ± " << gatherStddev << "us"
+                << "Dispatch: " << std::setw(10) << dispatchMean << "us ± " << dispatchStddev
+                << "us "
+                << "Combine: " << std::setw(10) << combineMean << "us ± " << combineStddev << "us"
                 << std::endl;
     }
   }
 
@@ -36,42 +36,42 @@ AllToAllInterNode::AllToAllInterNode(
   ROSE_ASSERT(numTokensBuffer != nullptr, "failed to allocate numTokensBuffer");
   cudaMemset(numTokensBuffer, 0, sizeof(uint64_t) * numLocalExperts * numDPGroups);
 
-  numScatterRecvBuffer =
+  numDispatchRecvBuffer =
       (uint64_t *)nvshmem_malloc(sizeof(uint64_t) * numLocalExperts * numDPGroups);
-  ROSE_ASSERT(numScatterRecvBuffer != nullptr, "failed to allocate numScatterRecvBuffer");
-  cudaMemset(numScatterRecvBuffer, 0, sizeof(uint64_t) * numLocalExperts * numDPGroups);
+  ROSE_ASSERT(numDispatchRecvBuffer != nullptr, "failed to allocate numDispatchRecvBuffer");
+  cudaMemset(numDispatchRecvBuffer, 0, sizeof(uint64_t) * numLocalExperts * numDPGroups);
 
-  gatherSignalBuffer = (uint64_t *)nvshmem_malloc(sizeof(uint64_t) * maxNumTokens);
-  ROSE_ASSERT(gatherSignalBuffer != nullptr, "failed to allocate gatherSignalBuffer");
-  cudaMemset(gatherSignalBuffer, 0, sizeof(uint64_t) * maxNumTokens);
+  combineSignalBuffer = (uint64_t *)nvshmem_malloc(sizeof(uint64_t) * maxNumTokens);
+  ROSE_ASSERT(combineSignalBuffer != nullptr, "failed to allocate combineSignalBuffer");
+  cudaMemset(combineSignalBuffer, 0, sizeof(uint64_t) * maxNumTokens);
 
-  gatherSyncBuffer = (uint64_t *)nvshmem_malloc(sizeof(uint64_t) * worldSize);
-  ROSE_ASSERT(gatherSyncBuffer != nullptr, "failed to allocate gatherSyncBuffer");
-  cudaMemset(gatherSyncBuffer, 0, sizeof(uint64_t) * worldSize);
+  combineSyncBuffer = (uint64_t *)nvshmem_malloc(sizeof(uint64_t) * worldSize);
+  ROSE_ASSERT(combineSyncBuffer != nullptr, "failed to allocate combineSyncBuffer");
+  cudaMemset(combineSyncBuffer, 0, sizeof(uint64_t) * worldSize);
 
-  // Buffers for scatter.
+  // Buffers for dispatch.
   const size_t perTokenBytes =
       round_up<size_t>(hiddenDimBytes + hiddenDimScaleBytes + sizeof(uint32_t), 16);
-  xScatterIn = (std::byte *)nvshmem_malloc(maxNumTokens * perTokenBytes);
-  ROSE_ASSERT(xScatterIn != nullptr, "failed to allocate xScatterIn");
-  xScatterOut = (std::byte *)nvshmem_malloc(maxBatchTokens * perTokenBytes);
-  ROSE_ASSERT(xScatterOut != nullptr, "failed to allocate xScatterOut");
+  xDispatchIn = (std::byte *)nvshmem_malloc(maxNumTokens * perTokenBytes);
+  ROSE_ASSERT(xDispatchIn != nullptr, "failed to allocate xDispatchIn");
+  xDispatchOut = (std::byte *)nvshmem_malloc(maxBatchTokens * perTokenBytes);
+  ROSE_ASSERT(xDispatchOut != nullptr, "failed to allocate xDispatchOut");
 
-  // Buffers for gather. The allocations are a bit wider to accommodate all
+  // Buffers for combine. The allocations are a bit wider to accommodate all
   // possible data types (primarily float for testing and bfloat16 for prod).
-  xGatherIn = (std::byte *)nvshmem_malloc(maxBatchTokens * hiddenDim * sizeof(float));
-  ROSE_ASSERT(xGatherIn != nullptr, "failed to allocate xGatherIn");
-  xGatherOut = (std::byte *)nvshmem_malloc(maxNumTokens * numExperts * hiddenDim * sizeof(float));
-  ROSE_ASSERT(xGatherOut != nullptr, "failed to allocate xGatherOut");
+  xCombineIn = (std::byte *)nvshmem_malloc(maxBatchTokens * hiddenDim * sizeof(float));
+  ROSE_ASSERT(xCombineIn != nullptr, "failed to allocate xCombineIn");
+  xCombineOut = (std::byte *)nvshmem_malloc(maxNumTokens * numExperts * hiddenDim * sizeof(float));
+  ROSE_ASSERT(xCombineOut != nullptr, "failed to allocate xCombineOut");
 }
 
 AllToAllInterNode::~AllToAllInterNode() {
   nvshmem_free(numTokensBuffer);
-  nvshmem_free(numScatterRecvBuffer);
-  nvshmem_free(gatherSignalBuffer);
-  nvshmem_free(gatherSyncBuffer);
-  nvshmem_free(xScatterIn);
-  nvshmem_free(xScatterOut);
-  nvshmem_free(xGatherIn);
-  nvshmem_free(xGatherOut);
+  nvshmem_free(numDispatchRecvBuffer);
+  nvshmem_free(combineSignalBuffer);
+  nvshmem_free(combineSyncBuffer);
+  nvshmem_free(xDispatchIn);
+  nvshmem_free(xDispatchOut);
+  nvshmem_free(xCombineIn);
+  nvshmem_free(xCombineOut);
 }
@@ -60,7 +60,7 @@ class AllToAllInterNode final : public AllToAll {
   /// overlapping).
   ///
   /// @param stream The CUDA stream to launch the kernel on.
-  void scatter(
+  void dispatch(
       const Strided1D<int32_t> &outTokensPerExpert,
       const Strided2D<std::byte> &expertX,
       const Strided2D<std::byte> &expertXScale,
@@ -73,7 +73,7 @@ class AllToAllInterNode final : public AllToAll {
       cudaStream_t stream
   );
 
-  /// Launches the all-to-all gather kernel.
+  /// Launches the all-to-all combine kernel.
   ///
   /// @param outTokens The output tokens.
   /// Shape: [numExperts, maxNumTokens].
@@ -97,7 +97,7 @@ class AllToAllInterNode final : public AllToAll {
   ///
   /// @param stream The CUDA stream to launch the kernel on.
   template <typename T>
-  void gather(
+  void combine(
       const Strided1D<nv_bfloat16> &outTokens,
       const Strided2D<uint32_t> &indices,
       const Strided2D<float> &weights,
@@ -111,13 +111,13 @@ class AllToAllInterNode final : public AllToAll {
 private:
   /// @section Pre-allocated symmetric shared memory workspace.
   uint64_t *numTokensBuffer = nullptr;
-  uint64_t *numScatterRecvBuffer = nullptr;
-  uint64_t *gatherSignalBuffer = nullptr;
-  uint64_t *gatherSyncBuffer = nullptr;
-  std::byte *xScatterIn = nullptr;
-  std::byte *xScatterOut = nullptr;
-  std::byte *xGatherIn = nullptr;
-  std::byte *xGatherOut = nullptr;
+  uint64_t *numDispatchRecvBuffer = nullptr;
+  uint64_t *combineSignalBuffer = nullptr;
+  uint64_t *combineSyncBuffer = nullptr;
+  std::byte *xDispatchIn = nullptr;
+  std::byte *xDispatchOut = nullptr;
+  std::byte *xCombineIn = nullptr;
+  std::byte *xCombineOut = nullptr;
 };
 
 } // namespace pplx
@@ -9,7 +9,7 @@
 using namespace pplx;
 
 template <typename T, size_t NUM_WARPS, bool DO_SEND, bool DO_RECV>
-__global__ __launch_bounds__(NUM_WARPS * 32, 1) void gatherKernel(
+__global__ __launch_bounds__(NUM_WARPS * 32, 1) void combineKernel(
     nv_bfloat16 *outTokens,
     size_t outTokensStrideElem,
     uint32_t *indices,
@@ -34,8 +34,8 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void gatherKernel(
     const uint32_t *sourceIndex,
     const uint32_t *sourceOffset,
     const uint32_t *sourceGroup,
-    uint64_t *gatherSignalBuffer,
-    uint64_t *gatherSyncBuffer,
+    uint64_t *combineSignalBuffer,
+    uint64_t *combineSyncBuffer,
     std::byte *xBufferIn,
     std::byte *xBufferOut
 ) {
@@ -50,7 +50,7 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void gatherKernel(
   if (DO_SEND) {
     for (unsigned i = blockIdx.x * numWarps + warpId; i < worldSize; i += gridDim.x * numWarps) {
       if (laneId == 0) {
-        nvshmemx_signal_op(&gatherSyncBuffer[rank], 1, NVSHMEM_SIGNAL_SET, i);
+        nvshmemx_signal_op(&combineSyncBuffer[rank], 1, NVSHMEM_SIGNAL_SET, i);
       }
     }
 
@@ -84,7 +84,7 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void gatherKernel(
         const unsigned index = dstExpert * maxNumTokens + source;
         std::byte *dstPtr = xBufferOut + index * stride;
         nvshmemx_putmem_signal_nbi_warp(
-            dstPtr, xTokenPtr, stride, &gatherSignalBuffer[source], 1, NVSHMEM_SIGNAL_ADD, dstRank
+            dstPtr, xTokenPtr, stride, &combineSignalBuffer[source], 1, NVSHMEM_SIGNAL_ADD, dstRank
         );
       }
     }
@@ -100,9 +100,9 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void gatherKernel(
     // Compute the weighed sum of the input tokens.
     const size_t localNumTokens = boundM ? __ldg(boundM) : m;
     for (unsigned i = blockIdx.x; i < localNumTokens; i += gridDim.x) {
-      nvshmem_uint64_wait_until(&gatherSignalBuffer[i], NVSHMEM_CMP_EQ, expertsPerToken);
+      nvshmem_uint64_wait_until(&combineSignalBuffer[i], NVSHMEM_CMP_EQ, expertsPerToken);
       __syncthreads();
-      gatherSignalBuffer[i] = 0;
+      combineSignalBuffer[i] = 0;
 
       nv_bfloat16 *dstPtr = outTokens + i * outTokensStrideElem;
       constexpr unsigned VEC_SIZE = 8;
@@ -134,14 +134,14 @@ __global__ __launch_bounds__(NUM_WARPS * 32, 1) void gatherKernel(
 
     for (unsigned i = blockIdx.x * blockDim.x + threadIdx.x; i < worldSize;
          i += gridDim.x * blockDim.x) {
-      nvshmem_uint64_wait_until(&gatherSyncBuffer[i], NVSHMEM_CMP_EQ, 1);
-      gatherSyncBuffer[i] = 0;
+      nvshmem_uint64_wait_until(&combineSyncBuffer[i], NVSHMEM_CMP_EQ, 1);
+      combineSyncBuffer[i] = 0;
     }
   }
 }
 
 template <typename T>
-void AllToAllInterNode::gather(
+void AllToAllInterNode::combine(
     const Strided1D<nv_bfloat16> &outTokens,
     const Strided2D<uint32_t> &indices,
     const Strided2D<float> &weights,
@@ -189,26 +189,26 @@ void AllToAllInterNode::gather(
       &sourceIndex,
       &sourceOffset,
       &sourceGroup,
-      &gatherSignalBuffer,
-      &gatherSyncBuffer,
-      &xGatherIn,
-      &xGatherOut};
+      &combineSignalBuffer,
+      &combineSyncBuffer,
+      &xCombineIn,
+      &xCombineOut};
 
-  nvtxRangePush("gather");
+  nvtxRangePush("combine");
   switch (splitMode) {
   case SplitMode::SEND:
     CUDACHECK(cudaLaunchCooperativeKernel(
-        &gatherKernel<T, NUM_WARPS, true, false>, dimGrid, dimBlock, args, 0, stream
+        &combineKernel<T, NUM_WARPS, true, false>, dimGrid, dimBlock, args, 0, stream
     ));
     break;
   case SplitMode::RECV:
     CUDACHECK(cudaLaunchCooperativeKernel(
-        &gatherKernel<T, NUM_WARPS, false, true>, dimGrid, dimBlock, args, 0, stream
+        &combineKernel<T, NUM_WARPS, false, true>, dimGrid, dimBlock, args, 0, stream
     ));
     break;
   case SplitMode::NONE:
     CUDACHECK(cudaLaunchCooperativeKernel(
-        &gatherKernel<T, NUM_WARPS, true, true>, dimGrid, dimBlock, args, 0, stream
+        &combineKernel<T, NUM_WARPS, true, true>, dimGrid, dimBlock, args, 0, stream
     ));
     break;
   default:
@@ -217,8 +217,8 @@ void AllToAllInterNode::gather(
   nvtxRangePop();
 }
 
-#define INSTANTIATE_GATHER(T)                                                                      \
-  template void AllToAllInterNode::gather<T>(                                                      \
+#define INSTANTIATE_COMBINE(T)                                                                     \
+  template void AllToAllInterNode::combine<T>(                                                     \
       const Strided1D<nv_bfloat16> &outTokens,                                                     \
       const Strided2D<uint32_t> &indices,                                                          \
       const Strided2D<float> &weights,                                                             \
@@ -229,6 +229,6 @@ void AllToAllInterNode::gather(
       cudaStream_t stream                                                                          \
   );
 
-INSTANTIATE_GATHER(float)
-INSTANTIATE_GATHER(half)
-INSTANTIATE_GATHER(nv_bfloat16)
+INSTANTIATE_COMBINE(float)
+INSTANTIATE_COMBINE(half)
+INSTANTIATE_COMBINE(nv_bfloat16)