perplexityai
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎csrc/CMakeLists.txt
Lines changed: 5 additions & 3 deletions b/‎csrc/CMakeLists.txt
Lines changed: 5 additions & 3 deletions
diff --git a/‎csrc/all_to_all/CMakeLists.txt
Lines changed: 29 additions & 12 deletions b/‎csrc/all_to_all/CMakeLists.txt
Lines changed: 29 additions & 12 deletions
diff --git a/‎csrc/all_to_all/all_to_all.cpp
Lines changed: 2 additions & 29 deletions b/‎csrc/all_to_all/all_to_all.cpp
Lines changed: 2 additions & 29 deletions
diff --git a/‎csrc/all_to_all/all_to_all.h
Lines changed: 0 additions & 9 deletions b/‎csrc/all_to_all/all_to_all.h
Lines changed: 0 additions & 9 deletions
@@ -90,7 +90,7 @@ cd pplx-kernels
 mkdir build-cmake
 cd build-cmake
 
-TORCH_PREFIX_PATH=$(python3 -c 'import torch; print(torch.utils.cmake_prefix_path)')
+export TORCH_PREFIX_PATH=$(python3 -c 'import torch; print(torch.utils.cmake_prefix_path)')
 
 cmake ../csrc \
     -GNinja \
@@ -105,12 +105,12 @@ ninja test_all_to_all bench_all_to_all
 To run the all-to-all tests on one node:
 
 ```bash
-NVSHMEM_REMOTE_TRANSPORT=none mpirun -np 4 ./test_all_to_all
+NVSHMEM_REMOTE_TRANSPORT=none mpirun -np 4 ./all_to_all/test_all_to_all
 ```
 
 
 To run the all-to-all benchmarks on one node:
 
 ```bash
-NVSHMEM_REMOTE_TRANSPORT=none mpirun -np 4 ./bench_all_to_all
+NVSHMEM_REMOTE_TRANSPORT=none mpirun -np 4 ./all_to_all/bench_all_to_all
 ```
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.26)
+cmake_minimum_required(VERSION 3.22)
 project(PPLXKernels
         VERSION 0.0.1
         DESCRIPTION "PPLX Kernels"
@@ -52,6 +52,7 @@ endfunction()
 
 # === Library targets ===
 add_subdirectory(all_to_all)
+add_subdirectory(core)
 
 # Main shared library
 add_library(pplx_kernels SHARED
@@ -60,13 +61,14 @@ add_library(pplx_kernels SHARED
     bindings/nvshmem.cpp
 )
 target_link_libraries(pplx_kernels PUBLIC
-    all_to_all_lib
+    all_to_all_internode_lib
+    all_to_all_intranode_lib
+    core_lib
     torch::py_limited
     Python::Module
     CUDA::cuda_driver
     CUDA::cudart
     nvshmem::nvshmem_host
-    nvshmem::nvshmem_device
 )
 set_target_properties(pplx_kernels PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../src/pplx_kernels
 
@@ -1,30 +1,46 @@
 # All-to-All library
 
-add_library(all_to_all_lib STATIC
+add_library(all_to_all_common STATIC
     all_to_all.cpp
-    internode_dispatch.cu
+)
+
+add_library(all_to_all_intranode_lib STATIC
+    intranode_combine.cu
+    intranode_dispatch.cu
+    intranode.cpp
+)
+target_link_libraries(all_to_all_intranode_lib PUBLIC
+    all_to_all_common
+    nvshmem::nvshmem
+    CUDA::cudart
+)
+set_cuda_compile_options(all_to_all_intranode_lib)
+
+add_library(all_to_all_internode_lib STATIC
     internode_combine.cu
+    internode_dispatch.cu
     internode.cpp
 )
-target_link_libraries(all_to_all_lib PUBLIC
-    nvshmem::nvshmem_host
-    nvshmem::nvshmem_device
+target_link_libraries(all_to_all_internode_lib PUBLIC
+    all_to_all_common
+    nvshmem::nvshmem
     CUDA::cudart
 )
-set_cuda_compile_options(all_to_all_lib)
+set_cuda_compile_options(all_to_all_internode_lib)
 
 if(WITH_TESTS)
     # All-to-All test
     add_executable(test_all_to_all
         test_all_to_all.cpp
     )
     target_link_libraries(test_all_to_all PUBLIC
-        all_to_all_lib
+        all_to_all_intranode_lib
+        all_to_all_internode_lib
+        core_lib
         CUDA::cudart
         CUDA::cuda_driver
         MPI::MPI_CXX
-        nvshmem::nvshmem_host
-        nvshmem::nvshmem_device
+        nvshmem::nvshmem
     )
     set_cuda_compile_options(test_all_to_all)
     add_test(NAME AllToAllTest
@@ -37,11 +53,12 @@ if (WITH_BENCHMARKS)
         bench_all_to_all.cpp
     )
     target_link_libraries(bench_all_to_all PUBLIC
-        all_to_all_lib
+        all_to_all_intranode_lib
+        all_to_all_internode_lib
+        core_lib
         CUDA::cudart
         CUDA::cuda_driver
         MPI::MPI_CXX
-        nvshmem::nvshmem_host
-        nvshmem::nvshmem_device
+        nvshmem::nvshmem
     )
 endif()
@@ -1,21 +1,9 @@
 #include "all_to_all.h"
 
-#include "core/cuda_utils.h"
 #include "core/utils.h"
 
-#include <cuda_runtime.h>
-
 using namespace pplx;
 
-namespace {
-template <typename T> T *mallocZeroBuffer(size_t size) {
-  T *ptr;
-  CUDACHECK(cudaMalloc(&ptr, size * sizeof(T)));
-  cudaMemset(ptr, 0, size * sizeof(T));
-  return ptr;
-}
-} // namespace
-
 AllToAll::AllToAll(
     size_t maxNumTokens,
     size_t numExperts,
@@ -37,31 +25,16 @@ AllToAll::AllToAll(
       hiddenDimScaleBytes(hiddenDimScaleBytes),
       rank(rank),
       worldSize(worldSize),
-      dpSize(dpSize),
-      maxBatchTokens(numLocalExperts * numDPGroups * maxNumTokens) {
+      dpSize(dpSize) {
 
   ROSE_ASSERT(hiddenDimBytes % 16 == 0, "invalid hidden dim bytes");
   ROSE_ASSERT(hiddenDimScaleBytes % 16 == 0, "invalid hidden dim scale bytes");
   const size_t perTokenBytes =
       round_up<size_t>(hiddenDimBytes + hiddenDimScaleBytes + sizeof(uint32_t), 16);
-  const size_t maxBatchTokens = numLocalExperts * numDPGroups * maxNumTokens;
 
   ROSE_ASSERT(numLocalExperts != 0, "numLocalExperts is 0");
   ROSE_ASSERT(numDPGroups > 1, "at least 2 DP groups are required");
   ROSE_ASSERT(hiddenDimScaleBytes <= hiddenDimBytes, "invalid hidden dim bytes");
-
-  // Buffers for token tracking.
-  numTokensPerDP = mallocZeroBuffer<uint32_t>(numLocalExperts * numDPGroups);
-  sourceIndex = mallocZeroBuffer<uint32_t>(maxBatchTokens);
-  sourceExpert = mallocZeroBuffer<uint32_t>(maxBatchTokens);
-  sourceOffset = mallocZeroBuffer<uint32_t>(maxBatchTokens);
-  sourceGroup = mallocZeroBuffer<uint32_t>(maxBatchTokens);
 }
 
-AllToAll::~AllToAll() {
-  CUDACHECK(cudaFree(numTokensPerDP));
-  CUDACHECK(cudaFree(sourceIndex));
-  CUDACHECK(cudaFree(sourceExpert));
-  CUDACHECK(cudaFree(sourceOffset));
-  CUDACHECK(cudaFree(sourceGroup));
-}
+AllToAll::~AllToAll() {}
@@ -63,15 +63,6 @@ class AllToAll {
   const unsigned worldSize;
   /// The size of a DP group.
   const unsigned dpSize;
-  /// The maximum number of tokens in a batch.
-  const size_t maxBatchTokens;
-
-  /// @section Internal buffers communicating between dispatch and combine.
-  uint32_t *numTokensPerDP = nullptr;
-  uint32_t *sourceIndex = nullptr;
-  uint32_t *sourceExpert = nullptr;
-  uint32_t *sourceOffset = nullptr;
-  uint32_t *sourceGroup = nullptr;
 };
 
 } // namespace pplx