perplexityai
diff --git a/‎.clang-format
Lines changed: 11 additions & 0 deletions b/‎.clang-format
Lines changed: 11 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 6 additions & 0 deletions b/‎.gitignore
Lines changed: 6 additions & 0 deletions
diff --git a/‎LICENSE
Lines changed: 7 additions & 0 deletions b/‎LICENSE
Lines changed: 7 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 57 additions & 0 deletions b/‎README.md
Lines changed: 57 additions & 0 deletions
diff --git a/‎csrc/CMakeLists.txt
Lines changed: 73 additions & 0 deletions b/‎csrc/CMakeLists.txt
Lines changed: 73 additions & 0 deletions
diff --git a/‎csrc/all_to_all/CMakeLists.txt
Lines changed: 44 additions & 0 deletions b/‎csrc/all_to_all/CMakeLists.txt
Lines changed: 44 additions & 0 deletions
diff --git a/‎csrc/all_to_all/all_to_all.cpp
Lines changed: 67 additions & 0 deletions b/‎csrc/all_to_all/all_to_all.cpp
Lines changed: 67 additions & 0 deletions
diff --git a/‎csrc/all_to_all/all_to_all.h
Lines changed: 77 additions & 0 deletions b/‎csrc/all_to_all/all_to_all.h
Lines changed: 77 additions & 0 deletions
@@ -0,0 +1,11 @@
+---
+BasedOnStyle: LLVM
+IndentWidth: 2
+ColumnLimit: 100
+BinPackArguments: false
+BinPackParameters: false
+ExperimentalAutoDetectBinPacking: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AlignAfterOpenBracket: BlockIndent
+BreakConstructorInitializers: BeforeColon
+PackConstructorInitializers: Never
@@ -0,0 +1,6 @@
+build-cmake
+build
+pplx_kernels/*.so
+*.egg-info
+*.pyc
+data
@@ -0,0 +1,7 @@
+Copyright (C) 2025 Perplexity AI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,57 @@
+Perplexity MoE Kernels
+==========
+
+Installation
+-----
+
+```
+cd pplx-kernels
+pip install -e . -vvv
+```
+
+Testing
+-----
+
+To build the C++ tests and benchmarks:
+
+```
+cd pplx-kernels
+mkdir build-cmake
+cd build-cmake
+
+TORCH_PREFIX_PATH=$(python3 -c 'import torch; print(torch.utils.cmake_prefix_path)')
+
+cmake ../csrc \
+    -GNinja \
+    -DCMAKE_PREFIX_PATH=$TORCH_PREFIX_PATH \
+    -DTORCH_CUDA_ARCH_LIST=9.0a+PTX \
+    -DWITH_TESTS=ON \
+    -DWITH_BENCHMARKS=ON
+
+ninja test_all_to_all bench_all_to_all
+```
+
+To run the all-to-all tests on one node:
+
+```
+NVSHMEM_REMOTE_TRANSPORT=None mpirun -np 4 ./test_all_to_all
+```
+
+
+To run the all-to-all benchmarks on one node:
+
+```
+NVSHMEM_REMOTE_TRANSPORT=None mpirun -np 4 ./bench_all_to_all
+```
+
+
+Inter-Node Benchmarks
+-----
+
+To test on a 32-device cluster spread across 4 nodes, run the following command on all nodes, alternating the rank from 0 to 3 and setting the master address to point to one of the nodes:
+
+```
+cd pplx-kernels
+pip install -e . -vvv
+NVSHMEM_IB_ENABLE_IBGDA=1 NODE_RANK=<rank> WORLD_SIZE=32 WORLD_LOCAL_SIZE=8 MASTER_ADDR=<master-address> MASTER_PORT=29500 python3 -m tests.bench_all_to_all
+```
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 3.26)
+project(PPLXKernels
+        VERSION 0.0.1
+        DESCRIPTION "PPLX Kernels"
+        LANGUAGES CXX CUDA)
+
+# === Configuration options ===
+option(WITH_TESTS "Build tests" OFF)
+option(WITH_BENCHMARKS "Build benchmarks" OFF)
+set(CMAKE_CUDA_ARCHITECTURES 90a CACHE STRING "CUDA architecture to target")
+
+# === CMake configuration ===
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+# === Dependencies ===
+include(FetchContent)
+find_package(CUDAToolkit REQUIRED)  # Modern replacement for find_package(CUDA)
+find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
+find_package(Torch REQUIRED)
+find_package(NVSHMEM REQUIRED)
+
+if(WITH_TESTS)
+    enable_testing()
+    find_package(MPI REQUIRED)
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(NCCL nccl)
+endif()
+
+# Create imported target for PyTorch
+add_library(torch_imported INTERFACE)
+add_library(torch::py_limited ALIAS torch_imported)
+target_include_directories(torch_imported SYSTEM INTERFACE ${TORCH_INCLUDE_DIRS})
+# NOTE(lequn): We don't link against all ${TORCH_LIBRARIES} because we use py_limited_api.
+# See: https://github.com/pytorch/pytorch/blob/9017becf1d895999a1c819c9d35b8139c090e7a9/torch/utils/cpp_extension.py#L1256-L1270
+target_link_libraries(torch_imported INTERFACE c10 torch torch_cpu c10_cuda torch_cuda CUDA::cudart)
+
+# === Compiler and linker flags ===
+add_compile_options(-Wno-deprecated-declarations)
+add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=1)
+add_compile_definitions(Py_LIMITED_API=0x03090000)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+# CUDA-specific compile options function
+function(set_cuda_compile_options target)
+    target_compile_options(${target} PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:--threads=32 -O3>)
+endfunction()
+
+# === Library targets ===
+add_subdirectory(all_to_all)
+
+# Main shared library
+add_library(pplx_kernels SHARED
+    bindings/all_to_all_ops.cpp
+    bindings/bindings.cpp
+    bindings/nvshmem.cpp
+)
+target_link_libraries(pplx_kernels PUBLIC
+    all_to_all_lib
+    torch::py_limited
+    Python::Module
+    CUDA::cuda_driver
+    CUDA::cudart
+    nvshmem::nvshmem
+)
+set_target_properties(pplx_kernels PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../pplx_kernels
+    CUDA_SEPARABLE_COMPILATION ON
+)
@@ -0,0 +1,44 @@
+# All-to-All library
+
+add_library(all_to_all_lib STATIC
+    all_to_all.cpp
+    internode_scatter.cu
+    internode_gather.cu
+    internode.cpp
+)
+target_link_libraries(all_to_all_lib PUBLIC
+    nvshmem::nvshmem
+    CUDA::cudart
+)
+set_cuda_compile_options(all_to_all_lib)
+
+if(WITH_TESTS)
+    # All-to-All test
+    add_executable(test_all_to_all
+        test_all_to_all.cpp
+    )
+    target_link_libraries(test_all_to_all PUBLIC
+        all_to_all_lib
+        CUDA::cudart
+        CUDA::cuda_driver
+        MPI::MPI_CXX
+        nvshmem::nvshmem
+    )
+    set_cuda_compile_options(test_all_to_all)
+    add_test(NAME AllToAllTest
+             COMMAND ${MPIEXEC_EXECUTABLE} -np 4 $<TARGET_FILE:test_all_to_all>)
+    set_tests_properties(AllToAllTest PROPERTIES ENVIRONMENT "NVSHMEM_REMOTE_TRANSPORT=None")
+endif()
+
+if (WITH_BENCHMARKS)
+    add_executable(bench_all_to_all
+        bench_all_to_all.cpp
+    )
+    target_link_libraries(bench_all_to_all PUBLIC
+        all_to_all_lib
+        CUDA::cudart
+        CUDA::cuda_driver
+        MPI::MPI_CXX
+        nvshmem::nvshmem
+    )
+endif()
@@ -0,0 +1,67 @@
+#include "all_to_all.h"
+
+#include "core/cuda_utils.h"
+#include "core/utils.h"
+
+#include <cuda_runtime.h>
+
+using namespace pplx;
+
+namespace {
+template <typename T> T *mallocZeroBuffer(size_t size) {
+  T *ptr;
+  CUDACHECK(cudaMalloc(&ptr, size * sizeof(T)));
+  cudaMemset(ptr, 0, size * sizeof(T));
+  return ptr;
+}
+} // namespace
+
+AllToAll::AllToAll(
+    size_t maxNumTokens,
+    size_t numExperts,
+    size_t expertsPerToken,
+    unsigned rank,
+    unsigned worldSize,
+    unsigned dpSize,
+    size_t hiddenDim,
+    size_t hiddenDimBytes,
+    size_t hiddenDimScaleBytes
+)
+    : maxNumTokens(maxNumTokens),
+      numExperts(numExperts),
+      numLocalExperts(ceil_div<uint32_t>(numExperts, worldSize)),
+      numDPGroups(ceil_div<uint32_t>(worldSize, dpSize)),
+      expertsPerToken(expertsPerToken),
+      hiddenDim(hiddenDim),
+      hiddenDimBytes(hiddenDimBytes),
+      hiddenDimScaleBytes(hiddenDimScaleBytes),
+      rank(rank),
+      worldSize(worldSize),
+      dpSize(dpSize),
+      maxBatchTokens(numLocalExperts * numDPGroups * maxNumTokens) {
+
+  ROSE_ASSERT(hiddenDimBytes % 16 == 0, "invalid hidden dim bytes");
+  ROSE_ASSERT(hiddenDimScaleBytes % 16 == 0, "invalid hidden dim scale bytes");
+  const size_t perTokenBytes =
+      round_up<size_t>(hiddenDimBytes + hiddenDimScaleBytes + sizeof(uint32_t), 16);
+  const size_t maxBatchTokens = numLocalExperts * numDPGroups * maxNumTokens;
+
+  ROSE_ASSERT(numLocalExperts != 0, "numLocalExperts is 0");
+  ROSE_ASSERT(numDPGroups > 1, "at least 2 DP groups are required");
+  ROSE_ASSERT(hiddenDimScaleBytes <= hiddenDimBytes, "invalid hidden dim bytes");
+
+  // Buffers for token tracking.
+  numTokensPerDP = mallocZeroBuffer<uint32_t>(numLocalExperts * numDPGroups);
+  sourceIndex = mallocZeroBuffer<uint32_t>(maxBatchTokens);
+  sourceExpert = mallocZeroBuffer<uint32_t>(maxBatchTokens);
+  sourceOffset = mallocZeroBuffer<uint32_t>(maxBatchTokens);
+  sourceGroup = mallocZeroBuffer<uint32_t>(maxBatchTokens);
+}
+
+AllToAll::~AllToAll() {
+  CUDACHECK(cudaFree(numTokensPerDP));
+  CUDACHECK(cudaFree(sourceIndex));
+  CUDACHECK(cudaFree(sourceExpert));
+  CUDACHECK(cudaFree(sourceOffset));
+  CUDACHECK(cudaFree(sourceGroup));
+}
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+
+namespace pplx {
+
+/// Specifies which part of a send-and-recv kernel to launch.
+enum class SplitMode {
+  NONE,
+  SEND,
+  RECV,
+};
+
+/// Base class for all-to-all broadcast kernels.
+class AllToAll {
+public:
+  /// @brief Initializes the all-to-all broadcast kernel.
+  ///
+  /// @param maxNumTokens The maximum number of tokens per DP group.
+  /// @param numExperts The total number of experts spread across all ranks.
+  /// @param expertsPerToken The number of experts per token.
+  /// @param rank The rank of the current process.
+  /// @param worldSize The number of processes in the world.
+  /// @param dpSize The size of a DP group.
+  /// @param hiddenDimBytes The hidden dimension of X, in bytes.
+  /// @param hiddenDimScaleBytes The hidden dimension of the scale of X, in
+  /// bytes.
+  AllToAll(
+      size_t maxNumTokens,
+      size_t numExperts,
+      size_t expertsPerToken,
+      unsigned rank,
+      unsigned worldSize,
+      unsigned dpSize,
+      size_t hiddenDim,
+      size_t hiddenDimBytes,
+      size_t hiddenDimScaleBytes
+  );
+
+  virtual ~AllToAll();
+
+protected:
+  /// The maximum number of tokens per DP group.
+  const size_t maxNumTokens;
+  /// The total number of experts spread across all ranks.
+  const size_t numExperts;
+  /// The number of local experts.
+  const size_t numLocalExperts;
+  /// The number of DP groups.
+  const size_t numDPGroups;
+  /// The number of experts per token.
+  const size_t expertsPerToken;
+  /// The hidden dimension of X, in elements.
+  const size_t hiddenDim;
+  /// The hidden dimension of X, in bytes.
+  const size_t hiddenDimBytes;
+  /// The hidden dimension scale of X, in bytes.
+  const size_t hiddenDimScaleBytes;
+  /// The rank of the current process.
+  const unsigned rank;
+  /// The number of processes in the world.
+  const unsigned worldSize;
+  /// The size of a DP group.
+  const unsigned dpSize;
+  /// The maximum number of tokens in a batch.
+  const size_t maxBatchTokens;
+
+  /// @section Internal buffers communicating between scatter and gather.
+  uint32_t *numTokensPerDP = nullptr;
+  uint32_t *sourceIndex = nullptr;
+  uint32_t *sourceExpert = nullptr;
+  uint32_t *sourceOffset = nullptr;
+  uint32_t *sourceGroup = nullptr;
+};
+
+} // namespace pplx