perplexityai · Willy-Chan · Jul 15, 2025 · Aug 11, 2025
diff --git a/csrc/CMakeLists.txt b/csrc/CMakeLists.txt
@@ -58,7 +58,6 @@ add_subdirectory(core)
 add_library(pplx_kernels SHARED
     bindings/all_to_all_ops.cpp
     bindings/bindings.cpp
-    bindings/nvshmem.cpp
 )
 target_link_libraries(pplx_kernels PUBLIC
     all_to_all_internode_lib
@@ -68,8 +67,9 @@ target_link_libraries(pplx_kernels PUBLIC
     Python::Module
     CUDA::cuda_driver
     CUDA::cudart
-    nvshmem::nvshmem
+    nvshmem::nvshmem_host
     nvshmem::nvshmem_bootstrap_uid
+    nvshmem::nvshmem_device
 )
 set_target_properties(pplx_kernels PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../src/pplx_kernels

diff --git a/csrc/all_to_all/CMakeLists.txt b/csrc/all_to_all/CMakeLists.txt
@@ -18,7 +18,7 @@ target_link_libraries(all_to_all_intranode_lib PUBLIC
     CUDA::cudart
 )
 target_link_libraries(all_to_all_intranode_lib INTERFACE
-    nvshmem::nvshmem
+    nvshmem::nvshmem_host
 )
 target_include_directories(all_to_all_intranode_lib PRIVATE ${NVSHMEM_INCLUDE_DIR})
 set_cuda_compile_options(all_to_all_intranode_lib)
@@ -33,7 +33,7 @@ target_link_libraries(all_to_all_internode_lib PUBLIC
     CUDA::cudart
 )
 target_link_libraries(all_to_all_internode_lib INTERFACE
-    nvshmem::nvshmem
+    nvshmem::nvshmem_host
 )
 target_include_directories(all_to_all_internode_lib PRIVATE ${NVSHMEM_INCLUDE_DIR})
 set_cuda_compile_options(all_to_all_internode_lib)
@@ -50,7 +50,7 @@ if(WITH_TESTS)
         CUDA::cudart
         CUDA::cuda_driver
         MPI::MPI_CXX
-        nvshmem::nvshmem
+        nvshmem::nvshmem_host
     )
     set_cuda_compile_options(test_all_to_all)
     add_test(NAME AllToAllTest
@@ -69,6 +69,6 @@ if (WITH_BENCHMARKS)
         CUDA::cudart
         CUDA::cuda_driver
         MPI::MPI_CXX
-        nvshmem::nvshmem
+        nvshmem::nvshmem_host
     )
 endif()
diff --git a/csrc/bindings/all_to_all_ops.cpp b/csrc/bindings/all_to_all_ops.cpp
@@ -73,6 +73,10 @@ fptr_t create_internode(
       hiddenDimBytes,
       hiddenDimScaleBytes
   );
+
+  // Needed to use host-side initialization information in device APIs.
+  nvshmem_init();
+
   return (fptr_t)ptr;
 }
 

diff --git a/csrc/bindings/bindings.cpp b/csrc/bindings/bindings.cpp
@@ -1,14 +1,10 @@
 #include <torch/library.h>
 
 #include "bindings/all_to_all_ops.h"
-#include "bindings/nvshmem.h"
 #include "core/registration.h"
 
 using namespace pplx;
 
-TORCH_LIBRARY(pplx_kernels, m) {
-  register_nvshmem_ops(m);
-  register_all_to_all_ops(m);
-}
+TORCH_LIBRARY(pplx_kernels, m) { register_all_to_all_ops(m); }
 
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/bindings/nvshmem.cpp b/csrc/bindings/nvshmem.cpp
diff --git a/csrc/bindings/nvshmem.h b/csrc/bindings/nvshmem.h
diff --git a/csrc/core/CMakeLists.txt b/csrc/core/CMakeLists.txt
@@ -8,7 +8,7 @@ target_link_libraries(core_lib PUBLIC
     CUDA::cudart
 )
 target_link_libraries(core_lib INTERFACE
-    nvshmem::nvshmem
+    nvshmem::nvshmem_host
 )
 target_include_directories(core_lib PRIVATE ${NVSHMEM_INCLUDE_DIR})
 set_cuda_compile_options(core_lib)
diff --git a/src/pplx_kernels/__init__.py b/src/pplx_kernels/__init__.py
@@ -1,16 +1,6 @@
 from . import ops as ops
-from .all_to_all import (
-    AllToAll as AllToAll,
-)
+from .all_to_all import AllToAll as AllToAll
 from .nvshmem import (
-    nvshmem_alloc_empty_unique_id as nvshmem_alloc_empty_unique_id,
-    nvshmem_alltoall as nvshmem_alltoall,
-    nvshmem_barrier_all as nvshmem_barrier_all,
-    nvshmem_barrier_all_on_current_stream as nvshmem_barrier_all_on_current_stream,
-    nvshmem_finalize as nvshmem_finalize,
-    nvshmem_get_unique_id as nvshmem_get_unique_id,
+    PyTorchStreamWrapper as PyTorchStreamWrapper,
     nvshmem_init as nvshmem_init,
-    nvshmem_my_pe as nvshmem_my_pe,
-    nvshmem_n_pes as nvshmem_n_pes,
-    nvshmem_unique_id_size as nvshmem_unique_id_size,
 )
diff --git a/src/pplx_kernels/nvshmem.py b/src/pplx_kernels/nvshmem.py
@@ -1,60 +1,45 @@
 # pyright: reportCallIssue=false
 
-from collections.abc import Sequence
+from typing import Any, Optional
 
-import torch
+import nvshmem.core as nvshmem  # type: ignore[import]
+import torch.distributed as dist
 
-from .ops import _ops
 
 ###### NVSHMEM ######
-
-
-def nvshmem_get_unique_id() -> torch.Tensor:
-    return _ops.nvshmem_get_unique_id()
-
-
-def nvshmem_unique_id_size() -> int:
-    return _ops.nvshmem_unique_id_size()
-
-
-def nvshmem_alloc_empty_unique_id() -> torch.Tensor:
-    return torch.zeros(nvshmem_unique_id_size(), dtype=torch.uint8, device="cpu")
-
-
-def nvshmem_init(uid: torch.Tensor, rank: int, world_size: int) -> int:
-    status = _ops.nvshmem_init(uid, rank, world_size)
-    torch.cuda.synchronize()
-    return status
-
-
-def nvshmem_alltoall(dest: torch.Tensor, source: torch.Tensor) -> None:
-    return _ops.nvshmem_alltoall(dest, source)
-
-
-def nvshmem_finalize() -> None:
-    torch.cuda.synchronize()
-    _ops.nvshmem_finalize()
-
-
-def nvshmem_my_pe() -> int:
-    return _ops.nvshmem_my_pe()
-
-
-def nvshmem_n_pes() -> int:
-    return _ops.nvshmem_n_pes()
-
-
-def nvshmem_malloc(
-    shape: Sequence[int],
-    dtype: torch.dtype,
-    device: torch.device,
-) -> torch.Tensor:
-    return _ops.nvshmem_malloc(shape, dtype, device)
-
-
-def nvshmem_barrier_all() -> None:
-    _ops.nvshmem_barrier_all()
-
-
-def nvshmem_barrier_all_on_current_stream() -> None:
-    _ops.nvshmem_barrier_all_on_current_stream()
+def nvshmem_init(
+    global_rank: int,
+    local_rank: int,
+    world_size: int,
+    device: Any,
+    uid: Optional[Any] = None,
+) -> None:
+    uniqueid = nvshmem.get_unique_id(empty=True)
+    if local_rank == 0:
+        uniqueid = nvshmem.get_unique_id()
+        broadcast_objects = [uniqueid]
+    else:
+        broadcast_objects = [None]
+
+    dist.broadcast_object_list(broadcast_objects, src=0)
+    dist.barrier()
+
+    nvshmem.init(
+        device=device,
+        uid=broadcast_objects[0],
+        rank=global_rank,
+        nranks=world_size,
+        initializer_method="uid",
+    )
+
+
+# This stream wrapper returns the format required by CUDA Python. This workaround will be removed when nvshmem4py supports Torch stream interoperability.
+# For more information see: https://nvidia.github.io/cuda-python/cuda-core/latest/interoperability.html#cuda-stream-protocol
+class PyTorchStreamWrapper:
+    def __init__(self, pt_stream: Any) -> None:
+        self.pt_stream = pt_stream
+        self.handle = pt_stream.cuda_stream
+
+    def __cuda_stream__(self) -> tuple[int, int]:
+        stream_id = self.pt_stream.cuda_stream
+        return (0, stream_id)
diff --git a/src/pplx_kernels/ops.py b/src/pplx_kernels/ops.py
@@ -5,6 +5,8 @@
 
 import torch
 
+logger = logging.getLogger(__name__)
+
 try:
     _lib_path = os.path.join(os.path.dirname(__file__), "libpplx_kernels.so")
     torch.ops.load_library(_lib_path)
@@ -13,4 +15,4 @@
     from types import SimpleNamespace
 
     _ops = SimpleNamespace()
-    logging.exception("Error loading pplx-kernels")
+    logger.exception("Error loading pplx-kernels")