Move pybind out of ncclx (#68)

tanquer · meta-codesync[bot] · commit 8ef125915229 · 2025-11-25T14:27:52.000-08:00
Summary: Pull Request resolved: #68 D85694262 added transport pybind layer at ncclx which is not approriate. They are very different libs. It introduce third party library and make internal conda image fail. Move it to RdmaTransport lib folder. Reviewed By: d4l3k Differential Revision: D87873651 fbshipit-source-id: fe9885bd6247d3048cd2c0a25907e6b967237ac9
diff --git a/comms/torchcomms/ncclx/CMakeLists.txt b/comms/torchcomms/ncclx/CMakeLists.txt
@@ -2,7 +2,6 @@
 # Extension: torchcomms._comms_ncclx
 file(GLOB TORCHCOMMS_NCCLX_SOURCES
     "comms/torchcomms/ncclx/*.cpp"
-    "comms/torchcomms/transport/*.cpp"
 )
 file(GLOB TORCHCOMMS_CUDA_API_SOURCE "comms/torchcomms/device/CudaApi.cpp")
 
@@ -61,7 +60,6 @@ add_library(torchcomms_comms_ncclx MODULE
     ${TORCHCOMMS_NCCLX_SOURCES}
     ${TORCHCOMMS_CUDA_API_SOURCE}
 )
-target_compile_definitions(torchcomms_comms_ncclx PRIVATE MOCK_SCUBA_DATA CTRAN_DISABLE_TCPDM)
 set_target_properties(torchcomms_comms_ncclx PROPERTIES
     PREFIX ""
     OUTPUT_NAME "_comms_ncclx"
diff --git a/comms/torchcomms/ncclx/TorchCommNCCLXPy.cpp b/comms/torchcomms/ncclx/TorchCommNCCLXPy.cpp
@@ -1,114 +1,19 @@
 // Copyright (c) Meta Platforms, Inc. and affiliates.
 
-#include <folly/io/async/EventBase.h>
-#include <folly/io/async/ScopedEventBaseThread.h>
 #include <pybind11/chrono.h>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <torch/csrc/utils/pybind.h>
 
 #include "comms/torchcomms/ncclx/TorchCommNCCLX.hpp"
-#include "comms/torchcomms/transport/RdmaTransport.h"
 
 namespace py = pybind11;
 using namespace torch::comms;
 
-namespace {
-folly::ScopedEventBaseThread& getScopedEventBaseThread() {
-  // This intentionally creates and leaks a global event base thread to be used
-  // for all Transports on first use.
-  static folly::ScopedEventBaseThread scopedEventBaseThread{"torchcomms_evb"};
-  return scopedEventBaseThread;
-}
-} // namespace
-
 PYBIND11_MODULE(_comms_ncclx, m) {
   m.doc() = "NCCLX specific python bindings for TorchComm";
 
   py::class_<TorchCommNCCLX, std::shared_ptr<TorchCommNCCLX>>(
       m, "TorchCommNCCLX");
-
-  py::class_<RdmaRemoteBuffer, std::shared_ptr<RdmaRemoteBuffer>>(
-      m, "RdmaRemoteBuffer")
-      .def(
-          py::pickle(
-              [](const RdmaRemoteBuffer& buffer) { // __getstate__
-                return py::make_tuple(
-                    reinterpret_cast<uintptr_t>(buffer.ptr), buffer.accessKey);
-              },
-              [](const py::tuple& t) { // __setstate__
-                if (t.size() != 2) {
-                  throw std::runtime_error(
-                      "Invalid state for RdmaRemoteBuffer");
-                }
-                return RdmaRemoteBuffer{
-                    reinterpret_cast<void*>(t[0].cast<uintptr_t>()),
-                    t[1].cast<std::string>()};
-              }));
-
-  py::class_<RdmaTransport, std::shared_ptr<RdmaTransport>>(m, "RdmaTransport")
-      // initialize a new RDMATransport using a custom init fn
-      .def(py::init([](at::Device device) {
-        TORCH_INTERNAL_ASSERT(device.is_cuda());
-        int cuda_device = device.index();
-        return std::make_shared<RdmaTransport>(
-            cuda_device, getScopedEventBaseThread().getEventBase());
-      }))
-      .def_static("supported", &RdmaTransport::supported)
-      .def("bind", [](RdmaTransport& self) { return py::bytes(self.bind()); })
-      .def(
-          "connect",
-          [](RdmaTransport& self, const py::bytes& peerUrl) {
-            std::string peerUrlStr = peerUrl.cast<std::string>();
-            return static_cast<int>(self.connect(peerUrlStr));
-          })
-      .def("connected", &RdmaTransport::connected)
-      .def(
-          "write",
-          [](RdmaTransport& self,
-             const RdmaMemory::View& localBuffer,
-             const RdmaRemoteBuffer& remoteBuffer) {
-            return static_cast<int>(
-                self.write(localBuffer, remoteBuffer, false).get());
-          })
-      .def(
-          "read",
-          [](RdmaTransport& self,
-             RdmaMemory::MutableView& localBuffer,
-             const RdmaRemoteBuffer& remoteBuffer) {
-            return static_cast<int>(self.read(localBuffer, remoteBuffer).get());
-          });
-
-  py::class_<RdmaMemory::View, std::shared_ptr<RdmaMemory::View>>(
-      m, "RdmaMemoryView")
-      .def("size", &RdmaMemory::View::size);
-
-  py::class_<RdmaMemory::MutableView, std::shared_ptr<RdmaMemory::MutableView>>(
-      m, "RdmaMemoryMutableView");
-
-  py::class_<RdmaMemory, std::shared_ptr<RdmaMemory>>(m, "RdmaMemory")
-      .def(py::init([](const at::Tensor& tensor) {
-        TORCH_CHECK(
-            tensor.is_contiguous(),
-            "RdmaMemory currently requires a contiguous tensor");
-        // If CPU memory is passed, use device 0 for NIC discovery
-        const auto device = tensor.get_device() < 0 ? 0 : tensor.get_device();
-        return std::make_shared<RdmaMemory>(
-            tensor.data_ptr(), tensor.nbytes(), device);
-      }))
-      .def(
-          "to_view",
-          [](RdmaMemory& self) {
-            return self.createView(size_t(0), self.length());
-          })
-      .def(
-          "to_mutable_view",
-          [](RdmaMemory& self) {
-            return self.createMutableView(size_t(0), self.length());
-          })
-      .def("to_remote_buffer", [](RdmaMemory& self) {
-        return RdmaRemoteBuffer{
-            const_cast<void*>(self.data()), self.remoteKey()};
-      });
 }
diff --git a/comms/torchcomms/ncclx/_comms_ncclx.pyi b/comms/torchcomms/ncclx/_comms_ncclx.pyi
@@ -1,35 +1,4 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # pyre-strict
 
-import torch
-
 class TorchCommNCCLX: ...
-
-class RdmaMemoryView:
-    def size(self) -> int: ...
-
-class RdmaMemoryMutableView: ...
-
-class RdmaRemoteBuffer:
-    def __getstate__(self) -> tuple[int, str]: ...
-    def __setstate__(self, state: tuple[int, str]) -> None: ...
-
-class RdmaMemory:
-    def __init__(self, tensor: torch.Tensor) -> None: ...
-    def to_view(self) -> RdmaMemoryView: ...
-    def to_mutable_view(self) -> RdmaMemoryMutableView: ...
-    def to_remote_buffer(self) -> RdmaRemoteBuffer: ...
-
-class RdmaTransport:
-    def __init__(self, device: torch.device) -> None: ...
-    @staticmethod
-    def supported() -> bool: ...
-    def bind(self) -> bytes: ...
-    def connect(self, peer_url: bytes) -> int: ...
-    def connected(self) -> bool: ...
-    def write(
-        self, local_buffer: RdmaMemoryView, remote_buffer: RdmaRemoteBuffer
-    ) -> int: ...
-    def read(
-        self, local_buffer: RdmaMemoryMutableView, remote_buffer: RdmaRemoteBuffer
-    ) -> int: ...
diff --git a/comms/torchcomms/transport/RdmaTransportPy.cpp b/comms/torchcomms/transport/RdmaTransportPy.cpp
@@ -0,0 +1,110 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+#include <folly/io/async/EventBase.h>
+#include <folly/io/async/ScopedEventBaseThread.h>
+#include <pybind11/chrono.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include "comms/torchcomms/transport/RdmaTransport.h"
+
+using namespace torch::comms;
+
+namespace {
+folly::ScopedEventBaseThread& getScopedEventBaseThread() {
+  // This intentionally creates and leaks a global event base thread to be used
+  // for all Transports on first use.
+  static folly::ScopedEventBaseThread scopedEventBaseThread{"torchcomms_evb"};
+  return scopedEventBaseThread;
+}
+} // namespace
+
+PYBIND11_MODULE(_transport, m) {
+  m.doc() = "RdmaTransport python bindings for TorchComm";
+
+  py::class_<RdmaRemoteBuffer, std::shared_ptr<RdmaRemoteBuffer>>(
+      m, "RdmaRemoteBuffer")
+      .def(
+          py::pickle(
+              [](const RdmaRemoteBuffer& buffer) { // __getstate__
+                return py::make_tuple(
+                    reinterpret_cast<uintptr_t>(buffer.ptr), buffer.accessKey);
+              },
+              [](const py::tuple& t) { // __setstate__
+                if (t.size() != 2) {
+                  throw std::runtime_error(
+                      "Invalid state for RdmaRemoteBuffer");
+                }
+                return RdmaRemoteBuffer{
+                    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+                    reinterpret_cast<void*>(t[0].cast<uintptr_t>()),
+                    t[1].cast<std::string>()};
+              }));
+
+  py::class_<RdmaTransport, std::shared_ptr<RdmaTransport>>(m, "RdmaTransport")
+      // initialize a new RDMATransport using a custom init fn
+      .def(py::init([](at::Device device) {
+        TORCH_INTERNAL_ASSERT(device.is_cuda());
+        int cuda_device = device.index();
+        return std::make_shared<RdmaTransport>(
+            cuda_device, getScopedEventBaseThread().getEventBase());
+      }))
+      .def_static("supported", &RdmaTransport::supported)
+      .def("bind", [](RdmaTransport& self) { return py::bytes(self.bind()); })
+      .def(
+          "connect",
+          [](RdmaTransport& self, const py::bytes& peerUrl) {
+            std::string peerUrlStr = peerUrl.cast<std::string>();
+            return static_cast<int>(self.connect(peerUrlStr));
+          })
+      .def("connected", &RdmaTransport::connected)
+      .def(
+          "write",
+          [](RdmaTransport& self,
+             const RdmaMemory::View& localBuffer,
+             const RdmaRemoteBuffer& remoteBuffer) {
+            return static_cast<int>(
+                self.write(localBuffer, remoteBuffer, false).get());
+          })
+      .def(
+          "read",
+          [](RdmaTransport& self,
+             RdmaMemory::MutableView& localBuffer,
+             const RdmaRemoteBuffer& remoteBuffer) {
+            return static_cast<int>(self.read(localBuffer, remoteBuffer).get());
+          });
+
+  py::class_<RdmaMemory::View, std::shared_ptr<RdmaMemory::View>>(
+      m, "RdmaMemoryView")
+      .def("size", &RdmaMemory::View::size);
+
+  py::class_<RdmaMemory::MutableView, std::shared_ptr<RdmaMemory::MutableView>>(
+      m, "RdmaMemoryMutableView");
+
+  py::class_<RdmaMemory, std::shared_ptr<RdmaMemory>>(m, "RdmaMemory")
+      .def(py::init([](const at::Tensor& tensor) {
+        TORCH_CHECK(
+            tensor.is_contiguous(),
+            "RdmaMemory currently requires a contiguous tensor");
+        // If CPU memory is passed, use device 0 for NIC discovery
+        const auto device = tensor.get_device() < 0 ? 0 : tensor.get_device();
+        return std::make_shared<RdmaMemory>(
+            tensor.data_ptr(), tensor.nbytes(), device);
+      }))
+      .def(
+          "to_view",
+          [](RdmaMemory& self) {
+            return self.createView(size_t(0), self.length());
+          })
+      .def(
+          "to_mutable_view",
+          [](RdmaMemory& self) {
+            return self.createMutableView(size_t(0), self.length());
+          })
+      .def("to_remote_buffer", [](RdmaMemory& self) {
+        return RdmaRemoteBuffer{
+            const_cast<void*>(self.data()), self.remoteKey()};
+      });
+}
diff --git a/comms/torchcomms/transport/_transport.pyi b/comms/torchcomms/transport/_transport.pyi
@@ -0,0 +1,33 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# pyre-strict
+
+import torch
+
+class RdmaMemoryView:
+    def size(self) -> int: ...
+
+class RdmaMemoryMutableView: ...
+
+class RdmaRemoteBuffer:
+    def __getstate__(self) -> tuple[int, str]: ...
+    def __setstate__(self, state: tuple[int, str]) -> None: ...
+
+class RdmaMemory:
+    def __init__(self, tensor: torch.Tensor) -> None: ...
+    def to_view(self) -> RdmaMemoryView: ...
+    def to_mutable_view(self) -> RdmaMemoryMutableView: ...
+    def to_remote_buffer(self) -> RdmaRemoteBuffer: ...
+
+class RdmaTransport:
+    def __init__(self, device: torch.device) -> None: ...
+    @staticmethod
+    def supported() -> bool: ...
+    def bind(self) -> bytes: ...
+    def connect(self, peer_url: bytes) -> int: ...
+    def connected(self) -> bool: ...
+    def write(
+        self, local_buffer: RdmaMemoryView, remote_buffer: RdmaRemoteBuffer
+    ) -> int: ...
+    def read(
+        self, local_buffer: RdmaMemoryMutableView, remote_buffer: RdmaRemoteBuffer
+    ) -> int: ...
diff --git a/comms/torchcomms/transport/tests/cpp/RdmaTransportSupportXPlatTest.cc b/comms/torchcomms/transport/tests/cpp/RdmaTransportSupportXPlatTest.cc
diff --git a/comms/torchcomms/transport/tests/cpp/RdmaTransportTest.cc b/comms/torchcomms/transport/tests/cpp/RdmaTransportTest.cc
diff --git a/comms/torchcomms/transport/tests/py/TransportTest.py b/comms/torchcomms/transport/tests/py/TransportTest.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from torchcomms._comms_ncclx import RdmaMemory, RdmaTransport
+from torchcomms._transport import RdmaMemory, RdmaTransport
 
 
 class TransportTest(unittest.TestCase):

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,6 @@`
`2`	`2`	`# Extension: torchcomms._comms_ncclx`
`3`	`3`	`file(GLOB TORCHCOMMS_NCCLX_SOURCES`
`4`	`4`	`"comms/torchcomms/ncclx/*.cpp"`
`5`		`- "comms/torchcomms/transport/*.cpp"`
`6`	`5`	`)`
`7`	`6`	`file(GLOB TORCHCOMMS_CUDA_API_SOURCE "comms/torchcomms/device/CudaApi.cpp")`
`8`	`7`
`@@ -61,7 +60,6 @@ add_library(torchcomms_comms_ncclx MODULE`
`61`	`60`	`${TORCHCOMMS_NCCLX_SOURCES}`
`62`	`61`	`${TORCHCOMMS_CUDA_API_SOURCE}`
`63`	`62`	`)`
`64`		`-target_compile_definitions(torchcomms_comms_ncclx PRIVATE MOCK_SCUBA_DATA CTRAN_DISABLE_TCPDM)`
`65`	`63`	`set_target_properties(torchcomms_comms_ncclx PROPERTIES`
`66`	`64`	`PREFIX ""`
`67`	`65`	`OUTPUT_NAME "_comms_ncclx"`