Robustness/Perf imporvements - Python Bindings For Drop (#1355)

dstaay-fb · facebook-github-bot · commit 04ffbbe003b1 · 2025-09-29T22:40:02.000-07:00
Summary: Pull Request resolved: #1355 This diff implements critical RDMA performance optimizations in Monarch by addressing memory region (MR) management bottlenecks, resolving MR binding issues, and strategy for managing the 32 MR hardware limit with mkey custom registration; we now approach hardware limits on large (100's MB) comms. ### **Key Changes in this DIFF** * Add Drop function to Python API Reviewed By: zdevito Differential Revision: D83436151 fbshipit-source-id: 7e2d8628416460ddcafbb8d50e1558556065ef03
diff --git a/monarch_rdma/extension/lib.rs b/monarch_rdma/extension/lib.rs
@@ -156,14 +156,19 @@ impl PyRdmaBuffer {
                     .request_buffer_deprecated(cx_instance, addr, size)
                     .await?
             });
-            let _result_ = instance_dispatch!(client, |cx_instance| {
+            instance_dispatch!(client, |cx_instance| {
                 local_buffer
                     .write_from(cx_instance, buffer, timeout)
                     .await
                     .map_err(|e| {
                         PyException::new_err(format!("failed to read into buffer: {}", e))
                     })?
             });
+            instance_dispatch!(client, |cx_instance| {
+                local_owner_ref
+                    .release_buffer_deprecated(cx_instance, local_buffer)
+                    .await?
+            });
             Ok(())
         })
     }
@@ -197,14 +202,19 @@ impl PyRdmaBuffer {
                     .request_buffer_deprecated(cx_instance, addr, size)
                     .await?
             });
-            let _result_ = instance_dispatch!(&client, |cx_instance| {
+            instance_dispatch!(&client, |cx_instance| {
                 local_buffer
                     .read_into(cx_instance, buffer, timeout)
                     .await
                     .map_err(|e| {
                         PyException::new_err(format!("failed to write from buffer: {}", e))
                     })?
             });
+            instance_dispatch!(client, |cx_instance| {
+                local_owner_ref
+                    .release_buffer_deprecated(cx_instance, local_buffer)
+                    .await?
+            });
             Ok(())
         })
     }
@@ -232,10 +242,23 @@ impl PyRdmaBuffer {
         Ok(deserialized)
     }
 
-    fn drop<'py>(&self) -> PyResult<PyPythonTask> {
-        // no op with CPUs, currently a stub.
-        // TODO - replace with correct GPU behavior.
-        PyPythonTask::new(async move { Ok(()) })
+    fn drop<'py>(
+        &self,
+        _py: Python<'py>,
+        local_proc_id: String,
+        client: PyInstance,
+    ) -> PyResult<PyPythonTask> {
+        let (_local_owner_ref, buffer) = setup_rdma_context(self, local_proc_id);
+        PyPythonTask::new(async move {
+            // Call the drop method on the buffer to release remote handles
+            instance_dispatch!(client, |cx_instance| {
+                buffer
+                    .drop_buffer(cx_instance)
+                    .await
+                    .map_err(|e| PyException::new_err(format!("Failed to drop buffer: {}", e)))?
+            });
+            Ok(())
+        })
     }
 }
 
diff --git a/python/monarch/_rust_bindings/rdma.pyi b/python/monarch/_rust_bindings/rdma.pyi
@@ -35,7 +35,7 @@ class _RdmaBuffer:
     def create_rdma_buffer_nonblocking(
         cls, addr: int, size: int, proc_id: str, client: Any
     ) -> PythonTask[Any]: ...
-    def drop(self, client: Any) -> PythonTask[None]: ...
+    def drop(self, local_proc_id: str, client: Any) -> PythonTask[None]: ...
     def read_into(
         self,
         addr: int,
diff --git a/python/monarch/_src/tensor_engine/rdma.py b/python/monarch/_src/tensor_engine/rdma.py
@@ -342,3 +342,27 @@ async def write_from_nonblocking() -> None:
             return res
 
         return Future(coro=write_from_nonblocking())
+
+    def drop(self) -> Future[None]:
+        """
+        Release the handle on the memory that the remote holds to this memory.
+        """
+        local_proc_id = context().actor_instance.proc_id
+        client = context().actor_instance
+
+        async def drop_nonblocking() -> None:
+            await _ensure_init_rdma_manager()
+
+            await self._buffer.drop(
+                local_proc_id=local_proc_id,
+                client=client,
+            )
+
+        return Future(coro=drop_nonblocking())
+
+    @property
+    def owner(self) -> ProcMesh:
+        """
+        The proc that owns this buffer
+        """
+        return context().actor_instance.proc
diff --git a/python/tests/test_rdma.py b/python/tests/test_rdma.py
@@ -117,6 +117,70 @@ async def test_proc_mesh_rdma():
     assert torch.allclose(buffer_gpu.cpu(), remote_grad.cpu())
 
 
+@needs_rdma
+async def test_rdma_buffer_drop():
+    """Test the new drop() and owner methods on RDMABuffer with two actors"""
+    proc = this_host().spawn_procs(per_host={"processes": 1})
+
+    class ProducerActor(Actor):
+        def __init__(self):
+            self.data = torch.ones(10, 10, dtype=torch.float32)  # 400 bytes
+            self.buffer = None
+
+        @endpoint
+        async def create_buffer(self) -> RDMABuffer:
+            """Create an RDMABuffer and return it"""
+            byte_tensor = self.data.view(torch.uint8).flatten()
+            self.buffer = RDMABuffer(byte_tensor)
+            return self.buffer
+
+    class ConsumerActor(Actor):
+        def __init__(self):
+            self.received_data = torch.zeros(10, 10, dtype=torch.float32)
+
+        @endpoint
+        async def receive_data(self, buffer: RDMABuffer):
+            """Receive data from the buffer into local storage"""
+            byte_tensor = self.received_data.view(torch.uint8).flatten()
+            await buffer.read_into(byte_tensor)  # Read FROM buffer INTO local tensor
+            return torch.sum(self.received_data).item()  # Should be 100 (10*10*1)
+
+        @endpoint
+        async def test_buffer_after_drop(self, buffer: RDMABuffer):
+            """Try to use buffer after it's been dropped - should fail"""
+            byte_tensor = self.received_data.view(torch.uint8).flatten()
+            try:
+                await buffer.read_into(byte_tensor)  # Try to read from dropped buffer
+                return "SUCCESS"  # This should not happen
+            except Exception as e:
+                return f"EXPECTED_ERROR: {e}"
+
+    # Create both actors
+    producer = proc.spawn("producer", ProducerActor)
+    consumer = proc.spawn("consumer", ConsumerActor)
+
+    # Create an RDMA buffer in the producer
+    buffer = await producer.create_buffer.call_one()
+
+    # Pass buffer to consumer and test write operation
+    result = await consumer.receive_data.call_one(buffer)
+    assert result == 100.0, f"Expected 100.0, got {result}"
+
+    # Now drop the buffer
+    await buffer.drop()
+
+    # Test that we can call drop multiple times (should be idempotent)
+    await buffer.drop()
+
+    # Try to use the buffer after dropping - this should fail
+    error_result = await consumer.test_buffer_after_drop.call_one(buffer)
+    assert error_result.startswith(
+        "EXPECTED_ERROR:"
+    ), f"Expected an error after drop, but got: {error_result}"
+
+    print(f"✓ Buffer operations failed after drop as expected: {error_result}")
+
+
 class TrainerActor(Actor):
     def __init__(self):
         super().__init__()