Support > 1 GB put/send with chunking (#1367)

dstaay-fb · facebook-github-bot · commit 5a863c30445e · 2025-09-30T15:58:31.000-07:00
Summary: Pull Request resolved: #1367 Adds chunking logic at rust layer. unblocks > 1 gb single put/get operations To test: rdma_load_test -- --device cuda:0 --operation ping-pong --iterations 100 --size 1512 === ACTOR 0 (Create Buffer) STATISTICS === [0] TIMING RESULTS: [0] Average time per operation: 1.526 ms [0] Minimum time per operation: 0.537 ms [0] Maximum time per operation: 92.305 ms [0] Standard deviation: 9.170 ms === ACTOR 1 (Create Buffer+Transmit) STATISTICS === [0] TIMING RESULTS: [0] Average time per operation: 35.145 ms [0] Minimum time per operation: 16.979 ms [0] Maximum time per operation: 155.070 ms [0] Standard deviation: 18.571 ms [0] [0] ============================================================ [0] RDMA PING-PONG LOAD TEST RESULTS (CUDA:0) [0] ============================================================ [0] Total iterations completed: 100 [0] Average data per operation: 1519.6 MB [0] Total data transferred: 151956.0 MB [0] [0] [0] BANDWIDTH RESULTS: [0] Average bandwidth: 362.70 Gbps [0] Maximum bandwidth: 750.74 Gbps [0] Minimum bandwidth: 82.20 Gbps [0] ============================================================ Reviewed By: allenwang28 Differential Revision: D83499085 fbshipit-source-id: 0f655e1f678993a1c80b5058f11715c0b8124323
diff --git a/monarch_rdma/src/rdma_components.rs b/monarch_rdma/src/rdma_components.rs
@@ -40,6 +40,9 @@
 //! 6. Poll for completions
 //! 7. Resources are cleaned up when dropped
 
+/// Maximum size for a single RDMA operation in bytes (1 GiB)
+const MAX_RDMA_MSG_SIZE: usize = 1024 * 1024 * 1024;
+
 use std::ffi::CStr;
 use std::fs;
 use std::io::Error;
@@ -788,20 +791,37 @@ impl RdmaQueuePair {
     }
 
     pub fn put(&mut self, lhandle: RdmaBuffer, rhandle: RdmaBuffer) -> Result<(), anyhow::Error> {
-        let idx = self.send_wqe_idx;
-        self.send_wqe_idx += 1;
-        self.post_op(
-            lhandle.addr,
-            lhandle.lkey,
-            lhandle.size,
-            idx,
-            true,
-            RdmaOperation::Write,
-            rhandle.addr,
-            rhandle.rkey,
-        )
-        .unwrap();
-        self.send_db_idx += 1;
+        let total_size = lhandle.size;
+        if rhandle.size < total_size {
+            return Err(anyhow::anyhow!(
+                "Remote buffer size ({}) is smaller than local buffer size ({})",
+                rhandle.size,
+                total_size
+            ));
+        }
+
+        let mut remaining = total_size;
+        let mut offset = 0;
+        while remaining > 0 {
+            let chunk_size = std::cmp::min(remaining, MAX_RDMA_MSG_SIZE);
+            let idx = self.send_wqe_idx;
+            self.send_wqe_idx += 1;
+            self.post_op(
+                lhandle.addr + offset,
+                lhandle.lkey,
+                chunk_size,
+                idx,
+                true,
+                RdmaOperation::Write,
+                rhandle.addr + offset,
+                rhandle.rkey,
+            )?;
+            self.send_db_idx += 1;
+
+            remaining -= chunk_size;
+            offset += chunk_size;
+        }
+
         Ok(())
     }
 
@@ -932,20 +952,38 @@ impl RdmaQueuePair {
     }
 
     pub fn get(&mut self, lhandle: RdmaBuffer, rhandle: RdmaBuffer) -> Result<(), anyhow::Error> {
-        let idx = self.send_wqe_idx;
-        self.send_wqe_idx += 1;
-        self.post_op(
-            lhandle.addr,
-            lhandle.lkey,
-            lhandle.size,
-            idx,
-            true,
-            RdmaOperation::Read,
-            rhandle.addr,
-            rhandle.rkey,
-        )
-        .unwrap();
-        self.send_db_idx += 1;
+        let total_size = lhandle.size;
+        if rhandle.size < total_size {
+            return Err(anyhow::anyhow!(
+                "Remote buffer size ({}) is smaller than local buffer size ({})",
+                rhandle.size,
+                total_size
+            ));
+        }
+
+        let mut remaining = total_size;
+        let mut offset = 0;
+
+        while remaining > 0 {
+            let chunk_size = std::cmp::min(remaining, MAX_RDMA_MSG_SIZE);
+            let idx = self.send_wqe_idx;
+            self.send_wqe_idx += 1;
+            self.post_op(
+                lhandle.addr + offset,
+                lhandle.lkey,
+                chunk_size,
+                idx,
+                true,
+                RdmaOperation::Read,
+                rhandle.addr + offset,
+                rhandle.rkey,
+            )?;
+            self.send_db_idx += 1;
+
+            remaining -= chunk_size;
+            offset += chunk_size;
+        }
+
         Ok(())
     }
 
@@ -1122,7 +1160,7 @@ impl RdmaQueuePair {
     ///
     /// # Arguments
     ///
-    /// * `target` - Which completion queue(s) to poll (Send, Receive, or Both)
+    /// * `target` - Which completion queue(s) to poll (Send, Receive)
     ///
     /// # Returns
     ///
@@ -1168,9 +1206,10 @@ impl RdmaQueuePair {
                     // This should be a send completion - verify it's the one we're waiting for
                     if wc.wr_id() == self.send_cq_idx {
                         self.send_cq_idx += 1;
+                    }
+                    // finished polling, return the last completion
+                    if self.send_cq_idx == self.send_db_idx {
                         return Ok(Some(IbvWc::from(wc)));
-                    } else {
-                        // This completion is for a different operation - keep polling
                     }
                 }
             }
@@ -1193,17 +1232,23 @@ impl RdmaQueuePair {
                     if !wc.is_valid() {
                         if let Some((status, vendor_err)) = wc.error() {
                             return Err(anyhow::anyhow!(
-                                "Receive work completion failed with status: {:?}, vendor error: {}",
+                                "Recv work completion failed with status: {:?}, vendor error: {}, wr_id: {}, send_cq_idx: {}",
                                 status,
-                                vendor_err
+                                vendor_err,
+                                wc.wr_id(),
+                                self.recv_cq_idx,
                             ));
                         }
                     }
 
-                    // This should be a receive completion
-                    self.recv_cq_idx += 1;
-
-                    return Ok(Some(IbvWc::from(wc)));
+                    // This should be a send completion - verify it's the one we're waiting for
+                    if wc.wr_id() == self.recv_cq_idx {
+                        self.recv_cq_idx += 1;
+                    }
+                    // finished polling, return the last completion
+                    if self.recv_cq_idx == self.recv_db_idx {
+                        return Ok(Some(IbvWc::from(wc)));
+                    }
                 }
             }
 
diff --git a/python/tests/test_rdma.py b/python/tests/test_rdma.py
@@ -260,3 +260,100 @@ def test_gpu_trainer_generator_sync() -> None:
     for _ in range(1):
         trainer.weights_ready.call().get()
         generator.update_weights.call().get()
+
+
+@needs_rdma
+async def test_rdma_concurrent_2gb_writes_in_order():
+    """Test concurrent 2GB RDMA buffer writes with reverse-order awaiting"""
+    proc = this_host().spawn_procs(per_host={"processes": 1})
+    num_elem = 500_000_000  # 500M elements
+
+    class BufferOwnerActor(Actor):
+        def __init__(self):
+            # Create a 2GB buffer (500M float32 elements * 4 bytes = 2GB)
+            self.data = torch.zeros(num_elem, dtype=torch.float32)
+            self.rdma_buffer = None
+
+        @endpoint
+        async def create_buffer(self) -> RDMABuffer:
+            """Create a 2GB RDMABuffer"""
+            byte_tensor = self.data.view(torch.uint8).flatten()
+            self.rdma_buffer = RDMABuffer(byte_tensor)
+            return self.rdma_buffer
+
+        @endpoint
+        async def get_buffer_data(self) -> torch.Tensor:
+            """Return the current buffer data for verification"""
+            return self.data
+
+    class WriterActor(Actor):
+        def __init__(self):
+            # Create a 2GB buffer (500M float32 elements * 4 bytes = 2GB)
+            self.tensor_a = torch.ones(
+                num_elem, dtype=torch.float32
+            )  # Will receive data
+            self.tensor_b = torch.full(
+                (num_elem,), 2.0, dtype=torch.float32
+            )  # Will send data
+
+        @endpoint
+        async def perform_concurrent_writes(self, buffer: RDMABuffer):
+            """Perform concurrent read/write operations and await in reverse order"""
+            # Convert tensors to byte views for RDMA
+            byte_tensor_a = self.tensor_a.view(torch.uint8).flatten()
+            byte_tensor_b = self.tensor_b.view(torch.uint8).flatten()
+
+            # Start both operations concurrently
+            future_a = buffer.read_into(
+                byte_tensor_a, timeout=10
+            )  # Read FROM buffer INTO tensor_a
+            future_b = buffer.write_from(
+                byte_tensor_b, timeout=10
+            )  # Write FROM tensor_b INTO buffer
+
+            # Await in reverse order - sets actual execution order
+            await future_b  # Await write operation first
+            await future_a  # Await read operation second
+
+            return "SUCCESS"
+
+        @endpoint
+        async def get_tensors(self) -> tuple[torch.Tensor, torch.Tensor]:
+            """Return both tensors for verification"""
+            return (self.tensor_a, self.tensor_b)
+
+    # Create actors
+    buffer_owner = proc.spawn("buffer_owner", BufferOwnerActor)
+    writer = proc.spawn("writer", WriterActor)
+
+    # Create the 2GB RDMA buffer
+    buffer = await buffer_owner.create_buffer.call_one()
+    print(f"✓ Created 2GB RDMA buffer (size: {buffer.size() / (1024**3):.2f} GB)")
+
+    # Perform concurrent writes with reverse-order awaiting
+    result = await writer.perform_concurrent_writes.call_one(buffer)
+    assert result == "SUCCESS", f"Concurrent writes failed: {result}"
+
+    # Verify the data flow worked correctly using torch.allclose
+    tensor_a_actual, tensor_b_actual = await writer.get_tensors.call_one()
+    buffer_data_actual = await buffer_owner.get_buffer_data.call_one()
+
+    expected_result = torch.full((num_elem,), 2.0, dtype=torch.float32)
+
+    # Verify using torch.allclose
+    assert torch.allclose(
+        tensor_a_actual, expected_result
+    ), "tensor_a does not match expected 2.0s"
+    assert torch.allclose(
+        tensor_b_actual, expected_result
+    ), "tensor_b does not match expected 2.0s"
+
+    assert torch.allclose(
+        buffer_data_actual, expected_result
+    ), "RDMABuffer does not contain expected 2.0s"
+
+    print("✓ Concurrent 2GB operations completed successfully")
+
+    # Drop the buffer
+    await buffer.drop()
+    print("✓ Buffer dropped successfully")