add dp size in replay buffer

DNXie · DNXie · commit 494654bcca92 · 2025-08-28T14:14:14.000-07:00
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -377,10 +377,7 @@ async def main():
     )
 
     replay_buffer = await spawn_service(
-        default_service_cfg,
-        ReplayBuffer,
-        batch_size=4,
-        max_policy_age=1,
+        default_service_cfg, ReplayBuffer, batch_size=4, max_policy_age=1, dp_size=1
     )
 
     dataloader = await spawn_service(
@@ -469,6 +466,7 @@ async def continuous_training():
             if batch is None:
                 await asyncio.sleep(0.1)
             else:
+                batch = batch[0]  # Hard coded because we are not doing data parallel
                 training_result = await trainer.train_step.choose(batch)
                 training_step += 1
                 if training_step % 10 == 0:
diff --git a/src/forge/actors/replay_buffer.py b/src/forge/actors/replay_buffer.py
@@ -8,17 +8,18 @@
 from dataclasses import dataclass
 from typing import Any
 
-from monarch.actor import endpoint
-
 from forge.controller import ForgeActor
 
+from monarch.actor import endpoint
+
 
 @dataclass
 class ReplayBuffer(ForgeActor):
     """Simple in-memory replay buffer implementation."""
 
     batch_size: int
     max_policy_age: int
+    dp_size: int = 1
     seed: int | None = None
 
     @endpoint
@@ -43,23 +44,32 @@ async def sample(self, curr_policy_version: int, batch_size: int | None = None):
                 passed in at initialization.
 
         Returns:
-            A list of sampled episodes or None if there are not enough episodes in the buffer.
+            A list of sampled episodes with shape (dp_size, bsz, ...) or None if there are not enough episodes in the buffer.
         """
         bsz = batch_size if batch_size is not None else self.batch_size
+        total_samples = self.dp_size * bsz
 
         # Evict old episodes
         self._evict(curr_policy_version)
 
-        if bsz > len(self.buffer):
+        if total_samples > len(self.buffer):
             return None
 
         # TODO: Make this more efficient
-        idx_to_sample = self.sampler(range(len(self.buffer)), k=bsz)
+        idx_to_sample = self.sampler(range(len(self.buffer)), k=total_samples)
         sorted_idxs = sorted(
             idx_to_sample, reverse=True
         )  # Sort in desc order to avoid shifting idxs
         sampled_episodes = [self.buffer.pop(i) for i in sorted_idxs]
-        return sampled_episodes
+
+        # Reshape to (dp_size, bsz, ...)
+        reshaped_episodes = []
+        for dp_idx in range(self.dp_size):
+            start_idx = dp_idx * bsz
+            end_idx = start_idx + bsz
+            reshaped_episodes.append(sampled_episodes[start_idx:end_idx])
+
+        return reshaped_episodes
 
     @endpoint
     async def evict(self, curr_policy_version: int) -> None: