[Feature] AsyncBatchedCollector: async envs + auto-batching inference (#3498)

vmoens · cursoragent · vmoens · commit fe0bcfe786e7 · 2026-02-21T21:11:47.000Z
Add AsyncBatchedCollector that pairs AsyncEnvPool with InferenceServer for pipelined RL data collection. Users supply env factories and a policy; the collector handles all internal wiring (transport, server, env pool). Co-authored-by: Cursor <cursoragent@cursor.com> ghstack-source-id: 525120c Pull-Request: #3498
diff --git a/docs/source/reference/collectors.rst b/docs/source/reference/collectors.rst
@@ -12,6 +12,7 @@ making it easy to collect high-quality training data efficiently.
 TorchRL provides several collector implementations optimized for different scenarios:
 
 - :class:`Collector`: Single-process collection on the training worker
+- :class:`AsyncBatchedCollector`: Async environments + auto-batching inference server (see :class:`AsyncBatchedCollector`)
 - :class:`MultiCollector`: Parallel collection across multiple workers (see below)
 - **Distributed collectors**: For multi-node setups using Ray, RPC, or distributed backends (see :class:`DistributedCollector` / :class:`RPCCollector`)
 
diff --git a/docs/source/reference/collectors_single.rst b/docs/source/reference/collectors_single.rst
@@ -15,6 +15,7 @@ Single node data collectors
     BaseCollector
     Collector
     AsyncCollector
+    AsyncBatchedCollector
     MultiCollector
     MultiSyncCollector
     MultiAsyncCollector
@@ -29,6 +30,49 @@ Single node data collectors
     - ``MultiSyncDataCollector`` → ``MultiSyncCollector``
     - ``MultiaSyncDataCollector`` → ``MultiAsyncCollector``
 
+Using AsyncBatchedCollector
+---------------------------
+
+The :class:`AsyncBatchedCollector` pairs an :class:`~torchrl.envs.AsyncEnvPool`
+with an :class:`~torchrl.modules.InferenceServer` to pipeline environment
+stepping and batched GPU inference.  You only need to supply **env factories**
+and a **policy** -- all internal wiring is handled automatically:
+
+.. code-block:: python
+
+    from torchrl.collectors import AsyncBatchedCollector
+    from torchrl.envs import GymEnv
+    from tensordict.nn import TensorDictModule
+    import torch.nn as nn
+
+    policy = TensorDictModule(
+        nn.Sequential(nn.Linear(4, 64), nn.ReLU(), nn.Linear(64, 2)),
+        in_keys=["observation"],
+        out_keys=["action"],
+    )
+
+    collector = AsyncBatchedCollector(
+        create_env_fn=[lambda: GymEnv("CartPole-v1")] * 8,
+        policy=policy,
+        frames_per_batch=200,
+        total_frames=10000,
+        max_batch_size=8,
+    )
+
+    for data in collector:
+        # data is a lazy-stacked TensorDict of collected transitions
+        pass
+
+    collector.shutdown()
+
+**Key advantages over** :class:`Collector`:
+
+- The inference server automatically **batches policy forward passes** from
+  all environments, maximising GPU utilisation.
+- Environment stepping and inference run in **overlapping fashion**, reducing
+  idle time.
+- Supports ``yield_completed_trajectories=True`` for episode-level yields.
+
 Using MultiCollector
 --------------------
 
diff --git a/docs/source/reference/modules_inference_server.rst b/docs/source/reference/modules_inference_server.rst
@@ -92,3 +92,29 @@ to receive updated model weights from a trainer between inference batches:
         loss.backward()
         optimizer.step()
         weight_sync.send(model=training_model)  # pushed to server
+
+Integration with Collectors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The easiest way to use the inference server with RL data collection is
+through :class:`~torchrl.collectors.AsyncBatchedCollector`, which
+creates the server, transport, and env pool automatically:
+
+.. code-block:: python
+
+    from torchrl.collectors import AsyncBatchedCollector
+    from torchrl.envs import GymEnv
+
+    collector = AsyncBatchedCollector(
+        create_env_fn=[lambda: GymEnv("CartPole-v1")] * 8,
+        policy=my_policy,
+        frames_per_batch=200,
+        total_frames=10_000,
+        max_batch_size=8,
+    )
+
+    for data in collector:
+        # train on data ...
+        pass
+
+    collector.shutdown()
diff --git a/examples/collectors/async_batched_collector.py b/examples/collectors/async_batched_collector.py
@@ -0,0 +1,64 @@
+"""AsyncBatchedCollector example.
+
+Demonstrates how to use :class:`~torchrl.collectors.AsyncBatchedCollector` to
+run many environments in parallel while automatically batching policy inference
+through an :class:`~torchrl.modules.InferenceServer`.
+
+Architecture:
+  - An :class:`~torchrl.envs.AsyncEnvPool` runs environments in parallel using
+    the chosen backend (``"multiprocessing"`` by default for true parallelism,
+    or ``"threading"``/``"asyncio"``).
+  - An :class:`~torchrl.modules.InferenceServer` batches incoming observations
+    and runs a single forward pass.
+  - A lightweight coordinator thread bridges the two: when an env finishes
+    stepping its observation is submitted to the server, and when an action is
+    ready the env is sent back for stepping -- all without synchronisation
+    barriers.
+
+The user only supplies:
+  - A list of environment factories
+  - A policy (or policy factory)
+"""
+import torch.nn as nn
+from tensordict.nn import TensorDictModule
+
+from torchrl.collectors import AsyncBatchedCollector
+from torchrl.envs import GymEnv
+
+
+def make_env():
+    """Factory that returns a CartPole environment."""
+    return GymEnv("CartPole-v1")
+
+
+def main():
+    num_envs = 4
+    frames_per_batch = 200
+    total_frames = 1_000
+
+    # A simple linear policy (random weights -- just for demonstration)
+    policy = TensorDictModule(
+        nn.Linear(4, 2), in_keys=["observation"], out_keys=["action"]
+    )
+
+    collector = AsyncBatchedCollector(
+        create_env_fn=[make_env] * num_envs,
+        policy=policy,
+        frames_per_batch=frames_per_batch,
+        total_frames=total_frames,
+        max_batch_size=num_envs,
+        device="cpu",
+    )
+
+    total_collected = 0
+    for i, batch in enumerate(collector):
+        n = batch.numel()
+        total_collected += n
+        print(f"Batch {i}: {batch.shape}  ({n} frames, total={total_collected})")
+
+    collector.shutdown()
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_inference_server.py b/test/test_inference_server.py
@@ -677,3 +677,189 @@ def test_no_weight_sync(self):
             td = TensorDict({"observation": torch.randn(4)})
             result = client(td)
             assert "action" in result.keys()
+
+
+# ---------------------------------------------------------------------------
+# AsyncBatchedCollector tests
+# ---------------------------------------------------------------------------
+
+from torchrl.collectors import AsyncBatchedCollector
+from torchrl.testing.mocking_classes import CountingEnv
+
+
+def _counting_env_factory(max_steps=5):
+    """Factory that returns a CountingEnv."""
+    return CountingEnv(max_steps=max_steps)
+
+
+class _BatchCountingPolicy(TensorDictModule):
+    """A batch-aware policy that always outputs action=1 for CountingEnv."""
+
+    def __init__(self):
+        super().__init__(
+            module=nn.Module(),  # placeholder
+            in_keys=["observation"],
+            out_keys=["action"],
+        )
+
+    def forward(self, td: TensorDictBase) -> TensorDictBase:
+        obs = td.get("observation")
+        action = torch.ones_like(obs)
+        return td.set("action", action)
+
+
+def _make_counting_policy():
+    return _BatchCountingPolicy()
+
+
+class TestAsyncBatchedCollector:
+    """Tests for :class:`AsyncBatchedCollector`."""
+
+    def test_basic_collection(self):
+        """Collector yields at least frames_per_batch frames."""
+        num_envs = 3
+        frames_per_batch = 20
+        total_frames = 60
+        policy = _make_counting_policy()
+
+        collector = AsyncBatchedCollector(
+            create_env_fn=[_counting_env_factory] * num_envs,
+            policy=policy,
+            frames_per_batch=frames_per_batch,
+            total_frames=total_frames,
+            max_batch_size=num_envs,
+            env_backend="threading",
+        )
+        total_collected = 0
+        for batch in collector:
+            assert batch is not None
+            total_collected += batch.numel()
+        collector.shutdown()
+        assert total_collected >= total_frames
+
+    def test_policy_factory(self):
+        """policy_factory is called to create the policy."""
+        num_envs = 2
+        collector = AsyncBatchedCollector(
+            create_env_fn=[_counting_env_factory] * num_envs,
+            policy_factory=_make_counting_policy,
+            frames_per_batch=10,
+            total_frames=20,
+            max_batch_size=num_envs,
+            env_backend="threading",
+        )
+        total_collected = 0
+        for batch in collector:
+            total_collected += batch.numel()
+        collector.shutdown()
+        assert total_collected >= 20
+
+    def test_policy_xor_factory(self):
+        """Providing both policy and policy_factory raises."""
+        policy = _make_counting_policy()
+        with pytest.raises(TypeError, match="mutually exclusive"):
+            AsyncBatchedCollector(
+                create_env_fn=[_counting_env_factory],
+                policy=policy,
+                policy_factory=_make_counting_policy,
+                frames_per_batch=10,
+            )
+
+    def test_neither_policy_nor_factory(self):
+        """Providing neither raises."""
+        with pytest.raises(TypeError, match="must be provided"):
+            AsyncBatchedCollector(
+                create_env_fn=[_counting_env_factory],
+                frames_per_batch=10,
+            )
+
+    def test_yield_completed_trajectories(self):
+        """With yield_completed_trajectories, collector yields done trajectories."""
+        num_envs = 3
+        max_steps = 5
+        policy = _make_counting_policy()
+
+        collector = AsyncBatchedCollector(
+            create_env_fn=[lambda: CountingEnv(max_steps=max_steps)] * num_envs,
+            policy=policy,
+            frames_per_batch=1,
+            total_frames=30,
+            yield_completed_trajectories=True,
+            max_batch_size=num_envs,
+            env_backend="threading",
+        )
+        count = 0
+        for batch in collector:
+            assert batch is not None
+            # Each trajectory should end with done=True
+            count += batch.numel()
+        collector.shutdown()
+        assert count >= 30
+
+    def test_shutdown_idempotent(self):
+        """Calling shutdown twice should not raise."""
+        policy = _make_counting_policy()
+        collector = AsyncBatchedCollector(
+            create_env_fn=[_counting_env_factory] * 2,
+            policy=policy,
+            frames_per_batch=10,
+            total_frames=10,
+            env_backend="threading",
+        )
+        # Consume one batch to start
+        for _batch in collector:
+            break
+        collector.shutdown()
+        collector.shutdown()  # should not raise
+
+    def test_endless_collector(self):
+        """total_frames=-1 creates an endless collector; verify manual break works."""
+        policy = _make_counting_policy()
+        collector = AsyncBatchedCollector(
+            create_env_fn=[_counting_env_factory] * 2,
+            policy=policy,
+            frames_per_batch=10,
+            total_frames=-1,
+            env_backend="threading",
+        )
+        collected = 0
+        for batch in collector:
+            collected += batch.numel()
+            if collected >= 50:
+                break
+        collector.shutdown()
+        assert collected >= 50
+
+    def test_num_envs(self):
+        """The collector knows the number of environments."""
+        policy = _make_counting_policy()
+        collector = AsyncBatchedCollector(
+            create_env_fn=[_counting_env_factory] * 2,
+            policy=policy,
+            frames_per_batch=10,
+            total_frames=10,
+        )
+        assert collector._num_envs == 2
+        collector.shutdown()
+
+    def test_postproc(self):
+        """Post-processing callable is applied to every batch."""
+        policy = _make_counting_policy()
+        called = {"count": 0}
+
+        def postproc(td):
+            called["count"] += 1
+            return td
+
+        collector = AsyncBatchedCollector(
+            create_env_fn=[_counting_env_factory] * 2,
+            policy=policy,
+            frames_per_batch=10,
+            total_frames=20,
+            postproc=postproc,
+            env_backend="threading",
+        )
+        for _ in collector:
+            pass
+        collector.shutdown()
+        assert called["count"] >= 1
diff --git a/torchrl/collectors/__init__.py b/torchrl/collectors/__init__.py
@@ -6,13 +6,14 @@
 
 from torchrl.modules.tensordict_module.exploration import RandomPolicy
 
+from ._async_batched import AsyncBatchedCollector
+
 from ._base import BaseCollector, DataCollectorBase, ProfileConfig
 
 from ._multi_async import MultiAsyncCollector, MultiaSyncDataCollector
 from ._multi_base import MultiCollector, MultiCollector as _MultiDataCollector
 from ._multi_sync import MultiSyncCollector, MultiSyncDataCollector
 from ._single import Collector, SyncDataCollector
-
 from ._single_async import AsyncCollector, aSyncDataCollector
 from .weight_update import (
     MultiProcessedWeightUpdater,
@@ -29,6 +30,7 @@
     "AsyncCollector",
     "MultiCollector",
     "MultiSyncCollector",
+    "AsyncBatchedCollector",
     "MultiAsyncCollector",
     "ProfileConfig",
     # Legacy names (backward-compatible aliases)
diff --git a/torchrl/collectors/_async_batched.py b/torchrl/collectors/_async_batched.py