pytorch
diff --git a/‎benchmarks/bench_collectors.py‎
Lines changed: 419 additions & 0 deletions b/‎benchmarks/bench_collectors.py‎
Lines changed: 419 additions & 0 deletions
diff --git a/‎docs/source/reference/collectors.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/reference/collectors.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/reference/collectors_single.rst‎
Lines changed: 44 additions & 0 deletions b/‎docs/source/reference/collectors_single.rst‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎docs/source/reference/modules.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/reference/modules.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/reference/modules_inference_server.rst‎
Lines changed: 120 additions & 0 deletions b/‎docs/source/reference/modules_inference_server.rst‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎examples/collectors/async_batched_collector.py‎
Lines changed: 63 additions & 0 deletions b/‎examples/collectors/async_batched_collector.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎setup.cfg‎
Lines changed: 1 addition & 0 deletions b/‎setup.cfg‎
Lines changed: 1 addition & 0 deletions
@@ -12,6 +12,7 @@ making it easy to collect high-quality training data efficiently.
 TorchRL provides several collector implementations optimized for different scenarios:
 
 - :class:`Collector`: Single-process collection on the training worker
+- :class:`AsyncBatchedCollector`: Async environments + auto-batching inference server (see :class:`AsyncBatchedCollector`)
 - :class:`MultiCollector`: Parallel collection across multiple workers (see below)
 - **Distributed collectors**: For multi-node setups using Ray, RPC, or distributed backends (see :class:`DistributedCollector` / :class:`RPCCollector`)
 
 
@@ -15,6 +15,7 @@ Single node data collectors
     BaseCollector
     Collector
     AsyncCollector
+    AsyncBatchedCollector
     MultiCollector
     MultiSyncCollector
     MultiAsyncCollector
@@ -29,6 +30,49 @@ Single node data collectors
     - ``MultiSyncDataCollector`` → ``MultiSyncCollector``
     - ``MultiaSyncDataCollector`` → ``MultiAsyncCollector``
 
+Using AsyncBatchedCollector
+---------------------------
+
+The :class:`AsyncBatchedCollector` pairs an :class:`~torchrl.envs.AsyncEnvPool`
+with an :class:`~torchrl.modules.InferenceServer` to pipeline environment
+stepping and batched GPU inference.  You only need to supply **env factories**
+and a **policy** -- all internal wiring is handled automatically:
+
+.. code-block:: python
+
+    from torchrl.collectors import AsyncBatchedCollector
+    from torchrl.envs import GymEnv
+    from tensordict.nn import TensorDictModule
+    import torch.nn as nn
+
+    policy = TensorDictModule(
+        nn.Sequential(nn.Linear(4, 64), nn.ReLU(), nn.Linear(64, 2)),
+        in_keys=["observation"],
+        out_keys=["action"],
+    )
+
+    collector = AsyncBatchedCollector(
+        create_env_fn=[lambda: GymEnv("CartPole-v1")] * 8,
+        policy=policy,
+        frames_per_batch=200,
+        total_frames=10000,
+        max_batch_size=8,
+    )
+
+    for data in collector:
+        # data is a lazy-stacked TensorDict of collected transitions
+        pass
+
+    collector.shutdown()
+
+**Key advantages over** :class:`Collector`:
+
+- The inference server automatically **batches policy forward passes** from
+  all environments, maximising GPU utilisation.
+- Environment stepping and inference run in **overlapping fashion**, reducing
+  idle time.
+- Supports ``yield_completed_trajectories=True`` for episode-level yields.
+
 Using MultiCollector
 --------------------
 
 
@@ -56,4 +56,5 @@ Documentation Sections
    modules_mcts
    modules_models
    modules_distributions
+   modules_inference_server
    modules_utils
@@ -0,0 +1,120 @@
+.. currentmodule:: torchrl.modules.inference_server
+
+Inference Server
+================
+
+.. _ref_inference_server:
+
+The inference server provides auto-batching model serving for RL actors.
+Multiple actors submit individual TensorDicts; the server transparently
+batches them, runs a single model forward pass, and routes results back.
+
+Core API
+--------
+
+.. autosummary::
+    :toctree: generated/
+    :template: rl_template_noinherit.rst
+
+    InferenceServer
+    InferenceClient
+    InferenceTransport
+
+Transport Backends
+------------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: rl_template_noinherit.rst
+
+    ThreadingTransport
+    MPTransport
+    RayTransport
+    MonarchTransport
+
+Usage
+-----
+
+The simplest setup uses :class:`ThreadingTransport` for actors that are
+threads in the same process:
+
+.. code-block:: python
+
+    from tensordict.nn import TensorDictModule
+    from torchrl.modules.inference_server import (
+        InferenceServer,
+        ThreadingTransport,
+    )
+    import torch.nn as nn
+    import concurrent.futures
+
+    policy = TensorDictModule(
+        nn.Sequential(nn.Linear(8, 64), nn.ReLU(), nn.Linear(64, 4)),
+        in_keys=["observation"],
+        out_keys=["action"],
+    )
+
+    transport = ThreadingTransport()
+    server = InferenceServer(policy, transport, max_batch_size=32)
+    server.start()
+    client = transport.client()
+
+    # actor threads call client(td) -- batched automatically
+    with concurrent.futures.ThreadPoolExecutor(16) as pool:
+        ...
+
+    server.shutdown()
+
+Weight Synchronisation
+^^^^^^^^^^^^^^^^^^^^^^
+
+The server integrates with :class:`~torchrl.weight_update.WeightSyncScheme`
+to receive updated model weights from a trainer between inference batches:
+
+.. code-block:: python
+
+    from torchrl.weight_update import SharedMemWeightSyncScheme
+
+    weight_sync = SharedMemWeightSyncScheme()
+    # Initialise on the trainer (sender) side first
+    weight_sync.init_on_sender(model=training_model, ...)
+
+    server = InferenceServer(
+        model=inference_model,
+        transport=ThreadingTransport(),
+        weight_sync=weight_sync,
+    )
+    server.start()
+
+    # Training loop
+    for batch in dataloader:
+        loss = loss_fn(training_model(batch))
+        loss.backward()
+        optimizer.step()
+        weight_sync.send(model=training_model)  # pushed to server
+
+Integration with Collectors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The easiest way to use the inference server with RL data collection is
+through :class:`~torchrl.collectors.AsyncBatchedCollector`, which
+creates the server, transport, and env pool automatically:
+
+.. code-block:: python
+
+    from torchrl.collectors import AsyncBatchedCollector
+    from torchrl.envs import GymEnv
+
+    collector = AsyncBatchedCollector(
+        create_env_fn=[lambda: GymEnv("CartPole-v1")] * 8,
+        policy=my_policy,
+        frames_per_batch=200,
+        total_frames=10_000,
+        max_batch_size=8,
+    )
+
+    for data in collector:
+        # train on data ...
+        pass
+
+    collector.shutdown()
@@ -0,0 +1,63 @@
+"""AsyncBatchedCollector example.
+
+Demonstrates how to use :class:`~torchrl.collectors.AsyncBatchedCollector` to
+run many environments in parallel while automatically batching policy inference
+through an :class:`~torchrl.modules.InferenceServer`.
+
+Architecture:
+  - An :class:`~torchrl.envs.AsyncEnvPool` runs environments in parallel
+    using the chosen backend (``"threading"`` or ``"multiprocessing"``).
+  - One lightweight coordinator thread per environment owns a slot in the pool
+    and an inference client.
+  - An :class:`~torchrl.modules.InferenceServer` batches incoming observations
+    and runs a single forward pass.
+  - There is no global synchronisation barrier -- fast envs keep stepping
+    while slow ones wait for inference.
+
+The user only supplies:
+  - A list of environment factories
+  - A policy (or policy factory)
+"""
+import torch.nn as nn
+from tensordict.nn import TensorDictModule
+
+from torchrl.collectors import AsyncBatchedCollector
+from torchrl.envs import GymEnv
+
+
+def make_env():
+    """Factory that returns a CartPole environment."""
+    return GymEnv("CartPole-v1")
+
+
+def main():
+    num_envs = 4
+    frames_per_batch = 200
+    total_frames = 1_000
+
+    # A simple linear policy (random weights -- just for demonstration)
+    policy = TensorDictModule(
+        nn.Linear(4, 2), in_keys=["observation"], out_keys=["action"]
+    )
+
+    collector = AsyncBatchedCollector(
+        create_env_fn=[make_env] * num_envs,
+        policy=policy,
+        frames_per_batch=frames_per_batch,
+        total_frames=total_frames,
+        max_batch_size=num_envs,
+        device="cpu",
+    )
+
+    total_collected = 0
+    for i, batch in enumerate(collector):
+        n = batch.numel()
+        total_collected += n
+        print(f"Batch {i}: {batch.shape}  ({n} frames, total={total_collected})")
+
+    collector.shutdown()
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
@@ -36,6 +36,7 @@ per-file-ignores =
     .github/unittest/helpers/*.py: T001, T201
     docs/source/conf.py: T001
     test/test_libs.py: T001
+    benchmarks/*.py: T001, T201
     torchrl/_utils.py: T002
 
 exclude = venv