[Feature] Auto-batching inference server: Ray transport (#3495)

vmoens · cursoragent · vmoens · commit 52dc84e6aa1f · 2026-02-21T21:11:46.000Z
Adds RayTransport using ray.util.queue.Queue for distributed inference across Ray actors. Ray is imported lazily at instantiation time. Co-authored-by: Cursor <cursoragent@cursor.com> ghstack-source-id: 660ee98 Pull-Request: #3495 Co-authored-by: Cursor <cursoragent@cursor.com>
diff --git a/test/test_inference_server.py b/test/test_inference_server.py
@@ -20,9 +20,16 @@
     InferenceServer,
     InferenceTransport,
     MPTransport,
+    RayTransport,
     ThreadingTransport,
 )
 
+_has_ray = True
+try:
+    import ray
+except ImportError:
+    _has_ray = False
+
 
 # =============================================================================
 # Helpers
@@ -398,3 +405,88 @@ def bad_model(td):
             td = TensorDict({"observation": torch.randn(4)})
             with pytest.raises(ValueError, match="mp model error"):
                 client(td)
+
+
+# =============================================================================
+# Tests: RayTransport (Commit 4)
+# =============================================================================
+
+
+@pytest.mark.skipif(not _has_ray, reason="ray not installed")
+class TestRayTransport:
+    @classmethod
+    def setup_class(cls):
+        if not ray.is_initialized():
+            ray.init(num_cpus=4, ignore_reinit_error=True)
+
+    def test_single_request(self):
+        transport = RayTransport()
+        client = transport.client()
+        policy = _make_policy()
+        with InferenceServer(policy, transport, max_batch_size=4):
+            td = TensorDict({"observation": torch.randn(4)})
+            result = client(td)
+            assert "action" in result.keys()
+            assert result["action"].shape == (2,)
+
+    def test_concurrent_clients(self):
+        """Multiple clients submit concurrently from threads (simulating Ray actors)."""
+        transport = RayTransport()
+        policy = _make_policy()
+        n_clients = 4
+        n_requests = 20
+
+        clients = [transport.client() for _ in range(n_clients)]
+        results_per_client: list[list[TensorDictBase]] = [[] for _ in range(n_clients)]
+
+        def client_fn(client_idx):
+            for _ in range(n_requests):
+                td = TensorDict({"observation": torch.randn(4)})
+                result = clients[client_idx](td)
+                results_per_client[client_idx].append(result)
+
+        with InferenceServer(policy, transport, max_batch_size=8):
+            with concurrent.futures.ThreadPoolExecutor(max_workers=n_clients) as pool:
+                futs = [pool.submit(client_fn, i) for i in range(n_clients)]
+                concurrent.futures.wait(futs)
+                for f in futs:
+                    f.result()
+
+        for client_results in results_per_client:
+            assert len(client_results) == n_requests
+            for r in client_results:
+                assert "action" in r.keys()
+                assert r["action"].shape == (2,)
+
+    def test_ray_remote_actor(self):
+        """A Ray remote actor can use the client to get inference results."""
+        transport = RayTransport()
+        client = transport.client()
+        policy = _make_policy()
+
+        @ray.remote
+        def remote_actor_fn(client, n_requests):
+            results = []
+            for _ in range(n_requests):
+                td = TensorDict({"observation": torch.randn(4)})
+                result = client(td)
+                results.append(result["action"].shape)
+            return results
+
+        with InferenceServer(policy, transport, max_batch_size=8):
+            ref = remote_actor_fn.remote(client, 5)
+            shapes = ray.get(ref, timeout=30.0)
+            assert len(shapes) == 5
+            for s in shapes:
+                assert s == (2,)
+
+    def test_ray_exception_propagates(self):
+        def bad_model(td):
+            raise ValueError("ray model error")
+
+        transport = RayTransport()
+        client = transport.client()
+        with InferenceServer(bad_model, transport, max_batch_size=4):
+            td = TensorDict({"observation": torch.randn(4)})
+            with pytest.raises(ValueError, match="ray model error"):
+                client(td)
diff --git a/torchrl/modules/inference_server/__init__.py b/torchrl/modules/inference_server/__init__.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from torchrl.modules.inference_server._mp import MPTransport
+from torchrl.modules.inference_server._ray import RayTransport
 from torchrl.modules.inference_server._server import InferenceClient, InferenceServer
 from torchrl.modules.inference_server._threading import ThreadingTransport
 from torchrl.modules.inference_server._transport import InferenceTransport
@@ -13,5 +14,6 @@
     "InferenceServer",
     "InferenceTransport",
     "MPTransport",
+    "RayTransport",
     "ThreadingTransport",
 ]
diff --git a/torchrl/modules/inference_server/_ray.py b/torchrl/modules/inference_server/_ray.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from __future__ import annotations
+
+from torchrl.modules.inference_server._queue_transport import (
+    _QueueInferenceClient,
+    QueueBasedTransport,
+)
+
+
+class RayTransport(QueueBasedTransport):
+    """Transport using Ray queues for distributed inference.
+
+    Uses ``ray.util.queue.Queue`` for both request submission and response
+    routing.  Per-actor response queues ensure correct result routing without
+    serialising Queue objects through other queues.
+
+    Ray is imported lazily at instantiation time; importing the class itself
+    does not require Ray.
+
+    Keyword Args:
+        max_queue_size (int): maximum size of the request queue.
+            Default: ``1000``.
+
+    Example:
+        >>> import ray
+        >>> ray.init()
+        >>> transport = RayTransport()
+        >>> client = transport.client()
+        >>> # pass *client* to a Ray actor for remote inference requests
+    """
+
+    def __init__(self, *, max_queue_size: int = 1000):
+        super().__init__()
+        try:
+            import ray.util.queue
+        except ImportError:
+            raise ImportError(
+                "Ray is required for RayTransport. Install it with: pip install ray"
+            )
+        self._request_queue = ray.util.queue.Queue(maxsize=max_queue_size)
+        self._response_queues: dict[int, ray.util.queue.Queue] = {}
+        self._ray_queue_module = ray.util.queue
+
+    def _make_response_queue(self):
+        return self._ray_queue_module.Queue(maxsize=1000)
+
+    def client(self) -> _QueueInferenceClient:
+        """Create an actor-side client with a dedicated Ray response queue.
+
+        Returns:
+            A :class:`_QueueInferenceClient` that can be used inside any Ray
+            actor or the driver process.
+        """
+        return super().client()