Update

vmoens · vmoens · commit 8c2309c66fe6 · 2026-02-17T10:17:27.000Z
[ghstack-poisoned]
diff --git a/torchrl/modules/inference_server/_monarch.py b/torchrl/modules/inference_server/_monarch.py
@@ -4,49 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 from __future__ import annotations
 
-import threading
-
 from torchrl.modules.inference_server._queue_transport import (
     _QueueInferenceClient,
     QueueBasedTransport,
 )
 
 
-class _MonarchRequestQueue:
-    """Wrapper around ``MonarchQueue`` that signals a :class:`threading.Event` on put.
-
-    Also adapts the Monarch queue API (``get(block=False)``) to the standard
-    ``get_nowait()`` expected by :class:`QueueBasedTransport`.
-    """
-
-    def __init__(self, monarch_queue, has_work: threading.Event):
-        self._queue = monarch_queue
-        self._has_work = has_work
-
-    def put(self, item):
-        self._queue.put(item)
-        self._has_work.set()
-
-    def get(self, timeout=None):
-        return self._queue.get(timeout=timeout)
-
-    def get_nowait(self):
-        return self._queue.get(block=False)
-
-
-class _MonarchResponseQueue:
-    """Thin wrapper adapting the MonarchQueue get API."""
-
-    def __init__(self, monarch_queue):
-        self._queue = monarch_queue
-
-    def put(self, item):
-        self._queue.put(item)
-
-    def get(self, timeout=None):
-        return self._queue.get(timeout=timeout)
-
-
 class MonarchTransport(QueueBasedTransport):
     """Transport using Monarch for distributed inference on GPU clusters.
 
@@ -74,15 +37,12 @@ def __init__(self, *, max_queue_size: int = 1000):
                 "Monarch is required for MonarchTransport. "
                 "Install it following the Monarch documentation."
             )
-        self._has_work = threading.Event()
-        self._request_queue = _MonarchRequestQueue(
-            MonarchQueue(maxsize=max_queue_size), self._has_work
-        )
-        self._response_queues: dict[int, _MonarchResponseQueue] = {}
+        self._request_queue = MonarchQueue(maxsize=max_queue_size)
+        self._response_queues: dict[int, MonarchQueue] = {}
         self._MonarchQueue = MonarchQueue
 
-    def _make_response_queue(self) -> _MonarchResponseQueue:
-        return _MonarchResponseQueue(self._MonarchQueue(maxsize=1000))
+    def _make_response_queue(self):
+        return self._MonarchQueue(maxsize=1000)
 
     def client(self) -> _QueueInferenceClient:
         """Create an actor-side client with a dedicated response queue.
diff --git a/torchrl/modules/inference_server/_mp.py b/torchrl/modules/inference_server/_mp.py
@@ -5,36 +5,13 @@
 from __future__ import annotations
 
 import multiprocessing as mp
-import threading
 
 from torchrl.modules.inference_server._queue_transport import (
     _QueueInferenceClient,
     QueueBasedTransport,
 )
 
 
-class _MPRequestQueue:
-    """Wrapper around ``mp.Queue`` that signals a :class:`threading.Event` on put.
-
-    This avoids the get-then-put anti-pattern in ``wait_for_work``: instead of
-    consuming an item just to peek, callers wait on the event.
-    """
-
-    def __init__(self, ctx: mp.context.BaseContext, has_work: threading.Event):
-        self._queue: mp.Queue = ctx.Queue()
-        self._has_work = has_work
-
-    def put(self, item):
-        self._queue.put(item)
-        self._has_work.set()
-
-    def get(self, timeout=None):
-        return self._queue.get(timeout=timeout)
-
-    def get_nowait(self):
-        return self._queue.get_nowait()
-
-
 class MPTransport(QueueBasedTransport):
     """Cross-process transport using :mod:`multiprocessing` queues.
 
@@ -58,8 +35,7 @@ class MPTransport(QueueBasedTransport):
     def __init__(self, ctx: mp.context.BaseContext | None = None):
         super().__init__()
         self._ctx = ctx if ctx is not None else mp.get_context("spawn")
-        self._has_work = threading.Event()
-        self._request_queue = _MPRequestQueue(self._ctx, self._has_work)
+        self._request_queue: mp.Queue = self._ctx.Queue()
         self._response_queues: dict[int, mp.Queue] = {}
 
     def _make_response_queue(self) -> mp.Queue:
diff --git a/torchrl/modules/inference_server/_queue_transport.py b/torchrl/modules/inference_server/_queue_transport.py
@@ -74,7 +74,7 @@ class _QueueInferenceClient:
     request-id.
 
     Args:
-        request_queue: the shared request queue.
+        request_queue: the shared request queue (any object with ``.put()``).
         response_queue: this client's dedicated response queue.
         actor_id: the unique identifier assigned by the transport.
     """
@@ -122,18 +122,23 @@ def _get_result(self, req_id: int, timeout: float | None = None) -> Any:
 class QueueBasedTransport(InferenceTransport):
     """Base class for transports that use a request queue and per-actor response queues.
 
-    Subclasses must set the following attributes before calling ``super().__init__()``:
+    Subclasses must set the following attributes in ``__init__`` (before or
+    after calling ``super().__init__()``):
 
-    * ``_request_queue`` -- the shared request queue (any object with ``.put()``,
-      ``.get(timeout=...)``, and ``.get_nowait()`` / ``.get(block=False)``).
+    * ``_request_queue`` -- the shared request queue (any object with
+      ``.put()``, ``.get(timeout=...)``, and ``.get(block=False)``).
     * ``_response_queues`` -- a ``dict[int, <queue>]`` mapping actor ids to
       per-actor response queues.
-    * ``_has_work`` -- a :class:`threading.Event` that is set whenever a new
-      request is enqueued (used for non-blocking ``wait_for_work``).
 
     Subclasses must implement:
 
     * :meth:`_make_response_queue` -- factory for creating a new response queue.
+
+    .. note::
+        ``wait_for_work`` uses a blocking ``get`` followed by ``put`` to peek
+        at the request queue.  This is safe because a single server thread
+        calls both ``wait_for_work`` and ``drain`` sequentially -- there is no
+        concurrent consumer that could miss the re-enqueued item.
     """
 
     def __init__(self):
@@ -178,7 +183,7 @@ def drain(
         callbacks: list[tuple[int, int]] = []
         for _ in range(max_items):
             try:
-                actor_id, req_id, td = self._request_queue.get_nowait()
+                actor_id, req_id, td = self._request_queue.get(block=False)
             except Exception:
                 break
             items.append(td)
@@ -187,8 +192,12 @@ def drain(
 
     def wait_for_work(self, timeout: float) -> None:
         """Block until at least one request is available or *timeout* elapses."""
-        self._has_work.wait(timeout=timeout)
-        self._has_work.clear()
+        try:
+            item = self._request_queue.get(timeout=timeout)
+            # Put it back so drain() can consume it.
+            self._request_queue.put(item)
+        except Exception:
+            pass
 
     def resolve(self, callback: tuple[int, int], result: TensorDictBase) -> None:
         """Route the result to the correct actor's response queue."""
diff --git a/torchrl/modules/inference_server/_ray.py b/torchrl/modules/inference_server/_ray.py
@@ -4,49 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 from __future__ import annotations
 
-import threading
-
 from torchrl.modules.inference_server._queue_transport import (
     _QueueInferenceClient,
     QueueBasedTransport,
 )
 
 
-class _RayRequestQueue:
-    """Wrapper around ``ray.util.queue.Queue`` that signals a :class:`threading.Event` on put.
-
-    Also adapts the Ray queue API (``get(block=False)``) to the standard
-    ``get_nowait()`` expected by :class:`QueueBasedTransport`.
-    """
-
-    def __init__(self, ray_queue, has_work: threading.Event):
-        self._queue = ray_queue
-        self._has_work = has_work
-
-    def put(self, item):
-        self._queue.put(item)
-        self._has_work.set()
-
-    def get(self, timeout=None):
-        return self._queue.get(timeout=timeout)
-
-    def get_nowait(self):
-        return self._queue.get(block=False)
-
-
-class _RayResponseQueue:
-    """Thin wrapper around ``ray.util.queue.Queue`` that adapts the get API."""
-
-    def __init__(self, ray_queue):
-        self._queue = ray_queue
-
-    def put(self, item):
-        self._queue.put(item)
-
-    def get(self, timeout=None):
-        return self._queue.get(timeout=timeout)
-
-
 class RayTransport(QueueBasedTransport):
     """Transport using Ray queues for distributed inference.
 
@@ -77,15 +40,12 @@ def __init__(self, *, max_queue_size: int = 1000):
             raise ImportError(
                 "Ray is required for RayTransport. Install it with: pip install ray"
             )
-        self._has_work = threading.Event()
-        self._request_queue = _RayRequestQueue(
-            ray.util.queue.Queue(maxsize=max_queue_size), self._has_work
-        )
-        self._response_queues: dict[int, _RayResponseQueue] = {}
+        self._request_queue = ray.util.queue.Queue(maxsize=max_queue_size)
+        self._response_queues: dict[int, ray.util.queue.Queue] = {}
         self._ray_queue_module = ray.util.queue
 
-    def _make_response_queue(self) -> _RayResponseQueue:
-        return _RayResponseQueue(self._ray_queue_module.Queue(maxsize=1000))
+    def _make_response_queue(self):
+        return self._ray_queue_module.Queue(maxsize=1000)
 
     def client(self) -> _QueueInferenceClient:
         """Create an actor-side client with a dedicated Ray response queue.