[Feature] Remote LLM wrappers and batching (#3116)

vmoens · web-flow · commit 0e4dabe3768c · 2025-08-06T23:24:39.000+01:00
diff --git a/docs/source/reference/llms.rst b/docs/source/reference/llms.rst
@@ -436,12 +436,107 @@ The main goal of these primitives is to:
     LLMWrapperBase
     TransformersWrapper
     vLLMWrapper
+    RemoteTransformersWrapper
+    RemotevLLMWrapper
     ChatHistory
     Text
     LogProbs
     Masks
     Tokens
 
+Remote Wrappers
+^^^^^^^^^^^^^^^
+
+TorchRL provides remote wrapper classes that enable distributed execution of LLM wrappers using Ray. These wrappers provide a simplified interface that doesn't require explicit `remote()` and `get()` calls, making them easy to use in distributed settings.
+
+**Key Features:**
+
+- **Simplified Interface**: No need to call `remote()` and `get()` explicitly
+- **Full API Compatibility**: Exposes all public methods from the base `LLMWrapperBase` class
+- **Automatic Ray Management**: Handles Ray initialization and remote execution internally
+- **Property Access**: All properties are accessible through the remote wrapper
+- **Error Handling**: Proper error propagation from remote actors
+- **Resource Management**: Context manager support for automatic cleanup
+
+**Model Parameter Requirements:**
+
+- **RemotevLLMWrapper**: Accepts string model names/paths (recommended) or remote vLLM LLM objects with ray handles. Local vLLM models are not serializable.
+- **RemoteTransformersWrapper**: Only accepts string model names/paths. Transformers models are not serializable.
+
+**Usage Examples:**
+
+.. code-block:: python
+
+    import ray
+    from torchrl.modules.llm.policies import RemotevLLMWrapper, RemoteTransformersWrapper
+    from torchrl.data.llm import History
+    from torchrl.modules.llm.policies import ChatHistory, Text
+    from tensordict import TensorDict
+
+    # Initialize Ray (if not already done)
+    if not ray.is_initialized():
+        ray.init()
+
+    # Use context manager for proper cleanup (recommended)
+    with RemotevLLMWrapper(
+        model="gpt2",
+        max_concurrency=16,  # Control concurrent calls
+        input_mode="history",
+        generate=True,
+        generate_kwargs={"max_new_tokens": 50, "temperature": 0.7}
+    ) as remote_wrapper:
+        
+        # Create test input
+        history = History.from_chats([[
+            {"role": "user", "content": "Hello, how are you?"}
+        ]])
+        chat_history = ChatHistory(prompt=history)
+        tensordict_input = TensorDict(history=chat_history, batch_size=(1,))
+        
+        # Use like a regular wrapper (no remote/get calls needed!)
+        result = remote_wrapper(tensordict_input)
+        print(result["text"].response)
+
+    # Transformers wrapper (only string models supported)
+    with RemoteTransformersWrapper(
+        model="gpt2",
+        max_concurrency=16,
+        input_mode="text",
+        generate=True,
+        generate_kwargs={"max_new_tokens": 30}
+    ) as remote_transformers:
+        
+        text_input = TensorDict({"text": Text(prompt="Hello world")}, batch_size=(1,))
+        result = remote_transformers(text_input)
+        print(result["text"].response)
+
+**Cleanup and Resource Management:**
+
+The remote wrappers implement context managers for proper resource cleanup:
+
+.. code-block:: python
+
+    # Context manager (recommended)
+    with RemotevLLMWrapper(model="gpt2") as wrapper:
+        result = wrapper(input_data)
+        # Cleanup is automatic when exiting the context
+
+    # Manual cleanup
+    wrapper = RemotevLLMWrapper(model="gpt2")
+    try:
+        result = wrapper(input_data)
+    finally:
+        wrapper.cleanup_batching()  # Important: prevents hanging
+
+**Performance Considerations:**
+
+- **Network Overhead**: Remote execution adds network communication overhead
+- **Serialization**: Data is serialized when sent to remote actors
+- **Memory**: Each remote actor maintains its own copy of the model
+- **Concurrency**: Multiple remote wrappers can run concurrently
+- **Max Concurrency**: Use the `max_concurrency` parameter to control the number of concurrent calls to each remote actor
+- **Cleanup**: Always use context managers or call `cleanup_batching()` to prevent hanging due to batching locks
+
 Utils
 ^^^^^
 
diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py
@@ -2850,6 +2850,53 @@ def __init__(self):
         assert result["text"].prompt == "Single question without batch dimension?"
 
 
+class TestRayWrapper:
+    @pytest.mark.parametrize("backend", ["transformers", "vllm"])
+    def test_ray_wrapper(self, sample_text, backend):
+        import gc
+        from concurrent.futures import ThreadPoolExecutor
+
+        from torchrl import logger as torchrl_logger
+        from torchrl.modules.llm.policies import (
+            RemoteTransformersWrapper,
+            RemotevLLMWrapper,
+        )
+
+        # check that the wrapper is remote
+        if backend == "vllm":
+            cls = RemotevLLMWrapper
+        elif backend == "transformers":
+            cls = RemoteTransformersWrapper
+        else:
+            raise ValueError(f"Invalid backend: {backend}")
+        model = cls(
+            model="Qwen/Qwen2.5-0.5B",
+            generate=True,
+            input_mode="text",
+            batching=True,
+            generate_kwargs={"max_new_tokens": 10},
+        )
+        try:
+            # check batching
+            data = TensorDict(
+                text=Text(prompt=sample_text[0]),
+                batch_size=(),
+            )
+            with ThreadPoolExecutor(max_workers=10) as executor:
+                futures = [executor.submit(model, data) for _ in range(10)]
+                torchrl_logger.info(f"Futures: {futures}")
+                results = [future.result() for future in futures]
+                torchrl_logger.info(f"Results: {results}")
+                assert all(result.batch_size == () for result in results)
+                assert all(
+                    isinstance(result["text"].response, str) for result in results
+                )
+                torchrl_logger.info("Batching test passed")
+        finally:
+            del model
+            gc.collect()
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
diff --git a/torchrl/modules/llm/policies/__init__.py b/torchrl/modules/llm/policies/__init__.py
@@ -6,13 +6,15 @@
 from __future__ import annotations
 
 from .common import ChatHistory, LLMWrapperBase, LogProbs, Masks, Text, Tokens
-from .transformers_wrapper import TransformersWrapper
+from .transformers_wrapper import RemoteTransformersWrapper, TransformersWrapper
 
-from .vllm_wrapper import vLLMWrapper
+from .vllm_wrapper import RemotevLLMWrapper, vLLMWrapper
 
 __all__ = [
     "TransformersWrapper",
+    "RemoteTransformersWrapper",
     "vLLMWrapper",
+    "RemotevLLMWrapper",
     "LLMWrapperBase",
     "Text",
     "LogProbs",
diff --git a/torchrl/modules/llm/policies/common.py b/torchrl/modules/llm/policies/common.py
@@ -7,7 +7,6 @@
 import threading
 import warnings
 import weakref
-
 from functools import wraps
 from typing import Any, Literal, overload
 
@@ -1334,6 +1333,13 @@ def _batched_func(self, td_input: TensorDictBase, **kwargs):
                         ).unbind(0)
                         for i, future in enumerate(not_done):
                             future.set_result(results[i])
+                        # remove not done futures from the queue
+                        self._batch_queue = [
+                            q
+                            for q, f in zip(self._batch_queue, futures)
+                            if f not in not_done
+                        ]
+                        self._futures = [f for f in self._futures if f not in not_done]
                 except Exception as e:
                     # Set exception for remaining futures that haven't been completed yet
                     for future in not_done:
diff --git a/torchrl/modules/llm/policies/transformers_wrapper.py b/torchrl/modules/llm/policies/transformers_wrapper.py
diff --git a/torchrl/modules/llm/policies/vllm_wrapper.py b/torchrl/modules/llm/policies/vllm_wrapper.py