[Feature] batching for vllm and transformers wrappers (#3103)

vmoens · web-flow · commit db9d2839aa4b · 2025-07-30T18:23:57.000+01:00
diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py
@@ -8,6 +8,7 @@
 import importlib.util
 
 import os
+import time
 from functools import partial
 
 import pytest
@@ -72,7 +73,11 @@ def vllm_instance():
     assert os.environ.get("VLLM_USE_V1") == "0"
 
     try:
-        model = LLM("Qwen/Qwen2.5-0.5B")
+        model = LLM(
+            "Qwen/Qwen2.5-0.5B",
+            max_num_batched_tokens=32768,  # Match max_model_len
+            max_model_len=32768,
+        )
         tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
         tokenizer.pad_token = tokenizer.eos_token
         return model, tokenizer
@@ -717,6 +722,179 @@ def test_generate_false_without_log_probs(
                 return_log_probs=False,
             )
 
+    # ================================================
+    # Batching Tests
+    # ================================================
+
+    @pytest.mark.parametrize(
+        "wrapper_class",
+        [vLLMWrapper, TransformersWrapperMaxTokens],
+        ids=["vllm", "transformers"],
+    )
+    def test_batching(self, wrapper_class, vllm_instance, transformers_instance):
+        from concurrent.futures import ThreadPoolExecutor, wait
+
+        # Handle the case where vLLM is not available
+        if wrapper_class == vLLMWrapper:
+            try:
+                model, tokenizer = vllm_instance
+            except Exception as e:
+                if "vLLM compatibility issue" in str(e):
+                    pytest.skip("vLLM not available due to compatibility issues")
+                raise
+        else:
+            model, tokenizer = transformers_instance
+
+        wrapper = wrapper_class(
+            model,
+            tokenizer=tokenizer,
+            input_mode="text",
+            generate=True,
+            return_log_probs=True,
+            batch_size=4,
+        )
+        # Create 2 threads and send inputs
+        inputs = [
+            TensorDict(
+                text=Text(prompt=[f"Question {i}?", f"Question {i+2}?"]),
+                batch_size=(2,),
+            )
+            for i in range(2)
+        ]
+        pool = ThreadPoolExecutor(max_workers=2)
+        try:
+            futures = [pool.submit(wrapper, input) for input in inputs]
+            wait(futures)
+        finally:
+            pool.shutdown(wait=False, cancel_futures=True)
+
+    @pytest.mark.parametrize(
+        "wrapper_class",
+        [vLLMWrapper, TransformersWrapperMaxTokens],
+        ids=["vllm", "transformers"],
+    )
+    def test_batching_uneven(self, wrapper_class, vllm_instance, transformers_instance):
+        from concurrent.futures import ThreadPoolExecutor, wait
+
+        if wrapper_class == vLLMWrapper:
+            model, tokenizer = vllm_instance
+        else:
+            model, tokenizer = transformers_instance
+        wrapper = wrapper_class(
+            model,
+            tokenizer=tokenizer,
+            input_mode="text",
+            generate=True,
+            return_log_probs=True,
+            batch_size=5,
+            batching_timeout=5,  # Increased timeout for CI environments
+        )
+        inputs = [
+            TensorDict(text=Text(prompt=["Question 1?"]), batch_size=(1,)),
+            TensorDict(
+                text=Text(prompt=["Question 2?", "Question 3?", "Question 4?"]),
+                batch_size=(3,),
+            ),
+            TensorDict(
+                text=Text(prompt=["Question 5?", "Question 6?"]), batch_size=(2,)
+            ),
+        ]
+        pool = ThreadPoolExecutor(max_workers=3)
+        try:
+            futures = []
+            for input in inputs:
+                futures.append(pool.submit(wrapper, input))
+                time.sleep(0.05)  # Increased delay for more reliable timing
+
+            # Wait for first two futures with longer timeout
+            wait(futures[:2], timeout=3)
+
+            # Check results with more flexible assertions
+            result0 = futures[0].result()
+            result1 = futures[1].result()
+
+            assert result0["text"].prompt == ["Question 1?"]
+            assert result1["text"].prompt == [
+                "Question 2?",
+                "Question 3?",
+                "Question 4?",
+            ]
+
+            # The third future may or may not be done depending on timing
+            # Wait for it with a reasonable timeout
+            wait(futures[2:], timeout=10)
+            if not futures[2].done():
+                raise RuntimeError("Third future not done")
+            result2 = futures[2].result()
+            assert result2["text"].prompt == ["Question 5?", "Question 6?"]
+        finally:
+            pool.shutdown(wait=False, cancel_futures=True)
+
+    @pytest.mark.parametrize(
+        "wrapper_class",
+        [vLLMWrapper, TransformersWrapperMaxTokens],
+        ids=["vllm", "transformers"],
+    )
+    def test_batching_cleanup(
+        self, wrapper_class, vllm_instance, transformers_instance
+    ):
+        """Test batching cleanup functionality."""
+        if wrapper_class == vLLMWrapper:
+            model, tokenizer = vllm_instance
+        else:
+            model, tokenizer = transformers_instance
+
+        wrapper = wrapper_class(
+            model,
+            tokenizer=tokenizer,
+            input_mode="text",
+            generate=True,
+            return_log_probs=True,
+            batch_size=3,
+        )
+
+        # Check initial state
+        state = wrapper.get_batching_state()
+        assert state["batching_enabled"] is True
+        assert state["batch_size"] == 3
+        assert state["queue_size"] == 0
+        assert state["pending_futures"] == 0
+
+        # Add some inputs to the queue
+        input1 = TensorDict(text=Text(prompt=["Test 1"]), batch_size=(1,))
+        input2 = TensorDict(text=Text(prompt=["Test 2"]), batch_size=(1,))
+
+        # Submit inputs (they won't be processed immediately due to batch size)
+        from concurrent.futures import ThreadPoolExecutor
+
+        pool = ThreadPoolExecutor(max_workers=1)
+        try:
+            future1 = pool.submit(wrapper, input1)
+            future2 = pool.submit(wrapper, input2)
+
+            # Check state after adding inputs
+            state = wrapper.get_batching_state()
+            assert state["queue_size"] >= 0  # May be 0 if processed immediately
+            assert state["pending_futures"] >= 0
+
+            # Clean up
+            wrapper.cleanup_batching()
+
+            # Check state after cleanup
+            state = wrapper.get_batching_state()
+            assert state["queue_size"] == 0
+            assert state["pending_futures"] == 0
+
+            # Wait for futures to complete or fail
+            try:
+                future1.result(timeout=5)
+                future2.result(timeout=5)
+            except Exception:
+                # Futures may fail after cleanup, which is expected
+                pass
+        finally:
+            pool.shutdown(wait=False, cancel_futures=True)
+
     # ================================================
     # Batch Size Tests
     # ================================================
diff --git a/torchrl/modules/llm/policies/common.py b/torchrl/modules/llm/policies/common.py
@@ -6,6 +6,8 @@
 
 import warnings
 import weakref
+
+from functools import wraps
 from typing import Any, Literal, overload
 
 import torch
@@ -372,6 +374,12 @@ class LLMWrapperBase(TensorDictModuleBase):
         text_key (NestedKey | None, optional): The key for the action :class:`~torchrl.modules.llm.policies.Text` object. Defaults to `"text"`.
         tokens_key (NestedKey | None, optional): The key for the action :class:`~torchrl.modules.llm.policies.Tokens` object. Defaults to `"tokens"`.
         masks_key (NestedKey | None, optional): The key for the action :class:`~torchrl.modules.llm.policies.Masks` object. Defaults to `"masks"`.
+        batch_size (int | None, optional): The batch size to use for batching. If None, no batching is done. If provided, the module will batch the inputs and process them in batches of this size.
+            This means that a single call to the module will wait until enough inputs are available to form a batch of this size, and then process the batch.
+            This functionality uses concurrent futures to process the batches in parallel and therefore is best used in a multi-threaded environment.
+            Defaults to `None`.
+        batching_timeout (float, optional): The timeout for batching. If the batch isn't completed after `batching_timeout` seconds, the batch is processed as is.
+            Defaults to `10` seconds.
 
     Attributes:
         collector: The collector associated with the module, if it exists.
@@ -393,6 +401,7 @@ class LLMWrapperBase(TensorDictModuleBase):
     device: torch.device | None
     layout: torch.layout | None
     num_samples: int | None
+    _batching_timeout: float | None
 
     @overload
     def __init__(
@@ -419,6 +428,8 @@ def __init__(
         tokens_key: NestedKey | None = "tokens",
         masks_key: NestedKey | None = "masks",
         log_probs_key: NestedKey | None = "log_probs",
+        batch_size: int | None = None,
+        batching_timeout: float = 10.0,
     ):
         ...
 
@@ -907,6 +918,35 @@ def log_prob(self, data: TensorDictBase, **get_kwargs) -> TensorDictBase:
             return data.get((self.log_prob_key, "response"), **get_kwargs)
         raise RuntimeError("log_prob not callable when generate=True.")
 
+    def cleanup_batching(self):
+        """Clear batching queues to prevent memory leaks.
+
+        This method should be called when the wrapper is no longer needed
+        or when you want to reset the batching state.
+        """
+        if hasattr(self, "_batch_queue"):
+            self._batch_queue.clear()
+        if hasattr(self, "_futures"):
+            self._futures.clear()
+
+    def get_batching_state(self):
+        """Get the current batching state for debugging and monitoring.
+
+        Returns:
+            dict: A dictionary containing the current batching state including
+                  queue size, number of pending futures, and batch size.
+        """
+        if not hasattr(self, "batch_size") or self.batch_size is None:
+            return {"batching_enabled": False}
+
+        return {
+            "batching_enabled": True,
+            "batch_size": self.batch_size,
+            "queue_size": len(getattr(self, "_batch_queue", [])),
+            "pending_futures": len(getattr(self, "_futures", [])),
+            "timeout": getattr(self, "_batching_timeout", None),
+        }
+
 
 def _extract_responses_from_full_histories(
     text_full: list[str],
@@ -973,3 +1013,68 @@ def _extract_responses_from_full_histories(
         return torch.stack(padded_responses)
 
     return torch.stack(response_histories)
+
+
+def _batching(func):
+    from concurrent.futures import Future, wait
+
+    @wraps(func)
+    def _batched_func(self, td_input: TensorDictBase, **kwargs):
+        if getattr(self, "batch_size", None) is not None:
+            # put elements in a queue until the batch size is reached
+            if td_input.batch_dims == 0:
+                inputs = [td_input]
+            else:
+                if td_input.batch_dims > 1:
+                    raise ValueError(
+                        f"Batching not supported for batch_dims > 1: {td_input.batch_dims}"
+                    )
+                inputs = list(td_input.unbind(0))
+
+            # Create as many futures as inputs
+            futures = [Future() for _ in inputs]
+
+            self._batch_queue.extend(inputs)
+            self._futures.extend(futures)
+
+            # Check if we have enough inputs to form a complete batch
+            if len(self._batch_queue) >= self.batch_size:
+                # Process full batch immediately
+                try:
+                    batch = lazy_stack(self._batch_queue[: self.batch_size])
+                    results = func(self, batch, **kwargs)
+                    batch_results = results.unbind(0)
+                    for i, future in enumerate(self._futures[: self.batch_size]):
+                        future.set_result(batch_results[i])
+                    self._batch_queue = self._batch_queue[self.batch_size :]
+                    self._futures = self._futures[self.batch_size :]
+                except Exception as e:
+                    # Set exception for all futures in this batch
+                    for future in futures:
+                        future.set_exception(e)
+                    raise
+
+            # Now wait for the current futures to complete (with timeout if needed)
+            _, not_done = wait(futures, timeout=self._batching_timeout)
+
+            # if there are still futures not done, process them as is
+            if not_done:
+                try:
+                    inputs_not_done = [
+                        inputs[futures.index(future)] for future in not_done
+                    ]
+                    results = func(self, torch.stack(inputs_not_done), **kwargs).unbind(
+                        0
+                    )
+                    for i, future in enumerate(not_done):
+                        future.set_result(results[i])
+                except Exception as e:
+                    # Set exception for remaining futures
+                    for future in not_done:
+                        future.set_exception(e)
+                    raise
+
+            return lazy_stack([future.result() for future in futures])
+        return func(self, td_input, **kwargs)
+
+    return _batched_func
diff --git a/torchrl/modules/llm/policies/transformers_wrapper.py b/torchrl/modules/llm/policies/transformers_wrapper.py
@@ -25,6 +25,7 @@
 from torch.nn.utils.rnn import pad_sequence
 
 from torchrl.modules.llm.policies.common import (
+    _batching,
     _extract_responses_from_full_histories,
     ChatHistory,
     LLMWrapperBase,
@@ -99,6 +100,12 @@ class TransformersWrapper(LLMWrapperBase):
         tokens_key (NestedKey | None, optional): The key for the action :class:`~torchrl.modules.llm.policies.Tokens` object. Defaults to `"tokens"`.
         masks_key (NestedKey | None, optional): The key for the action :class:`~torchrl.modules.llm.policies.Masks` object. Defaults to `"masks"`.
         history_key (NestedKey | None, optional): The key for the action :class:`~torchrl.modules.llm.policies.ChatHistory` object. Defaults to `"history"`.
+        batch_size (int | None, optional): The batch size to use for batching. If None, no batching is done. If provided, the module will batch the inputs and process them in batches of this size.
+            This means that a single call to the module will wait until enough inputs are available to form a batch of this size, and then process the batch.
+            This functionality uses concurrent futures to process the batches in parallel and therefore is best used in a multi-threaded environment.
+            Defaults to `None`.
+        batching_timeout (float, optional): The timeout for batching. If the batch isn't completed after `batching_timeout` seconds, the batch is processed as is.
+            Defaults to `10` seconds.
 
     Input Keys:
         The input key depends on both `input_mode` and `generate`:
@@ -188,9 +195,16 @@ def __init__(
         tokens_key: NestedKey | None = "tokens",
         masks_key: NestedKey | None = "masks",
         log_probs_key: NestedKey | None = "log_probs",
+        batch_size: int | None = None,
+        batching_timeout: float = 10.0,
     ):
         super().__init__()
 
+        self.batch_size = batch_size
+        self._batching_timeout = batching_timeout
+        self._batch_queue = []
+        self._futures = []
+
         if isinstance(model, str):
             from transformers import AutoModelForCausalLM
 
@@ -489,6 +503,7 @@ def get_new_version(self, **kwargs):
         return type(self)(**constructor_kwargs)
 
     @set_list_to_stack(True)
+    @_batching
     def forward(
         self,
         tensordict: TensorDictBase,
diff --git a/torchrl/modules/llm/policies/vllm_wrapper.py b/torchrl/modules/llm/policies/vllm_wrapper.py