[BugFix] Variable length vllm wrapper answer stacking (#3049)

vmoens · web-flow · commit 5a5f63d22977 · 2025-07-09T18:59:00.000+01:00
diff --git a/README.md b/README.md
@@ -29,11 +29,11 @@
 
 TorchRL now includes a comprehensive **LLM API** for post-training and fine-tuning of language models! This new framework provides everything you need for RLHF, supervised fine-tuning, and tool-augmented training:
 
-- 🤖 **Unified LLM Wrappers**: Seamless integration with Hugging Face models and vLLM inference engines
-- 💬 **Conversation Management**: Advanced `History` class for multi-turn dialogue with automatic chat template detection
-- 🛠️ **Tool Integration**: Built-in support for Python code execution, function calling, and custom tool transforms
-- 🎯 **Specialized Objectives**: GRPO (Group Relative Policy Optimization) and SFT loss functions optimized for language models
-- ⚡ **High-Performance Collectors**: Async data collection with distributed training support
+- 🤖 **Unified LLM Wrappers**: Seamless integration with Hugging Face models and vLLM inference engines - more to come!
+- 💬 **Conversation Management**: Advanced [`History`](torchrl/data/llm/history.py) class for multi-turn dialogue with automatic chat template detection
+- 🛠️ **Tool Integration**: [Built-in support](torchrl/envs/llm/transforms/) for Python code execution, function calling, and custom tool transforms
+- 🎯 **Specialized Objectives**: [GRPO](torchrl/objectives/llm/grpo.py) (Group Relative Policy Optimization) and [SFT](torchrl/objectives/llm/sft.py) loss functions optimized for language models
+- ⚡ **High-Performance Collectors**: [Async data collection](torchrl/collectors/llm/) with distributed training support
 - 🔄 **Flexible Environments**: Transform-based architecture for reward computation, data loading, and conversation augmentation
 
 The LLM API follows TorchRL's modular design principles, allowing you to mix and match components for your specific use case. Check out the [complete documentation](https://pytorch.org/rl/main/reference/llms.html) and [GRPO implementation example](https://github.com/pytorch/rl/tree/main/sota-implementations/grpo) to get started!
diff --git a/test/llm/test_data.py b/test/llm/test_data.py
@@ -966,6 +966,119 @@ def norm(x):
             history.role[:-1]
         ), f"All roles except the last should match original. Original: {history.role[:-1]}, Parsed: {parsed.role[:-1]}"
 
+    @pytest.mark.skipif(not _has_transformers, reason="requires transformers library")
+    def test_extract_responses_from_full_histories_batch_issue(self):
+        """Test the isolated function for handling different response shapes in batch processing."""
+        from torchrl.modules.llm.policies.common import (
+            _extract_responses_from_full_histories,
+        )
+        from transformers import AutoTokenizer
+
+        # Create a batch of 2 prompt histories
+        prompt_histories = History.from_chats(
+            [
+                [
+                    {"role": "user", "content": "Hello, how are you?"},
+                ],
+                [
+                    {"role": "user", "content": "Tell me a joke."},
+                ],
+            ]
+        )
+
+        # Simulate generated text with different response counts
+        text_full = [
+            # First element: 1 assistant response
+            """<|im_start|>user
+Hello, how are you?<|im_end|>
+<|im_start|>assistant
+I'm doing well, thank you for asking!<|im_end|>""",
+            # Second element: 3 messages (1 assistant + 1 user + 1 assistant)
+            """<|im_start|>user
+Tell me a joke.<|im_end|>
+<|im_start|>assistant
+Why did the chicken cross the road?<|im_end|>
+<|im_start|>user
+I don't know, why?<|im_end|>
+<|im_start|>assistant
+To get to the other side!<|im_end|>""",
+        ]
+
+        # Test the isolated function
+        h_responses = _extract_responses_from_full_histories(
+            text_full, prompt_histories, chat_template_name="qwen"
+        )
+
+        # Verify the responses have the expected shapes and content
+        assert len(h_responses) == 2, f"Expected 2 responses, got {len(h_responses)}"
+
+        # Check first response (should be padded to match second response length)
+        response_0 = h_responses[0]
+        assert response_0.shape == (3,), f"Expected shape (3,), got {response_0.shape}"
+        assert response_0.role == [
+            "assistant",
+            "<none>",
+            "<none>",
+        ], f"Expected roles ['assistant', '<none>', '<none>'], got {response_0.role}"
+        assert response_0.content == [
+            "I'm doing well, thank you for asking!",
+            "",
+            "",
+        ], f"Expected content ['I\\'m doing well, thank you for asking!', '', ''], got {response_0.content}"
+
+        # Check second response (should have 3 messages)
+        response_1 = h_responses[1]
+        assert response_1.shape == (3,), f"Expected shape (3,), got {response_1.shape}"
+        assert response_1.role == [
+            "assistant",
+            "user",
+            "assistant",
+        ], f"Expected roles ['assistant', 'user', 'assistant'], got {response_1.role}"
+        assert response_1.content == [
+            "Why did the chicken cross the road?",
+            "I don't know, why?",
+            "To get to the other side!",
+        ], f"Expected content ['Why did the chicken cross the road?', 'I don\\'t know, why?', 'To get to the other side!'], got {response_1.content}"
+
+        assert isinstance(h_responses, History)
+        h_responses.shape == (
+            2,
+            3,
+        ), f"Expected stacked shape (2, 3), got {h_responses.shape}"
+
+        # Extract individual responses for testing
+        response_0 = h_responses[0]
+        response_1 = h_responses[1]
+
+        # Test chat template application
+        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
+
+        # Test first response (should only show the assistant message, ignore padding)
+        template_0 = response_0.apply_chat_template(
+            tokenizer=tokenizer, add_generation_prompt=False, chat_template_name="qwen"
+        )
+        expected_0 = """<|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>assistant
+I'm doing well, thank you for asking!<|im_end|>
+    """
+        assert template_0 == expected_0
+
+        # Test second response (should show all 3 messages)
+        template_1 = response_1.apply_chat_template(
+            tokenizer=tokenizer, add_generation_prompt=False, chat_template_name="qwen"
+        )
+        expected_1 = """<|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>assistant
+Why did the chicken cross the road?<|im_end|>
+    <|im_start|>user
+I don't know, why?<|im_end|>
+<|im_start|>assistant
+To get to the other side!<|im_end|>
+    """
+        assert template_1 == expected_1
+
 
 class TestTopK:
     @pytest.mark.parametrize("per_token_reward", [True, False])
@@ -989,7 +1102,7 @@ def _per_token_reward(i):
                         ("next", "done"): torch.full((1, 1), True),
                         ("next", "reward"): _per_token_reward(i),
                         # total of 10 dialogs per prompt
-                        "text": f"Prompt {i // 5}",
+                        ("text", "prompt"): f"Prompt {i // 5}",
                     }
                 )
                 for i in range(50)
diff --git a/torchrl/data/llm/history.py b/torchrl/data/llm/history.py
@@ -1166,6 +1166,8 @@ def append(
         Returns:
             History: The appended History object.
         """
+        # TODO: we should remove the <none> role from the history before appending / extending
+        #  It works when keeping them, but it may lead to a lot of useless padding in between valid messages
         if not self.batch_dims:
             raise RuntimeError(
                 "Cannot append an element to a batchless History. Call unsqueeze(dim=0) first on self."
diff --git a/torchrl/data/replay_buffers/replay_buffers.py b/torchrl/data/replay_buffers/replay_buffers.py
@@ -705,7 +705,9 @@ def add(self, data: Any) -> int:
                 make_none = False
                 # Transforms usually expect a time batch dimension when called within a RB, so we unsqueeze the data temporarily
                 is_tc = is_tensor_collection(data)
-                with data.unsqueeze(-1) if is_tc else contextlib.nullcontext(data) as data_unsq:
+                with data.unsqueeze(-1) if is_tc else contextlib.nullcontext(
+                    data
+                ) as data_unsq:
                     data_unsq_r = self._transform.inv(data_unsq)
                     if is_tc and data_unsq_r is not None:
                         # this is a no-op whenever the result matches the input
diff --git a/torchrl/modules/llm/policies/common.py b/torchrl/modules/llm/policies/common.py
@@ -854,3 +854,70 @@ def log_prob(self, data: TensorDictBase, **get_kwargs) -> TensorDictBase:
             data = self(data)
             return data.get((self.log_prob_key, "response"), **get_kwargs)
         raise RuntimeError("log_prob not callable when generate=True.")
+
+
+def _extract_responses_from_full_histories(
+    text_full: list[str],
+    prompt_histories,
+    chat_template_name: str | None = None,
+    tokenizer=None,
+) -> History:
+    """Extract response histories from full text histories.
+
+    This function parses the full text back to history objects and extracts
+    the response portions (everything after the prompt).
+
+    Args:
+        text_full: List of full text strings to parse
+        prompt_histories: The original prompt histories
+        chat_template_name: Optional chat template name for parsing
+        tokenizer: Optional tokenizer for template detection
+
+    Returns:
+        Stacked History object with response portions
+
+    Raises:
+        RuntimeError: If full history is shorter than prompt history
+        RuntimeError: If parsing produces inconsistent batch shapes
+    """
+    import torch
+    from tensordict.utils import _zip_strict
+    from torchrl.data.llm import History
+
+    # Extract response portions by processing each element individually
+    # This avoids the stacking issue when different batch elements produce
+    # different numbers of responses
+    response_histories = []
+    full_histories = History.from_text(
+        text_full, chat_template_name=chat_template_name, tokenizer=tokenizer
+    )
+    for h_prompt, h_full in _zip_strict(
+        prompt_histories.unbind(0), full_histories.unbind(0)
+    ):
+        if h_full.shape[0] <= h_prompt.shape[0]:
+            raise RuntimeError(
+                f"Full history is shorter than prompt history: {h_full.shape} <= {h_prompt.shape}"
+            )
+        # Note: there can be more than one response, so the response has the same number of dims as prompt
+        response_histories.append(h_full[h_prompt.shape[0] :])
+
+    # Check if all responses have the same shape
+    shapes = [r.shape for r in response_histories]
+    if len(set(shapes)) > 1:
+        # Different shapes detected - pad to the same length
+        max_length = max(r.shape[0] for r in response_histories)
+        padded_responses = []
+        for response in response_histories:
+            if response.shape[0] < max_length:
+                # Pad with empty messages using "<none>" role
+                padding_needed = max_length - response.shape[0]
+                padding_history = History(
+                    role="<none>", content="", batch_size=(padding_needed,)
+                )
+                padded_response = response.extend(padding_history, inplace=False)
+                padded_responses.append(padded_response)
+            else:
+                padded_responses.append(response)
+        return torch.stack(padded_responses)
+
+    return torch.stack(response_histories)
diff --git a/torchrl/modules/llm/policies/transformers_wrapper.py b/torchrl/modules/llm/policies/transformers_wrapper.py
@@ -24,6 +24,7 @@
 from torch.nn.utils.rnn import pad_sequence
 
 from torchrl.modules.llm.policies.common import (
+    _extract_responses_from_full_histories,
     ChatHistory,
     LLMWrapperBase,
     LogProbs,
@@ -680,16 +681,11 @@ def _from_transformers_generate_history(self, td, cfg, out) -> TensorDictBase:
             for h in history_chat.unbind(1):
                 h.prompt = history
         with history_chat.view(-1) as history_chat_flat:
-            history_chat_flat.full = full_histories = History.from_text(text_full)
             prompt_histories = history_chat_flat.prompt
-            # iterate over batch
-            h_responses = []
-            for h_full, h_prompt in _zip_strict(
-                full_histories.unbind(0), prompt_histories.unbind(0)
-            ):
-                if h_full.shape[0] <= h_prompt.shape[0]:
-                    raise RuntimeError("Full history is shorter than prompt history")
-                h_responses.append(h_full[h_prompt.shape[0] :])
+            # Extract response histories from full text
+            h_responses = _extract_responses_from_full_histories(
+                text_full, prompt_histories, self.chat_template_name, self.tokenizer
+            )
             history_chat_flat.response = torch.stack(h_responses)
         result.set(self.history_key, history_chat)
         return result
diff --git a/torchrl/modules/llm/policies/vllm_wrapper.py b/torchrl/modules/llm/policies/vllm_wrapper.py
@@ -24,6 +24,7 @@
 
 from torchrl.envs.utils import _classproperty
 from torchrl.modules.llm.policies.common import (
+    _extract_responses_from_full_histories,
     ChatHistory,
     LLMWrapperBase,
     LogProbs,
@@ -720,17 +721,11 @@ def _from_vllm_generate_history(
             for h in history_chat.unbind(1):
                 h.prompt = history
         with history_chat.view(-1) as history_chat_flat:
-            history_chat_flat.full = full_histories = History.from_text(text_full)
             prompt_histories = history_chat_flat.prompt
-            # iterate over batch
-            h_responses = []
-            for h_full, h_prompt in _zip_strict(
-                full_histories.unbind(0), prompt_histories.unbind(0)
-            ):
-                if h_full.shape[0] <= h_prompt.shape[0]:
-                    raise RuntimeError("Full history is shorter than prompt history")
-                # Note: there can be more than one response, so the response has the same number of dims as prompt
-                h_responses.append(h_full[h_prompt.shape[0] :])
+            # Extract response histories from full text
+            h_responses = _extract_responses_from_full_histories(
+                text_full, prompt_histories, self.chat_template_name, self.tokenizer
+            )
             history_chat_flat.response = torch.stack(h_responses)
         result.set(self.history_key, history_chat)
         return result