[GRPO/RLOO] Tokenize before vLLM generation call (#5238)

qgallouedec · albertvillanova · web-flow · commit b77f36f3bb95 · 2026-03-10T12:48:43.000-06:00
Co-authored-by: Albert Villanova del Moral &lt;8515462+albertvillanova@users.noreply.github.com&gt;
diff --git a/trl/generation/vllm_generation.py b/trl/generation/vllm_generation.py
@@ -14,7 +14,6 @@
 
 """vLLM-based generation backend for TRL trainers."""
 
-import json
 import logging
 import math
 import os
@@ -29,7 +28,6 @@
 from transformers import PreTrainedModel, PreTrainedTokenizerBase, ProcessorMixin, is_bitsandbytes_available
 from transformers.utils import is_torch_mlu_available, is_torch_npu_available, is_torch_xpu_available
 
-from ..data_utils import is_conversational, prepare_multimodal_messages_vllm
 from ..extras.profiling import ProfilingContext
 from ..import_utils import is_vllm_available
 from ..trainer.utils import ensure_master_addr_port
@@ -245,10 +243,6 @@ def __init__(
         max_completion_length: int = 16,
         logprobs: int | None = 0,
         generation_kwargs: dict | None = None,
-        # Chat/tool configuration
-        chat_template: str | None = None,
-        chat_template_kwargs: dict | None = None,
-        tools: list | None = None,
     ):
         self.model = model
         self.accelerator = accelerator
@@ -284,11 +278,6 @@ def __init__(
         self.logprobs = logprobs
         self.generation_kwargs = generation_kwargs or {}
 
-        # Chat/tool configuration
-        self.chat_template = chat_template
-        self.chat_template_kwargs = chat_template_kwargs or {}
-        self.tools = tools
-
         self._init_vllm()
 
     def _init_vllm(self):
@@ -528,13 +517,21 @@ def sync_weights(self):
         elif self.mode == "colocate":
             self.llm.reset_prefix_cache()
 
-    def generate(self, prompts: list, num_generations: int, profiler: ProfilingContext | None = None) -> tuple:
+    def generate(
+        self,
+        prompts: list[list[int]],
+        images: list[list | None] | None,
+        num_generations: int,
+        profiler: ProfilingContext | None = None,
+    ) -> tuple:
         """Generate completions using vLLM.
 
         Args:
-            prompts: List of prompts (strings or chat conversations)
-            num_generations: Number of generations per prompt
-            profiler: Optional profiler for performance tracking
+            prompts: List of token ID lists, one per prompt (already tokenized).
+            images: Optional list of image lists for VLM support. Each element is a list of PIL images for the
+                corresponding prompt, or `None` if no images for that prompt. `None` if no images at all.
+            num_generations: Number of generations per prompt.
+            profiler: Optional profiler for performance tracking.
 
         Returns:
             Tuple of (prompt_ids, completion_ids, logprobs, logprob_token_ids, extra_fields).
@@ -567,9 +564,6 @@ def generate(self, prompts: list, num_generations: int, profiler: ProfilingConte
         min_p = self.min_p
         repetition_penalty = self.repetition_penalty
         max_completion_length = self.max_completion_length
-        chat_template_kwargs = self.chat_template_kwargs
-        tools = self.tools
-        chat_template = self.chat_template
 
         # Wake up colocated vLLM weights if needed (idempotent if already awake from sync_weights)
         if self.mode == "colocate" and self.enable_sleep_mode:
@@ -582,28 +576,21 @@ def generate(self, prompts: list, num_generations: int, profiler: ProfilingConte
                 # Non-CUDA vLLM backends (e.g., vllm-ascend's NPUWorkerV1), don't implement reload_weights
                 pass
 
-        if is_conversational({"prompt": prompts[0]}):
-            prompts = [prepare_multimodal_messages_vllm(prompt) for prompt in prompts]
-
-        # In vLLM, tool call arguments must be JSON strings. See https://github.com/vllm-project/vllm/pull/28820
-        for prompt in prompts:  # iterate over each conversation
-            if is_conversational({"prompt": prompt}):
-                for message in prompt:  # iterate over each message
-                    if "tool_calls" in message:  # check if message has tool calls
-                        for call in message["tool_calls"]:
-                            args_value = call["function"]["arguments"]
-                            if isinstance(args_value, dict):  # only convert dict → JSON string
-                                call["function"]["arguments"] = json.dumps(args_value)
-
         # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
         if self.mode == "server":
             all_prompts = gather_object(prompts)
+            # Always gather images (even when None) to avoid deadlock: images may be None on some ranks
+            # and non-None on others in mixed datasets, and gather_object is a collective operation.
+            all_images = gather_object(images if images is not None else [None] * len(prompts))
+            if all(img is None for img in all_images):
+                all_images = None
 
             if accelerator.is_main_process:
-                # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
-                # num_generations outputs for each one. This is faster than generating outputs for each duplicate
-                # prompt individually.
-                ordered_set_of_prompts = all_prompts[::num_generations]
+                # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and
+                # generate num_generations outputs for each one. This is faster than generating outputs for each
+                # duplicate prompt individually.
+                ordered_set_of_prompt_ids = all_prompts[::num_generations]
+                ordered_set_of_images = all_images[::num_generations] if all_images is not None else None
 
                 sampling_params = {
                     "n": num_generations,
@@ -617,18 +604,12 @@ def generate(self, prompts: list, num_generations: int, profiler: ProfilingConte
                     "structured_outputs_regex": self.structured_outputs_regex,
                     "generation_kwargs": self.generation_kwargs,
                 }
-                with profiler:  # TODO: profiling_context(trainer, "vLLM.generate"):
-                    if is_conversational({"prompt": ordered_set_of_prompts[0]}):
-                        output = self.vllm_client.chat(
-                            messages=ordered_set_of_prompts,
-                            **sampling_params,
-                            chat_template_kwargs=chat_template_kwargs,
-                            tools=tools,
-                            chat_template=chat_template,
-                        )
-                    else:
-                        ordered_set_of_prompt_ids = self.processing_class(text=ordered_set_of_prompts)["input_ids"]
-                        output = self.vllm_client.generate(prompts=ordered_set_of_prompt_ids, **sampling_params)
+                with profiler:
+                    output = self.vllm_client.generate(
+                        prompts=ordered_set_of_prompt_ids,
+                        images=ordered_set_of_images,
+                        **sampling_params,
+                    )
                     # Extract required fields and collect any extra fields for reward functions
                     required_keys = {"prompt_ids", "completion_ids", "logprobs", "logprob_token_ids"}
                     extra_fields = {k: v for k, v in output.items() if k not in required_keys}
@@ -647,7 +628,7 @@ def generate(self, prompts: list, num_generations: int, profiler: ProfilingConte
             broadcast_object_list(obj_list, from_process=0)
             all_prompt_ids, all_completion_ids, all_logprobs, all_logprob_token_ids, all_extra_fields = obj_list[0]
 
-            # vllm_client.generate/chat(n=num_generations) returns num_generations completions per prompt.
+            # vllm_client.generate(n=num_generations) returns num_generations completions per prompt.
             # Duplicate prompt_ids to align with per-completion entries.
             all_prompt_ids = [ids for ids in all_prompt_ids for _ in range(num_generations)]
 
@@ -702,24 +683,34 @@ def generate(self, prompts: list, num_generations: int, profiler: ProfilingConte
                 gathered_prompts = [None for _ in range(self.tensor_parallel_size)]
                 torch.distributed.all_gather_object(gathered_prompts, prompts, group=self.tp_group)
                 all_prompts = [p for sublist in gathered_prompts for p in sublist]
+                # Always gather images (even when None) to avoid deadlock: images may be None on some
+                # ranks and non-None on others in mixed datasets, and all_gather_object is collective.
+                local_images = images if images is not None else [None] * len(prompts)
+                gathered_images = [None for _ in range(self.tensor_parallel_size)]
+                torch.distributed.all_gather_object(gathered_images, local_images, group=self.tp_group)
+                all_images = [img for sublist in gathered_images for img in sublist]
+                if all(img is None for img in all_images):
+                    all_images = None
             else:
                 all_prompts = prompts
+                all_images = images
 
             if self.enable_sleep_mode:
                 self.llm.wake_up(tags=["kv_cache"])
 
-            with profiler:  # TODO: profiling_context(trainer, "vLLM.generate"):
-                if is_conversational({"prompt": prompts[0]}):
-                    all_outputs = self.llm.chat(
-                        all_prompts,
-                        sampling_params=sampling_params,
-                        use_tqdm=False,
-                        chat_template_kwargs=chat_template_kwargs,
-                        tools=tools,
-                        chat_template=chat_template,
-                    )
-                else:
-                    all_outputs = self.llm.generate(all_prompts, sampling_params=sampling_params, use_tqdm=False)
+            # Build vLLM-compatible prompt inputs with token IDs and optional multi-modal data
+            vllm_prompts = []
+            if all_images is not None:
+                for ids, img_list in zip(all_prompts, all_images, strict=True):
+                    row = {"prompt_token_ids": ids}
+                    if img_list is not None:
+                        row["multi_modal_data"] = {"image": img_list if len(img_list) > 1 else img_list[0]}
+                    vllm_prompts.append(row)
+            else:
+                vllm_prompts = [{"prompt_token_ids": ids} for ids in all_prompts]
+
+            with profiler:
+                all_outputs = self.llm.generate(vllm_prompts, sampling_params=sampling_params, use_tqdm=False)
 
             all_prompt_ids = [output.prompt_token_ids for output in all_outputs]
             all_completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs]
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -741,10 +741,6 @@ def cast_outputs_to_original_dtype(module, args, output):
                 max_completion_length=self.max_completion_length,
                 logprobs=0,  # we only need the generated token logprobs for the importance sampling correction
                 generation_kwargs=args.generation_kwargs,
-                # Chat/tool configuration
-                chat_template=self.chat_template,
-                chat_template_kwargs=self.chat_template_kwargs,
-                tools=self.tools,
             )
             self._last_loaded_step = -1  # tag to avoid useless loading during grad accumulation
         else:
@@ -1226,10 +1222,43 @@ def _generate_single_turn(self, prompts: list):
                     self.vllm_generation.sync_weights()
                 self._last_loaded_step = self.state.global_step
 
-            # Generate using vLLM
+            # Tokenize prompts and extract images (for VLM) before calling vLLM
+            if is_conversational({"prompt": prompts[0]}):
+                # Extract images from messages for VLM support
+                images = []
+                has_images = False
+                for prompt in prompts:
+                    prompt_images = []
+                    for message in prompt:
+                        if isinstance(message["content"], list):
+                            for part in message["content"]:
+                                if part["type"] == "image":
+                                    prompt_images.append(part["image"])
+                                    has_images = True
+                    images.append(prompt_images if prompt_images else None)
+                images = images if has_images else None
+
+                tokenized = self.processing_class.apply_chat_template(
+                    conversation=prompts,
+                    tools=self.tools,
+                    chat_template=self.chat_template,
+                    add_generation_prompt=True,
+                    tokenize=True,
+                    return_dict=True,
+                    **self.chat_template_kwargs,
+                )
+                prompt_token_ids = tokenized["input_ids"]
+            else:
+                prompt_token_ids = self.processing_class(text=prompts)["input_ids"]
+                images = None
+
+            # Generate using vLLM with raw token IDs
             num_generations = self.num_generations if mode == "train" else self.num_generations_eval
             prompt_ids, completion_ids, logprobs, _, extra_fields = self.vllm_generation.generate(
-                prompts=prompts, num_generations=num_generations, profiler=profiling_context(self, "vLLM.generate")
+                prompts=prompt_token_ids,
+                images=images,
+                num_generations=num_generations,
+                profiler=profiling_context(self, "vLLM.generate"),
             )
             # vLLM returns per-token top-k logprobs; keep only the top-1 (sampled token) logprob
             logprobs = [[lp[0] for lp in seq] for seq in logprobs]
diff --git a/trl/trainer/rloo_trainer.py b/trl/trainer/rloo_trainer.py
@@ -525,8 +525,6 @@ def __init__(
                 max_completion_length=self.max_completion_length,
                 logprobs=None,  # we don't need logprobs from vLLM in RLOO
                 generation_kwargs=args.generation_kwargs,
-                # Chat/tool configuration
-                chat_template_kwargs=self.chat_template_kwargs,
             )
             self._last_loaded_step = -1  # tag to avoid useless loading during grad accumulation
         else:
@@ -898,10 +896,42 @@ def _generate_single_turn(self, prompts: list):
                     self.vllm_generation.sync_weights()
                 self._last_loaded_step = self.state.global_step
 
+            # Tokenize prompts and extract images (for VLM) before calling vLLM
+            if is_conversational({"prompt": prompts[0]}):
+                # Extract images from messages for VLM support
+                images = []
+                has_images = False
+                for prompt in prompts:
+                    prompt_images = []
+                    for message in prompt:
+                        if isinstance(message["content"], list):
+                            for part in message["content"]:
+                                if part["type"] == "image":
+                                    prompt_images.append(part["image"])
+                                    has_images = True
+                    images.append(prompt_images if prompt_images else None)
+                images = images if has_images else None
+
+                # RLOO does not support tools; omit tools/chat_template args
+                tokenized = self.processing_class.apply_chat_template(
+                    conversation=prompts,
+                    add_generation_prompt=True,
+                    tokenize=True,
+                    return_dict=True,
+                    **self.chat_template_kwargs,
+                )
+                prompt_token_ids = tokenized["input_ids"]
+            else:
+                prompt_token_ids = self.processing_class(text=prompts)["input_ids"]
+                images = None
+
             # Generate using vLLM (note: RLOO doesn't use logprobs from generation, so we ignore them)
             num_generations = self.num_generations if mode == "train" else self.num_generations_eval
             prompt_ids, completion_ids, _, _, _ = self.vllm_generation.generate(
-                prompts=prompts, num_generations=num_generations, profiler=profiling_context(self, "vLLM.generate")
+                prompts=prompt_token_ids,
+                images=images,
+                num_generations=num_generations,
+                profiler=profiling_context(self, "vLLM.generate"),
             )
 
         elif self.use_transformers_paged: