huggingface
diff --git a/‎docs/source/vllm_integration.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/vllm_integration.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/test_grpo_trainer.py‎
Lines changed: 38 additions & 12 deletions b/‎tests/test_grpo_trainer.py‎
Lines changed: 38 additions & 12 deletions
diff --git a/‎tests/test_vllm_client_server.py‎
Lines changed: 97 additions & 1 deletion b/‎tests/test_vllm_client_server.py‎
Lines changed: 97 additions & 1 deletion
diff --git a/‎trl/_compat.py‎
Lines changed: 1 addition & 1 deletion b/‎trl/_compat.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎trl/experimental/cpo/cpo_trainer.py‎
Lines changed: 1 addition & 1 deletion b/‎trl/experimental/cpo/cpo_trainer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎trl/experimental/gkd/gkd_config.py‎
Lines changed: 3 additions & 5 deletions b/‎trl/experimental/gkd/gkd_config.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎trl/experimental/gold/gold_config.py‎
Lines changed: 6 additions & 8 deletions b/‎trl/experimental/gold/gold_config.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎trl/experimental/minillm/minillm_config.py‎
Lines changed: 4 additions & 2 deletions b/‎trl/experimental/minillm/minillm_config.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎trl/experimental/online_dpo/online_dpo_trainer.py‎
Lines changed: 8 additions & 4 deletions b/‎trl/experimental/online_dpo/online_dpo_trainer.py‎
Lines changed: 8 additions & 4 deletions
@@ -3,7 +3,7 @@
 This document will guide you through the process of using vLLM with TRL for faster generation in online methods like GRPO and Online DPO. We first summarize a tl;dr on how to use vLLM with TRL, and then we will go into the details of how it works under the hood.
 
 > [!WARNING]
-> TRL currently only supports vLLM versions from `0.10.2` to `0.14.1`. Please ensure you have a version in this range installed to avoid compatibility issues.
+> TRL currently only supports vLLM versions from `0.10.2` to `0.17.0`. Please ensure you have a version in this range installed to avoid compatibility issues.
 
 > [!TIP]
 > The following trainers currently support generation with vLLM:
 
@@ -83,7 +83,7 @@ test = [
     "pytest"
 ]
 vllm = [
-    "vllm>=0.10.2,<=0.14.1",
+    "vllm>=0.10.2,<=0.17.0",
     "fastapi",
     "pydantic",
     "requests",
 
@@ -162,17 +162,44 @@ def test_compute_entropy_all_masked(self):
 class TestGRPORolloutDispatch:
     def _make_trainer(self):
         trainer = object.__new__(GRPOTrainer)
-        trainer.accelerator = SimpleNamespace(device=torch.device("cpu"), is_main_process=True)
+        trainer.accelerator = SimpleNamespace(
+            device=torch.device("cpu"),
+            is_main_process=True,
+            gather=lambda t: t,
+        )
         trainer.args = SimpleNamespace(report_to=[])
         trainer.model = SimpleNamespace(training=True)
-        trainer.state = SimpleNamespace(global_step=2)
+        trainer.state = SimpleNamespace(global_step=2, num_input_tokens_seen=0)
         trainer._last_loaded_step = 1
         trainer.use_vllm = False
         trainer.use_transformers_paged = False
         trainer.vllm_generation = SimpleNamespace(sync_weights=MagicMock())
+        trainer.processing_class = SimpleNamespace(
+            batch_decode=MagicMock(return_value=["decoded"]),
+        )
+        trainer.tools = None
+        trainer.eos_token_id = 2
+        trainer.pad_token_id = 0
+        trainer._metrics = {
+            "train": {
+                "num_tokens": [],
+                **{
+                    k: []
+                    for k in [
+                        "completions/mean_length",
+                        "completions/min_length",
+                        "completions/max_length",
+                        "completions/clipped_ratio",
+                        "completions/mean_terminated_length",
+                        "completions/min_terminated_length",
+                        "completions/max_terminated_length",
+                    ]
+                },
+            }
+        }
         return trainer
 
-    def test_generate_single_turn_prefers_rollout_func(self):
+    def test_generate_prefers_rollout_func(self):
         trainer = self._make_trainer()
         trainer.rollout_func = MagicMock(
             return_value={
@@ -183,33 +210,32 @@ def test_generate_single_turn_prefers_rollout_func(self):
             }
         )
 
-        prompt_ids, completion_ids, logprobs, extra_fields = trainer._generate_single_turn(["prompt"])
+        result = trainer._generate(["prompt"])
 
-        assert prompt_ids == [[1]]
-        assert completion_ids == [[2]]
-        assert logprobs == [[-0.1]]
-        assert extra_fields == {"env_mask": [[1]]}
+        assert result[0] == [[1]]  # prompt_ids
+        assert result[1] == [[2]]  # completion_ids
+        assert result[2] == [[1]]  # tool_mask (from env_mask)
         trainer.rollout_func.assert_called_once_with(["prompt"], trainer)
 
-    def test_generate_single_turn_rollout_func_syncs_vllm_weights_when_needed(self):
+    def test_generate_rollout_func_syncs_vllm_weights_when_needed(self):
         trainer = self._make_trainer()
         trainer.use_vllm = True
         trainer.rollout_func = MagicMock(
             return_value={"prompt_ids": [[1]], "completion_ids": [[2]], "logprobs": [[0.0]]}
         )
 
-        trainer._generate_single_turn(["prompt"])
+        trainer._generate(["prompt"])
 
         trainer.vllm_generation.sync_weights.assert_called_once()
         assert trainer._last_loaded_step == trainer.state.global_step
         trainer.rollout_func.assert_called_once_with(["prompt"], trainer)
 
-    def test_generate_single_turn_rollout_func_raises_when_required_keys_are_missing(self):
+    def test_generate_rollout_func_raises_when_required_keys_are_missing(self):
         trainer = self._make_trainer()
         trainer.rollout_func = MagicMock(return_value={"prompt_ids": [[1]], "completion_ids": [[2]]})
 
         with pytest.raises(ValueError, match="rollout_func must return keys"):
-            trainer._generate_single_turn(["prompt"])
+            trainer._generate(["prompt"])
 
 
 class TestGRPOTrainer(TrlTestCase):
 
@@ -18,7 +18,7 @@
 
 import pytest
 from packaging.version import Version
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 from transformers.testing_utils import torch_device
 
 from trl.generation.vllm_client import VLLMClient
@@ -31,6 +31,7 @@
     kill_process,
     require_3_accelerators,
     require_torch_multi_accelerator,
+    require_vision,
     require_vllm,
 )
 
@@ -874,3 +875,98 @@ def teardown_class(cls):
         # vLLM x pytest (or Popen) seems not to handle process termination well. To avoid zombie processes, we need to
         # kill the server process and its children explicitly.
         kill_process(cls.server_process)
+
+
+@pytest.mark.slow
+@require_vllm
+@require_vision
+class TestVLLMClientServerVLM(TrlTestCase):
+    model_id = "Qwen/Qwen2.5-VL-3B-Instruct"
+
+    @classmethod
+    def setup_class(cls):
+        # Start the server process
+        cls.server_process = subprocess.Popen(
+            ["trl", "vllm-serve", "--model", cls.model_id], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+
+        # Initialize the client (no communicator needed for generation-only tests)
+        cls.client = VLLMClient(connection_timeout=240, host="localhost")
+
+    def test_generate_with_token_ids_and_image(self):
+        from PIL import Image
+
+        processor = AutoProcessor.from_pretrained(self.model_id)
+        image1 = Image.new("RGB", (64, 64), color="red")
+        image2 = Image.new("RGB", (64, 64), color="blue")
+        image3 = Image.new("RGB", (64, 64), color="green")
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image1},
+                        {"type": "image", "image": image2},
+                        {"type": "text", "text": "What are the differences between these two images?"},
+                    ],
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image3},
+                        {"type": "text", "text": "What is the color of this image?"},
+                    ],
+                }
+            ],
+        ]
+        prompt_token_ids = processor.apply_chat_template(
+            conversation=messages, tokenize=True, add_generation_prompt=True
+        )
+        outputs = self.client.generate(prompt_token_ids, images=[[image1, image2], [image3]], max_tokens=64)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        assert len(prompt_ids) == 2
+        assert len(completion_ids) == 2
+        assert all(isinstance(tok, int) for tok in prompt_ids[0])
+        assert all(isinstance(tok, int) for tok in completion_ids[0])
+
+    def test_generate_with_token_ids_mixed_images(self):
+        """Test a batch where one prompt has an image and the other does not."""
+        from PIL import Image
+
+        processor = AutoProcessor.from_pretrained(self.model_id)
+        image = Image.new("RGB", (64, 64), color="red")
+        messages = [
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "image", "image": image}, {"type": "text", "text": "Describe this image."}],
+                }
+            ],
+            [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "What is 1+1?"}],
+                }
+            ],
+        ]
+        prompt_token_ids = processor.apply_chat_template(
+            conversation=messages, tokenize=True, add_generation_prompt=True
+        )
+        outputs = self.client.generate(prompt_token_ids, images=[[image], None], max_tokens=64)
+        prompt_ids = outputs["prompt_ids"]
+        completion_ids = outputs["completion_ids"]
+
+        assert len(prompt_ids) == 2
+        assert len(completion_ids) == 2
+        assert all(isinstance(tok, int) for tok in prompt_ids[0])
+        assert all(isinstance(tok, int) for tok in prompt_ids[1])
+        assert all(isinstance(tok, int) for tok in completion_ids[0])
+        assert all(isinstance(tok, int) for tok in completion_ids[1])
+
+    @classmethod
+    def teardown_class(cls):
+        kill_process(cls.server_process)
@@ -89,7 +89,7 @@ def _patch_vllm_disabled_tqdm() -> None:
 
     - Bug introduced in https://github.com/vllm-project/vllm/pull/52
     - Fixed in https://github.com/vllm-project/vllm/pull/28471 (released in v0.11.1)
-    - Since TRL currently supports vLLM v0.10.2-0.14.1, we patch it here
+    - Since TRL currently supports vLLM v0.10.2-0.17.0, we patch it here
     - This can be removed when TRL requires vLLM>=0.11.1
     """
     if _is_package_version_below("vllm", "0.11.1"):
 
@@ -481,7 +481,7 @@ def tokenize_row(self, feature, model: PreTrainedModel | nn.Module | None = None
             # and length only differs by 1 at most
             num_diff_tokens = sum(
                 a != b
-                for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"], strict=True)
+                for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"], strict=False)
             )
             num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)
             if num_diff_tokens > 1 or num_diff_len > 1:
 
@@ -15,8 +15,6 @@
 from dataclasses import dataclass, field
 from typing import Any
 
-from transformers import TrainingArguments
-
 from ...trainer.sft_config import SFTConfig
 
 
@@ -42,7 +40,7 @@ class GKDConfig(SFTConfig):
         teacher_model_name_or_path (`str`, *optional*):
             Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
             trained.
-        teacher_model_init_kwargs (`dict[str, Any]]`, *optional*):
+        teacher_model_init_kwargs (`dict[str, Any]`, *optional*):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
             from a string.
         disable_dropout (`bool`, *optional*, defaults to `True`):
@@ -52,7 +50,7 @@ class GKDConfig(SFTConfig):
             teacher-generated output).
     """
 
-    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
+    _VALID_DICT_FIELDS = SFTConfig._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
 
     temperature: float = field(
         default=0.9,
@@ -84,7 +82,7 @@ class GKDConfig(SFTConfig):
             "model being trained."
         },
     )
-    teacher_model_init_kwargs: dict[str, Any] | None = field(
+    teacher_model_init_kwargs: dict[str, Any] | str | None = field(
         default=None,
         metadata={
             "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
 
@@ -15,8 +15,6 @@
 from dataclasses import dataclass, field
 from typing import Any
 
-from transformers import TrainingArguments
-
 from ...trainer.sft_config import SFTConfig
 
 
@@ -39,13 +37,13 @@ class GOLDConfig(SFTConfig):
             beta is `0.0`, the loss is the KL divergence. When beta is `1.0`, the loss is the Inverse KL Divergence.
         max_completion_length (`int`, *optional*, defaults to `128`):
             Maximum number of tokens to generate per completion.
-        teacher_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
+        teacher_model_name_or_path (`str`, *optional*):
             Model name or path of the teacher model. If `None`, the teacher model will be the same as the model being
             trained.
-        teacher_model_init_kwargs (`dict[str, Any]]` or `None`, *optional*, defaults to `None`):
+        teacher_model_init_kwargs (`dict[str, Any]`, *optional*):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
             from a string.
-        teacher_tokenizer_name_or_path (`str` or `None`, *optional*, defaults to `None`):
+        teacher_tokenizer_name_or_path (`str`, *optional*):
             Tokenizer name or path for the teacher model. If None when using ULD loss, will use the same tokenizer as
             the student model (not recommended for cross-tokenizer distillation).
         disable_dropout (`bool`, *optional*, defaults to `True`):
@@ -84,7 +82,7 @@ class GOLDConfig(SFTConfig):
             to set this to a low value if the student and teacher models share the same GPU.
         vllm_tensor_parallel_size (`int`, *optional*, defaults to `1`):
             Tensor parallel size for the colocated student vLLM engine (if `vllm_mode="colocate"`).
-        vllm_structured_outputs_regex (`str` or `None`, *optional*, defaults to `None`):
+        vllm_structured_outputs_regex (`str`, *optional*):
             Regex for vLLM structured outputs for the student model.
         vllm_sync_frequency (`int`, *optional*, defaults to `1`):
             Frequency (in training steps) to synchronize student model weights to vLLM engine. Set to 1 to sync after
@@ -94,7 +92,7 @@ class GOLDConfig(SFTConfig):
             low, but waking the engine adds host–device transfer latency.
     """
 
-    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
+    _VALID_DICT_FIELDS = SFTConfig._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
 
     # Parameters whose default values are overridden from TrainingArguments
     learning_rate: float = field(
@@ -153,7 +151,7 @@ class GOLDConfig(SFTConfig):
             "model being trained."
         },
     )
-    teacher_model_init_kwargs: dict[str, Any] | None = field(
+    teacher_model_init_kwargs: dict[str, Any] | str | None = field(
         default=None,
         metadata={
             "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
 
@@ -29,7 +29,7 @@ class MiniLLMConfig(GRPOConfig):
     arguments, please refer to the [`~transformers.TrainingArguments`] and [`GRPOConfig`] documentation.
 
     Args:
-        teacher_model_init_kwargs (`dict[str, Any]]`, *optional*):
+        teacher_model_init_kwargs (`dict[str, Any]`, *optional*):
             Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the teacher model
             from a string.
         disable_dropout (`bool`, *optional*, defaults to `True`):
@@ -47,7 +47,9 @@ class MiniLLMConfig(GRPOConfig):
             Whether to apply length normalization to the rewards.
     """
 
-    teacher_model_init_kwargs: dict[str, Any] | None = field(
+    _VALID_DICT_FIELDS = TrainingArguments._VALID_DICT_FIELDS + ["teacher_model_init_kwargs"]
+
+    teacher_model_init_kwargs: dict[str, Any] | str | None = field(
         default=None,
         metadata={
             "help": "Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the "
 
@@ -96,9 +96,11 @@
 
 logger = logging.get_logger(__name__)
 
-# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
-# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
-RewardFunc = str | PreTrainedModel | Callable[[list, list], list[float]]
+# A reward function can be a string, interpreted as a model ID and loaded as a pretrained model, a pretrained model, or
+# a callable that returns a list of floats (the rewards). The callable receives prompts, completions, and additional
+# arguments from the trainer (refer to the trainer's source for details). To ensure forward compatibility, it should
+# accept **kwargs.
+RewardFunc = str | PreTrainedModel | Callable[..., list[float]]
 
 
 class OnlineDPOTrainer(_BaseTrainer):
@@ -750,7 +752,9 @@ def _generate_vllm_server(self, prompts, images=None):
             # prompt individually.
             ordered_set_of_prompts = all_prompts[:: self.num_generations]
             if has_images:
-                ordered_set_of_images = all_images[:: self.num_generations]
+                ordered_set_of_images = [
+                    [img] if img is not None else None for img in all_images[:: self.num_generations]
+                ]
             else:
                 ordered_set_of_images = None
             completion_ids = self.vllm_client.generate(
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ test = [`
`83`	`83`	`"pytest"`
`84`	`84`	`]`
`85`	`85`	`vllm = [`
`86`		`- "vllm>=0.10.2,<=0.14.1",`
	`86`	`+ "vllm>=0.10.2,<=0.17.0",`
`87`	`87`	`"fastapi",`
`88`	`88`	`"pydantic",`
`89`	`89`	`"requests",`
Original file line number	Diff line number	Diff line change
`@@ -481,7 +481,7 @@ def tokenize_row(self, feature, model: PreTrainedModel \| nn.Module \| None = None`
`481`	`481`	`# and length only differs by 1 at most`
`482`	`482`	`num_diff_tokens = sum(`
`483`	`483`	`a != b`
`484`		`- for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"], strict=True)`
	`484`	`+ for a, b in zip(chosen_tokens["prompt_input_ids"], rejected_tokens["prompt_input_ids"], strict=False)`
`485`	`485`	`)`
`486`	`486`	`num_diff_len = abs(chosen_prompt_len_input_ids - rejected_prompt_len_input_ids)`
`487`	`487`	`if num_diff_tokens > 1 or num_diff_len > 1:`