huggingface
diff --git a/‎.github/workflows/tests-experimental.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/tests-experimental.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/tests_transformers_branch.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/tests_transformers_branch.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/experimental/test_kto_trainer.py‎
Lines changed: 6 additions & 0 deletions b/‎tests/experimental/test_kto_trainer.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/experimental/test_nash_md_trainer.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/experimental/test_nash_md_trainer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/test_cli.py‎
Lines changed: 2 additions & 1 deletion b/‎tests/test_cli.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/test_grpo_trainer.py‎
Lines changed: 108 additions & 0 deletions b/‎tests/test_grpo_trainer.py‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎tests/test_rloo_trainer.py‎
Lines changed: 67 additions & 0 deletions b/‎tests/test_rloo_trainer.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎tests/test_sft_trainer.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/test_sft_trainer.py‎
Lines changed: 2 additions & 2 deletions
@@ -10,6 +10,7 @@ on:
 env:
   TQDM_DISABLE: 1
   PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+  PYTORCH_ALLOC_CONF: "expandable_segments:True"
   TRL_EXPERIMENTAL_SILENCE: 1
 
 jobs:
 
@@ -22,6 +22,7 @@ env:
   TQDM_DISABLE: 1
   CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }}
   PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+  PYTORCH_ALLOC_CONF: "expandable_segments:True"
 
 jobs:
   check_code_quality:
 
@@ -12,6 +12,7 @@ env:
   TQDM_DISABLE: 1
   CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }}
   PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+  PYTORCH_ALLOC_CONF: "expandable_segments:True"
 
 jobs:
   tests_transformers_branch:
 
@@ -109,7 +109,7 @@ dev = [
     # kernels
     "kernels",
     # liger
-    #"liger-kernel>=0.7.0",
+    "liger-kernel>=0.7.0",
     # peft
     "peft>=0.8.0",
     # quality
 
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import multiprocess
 import pytest
 import torch
 from datasets import load_dataset
@@ -98,6 +99,11 @@ def test_kto_trainer_with_ref_model_is_model(self):
             )
 
     def test_tokenize_and_process_tokens(self):
+        # Pytest/CI often starts background threads before tests run. Under Python 3.12+,
+        # using "fork" in a multi-threaded process emits a DeprecationWarning and may deadlock.
+        # Force "spawn" to keep this multiprocessing test safe while still exercising `num_proc=2`.
+        multiprocess.set_start_method("spawn", force=True)
+
         training_args = KTOConfig(
             output_dir=self.tmp_dir,
             per_device_train_batch_size=2,
 
@@ -20,7 +20,7 @@
 
 from trl.experimental.nash_md import NashMDConfig, NashMDTrainer
 from trl.experimental.nash_md.nash_md_trainer import GeometricMixtureWrapper
-from trl.models.utils import create_reference_model
+from trl.experimental.utils import create_reference_model
 
 from ..testing_utils import TrlTestCase, require_llm_blender, require_peft
 from .testing_utils import RandomPairwiseJudge
 
@@ -130,7 +130,8 @@ def test_vllm_serve_config_file(self):
         with open(config_path, "w") as f:
             yaml.dump({"model": "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"}, f)
 
-        with patch("trl.cli.commands.vllm_serve.vllm_serve_main") as mock_serve:
+        # Patch the actual function that `VllmServeCommand.run` imports as `vllm_serve_main`
+        with patch("trl.scripts.vllm_serve.main") as mock_serve:
             with patch("sys.argv", ["trl", "vllm-serve", "--config", config_path]):
                 main()
 
 
@@ -861,6 +861,36 @@ def test_training_beta_non_zero(self):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
+    def test_training_with_pad_to_multiple_of(self):
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=8,  # reduce the completion length to reduce memory usage
+            pad_to_multiple_of=8,
+            report_to="none",
+        )
+        trainer = GRPOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        # Check that the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
     def test_get_off_policy_mask(self):
         """
         Test the logic of off-policy masking:
@@ -1771,6 +1801,43 @@ def reward_func(completions, **kwargs):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
+    @require_vision
+    def test_training_vlm_with_pad_to_multiple_of(self):
+        # Models like Gemma3 use other forward keyword arguments like token_type_ids that also need to be padded when
+        # using pad_to_multiple_of, so we test that the trainer correctly pads all the necessary inputs in this case.
+        dataset = load_dataset("trl-internal-testing/zen-image", "conversational_prompt_only", split="train")
+
+        def reward_func(completions, **kwargs):
+            """Reward function that rewards longer completions."""
+            return [float(len(completion[0]["content"])) for completion in completions]
+
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=8,  # reduce the completion length to reduce memory usage
+            pad_to_multiple_of=7,
+            report_to="none",
+        )
+        trainer = GRPOTrainer(
+            model="trl-internal-testing/tiny-Gemma3ForConditionalGeneration",
+            reward_funcs=reward_func,
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        # Check that the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
     @pytest.mark.parametrize(
         "model_id",
         [
@@ -2554,6 +2621,47 @@ def test_training_with_liger_grpo_kernel_and_peft(self, model_name):
 
         release_memory(model, trainer)
 
+    @require_liger_kernel
+    def test_liger_grpo_kernel_importance_sampling(self):
+        model_name = "trl-internal-testing/tiny-LlamaForCausalLM-3.2"
+
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            per_device_train_batch_size=3,
+            num_generations=3,
+            use_liger_kernel=True,
+            max_completion_length=self.max_length,
+            importance_sampling_level="sequence",
+            report_to="none",
+            logging_strategy="no",
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="float32")
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
+
+        trainer = GRPOTrainer(
+            model=model,
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            processing_class=tokenizer,
+        )
+        from liger_kernel.chunked_loss import LigerFusedLinearGRPOLoss
+
+        assert isinstance(trainer.liger_grpo_loss, LigerFusedLinearGRPOLoss)
+
+        previous_trainable_params = {n: param.clone() for n, param in model.named_parameters()}
+
+        trainer.train()
+
+        for n, param in previous_trainable_params.items():
+            new_param = model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
+        release_memory(model, trainer)
+
     @pytest.mark.parametrize(
         "model_name",
         [
 
@@ -678,6 +678,36 @@ def test_training_beta_zero(self):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
+    def test_training_with_pad_to_multiple_of(self):
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+        training_args = RLOOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=8,  # reduce the completion length to reduce memory usage
+            pad_to_multiple_of=8,
+            report_to="none",
+        )
+        trainer = RLOOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        # Check that the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
     @require_peft
     @require_vllm
     @pytest.mark.skip(reason="We should add a mock for the vLLM server.")
@@ -1242,6 +1272,43 @@ def reward_func(completions, **kwargs):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
+    @require_vision
+    def test_training_vlm_with_pad_to_multiple_of(self):
+        # Models like Gemma3 use other forward keyword arguments like token_type_ids that also need to be padded when
+        # using pad_to_multiple_of, so we test that the trainer correctly pads all the necessary inputs in this case.
+        dataset = load_dataset("trl-internal-testing/zen-image", "conversational_prompt_only", split="train")
+
+        def reward_func(completions, **kwargs):
+            """Reward function that rewards longer completions."""
+            return [float(len(completion[0]["content"])) for completion in completions]
+
+        training_args = RLOOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
+            per_device_train_batch_size=3,  # reduce the batch size to reduce memory usage
+            num_generations=3,  # reduce the number of generations to reduce memory usage
+            max_completion_length=8,  # reduce the completion length to reduce memory usage
+            pad_to_multiple_of=7,
+            report_to="none",
+        )
+        trainer = RLOOTrainer(
+            model="trl-internal-testing/tiny-Gemma3ForConditionalGeneration",
+            reward_funcs=reward_func,
+            args=training_args,
+            train_dataset=dataset,
+        )
+
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+
+        # Check that the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
     @pytest.mark.parametrize(
         "model_id",
         [
 
@@ -812,7 +812,7 @@ def mock_super_compute_loss(model, inputs, return_outputs=False, num_items_in_ba
             "attention_mask": torch.tensor([[1, 1, 1, 1, 1]]),
         }
 
-        with patch("trl.trainer.sft_trainer.BaseTrainer.compute_loss", side_effect=mock_super_compute_loss):
+        with patch("transformers.Trainer.compute_loss", side_effect=mock_super_compute_loss):
             trainer.compute_loss(trainer.model, inputs)
 
         assert captured["skip_logits"] is True
@@ -846,7 +846,7 @@ def mock_super_compute_loss(model, inputs, return_outputs=False, num_items_in_ba
             dummy_outputs = (dummy_loss, torch.randn(1, 5, trainer.model.config.vocab_size))
             return (dummy_loss, dummy_outputs)
 
-        with patch("trl.trainer.sft_trainer.BaseTrainer.compute_loss", side_effect=mock_super_compute_loss):
+        with patch("transformers.Trainer.compute_loss", side_effect=mock_super_compute_loss):
             trainer.predict(trainer.train_dataset)
 
         assert captured["skip_logits"] is False