@@ -861,6 +861,36 @@ def test_training_beta_non_zero(self):
861861 new_param = trainer .model .get_parameter (n )
862862 assert not torch .equal (param , new_param ), f"Parameter { n } has not changed."
863863
864+ def test_training_with_pad_to_multiple_of (self ):
865+ dataset = load_dataset ("trl-internal-testing/zen" , "standard_prompt_only" , split = "train" )
866+
867+ training_args = GRPOConfig (
868+ output_dir = self .tmp_dir ,
869+ learning_rate = 0.1 , # use higher lr because gradients are tiny and default lr can stall updates
870+ per_device_train_batch_size = 3 , # reduce the batch size to reduce memory usage
871+ num_generations = 3 , # reduce the number of generations to reduce memory usage
872+ max_completion_length = 8 , # reduce the completion length to reduce memory usage
873+ pad_to_multiple_of = 8 ,
874+ report_to = "none" ,
875+ )
876+ trainer = GRPOTrainer (
877+ model = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" ,
878+ reward_funcs = "trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5" ,
879+ args = training_args ,
880+ train_dataset = dataset ,
881+ )
882+
883+ previous_trainable_params = {n : param .clone () for n , param in trainer .model .named_parameters ()}
884+
885+ trainer .train ()
886+
887+ assert trainer .state .log_history [- 1 ]["train_loss" ] is not None
888+
889+ # Check that the params have changed
890+ for n , param in previous_trainable_params .items ():
891+ new_param = trainer .model .get_parameter (n )
892+ assert not torch .equal (param , new_param ), f"Parameter { n } has not changed."
893+
864894 def test_get_off_policy_mask (self ):
865895 """
866896 Test the logic of off-policy masking:
@@ -1771,6 +1801,43 @@ def reward_func(completions, **kwargs):
17711801 new_param = trainer .model .get_parameter (n )
17721802 assert not torch .equal (param , new_param ), f"Parameter { n } has not changed."
17731803
1804+ @require_vision
1805+ def test_training_vlm_with_pad_to_multiple_of (self ):
1806+ # Models like Gemma3 use other forward keyword arguments like token_type_ids that also need to be padded when
1807+ # using pad_to_multiple_of, so we test that the trainer correctly pads all the necessary inputs in this case.
1808+ dataset = load_dataset ("trl-internal-testing/zen-image" , "conversational_prompt_only" , split = "train" )
1809+
1810+ def reward_func (completions , ** kwargs ):
1811+ """Reward function that rewards longer completions."""
1812+ return [float (len (completion [0 ]["content" ])) for completion in completions ]
1813+
1814+ training_args = GRPOConfig (
1815+ output_dir = self .tmp_dir ,
1816+ learning_rate = 0.1 , # use higher lr because gradients are tiny and default lr can stall updates
1817+ per_device_train_batch_size = 3 , # reduce the batch size to reduce memory usage
1818+ num_generations = 3 , # reduce the number of generations to reduce memory usage
1819+ max_completion_length = 8 , # reduce the completion length to reduce memory usage
1820+ pad_to_multiple_of = 7 ,
1821+ report_to = "none" ,
1822+ )
1823+ trainer = GRPOTrainer (
1824+ model = "trl-internal-testing/tiny-Gemma3ForConditionalGeneration" ,
1825+ reward_funcs = reward_func ,
1826+ args = training_args ,
1827+ train_dataset = dataset ,
1828+ )
1829+
1830+ previous_trainable_params = {n : param .clone () for n , param in trainer .model .named_parameters ()}
1831+
1832+ trainer .train ()
1833+
1834+ assert trainer .state .log_history [- 1 ]["train_loss" ] is not None
1835+
1836+ # Check that the params have changed
1837+ for n , param in previous_trainable_params .items ():
1838+ new_param = trainer .model .get_parameter (n )
1839+ assert not torch .equal (param , new_param ), f"Parameter { n } has not changed."
1840+
17741841 @pytest .mark .parametrize (
17751842 "model_id" ,
17761843 [
@@ -2554,6 +2621,47 @@ def test_training_with_liger_grpo_kernel_and_peft(self, model_name):
25542621
25552622 release_memory (model , trainer )
25562623
2624+ @require_liger_kernel
2625+ def test_liger_grpo_kernel_importance_sampling (self ):
2626+ model_name = "trl-internal-testing/tiny-LlamaForCausalLM-3.2"
2627+
2628+ training_args = GRPOConfig (
2629+ output_dir = self .tmp_dir ,
2630+ per_device_train_batch_size = 3 ,
2631+ num_generations = 3 ,
2632+ use_liger_kernel = True ,
2633+ max_completion_length = self .max_length ,
2634+ importance_sampling_level = "sequence" ,
2635+ report_to = "none" ,
2636+ logging_strategy = "no" ,
2637+ )
2638+
2639+ model = AutoModelForCausalLM .from_pretrained (model_name , dtype = "float32" )
2640+ tokenizer = AutoTokenizer .from_pretrained (model_name )
2641+ tokenizer .pad_token = tokenizer .eos_token if tokenizer .pad_token is None else tokenizer .pad_token
2642+
2643+ trainer = GRPOTrainer (
2644+ model = model ,
2645+ reward_funcs = "trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5" ,
2646+ args = training_args ,
2647+ train_dataset = self .train_dataset ,
2648+ eval_dataset = self .eval_dataset ,
2649+ processing_class = tokenizer ,
2650+ )
2651+ from liger_kernel .chunked_loss import LigerFusedLinearGRPOLoss
2652+
2653+ assert isinstance (trainer .liger_grpo_loss , LigerFusedLinearGRPOLoss )
2654+
2655+ previous_trainable_params = {n : param .clone () for n , param in model .named_parameters ()}
2656+
2657+ trainer .train ()
2658+
2659+ for n , param in previous_trainable_params .items ():
2660+ new_param = model .get_parameter (n )
2661+ assert not torch .equal (param , new_param ), f"Parameter { n } has not changed."
2662+
2663+ release_memory (model , trainer )
2664+
25572665 @pytest .mark .parametrize (
25582666 "model_name" ,
25592667 [
0 commit comments