Update

sergiopaniego · sergiopaniego · commit c5550400560b · 2026-03-13T12:37:31.000+01:00
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
@@ -184,15 +184,18 @@ def test_train(self, model_id):
         # Get the dataset
         dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
 
-        # NemotronH does not support gradient checkpointing
-        gradient_checkpointing = "NemotronH" not in model_id
+        # NemotronH (hybrid Mamba-Attention) does not support gradient checkpointing. The Mamba CUDA
+        # kernels require strides to be multiples of 8, which is incompatible with tiny model dimensions.
+        # Force CPU so that the model uses the pure PyTorch path (works fine on GPU without kernels).
+        is_nemotron = "NemotronH" in model_id
 
         # Initialize the trainer
         training_args = DPOConfig(
             output_dir=self.tmp_dir,
             learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
             report_to="none",
-            gradient_checkpointing=gradient_checkpointing,
+            gradient_checkpointing=not is_nemotron,
+            use_cpu=is_nemotron,
         )
         trainer = DPOTrainer(model=model_id, args=training_args, train_dataset=dataset)
 
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -297,12 +297,17 @@ def test_train(self, model_id):
         # Get the dataset
         dataset = load_dataset("trl-internal-testing/zen", "standard_language_modeling", split="train")
 
-        # NemotronH does not support gradient checkpointing
-        gradient_checkpointing = "NemotronH" not in model_id
+        # NemotronH (hybrid Mamba-Attention) does not support gradient checkpointing. The Mamba CUDA
+        # kernels require strides to be multiples of 8, which is incompatible with tiny model dimensions.
+        # Force CPU so that the model uses the pure PyTorch path (works fine on GPU without kernels).
+        is_nemotron = "NemotronH" in model_id
 
         # Initialize the trainer
         training_args = SFTConfig(
-            output_dir=self.tmp_dir, report_to="none", gradient_checkpointing=gradient_checkpointing
+            output_dir=self.tmp_dir,
+            report_to="none",
+            gradient_checkpointing=not is_nemotron,
+            use_cpu=is_nemotron,
         )
         trainer = SFTTrainer(model=model_id, args=training_args, train_dataset=dataset)