Cursor review

sergiopaniego · sergiopaniego · commit 3c8f9d4f3a72 · 2026-03-13T16:28:48.000+01:00
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
@@ -187,15 +187,17 @@ def test_train(self, model_id):
         # NemotronH (hybrid Mamba-Attention) does not support gradient checkpointing. The Mamba CUDA
         # kernels require strides to be multiples of 8, which is incompatible with tiny model dimensions.
         # Force CPU so that the model uses the pure PyTorch path (works fine on GPU without kernels).
-        is_nemotron = "NemotronH" in model_id
+        kwargs = {}
+        if "NemotronH" in model_id:
+            kwargs["gradient_checkpointing"] = False
+            kwargs["use_cpu"] = True
 
         # Initialize the trainer
         training_args = DPOConfig(
             output_dir=self.tmp_dir,
             learning_rate=0.1,  # use higher lr because gradients are tiny and default lr can stall updates
             report_to="none",
-            gradient_checkpointing=not is_nemotron,
-            use_cpu=is_nemotron,
+            **kwargs,
         )
         trainer = DPOTrainer(model=model_id, args=training_args, train_dataset=dataset)
 
diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -300,15 +300,13 @@ def test_train(self, model_id):
         # NemotronH (hybrid Mamba-Attention) does not support gradient checkpointing. The Mamba CUDA
         # kernels require strides to be multiples of 8, which is incompatible with tiny model dimensions.
         # Force CPU so that the model uses the pure PyTorch path (works fine on GPU without kernels).
-        is_nemotron = "NemotronH" in model_id
+        kwargs = {}
+        if "NemotronH" in model_id:
+            kwargs["gradient_checkpointing"] = False
+            kwargs["use_cpu"] = True
 
         # Initialize the trainer
-        training_args = SFTConfig(
-            output_dir=self.tmp_dir,
-            report_to="none",
-            gradient_checkpointing=not is_nemotron,
-            use_cpu=is_nemotron,
-        )
+        training_args = SFTConfig(output_dir=self.tmp_dir, report_to="none", **kwargs)
         trainer = SFTTrainer(model=model_id, args=training_args, train_dataset=dataset)
 
         # Save the initial parameters to compare them later