ci: add QLoRA test (#1017)

akoumpa · web-flow · commit 4fe74cdc6b66 · 2026-01-06T11:31:37.000-08:00
Signed-off-by: Alexandros Koumparoulis &lt;akoumparouli@nvidia.com&gt;
diff --git a/tests/functional_tests/hf_peft/L2_HF_QLORA_Tiny.sh b/tests/functional_tests/hf_peft/L2_HF_QLORA_Tiny.sh
@@ -0,0 +1,42 @@
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+set -xeuo pipefail # Exit immediately if a command exits with a non-zero status
+
+export PYTHONPATH=${PYTHONPATH:-}:$(pwd)
+export CUDA_VISIBLE_DEVICES="0"
+
+# NOTE: This functional test assumes the CI harness provides:
+# - $TEST_DATA_DIR (local tiny HF model fixtures)
+# - $HF_CACHE (local dataset fixtures)
+
+TRANSFORMERS_OFFLINE=1 python -m torch.distributed.run \
+--master-port=29511 --nproc_per_node=1 --nnodes=1 -m coverage run --data-file=/workspace/.coverage --source=/workspace \
+-m pytest tests/functional_tests/training/test_qlora_tiny.py \
+    --config tests/functional_tests/hf_peft/qlora_tiny_squad.yaml \
+    --model.pretrained_model_name_or_path $TEST_DATA_DIR/hf_mixtral_2l/ \
+    --dataset.tokenizer.pretrained_model_name_or_path $TEST_DATA_DIR/hf_mixtral_2l/ \
+    --validation_dataset.tokenizer.pretrained_model_name_or_path $TEST_DATA_DIR/hf_mixtral_2l/ \
+    --dataset.dataset_name $HF_CACHE/squad/ \
+    --validation_dataset.dataset_name $HF_CACHE/squad/ \
+    --step_scheduler.max_steps 2 \
+    --step_scheduler.global_batch_size 2 \
+    --step_scheduler.local_batch_size 1 \
+    --checkpoint.enabled true \
+    --checkpoint.checkpoint_dir checkpoints/ \
+    --checkpoint.model_save_format safetensors \
+    --checkpoint.save_consolidated false
+
+
diff --git a/tests/functional_tests/hf_peft/qlora_tiny_squad.yaml b/tests/functional_tests/hf_peft/qlora_tiny_squad.yaml
@@ -0,0 +1,102 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Tiny QLoRA smoke config used by functional tests.
+# The launcher script overrides the model + dataset paths to use local test fixtures.
+
+step_scheduler:
+  global_batch_size: 2
+  local_batch_size: 1
+  ckpt_every_steps: 2
+  val_every_steps: null
+  max_steps: 2
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 5
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 42
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
+  # Overridden by the functional-test launcher to a tiny local HF model.
+  pretrained_model_name_or_path: hf-internal-testing/tiny-random-gpt2
+
+peft:
+  _target_: nemo_automodel.components._peft.lora.PeftConfig
+  match_all_linear: true
+  dim: 4
+  alpha: 8
+  dropout: 0.0
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  dp_size: none
+  dp_replicate_size: 1
+  tp_size: 1
+  cp_size: 1
+  sequence_parallel: false
+
+quantization:
+  load_in_4bit: true
+  load_in_8bit: false
+  bnb_4bit_compute_dtype: bfloat16
+  bnb_4bit_use_double_quant: true
+  bnb_4bit_quant_type: nf4
+  bnb_4bit_quant_storage: bfloat16
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: train
+  limit_dataset_samples: 128
+
+packed_sequence:
+  packed_sequence_size: 0
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+  shuffle: false
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.llm.squad.make_squad_dataset
+  dataset_name: rajpurkar/squad
+  split: validation
+  limit_dataset_samples: 64
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn: nemo_automodel.components.datasets.utils.default_collater
+
+optimizer:
+  _target_: torch.optim.AdamW
+  betas: [0.9, 0.999]
+  eps: 1.0e-8
+  lr: 1.0e-4
+  weight_decay: 0.0
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: checkpoints/
+  model_save_format: safetensors
+  save_consolidated: false
+
+
diff --git a/tests/functional_tests/hf_peft/test_hf_peft.py b/tests/functional_tests/hf_peft/test_hf_peft.py
@@ -20,6 +20,7 @@
 HF_PEFT_FSDP2_CHECKPOINT_QAT_FILENAME = "L2_HF_PEFT_FSDP2_Checkpoint_qat.sh"
 HF_PEFT_Triton_FSDP2_CHECKPOINT_FILENAME = "L2_HF_PEFT_Triton_FSDP2_Checkpoint.sh"
 HF_PEFT_VLM_FSDP2_CHECKPOINT_FILENAME = "L2_HF_PEFT_VLM_FSDP2_Checkpoint.sh"
+HF_QLORA_TINY_FILENAME = "L2_HF_QLORA_Tiny.sh"
 
 class TestHFPEFT:
     def test_hf_peft_fsdp2_checkpoint(self):
@@ -45,6 +46,13 @@ def test_hf_peft_triton_fsdp2_checkpoint(self):
     def test_hf_peft_vlm_fsdp2_checkpoint(self):
         try:
             run_test_script(TEST_FOLDER, HF_PEFT_VLM_FSDP2_CHECKPOINT_FILENAME)
+        finally:
+            # remove the checkpoint directory
+            shutil.rmtree("checkpoints/", ignore_errors=True)
+
+    def test_hf_qlora_tiny(self):
+        try:
+            run_test_script(TEST_FOLDER, HF_QLORA_TINY_FILENAME)
         finally:
             # remove the checkpoint directory
             shutil.rmtree("checkpoints/", ignore_errors=True)
diff --git a/tests/functional_tests/training/test_qlora_tiny.py b/tests/functional_tests/training/test_qlora_tiny.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tiny QLoRA smoke test (4-bit + LoRA) for the functional test suite."""
+
+from __future__ import annotations
+
+import sys
+
+import pytest
+import torch
+
+from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
+from nemo_automodel.components.quantization.qlora import verify_qlora_quantization
+from nemo_automodel.recipes.llm.train_ft import TrainFinetuneRecipeForNextTokenPrediction
+
+import datasets
+
+datasets.disable_caching()
+
+
+def _get_cfg_path() -> str:
+    argv = sys.argv[1:]
+    for i, tok in enumerate(argv):
+        if tok in ("--config", "-c"):
+            if i + 1 >= len(argv):
+                raise ValueError("Expected a path after --config")
+            return argv[i + 1]
+    raise ValueError("Expected --config/-c to be provided by the functional-test launcher")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="QLoRA functional test requires CUDA")
+def test_qlora_tiny_smoke():
+    """
+    End-to-end smoke test:
+    - load a tiny HF model in 4-bit (bitsandbytes)
+    - apply LoRA adapters
+    - run a couple of training steps
+    - assert: model is quantized and LoRA params are trainable
+    """
+    pytest.importorskip("bitsandbytes")
+
+    cfg = parse_args_and_load_config(_get_cfg_path())
+    trainer = TrainFinetuneRecipeForNextTokenPrediction(cfg)
+    trainer.setup()
+
+    # Single-stage model in this config
+    model = trainer.model_parts[0]
+
+    is_quantized = bool(getattr(model, "is_loaded_in_4bit", False)) or bool(
+        getattr(getattr(model, "config", None), "quantization_config", None)
+    )
+    is_quantized = is_quantized or verify_qlora_quantization(model)
+    assert is_quantized, "Expected 4-bit quantization to be active for QLoRA"
+
+    trainable = [n for n, p in model.named_parameters() if p.requires_grad]
+    assert any("lora" in n.lower() for n in trainable), f"Expected LoRA trainable params, got: {trainable[:20]}"
+
+    # Run a very short training loop (max_steps is controlled by the config/CLI overrides)
+    trainer.run_train_validation_loop()
+
+