updated readme

sugunav14 · sugunav14 · commit 5beed13c4d1f · 2025-09-22T16:02:22.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md
@@ -354,7 +354,11 @@ To perform QLoRA training, run:
    --lora True
 ```
 
-> **_NOTE:_** QLoRA is currently an experimental feature designed to reduce the memory footprint during training. Deployment functionality is not yet available.
+After performing QLoRA training the final checkpoint is exported to be ready for deployment. For more details about QLoRA deployment using vLLM dere to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html). To deploy with vLLM, run:
+
+```sh
+vllm serve llama3-fp4-qlora/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora --port 8000 --tokenizer llama3-fp4-qlora
+```
 
 ## Pre-Quantized Checkpoints
 
diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py
@@ -274,7 +274,7 @@ def train():
         trainer.save_model(training_args.output_dir, **kwargs)
 
     if training_args.lora and getattr(quant_args, "compress", False):
-        trainer.export_base_model_hf_checkpoint()
+        trainer.export_base_model()
 
 
 if __name__ == "__main__":
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -279,17 +279,18 @@ def save_model(self, *args, **kwargs):
         return outputs
 
     def _load_best_model(self, *args, **kwargs):
-        """Load the best model."""
+        """Load the best model for final evaluation."""
         is_lora = getattr(self.args, "lora", None)
         if not is_lora:
             super()._load_best_model(*args, **kwargs)
         else:
             # Custom logic for loading best model with LoRA
+            # TODO: Remove once we migrate to using get_peft_model()
             adapter_name = self.model.active_adapter()
             self.model.delete_adapter(adapter_name)
             self.model.load_adapter(self.state.best_model_checkpoint, adapter_name)
 
-    def export_base_model_hf_checkpoint(self):
+    def export_base_model(self):
         """Export the basemodel to HF checkpoint for deployment."""
         # Save config.json
         if self.accelerator.is_main_process: