Refactor to include QAT/QAD export too

sugunav14 · sugunav14 · commit b81b4de93aa4 · 2025-10-06T06:01:41.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py
@@ -21,6 +21,7 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+import modelopt.torch.opt as mto
 from modelopt.torch.export.convert_hf_config import convert_hf_quant_config_format
 from modelopt.torch.export.unified_export_hf import _export_hf_checkpoint
 from modelopt.torch.opt.conversion import restore_from_modelopt_state
@@ -29,6 +30,9 @@
 
 RAND_SEED = 1234
 
+# Enable automatic save/load of modelopt state huggingface checkpointing
+mto.enable_huggingface_checkpointing()
+
 
 def get_lora_model(
     ckpt_path: str,
@@ -42,19 +46,20 @@ def get_lora_model(
     if device == "cpu":
         device_map = "cpu"
 
-    # Load model with adapters
+    # Load model
     model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map)
 
-    # Restore modelopt state
-    modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_calib.pth", weights_only=False)
-    restore_from_modelopt_state(model, modelopt_state)
-    print_rank_0("Restored modelopt state")
+    # Restore modelopt state for LoRA models. For QAT/QAD models from_pretrained call handles this
+    if hasattr(model, "peft_config"):
+        modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_train.pth", weights_only=False)
+        restore_from_modelopt_state(model, modelopt_state)
+        print_rank_0("Restored modelopt state")
 
-    # Restore modelopt quantizer state dict
-    modelopt_weights = modelopt_state.pop("modelopt_state_weights", None)
-    if modelopt_weights is not None:
-        set_quantizer_state_dict(model, modelopt_weights)
-        print_rank_0("Restored modelopt quantizer state dict")
+        # Restore modelopt quantizer state dict
+        modelopt_weights = modelopt_state.pop("modelopt_state_weights", None)
+        if modelopt_weights is not None:
+            set_quantizer_state_dict(model, modelopt_weights)
+            print_rank_0("Restored modelopt quantizer state dict")
 
     return model
 
@@ -63,25 +68,31 @@ def main(args):
     # Load model
     model = get_lora_model(args.pyt_ckpt_path, args.device)
     tokenizer = AutoTokenizer.from_pretrained(args.pyt_ckpt_path)
+    is_qlora = hasattr(model, "peft_config")
 
     # Export HF checkpoint
     export_dir = Path(args.export_path)
     export_dir.mkdir(parents=True, exist_ok=True)
-    base_model_dir = export_dir / "base_model"
-    base_model_dir.mkdir(parents=True, exist_ok=True)
+    if is_qlora:
+        base_model_dir = export_dir / "base_model"
+        base_model_dir.mkdir(parents=True, exist_ok=True)
+    else:
+        base_model_dir = export_dir
 
     try:
-        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_modelopt_qlora=True)
+        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_modelopt_qlora=is_qlora)
 
         with open(f"{base_model_dir}/hf_quant_config.json", "w") as file:
             json.dump(hf_quant_config, file, indent=4)
 
         hf_quant_config = convert_hf_quant_config_format(hf_quant_config)
 
-        # Save base model
-        model.base_model.save_pretrained(f"{base_model_dir}", state_dict=post_state_dict)
-        # Save adapters
-        model.save_pretrained(export_dir)
+        # Save model
+        if is_qlora:
+            model.base_model.save_pretrained(f"{base_model_dir}", state_dict=post_state_dict)
+            model.save_pretrained(export_dir)
+        else:
+            model.save_pretrained(export_dir, state_dict=post_state_dict)
 
         config_path = f"{base_model_dir}/config.json"
 
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -209,14 +209,8 @@ def forward_loop(model):
             print_rank_0("Quantizing the model...")
             mtq.quantize(self.model, self.quant_cfg, forward_loop)  # type: ignore [arg-type]
 
-        # Save modelopt state before compression. This is used to later export the model for deployment.
-        modelopt_state = mto.modelopt_state(self.model)
-        modelopt_state["modelopt_state_weights"] = get_quantizer_state_dict(self.model)
-        torch.save(modelopt_state, f"{self.args.output_dir}/modelopt_state_calib.pth")
-
-        print_rank_0(
-            f"Saved modelopt state before compression to {f'{self.args.output_dir}/modelopt_state_calib.pth'}"
-        )
+        # Save modelopt state
+        self._save_modelopt_state_with_weights()
 
         if getattr(self.quant_args, "compress", False):
             print_rank_0("Compressing model after calibration")
@@ -225,7 +219,6 @@ def forward_loop(model):
         # Force garbage collection to free up memory
         gc.collect()
 
-        self._save_modelopt_state_with_weights()
         torch.cuda.empty_cache()
 
         if self.accelerator.is_main_process: