export for fp8 lora base model

sugunav14 · sugunav14 · commit 3cf89cfacb26 · 2025-10-06T05:59:33.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -25,6 +25,7 @@
 import transformers
 from accelerate import infer_auto_device_map, init_empty_weights
 from accelerate.utils import get_max_memory
+from safetensors.torch import load_file
 from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 
 try:
@@ -115,6 +116,31 @@ def get_dtype(dtype):
     return dtype
 
 
+def get_lora_model(
+    ckpt_path: str,
+    device="cuda",
+):
+    """
+    Loads a QLoRA model that has been trained using modelopt trainer.
+    """
+    device_map = "auto"
+    if device == "cpu":
+        device_map = "cpu"
+
+    # Load model with adapters
+    model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map)
+
+    # Restore modelopt state
+    modelopt_state = torch.load(f"{ckpt_path}/modelopt_state.pth", weights_only=False)
+    restore_from_modelopt_state(model, modelopt_state)
+
+    # Load compressed weights
+    state_dict = load_file(f"{ckpt_path}/model.safetensors")
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
 def get_model(
     ckpt_path,
     device="cuda",
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -241,14 +241,20 @@ def main(args):
     # If low memory mode is enabled, we compress the model while loading the HF checkpoint.
     calibration_only = False
     if not args.low_memory_mode:
-        model = get_model(
-            args.pyt_ckpt_path,
-            args.device,
-            gpu_mem_percentage=args.gpu_max_mem_percentage,
-            trust_remote_code=args.trust_remote_code,
-            use_seq_device_map=args.use_seq_device_map,
-            attn_implementation=args.attn_implementation,
-        )
+        if args.lora:
+            model = get_lora_model(
+                args.pyt_ckpt_path,
+                args.device,
+            )
+        else:
+            model = get_model(
+                args.pyt_ckpt_path,
+                args.device,
+                gpu_mem_percentage=args.gpu_max_mem_percentage,
+                trust_remote_code=args.trust_remote_code,
+                use_seq_device_map=args.use_seq_device_map,
+                attn_implementation=args.attn_implementation,
+            )
     else:
         assert args.qformat in QUANT_CFG_CHOICES, (
             f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
@@ -629,6 +635,7 @@ def output_decode(generated_ids, input_shape):
                     "They will be set at deployment time."
                 )
 
+            print("DEBUG LOG: Calling unified export hf checkpoint")
             export_hf_checkpoint(
                 full_model,
                 export_dir=export_path,
@@ -772,6 +779,12 @@ def output_decode(generated_ids, input_shape):
         default=None,
         type=str,
     )
+    parser.add_argument(
+        "--lora",
+        help="Specify the model to be exported is a LoRA model trained using modelopt.",
+        default=False,
+        action="store_true",
+    )
 
     args = parser.parse_args()
 
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -1087,11 +1087,9 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st
             if block_size == 0:
                 block_size = get_weight_block_size(module)
 
-            # Handles case if default weight quantizer is not enabled or is None
-            if block_size != 0:
-                # Construct per layer config dictionary
-                layer_config_dict[name + ".quantization"] = quantization_format
-                layer_config_dict[name + ".awq_block_size"] = block_size
+            # Construct per layer config dictionary
+            layer_config_dict[name + ".quantization"] = quantization_format
+            layer_config_dict[name + ".awq_block_size"] = block_size
 
         # Find kv cache quant format
         if (
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -518,32 +518,40 @@ def export_hf_checkpoint(
         export_dir: the target export path.
         save_modelopt_state: whether to save the modelopt state_dict.
     """
+    is_lora = hasattr(model, "base_model")
+    base_export_dir: Path | str = f"{export_dir}/base_model" if is_lora else export_dir
     export_dir = Path(export_dir)
     export_dir.mkdir(parents=True, exist_ok=True)
+    base_export_dir = Path(base_export_dir)
+    base_export_dir.mkdir(parents=True, exist_ok=True)
+
     try:
         post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
 
         # NOTE: (hg) Should we save hf_quant_config when there's no quantization applied?
         # Save hf_quant_config.json for backward compatibility
-        with open(f"{export_dir}/hf_quant_config.json", "w") as file:
+        with open(f"{base_export_dir}/hf_quant_config.json", "w") as file:
             json.dump(hf_quant_config, file, indent=4)
 
         hf_quant_config = convert_hf_quant_config_format(hf_quant_config)
 
         post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict)
 
-        # For QLoRA models we export the base model
-        if hasattr(model, "base_model"):
-            model = model.base_model
+        # In the case of LoRA model, we save the base model
+        if is_lora:
+            model.base_model.save_pretrained(
+                base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
+            )
+
         model.save_pretrained(
             export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
         )
 
-        original_config = f"{export_dir}/config.json"
+        original_config = f"{base_export_dir}/config.json"
         config_data = {}
 
-        with open(original_config) as file:
-            config_data = json.load(file)
+        # In the case of LoRA model.save_pretrained does not save the correct config.json
+        config_data = model.config.to_dict()
 
         config_data["quantization_config"] = hf_quant_config
 
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -147,7 +147,7 @@ def __init__(
             self.model, "peft_config"
         ):
             # TODO: use get_peft_model here instead of add_adapter
-            self.model.add_adapter(self.args.lora_config, adapter_name="adapter")
+            self.model.add_adapter(self.args.lora_config)
             print_rank_0("Lora adapter added.")
 
         if hasattr(self.model, "peft_config") and self.quant_cfg is not None:
@@ -185,6 +185,7 @@ def _save_modelopt_state_with_weights(self):
         # Save base model compressed weights for QLoRA
         if getattr(self.quant_args, "compress", False):
             # Save base model config.json
+            # weight_quantizer = self.quant_cfg["quant_cfg"]["*weight_quantizer"]
             self.model.config.save_pretrained(self.args.output_dir)
 
             # Save base model compressed weights excluding lora weights
@@ -362,7 +363,7 @@ def __init__(
         if self.quant_cfg is not None and not is_quantized(self.model):
             self._quantize_model()
         if getattr(self.args, "lora_config", None) is not None:
-            self.model.add_adapter(self.args.lora_config, adapter_name="adapter")
+            self.model.add_adapter(self.args.lora_config)
             print_rank_0("Lora adapter added.")
         self._convert_to_distillation_model()