e2e example for qlora ddp export

sugunav14 · sugunav14 · commit 035117f4275f · 2025-09-22T15:24:57.000Z
diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py
@@ -273,6 +273,9 @@ def train():
         kwargs = {"export_student": True} if training_args.distill else {}
         trainer.save_model(training_args.output_dir, **kwargs)
 
+    if training_args.lora and getattr(quant_args, "compress", False):
+        trainer.export_base_model_hf_checkpoint()
+
 
 if __name__ == "__main__":
     train()
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -826,6 +826,9 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str
         "k_bmm_quantizer._bias_value": "k_proj.k_bias",
         "v_bmm_quantizer._bias_value": "v_proj.v_bias",
         "input_quantizer._pre_quant_scale": "pre_quant_scale",
+        "base_layer.weight": "weight",
+        "base_layer.input_scale": "input_scale",
+        "base_layer.weight_scale": "weight_scale",
     }
 
     post_state_dict = {}
@@ -837,6 +840,7 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str
             and "_amax" not in key
             and "_bias_value" not in key
             and "input_quantizer._pre_quant_scale" not in key
+            and "base_layer" not in key
         ):
             post_state_dict[key] = value
             continue
@@ -888,6 +892,11 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str
         ):
             keys_to_delete.append(key)
 
+    # remove LoRA adapters from state dict
+    for key, value in post_state_dict.items():
+        if "lora" in key and key not in keys_to_delete:
+            keys_to_delete.append(key)
+
     # Check for tied weights and remove duplicates
     seen_tensors = {}
 
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -29,7 +29,7 @@
 
 from modelopt.torch.quantization import set_quantizer_by_cfg_context
 from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer
-from modelopt.torch.quantization.qtensor import NVFP4QTensor
+from modelopt.torch.quantization.qtensor import NVFP4QTensor, QTensorWrapper
 from modelopt.torch.quantization.utils import quantizer_attr_names
 
 from .convert_hf_config import convert_hf_quant_config_format
@@ -85,6 +85,9 @@ def _is_enabled_quantizer(quantizer):
 
 def requantize_resmooth_fused_llm_layers(model: torch.nn.Module):
     """Group modules that take the same input and register shared parameters in module."""
+    # Skip for LoRA finetuned models
+    if hasattr(model, "base_model"):
+        return
     # TODO: Handle DBRX MoE
     input_to_linear = defaultdict(list)
     output_to_layernorm = defaultdict(None)
@@ -311,7 +314,7 @@ def _export_quantized_weight(
         )[0]
 
         quantized_weight = to_quantized_weight(
-            weight.to(dtype),
+            weight.to(dtype) if not isinstance(weight, QTensorWrapper) else weight,
             weight_scale,
             quantization_format,
             weight_scale_2,
@@ -323,7 +326,7 @@ def _export_quantized_weight(
         )
     else:
         quantized_weight = to_quantized_weight(
-            weight.to(dtype),
+            weight.to(dtype) if not isinstance(weight, QTensorWrapper) else weight,
             weight_scale,
             quantization_format,
             weight_scale_2,
@@ -457,7 +460,11 @@ def _export_hf_checkpoint(
     for name, sub_module in layer_pool.items():
         if get_quantization_format(sub_module) != QUANTIZATION_NONE:
             has_quantized_layers = True
-            if is_quantlinear(sub_module):
+            if (
+                is_quantlinear(sub_module)
+                and hasattr(sub_module, "weight_quantizer")
+                and sub_module.weight_quantizer.is_enabled
+            ):
                 _export_quantized_weight(sub_module, dtype)
             elif (
                 "Llama4TextExperts" in type(sub_module).__name__
@@ -519,7 +526,9 @@ def export_hf_checkpoint(
 
         post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict)
 
-        # Save model
+        # For QLoRA models we export the base model
+        if hasattr(model, "base_model"):
+            model = model.base_model
         model.save_pretrained(
             export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
         )
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -16,6 +16,7 @@
 """ModelOpt plugin for transformers Trainer."""
 
 import gc
+import json
 import os
 import types
 from dataclasses import dataclass, field
@@ -28,6 +29,7 @@
 from modelopt.torch.distill import KDLossConfig
 from modelopt.torch.distill.mode import _convert_for_kd
 from modelopt.torch.distill.plugins.huggingface import KDTrainer
+from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
 from modelopt.torch.opt.conversion import restore_from_modelopt_state
 from modelopt.torch.opt.plugins import ModelOptHFTrainer
 from modelopt.torch.quantization.config import QuantizeConfig
@@ -217,6 +219,7 @@ def forward_loop(model):
         gc.collect()
 
         self._save_modelopt_state_with_weights()
+
         torch.cuda.empty_cache()
 
         if self.accelerator.is_main_process:
@@ -275,6 +278,25 @@ def save_model(self, *args, **kwargs):
             outputs = super().save_model(*args, **kwargs)
         return outputs
 
+    def _load_best_model(self, *args, **kwargs):
+        """Load the best model."""
+        is_lora = getattr(self.args, "lora", None)
+        if not is_lora:
+            super()._load_best_model(*args, **kwargs)
+        else:
+            # Custom logic for loading best model with LoRA
+            adapter_name = self.model.active_adapter()
+            self.model.delete_adapter(adapter_name)
+            self.model.load_adapter(self.state.best_model_checkpoint, adapter_name)
+
+    def export_base_model_hf_checkpoint(self):
+        """Export the basemodel to HF checkpoint for deployment."""
+        # Save config.json
+        if self.accelerator.is_main_process:
+            with open(f"{self.args.output_dir}/config.json", "w") as f:
+                json.dump(self.model.config.to_dict(), f, indent=2)
+            export_hf_checkpoint(self.model, export_dir=f"{self.args.output_dir}/base_model")
+
     def _patch_accelerate_for_fsdp2_fix(self):
         """Fixes for accelerate prepare.