e2e checkpoint tested for nvfp4 and fp8

sugunav14 · sugunav14 · commit 40202eb38829 · 2025-10-06T06:00:36.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -118,15 +118,11 @@ def get_dtype(dtype):
 
 def get_lora_model(
     ckpt_path: str,
-    device="cuda",
+    device_map="cuda",
 ):
     """
     Loads a QLoRA model that has been trained using modelopt trainer.
     """
-    device_map = "auto"
-    if device == "cpu":
-        device_map = "cpu"
-
     # Load model with adapters
     model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map)
 
@@ -148,13 +144,18 @@ def get_model(
     trust_remote_code=False,
     use_seq_device_map=False,
     attn_implementation=None,
+    is_lora=False,
 ):
     print(f"Initializing model from {ckpt_path}")
 
     device_map = "auto"
     if device == "cpu":
         device_map = "cpu"
 
+    if is_lora:
+        model = get_lora_model(ckpt_path, device_map)
+        return model
+
     config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {}
     if attn_implementation is not None:
         config_kwargs["attn_implementation"] = attn_implementation
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -241,20 +241,15 @@ def main(args):
     # If low memory mode is enabled, we compress the model while loading the HF checkpoint.
     calibration_only = False
     if not args.low_memory_mode:
-        if args.lora:
-            model = get_lora_model(
-                args.pyt_ckpt_path,
-                args.device,
-            )
-        else:
-            model = get_model(
-                args.pyt_ckpt_path,
-                args.device,
-                gpu_mem_percentage=args.gpu_max_mem_percentage,
-                trust_remote_code=args.trust_remote_code,
-                use_seq_device_map=args.use_seq_device_map,
-                attn_implementation=args.attn_implementation,
-            )
+        model = get_model(
+            args.pyt_ckpt_path,
+            args.device,
+            gpu_mem_percentage=args.gpu_max_mem_percentage,
+            trust_remote_code=args.trust_remote_code,
+            use_seq_device_map=args.use_seq_device_map,
+            attn_implementation=args.attn_implementation,
+            is_lora=args.lora,
+        )
     else:
         assert args.qformat in QUANT_CFG_CHOICES, (
             f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}"
@@ -395,9 +390,6 @@ def main(args):
                 sample_input_single_batch = None
 
             run_auto_quant = args.auto_quantize_bits is not None
-            print("DEBUG LOG: Entereing here")
-            for k, v in model.state_dict().items():
-                print(k, v.shape, v.dtype, v.device)
 
             args.batch_size = get_max_batch_size(
                 model,
@@ -493,7 +485,7 @@ def main(args):
                 quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
                 quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
 
-        if not model_is_already_quantized or calibration_only:
+        if calibration_only:
             # Only run single sample for preview
             input_ids = next(iter(calib_dataloader))[
                 "input_features" if model_type == "whisper" else "input_ids"
@@ -567,7 +559,12 @@ def output_decode(generated_ids, input_shape):
 
     else:
         assert model_type != "dbrx", f"Does not support export {model_type} without quantizaton"
-        print(f"qformat: {args.qformat}. No quantization applied, export {device} model")
+        if model_is_already_quantized:
+            warnings.warn(
+                "Skipping quantization: Model is already quantized. Exporting the model..."
+            )
+        else:
+            print(f"qformat: {args.qformat}. No quantization applied, export {device} model")
 
     with torch.inference_mode():
         if model_type is None:
@@ -643,6 +640,7 @@ def output_decode(generated_ids, input_shape):
             export_hf_checkpoint(
                 full_model,
                 export_dir=export_path,
+                is_modelopt_trained_lora=args.lora,
             )
 
         # Copy custom model files (Python files and JSON configs) if trust_remote_code is used
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -270,23 +270,28 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
+        # If scale is already registered, indicates weights are already compressed.
+        # We convert to modelopt scale if necessary and return
         if hasattr(weight_quantizer, "_scale"):
             return NVFP4QTensor.get_modelopt_weights_scaling_factor(
                 weight_quantizer._scale, weight.metadata["shape"]
             )
-
-        return NVFP4QTensor.get_weights_scaling_factor(
-            weight,
-            weight_quantizer.block_sizes[-1],
-            NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer).to(
-                weight.device
-            ),
-        )[0]
+        else:
+            return NVFP4QTensor.get_weights_scaling_factor(
+                weight,
+                weight_quantizer.block_sizes[-1],
+                NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer).to(
+                    weight.device
+                ),
+            )[0]
 
     if quantization_format in [QUANTIZATION_W4A8_MXFP4_FP8, QUANTIZATION_MXFP4]:
-        return MXFP4QTensor.quantize(weight, block_size=weight_quantizer.block_sizes[-1])[
-            1
-        ].reshape(*weight.shape[:-1], -1)
+        if hasattr(weight_quantizer, "_scale"):
+            return weight_quantizer._scale.reshape(*weight.shape[:-1], -1)
+        else:
+            return MXFP4QTensor.quantize(weight, block_size=weight_quantizer.block_sizes[-1])[
+                1
+            ].reshape(*weight.shape[:-1], -1)
     return get_scaling_factor(weight_quantizer)
 
 
@@ -302,7 +307,10 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
-        return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
+        if hasattr(weight_quantizer, "_double_scale"):
+            return weight_quantizer._double_scale
+        else:
+            return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
 
     # SequentialQuantizer is required
     if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled:
@@ -824,7 +832,12 @@ def from_quantized_weight(
     raise NotImplementedError(f"quantization format {quantization} not supported")
 
 
-def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str | None) -> dict:
+def postprocess_state_dict(
+    state_dict: dict,
+    maxbound: float,
+    quantization: str | None,
+    is_modelopt_trained_lora: bool = False,
+) -> dict:
     """Filters out keys related to weight quantizers and updates KV cache related keys.
 
     Args:
@@ -841,11 +854,18 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str
         "k_bmm_quantizer._bias_value": "k_proj.k_bias",
         "v_bmm_quantizer._bias_value": "v_proj.v_bias",
         "input_quantizer._pre_quant_scale": "pre_quant_scale",
-        "base_layer.weight": "weight",
-        "base_layer.input_scale": "input_scale",
-        "base_layer.weight_scale": "weight_scale",
     }
 
+    # For modelopt-trained LoRA models, we need to remove the base_layer prefix from the keys for deployment
+    if is_modelopt_trained_lora:
+        replacements.update(
+            {
+                "base_layer.weight": "weight",
+                "base_layer.input_scale": "input_scale",
+                "base_layer.weight_scale": "weight_scale",
+            }
+        )
+
     post_state_dict = {}
 
     for key, value in state_dict.items():
@@ -908,10 +928,10 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str
             keys_to_delete.append(key)
 
     # remove LoRA adapters from state dict
-    for key, value in post_state_dict.items():
-        if "lora" in key and key not in keys_to_delete:
-            keys_to_delete.append(key)
-
+    if is_modelopt_trained_lora:
+        for key, value in post_state_dict.items():
+            if "lora" in key and key not in keys_to_delete:
+                keys_to_delete.append(key)
     # Check for tied weights and remove duplicates
     seen_tensors = {}
 
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -85,9 +85,6 @@ def _is_enabled_quantizer(quantizer):
 
 def requantize_resmooth_fused_llm_layers(model: torch.nn.Module):
     """Group modules that take the same input and register shared parameters in module."""
-    # Skip for LoRA finetuned models
-    if hasattr(model, "base_model"):
-        return
     # TODO: Handle DBRX MoE
     input_to_linear = defaultdict(list)
     output_to_layernorm = defaultdict(None)
@@ -343,7 +340,7 @@ def _export_quantized_weight(
 
 
 def _export_hf_checkpoint(
-    model: nn.Module, dtype: torch.dtype | None = None
+    model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_trained_lora: bool = False
 ) -> tuple[dict[str, Any], dict[str, Any]]:
     """Exports the torch model to the packed checkpoint with original HF naming.
 
@@ -435,7 +432,9 @@ def _export_hf_checkpoint(
 
     # Resmooth and requantize fused layers
     # TODO: Handle mixed precision
-    requantize_resmooth_fused_llm_layers(model)
+    # TODO: Support requantize and resmooth for modelopt-trained LoRA models
+    if not is_modelopt_trained_lora:
+        requantize_resmooth_fused_llm_layers(model)
 
     # Remove all hooks from the model
     try:
@@ -494,7 +493,7 @@ def _export_hf_checkpoint(
     quantized_state_dict = model.state_dict()
 
     quantized_state_dict = postprocess_state_dict(
-        quantized_state_dict, kv_cache_max_bound, kv_cache_format
+        quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_trained_lora
     )
 
     # Check if any layers are quantized
@@ -509,6 +508,7 @@ def export_hf_checkpoint(
     dtype: torch.dtype | None = None,
     export_dir: Path | str = tempfile.gettempdir(),
     save_modelopt_state: bool = False,
+    is_modelopt_trained_lora: bool = False,
 ):
     """Exports the torch model to unified checkpoint and saves to export_dir.
 
@@ -518,15 +518,18 @@ def export_hf_checkpoint(
         export_dir: the target export path.
         save_modelopt_state: whether to save the modelopt state_dict.
     """
-    is_lora = hasattr(model, "base_model")
-    base_export_dir: Path | str = f"{export_dir}/base_model" if is_lora else export_dir
+    base_export_dir: Path | str = (
+        f"{export_dir}/base_model" if is_modelopt_trained_lora else export_dir
+    )
     export_dir = Path(export_dir)
     export_dir.mkdir(parents=True, exist_ok=True)
     base_export_dir = Path(base_export_dir)
     base_export_dir.mkdir(parents=True, exist_ok=True)
 
     try:
-        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
+        post_state_dict, hf_quant_config = _export_hf_checkpoint(
+            model, dtype, is_modelopt_trained_lora
+        )
 
         # NOTE: (hg) Should we save hf_quant_config when there's no quantization applied?
         # Save hf_quant_config.json for backward compatibility
@@ -538,11 +541,11 @@ def export_hf_checkpoint(
         post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict)
 
         # In the case of LoRA model, we save the base model
-        if is_lora:
+        if is_modelopt_trained_lora:
             model.base_model.save_pretrained(
                 base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
             )
-            model.save_pretrained(export_dir, save_modelopt_state=save_modelopt_state)
+            model.save_pretrained(export_dir)
         else:
             model.save_pretrained(
                 export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -185,7 +185,6 @@ def _save_modelopt_state_with_weights(self):
         # Save base model compressed weights for QLoRA
         if getattr(self.quant_args, "compress", False):
             # Save base model config.json
-            # weight_quantizer = self.quant_cfg["quant_cfg"]["*weight_quantizer"]
             self.model.config.save_pretrained(self.args.output_dir)
 
             # Save base model compressed weights excluding lora weights
@@ -292,14 +291,14 @@ def save_model(self, *args, **kwargs):
     def _load_best_model(self, *args, **kwargs):
         """Load the best model for final evaluation."""
         is_lora = getattr(self.args, "lora", None)
-        if not is_lora:
-            super()._load_best_model(*args, **kwargs)
-        else:
+        if is_lora and not self.is_fsdp_enabled:
             # Custom logic for loading best model with LoRA
             # TODO: Remove once we migrate to using get_peft_model()
             adapter_name = self.model.active_adapter()
             self.model.delete_adapter(adapter_name)
             self.model.load_adapter(self.state.best_model_checkpoint, adapter_name)
+        else:
+            super()._load_best_model(*args, **kwargs)
 
     def _patch_accelerate_for_fsdp2_fix(self):
         """Fixes for accelerate prepare.
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -270,20 +270,9 @@ def _unpack_tensor(input: torch.Tensor):
             return unpacked.reshape(unpacked_shape)
 
         # Get scales from kwargs
-        if kwarg["scale"].dtype == torch.uint8 and kwarg["scale"].ndim == 1:
-            # If quantization is done by trtllm, convert cutlass fp4 scale to modelopt fp4 scale
-            try:
-                from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import (
-                    cutlass_fp4_scale_to_modelopt_fp4_scale,
-                )
-
-                kwarg["scale"] = cutlass_fp4_scale_to_modelopt_fp4_scale(
-                    kwarg["scale"], self.metadata["shape"][-2:]
-                )
-            except ImportError as e:
-                raise ImportError(
-                    "This tensor is quantized by trtllm, but tensorrt_llm cannot be imported."
-                ) from e
+        kwarg["scale"] = self.get_modelopt_weights_scaling_factor(
+            kwarg["scale"], self.metadata["shape"]
+        )
 
         if fast:
             from ..triton.fp4_kernel import fp4_dequantize