minor update

sugunav14 · sugunav14 · commit 7fb53bb16218 · 2025-09-30T02:49:07.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -471,7 +471,7 @@ def main(args):
                     "Please set the default input_mode to InputMode.LANGUAGE before quantizing."
                 )
 
-        if not model_is_already_quantized and calibration_only:
+        if not model_is_already_quantized or calibration_only:
             # Only run single sample for preview
             input_ids = next(iter(calib_dataloader))[
                 "input_features" if model_type == "whisper" else "input_ids"
@@ -755,12 +755,6 @@ def output_decode(generated_ids, input_shape):
         default=None,
         type=str,
     )
-    parser.add_argument(
-        "--qlora",
-        help="Specify the model to be exported is a QLoRA model trained using modelopt.",
-        default=False,
-        action="store_true",
-    )
 
     args = parser.parse_args()
 
diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md
@@ -357,13 +357,9 @@ To perform QLoRA training, run:
 After performing QLoRA training the final checkpoint can be exported for deployment with vLLM using the following command.
 
 ```sh
-cd ../llm_ptq
-
-python hf_ptq.py \
+python export.py \
    --pyt_ckpt_path llama3-fp4-qlora \
-   --qformat nvfp4 \
    --export_dir llama3-fp4-qlora-hf \
-   --qlora 
 
 ```
 
diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py
@@ -68,7 +68,7 @@ def main(args):
     base_model_dir.mkdir(parents=True, exist_ok=True)
 
     try:
-        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_lora=True)
+        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_modelopt_qlora=True)
 
         with open(f"{export_dir}/base_model/hf_quant_config.json", "w") as file:
             json.dump(hf_quant_config, file, indent=4)
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -334,7 +334,7 @@ def _export_quantized_weight(
 
 
 def _export_hf_checkpoint(
-    model: nn.Module, dtype: torch.dtype | None = None, is_lora: bool = False
+    model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False
 ) -> tuple[dict[str, Any], dict[str, Any]]:
     """Exports the torch model to the packed checkpoint with original HF naming.
 
@@ -427,7 +427,7 @@ def _export_hf_checkpoint(
     # Resmooth and requantize fused layers
     # TODO: Handle mixed precision
     # TODO: Support requantize and resmooth for modelopt-trained LoRA models
-    if not is_lora:
+    if not is_modelopt_qlora:
         requantize_resmooth_fused_llm_layers(model)
 
     # Remove all hooks from the model
@@ -487,7 +487,7 @@ def _export_hf_checkpoint(
     quantized_state_dict = model.state_dict()
 
     quantized_state_dict = postprocess_state_dict(
-        quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_lora
+        quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora
     )
 
     # Check if any layers are quantized
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -209,13 +209,13 @@ def forward_loop(model):
             print_rank_0("Quantizing the model...")
             mtq.quantize(self.model, self.quant_cfg, forward_loop)  # type: ignore [arg-type]
 
-        # Save modelopt state before compression
+        # Save modelopt state before compression. This is used to later export the model for deployment.
         modelopt_state = mto.modelopt_state(self.model)
         modelopt_state["modelopt_state_weights"] = get_quantizer_state_dict(self.model)
-        torch.save(modelopt_state, f"{self.args.output_dir}/modelopt_state_calibration.pth")
+        torch.save(modelopt_state, f"{self.args.output_dir}/modelopt_state_calib.pth")
 
         print_rank_0(
-            f"Saved modelopt state before compression to {f'{self.args.output_dir}/modelopt_state_calibration.pth'}"
+            f"Saved modelopt state before compression to {f'{self.args.output_dir}/modelopt_state_calib.pth'}"
         )
 
         if getattr(self.quant_args, "compress", False):
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -94,27 +94,6 @@ def get_weights_scaling_factor_2(cls, input: torch.Tensor):
         """Returns per tensor weight scaling factor."""
         return reduce_amax(input).float() / (6.0 * 448.0)
 
-    @classmethod
-    def get_modelopt_weights_scaling_factor(cls, weight_scaling_factor: torch.Tensor, weight_shape):
-        """Returns the modelopt weights scaling factor if the quantization is done by trtllm."""
-        if weight_scaling_factor.dtype == torch.float8_e4m3fn:
-            return weight_scaling_factor
-
-        if weight_scaling_factor.dtype == torch.uint8 and weight_scaling_factor.ndim == 1:
-            # If quantization is done by trtllm, convert cutlass fp4 scale to modelopt fp4 scale
-            try:
-                from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import (
-                    cutlass_fp4_scale_to_modelopt_fp4_scale,
-                )
-
-                return cutlass_fp4_scale_to_modelopt_fp4_scale(
-                    weight_scaling_factor, weight_shape[-2:]
-                )
-            except ImportError as e:
-                raise ImportError(
-                    "This tensor is quantized by trtllm, but tensorrt_llm cannot be imported."
-                ) from e
-
     @classmethod
     def get_activation_scaling_factor(cls, quantizer):
         """Returns the activation scaling factor for export."""
@@ -270,9 +249,20 @@ def _unpack_tensor(input: torch.Tensor):
             return unpacked.reshape(unpacked_shape)
 
         # Get scales from kwargs
-        kwarg["scale"] = self.get_modelopt_weights_scaling_factor(
-            kwarg["scale"], self.metadata["shape"]
-        )
+        if kwarg["scale"].dtype == torch.uint8 and kwarg["scale"].ndim == 1:
+            # If quantization is done by trtllm, convert cutlass fp4 scale to modelopt fp4 scale
+            try:
+                from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import (
+                    cutlass_fp4_scale_to_modelopt_fp4_scale,
+                )
+
+                kwarg["scale"] = cutlass_fp4_scale_to_modelopt_fp4_scale(
+                    kwarg["scale"], self.metadata["shape"][-2:]
+                )
+            except ImportError as e:
+                raise ImportError(
+                    "This tensor is quantized by trtllm, but tensorrt_llm cannot be imported."
+                ) from e
 
         if fast:
             from ..triton.fp4_kernel import fp4_dequantize