refactored

sugunav14 · sugunav14 · commit 387ea6891526 · 2025-09-30T02:39:11.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -22,10 +22,8 @@
 import transformers
 from accelerate import infer_auto_device_map, init_empty_weights
 from accelerate.utils import get_max_memory
-from safetensors.torch import load_file
 from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 
-from modelopt.torch.opt.conversion import restore_from_modelopt_state
 from modelopt.torch.utils.image_processor import MllamaImageProcessor
 
 SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
@@ -124,46 +122,20 @@ def get_dtype(dtype):
     return dtype
 
 
-def get_lora_model(
-    ckpt_path: str,
-    device_map="cuda",
-):
-    """
-    Loads a QLoRA model that has been trained using modelopt trainer.
-    """
-    # Load model with adapters
-    model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map)
-
-    # Restore modelopt state
-    modelopt_state = torch.load(f"{ckpt_path}/modelopt_state.pth", weights_only=False)
-    restore_from_modelopt_state(model, modelopt_state)
-
-    # Load compressed weights
-    state_dict = load_file(f"{ckpt_path}/model.safetensors")
-    model.load_state_dict(state_dict, strict=False)
-
-    return model
-
-
 def get_model(
     ckpt_path,
     device="cuda",
     gpu_mem_percentage=0.8,
     trust_remote_code=False,
     use_seq_device_map=False,
     attn_implementation=None,
-    is_modelopt_qlora=False,
 ):
     print(f"Initializing model from {ckpt_path}")
 
     device_map = "auto"
     if device == "cpu":
         device_map = "cpu"
 
-    if is_modelopt_qlora:
-        model = get_lora_model(ckpt_path, device_map)
-        return model
-
     config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {}
     if attn_implementation is not None:
         config_kwargs["attn_implementation"] = attn_implementation
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -238,7 +238,6 @@ def main(args):
             trust_remote_code=args.trust_remote_code,
             use_seq_device_map=args.use_seq_device_map,
             attn_implementation=args.attn_implementation,
-            is_modelopt_qlora=args.qlora,
         )
     else:
         assert args.qformat in QUANT_CFG_CHOICES, (
@@ -345,9 +344,7 @@ def main(args):
         )
         mts.export(model)
 
-    if (
-        args.auto_quantize_bits or args.qformat in QUANT_CFG_CHOICES
-    ) and not model_is_already_quantized:
+    if args.auto_quantize_bits or args.qformat in QUANT_CFG_CHOICES:
         if "awq" in args.qformat:
             print(
                 "\n####\nAWQ calibration could take longer than other calibration methods. "
@@ -474,7 +471,7 @@ def main(args):
                     "Please set the default input_mode to InputMode.LANGUAGE before quantizing."
                 )
 
-        if calibration_only:
+        if not model_is_already_quantized and calibration_only:
             # Only run single sample for preview
             input_ids = next(iter(calib_dataloader))[
                 "input_features" if model_type == "whisper" else "input_ids"
@@ -548,12 +545,7 @@ def output_decode(generated_ids, input_shape):
 
     else:
         assert model_type != "dbrx", f"Does not support export {model_type} without quantizaton"
-        if model_is_already_quantized:
-            warnings.warn(
-                "Skipping quantization: Model is already quantized. Exporting the model..."
-            )
-        else:
-            print(f"qformat: {args.qformat}. No quantization applied, export {device} model")
+        print(f"qformat: {args.qformat}. No quantization applied, export {device} model")
 
     with torch.inference_mode():
         if model_type is None:
@@ -626,7 +618,6 @@ def output_decode(generated_ids, input_shape):
             export_hf_checkpoint(
                 full_model,
                 export_dir=export_path,
-                is_modelopt_qlora=args.qlora,
             )
 
         # Restore default padding and export the tokenizer as well.
diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py
@@ -0,0 +1,119 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import warnings
+from pathlib import Path
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from modelopt.torch.export.convert_hf_config import convert_hf_quant_config_format
+from modelopt.torch.export.unified_export_hf import _export_hf_checkpoint
+from modelopt.torch.opt.conversion import restore_from_modelopt_state
+from modelopt.torch.quantization.utils import set_quantizer_state_dict
+
+RAND_SEED = 1234
+
+
+def get_lora_model(
+    ckpt_path: str,
+    device="cuda",
+):
+    """
+    Loads a QLoRA model that has been trained using modelopt trainer.
+    """
+    device_map = "auto"
+    if device == "cpu":
+        device_map = "cpu"
+
+    # Load model with adapters
+    model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map)
+
+    # Restore modelopt state
+    modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_calibration.pth", weights_only=False)
+    restore_from_modelopt_state(model, modelopt_state)
+
+    # Restore modelopt quantizer state dict
+    modelopt_weights = modelopt_state.pop("modelopt_state_weights", None)
+    if modelopt_weights is not None:
+        print("Restoring modelopt weights")
+        set_quantizer_state_dict(model, modelopt_weights)
+
+    return model
+
+
+def main(args):
+    # Load model
+    model = get_lora_model(args.pyt_ckpt_path, args.device)
+    tokenizer = AutoTokenizer.from_pretrained(args.pyt_ckpt_path)
+
+    # Export HF checkpoint
+    export_dir = Path(args.export_path)
+    export_dir.mkdir(parents=True, exist_ok=True)
+    base_model_dir = export_dir / "base_model"
+    base_model_dir.mkdir(parents=True, exist_ok=True)
+
+    try:
+        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_lora=True)
+
+        with open(f"{export_dir}/base_model/hf_quant_config.json", "w") as file:
+            json.dump(hf_quant_config, file, indent=4)
+
+        hf_quant_config = convert_hf_quant_config_format(hf_quant_config)
+
+        # Save base model
+        model.base_model.save_pretrained(f"{export_dir}/base_model", state_dict=post_state_dict)
+        # Save adapters
+        model.save_pretrained(export_dir)
+
+        config_path = f"{export_dir}/base_model/config.json"
+
+        # In the case of LoRA model.save_pretrained does not save the correct config.json
+        config_data = model.config.to_dict()
+        print(config_data)
+
+        config_data["quantization_config"] = hf_quant_config
+
+        with open(config_path, "w") as file:
+            json.dump(config_data, file, indent=4)
+
+        # Save tokenizer
+        tokenizer.save_pretrained(export_dir)
+
+    except Exception as e:
+        warnings.warn(
+            "Cannot export model to the model_config. The modelopt-optimized model state_dict"
+            " can be saved with torch.save for further inspection."
+        )
+        raise e
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--pyt_ckpt_path",
+        help="Specify where the PyTorch checkpoint path is",
+        required=True,
+    )
+
+    parser.add_argument("--device", default="cuda")
+
+    parser.add_argument("--export_path", default="exported_model")
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -269,28 +269,18 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") ->
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
-        # If scale is already registered, indicates weights are already compressed.
-        # We convert to modelopt scale if necessary and return
-        if hasattr(weight_quantizer, "_scale"):
-            return NVFP4QTensor.get_modelopt_weights_scaling_factor(
-                weight_quantizer._scale, weight.metadata["shape"]
-            )
-        else:
-            return NVFP4QTensor.get_weights_scaling_factor(
-                weight,
-                weight_quantizer.block_sizes[-1],
-                NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer).to(
-                    weight.device
-                ),
-            )[0]
+        return NVFP4QTensor.get_weights_scaling_factor(
+            weight,
+            weight_quantizer.block_sizes[-1],
+            NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer).to(
+                weight.device
+            ),
+        )[0]
 
     if quantization_format in [QUANTIZATION_W4A8_MXFP4_FP8, QUANTIZATION_MXFP4]:
-        if hasattr(weight_quantizer, "_scale"):
-            return weight_quantizer._scale.reshape(*weight.shape[:-1], -1)
-        else:
-            return MXFP4QTensor.quantize(weight, block_size=weight_quantizer.block_sizes[-1])[
-                1
-            ].reshape(*weight.shape[:-1], -1)
+        return MXFP4QTensor.quantize(weight, block_size=weight_quantizer.block_sizes[-1])[
+            1
+        ].reshape(*weight.shape[:-1], -1)
     return get_scaling_factor(weight_quantizer)
 
 
@@ -306,10 +296,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight")
         QUANTIZATION_NVFP4_AWQ,
         QUANTIZATION_W4A8_NVFP4_FP8,
     ]:
-        if hasattr(weight_quantizer, "_double_scale"):
-            return weight_quantizer._double_scale
-        else:
-            return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
+        return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer)
 
     # SequentialQuantizer is required
     if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled:
@@ -740,7 +727,6 @@ def to_quantized_weight(
     quantization: str,
     weights_scaling_factor2: torch.Tensor | None = None,
     block_size: int | None = None,
-    dtype: torch.dtype | None = None,
 ):
     """Converts the weight to the quantized (packed) format."""
     if weights_scaling_factor is not None:
@@ -753,9 +739,6 @@ def to_quantized_weight(
     if isinstance(weight, QTensorWrapper):
         return weight.data
 
-    if dtype:
-        weight = weight.to(dtype)
-
     if quantization == QUANTIZATION_FP8:
         # Fix RuntimeError: Promotion for Float8 Types is not supported, attempted to promote Float8_e4m3fn and Float
         # in speculative decoding fp8 model export
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py