cleanup

sugunav14 · sugunav14 · commit d23963ba2d30 · 2025-09-29T15:55:42.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
@@ -152,15 +152,15 @@ def get_model(
     trust_remote_code=False,
     use_seq_device_map=False,
     attn_implementation=None,
-    is_lora=False,
+    is_modelopt_qlora=False,
 ):
     print(f"Initializing model from {ckpt_path}")
 
     device_map = "auto"
     if device == "cpu":
         device_map = "cpu"
 
-    if is_lora:
+    if is_modelopt_qlora:
         model = get_lora_model(ckpt_path, device_map)
         return model
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -238,7 +238,7 @@ def main(args):
             trust_remote_code=args.trust_remote_code,
             use_seq_device_map=args.use_seq_device_map,
             attn_implementation=args.attn_implementation,
-            is_lora=args.lora,
+            is_modelopt_qlora=args.qlora,
         )
     else:
         assert args.qformat in QUANT_CFG_CHOICES, (
@@ -626,7 +626,7 @@ def output_decode(generated_ids, input_shape):
             export_hf_checkpoint(
                 full_model,
                 export_dir=export_path,
-                is_modelopt_trained_lora=args.lora,
+                is_modelopt_qlora=args.qlora,
             )
 
         # Restore default padding and export the tokenizer as well.
@@ -765,8 +765,8 @@ def output_decode(generated_ids, input_shape):
         type=str,
     )
     parser.add_argument(
-        "--lora",
-        help="Specify the model to be exported is a LoRA model trained using modelopt.",
+        "--qlora",
+        help="Specify the model to be exported is a QLoRA model trained using modelopt.",
         default=False,
         action="store_true",
     )
diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md
@@ -354,10 +354,23 @@ To perform QLoRA training, run:
    --lora True
 ```
 
-After performing QLoRA training the final checkpoint exported is ready for deployment using vLLM. For more details about QLoRA deployment using vLLM refer to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html). To deploy with vLLM, run:
+After performing QLoRA training the final checkpoint can be exported for deployment with vLLM using the following command.
 
 ```sh
-vllm serve llama3-fp4-qlora/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora --port 8000 --tokenizer llama3-fp4-qlora
+cd ../llm_ptq
+
+python hf_ptq.py \
+   --pyt_ckpt_path llama3-fp4-qlora \
+   --qformat nvfp4 \
+   --export_dir llama3-fp4-qlora-hf \
+   --qlora 
+
+```
+
+To deploy with vLLM, run the following command. For more details about QLoRA deployment using vLLM refer to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html).
+
+```sh
+vllm serve llama3-fp4-qlora-hf/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora-hf --port 8000 --tokenizer llama3-fp4-qlora-hf
 ```
 
 ## Pre-Quantized Checkpoints
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -830,7 +830,7 @@ def postprocess_state_dict(
     state_dict: dict,
     maxbound: float,
     quantization: str | None,
-    is_modelopt_trained_lora: bool = False,
+    is_modelopt_qlora: bool = False,
 ) -> dict:
     """Filters out keys related to weight quantizers and updates KV cache related keys.
 
@@ -849,28 +849,24 @@ def postprocess_state_dict(
         "v_bmm_quantizer._bias_value": "v_proj.v_bias",
         "input_quantizer._pre_quant_scale": "pre_quant_scale",
     }
+    skip_keys = ["output_quantizer", "_amax", "_bias_value", "input_quantizer._pre_quant_scale"]
 
     # For modelopt-trained LoRA models, we need to remove the base_layer prefix from the keys for deployment
-    if is_modelopt_trained_lora:
+    if is_modelopt_qlora:
         replacements.update(
             {
                 "base_layer.weight": "weight",
                 "base_layer.input_scale": "input_scale",
                 "base_layer.weight_scale": "weight_scale",
             }
         )
+        skip_keys.append("base_layer")
 
     post_state_dict = {}
 
     for key, value in state_dict.items():
         # Skip keys not related to quantizers
-        if (
-            "output_quantizer" not in key
-            and "_amax" not in key
-            and "_bias_value" not in key
-            and "input_quantizer._pre_quant_scale" not in key
-            and "base_layer" not in key
-        ):
+        if all(skip_key not in key for skip_key in skip_keys):
             post_state_dict[key] = value
             continue
 
@@ -922,8 +918,8 @@ def postprocess_state_dict(
             keys_to_delete.append(key)
 
     # remove LoRA adapters from state dict
-    if is_modelopt_trained_lora:
-        for key, value in post_state_dict.items():
+    if is_modelopt_qlora:
+        for key in post_state_dict:
             if "lora" in key and key not in keys_to_delete:
                 keys_to_delete.append(key)
     # Check for tied weights and remove duplicates
@@ -1104,10 +1100,15 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st
             if block_size == 0:
                 block_size = get_weight_block_size(module)
 
-            # Construct per layer config dictionary
-            if block_size == 0 and quantization_format != QUANTIZATION_FP8:
+            # In  the case of NVFP4, block_size 0 indicates weight_quantizer is not enabled
+            if block_size == 0 and quantization_format in [
+                QUANTIZATION_NVFP4,
+                QUANTIZATION_NVFP4_AWQ,
+                QUANTIZATION_W4A8_NVFP4_FP8,
+            ]:
                 continue
 
+            # Construct per layer config dictionary
             layer_config_dict[name + ".quantization"] = quantization_format
             layer_config_dict[name + ".awq_block_size"] = block_size
 
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -336,7 +336,7 @@ def _export_quantized_weight(
 
 
 def _export_hf_checkpoint(
-    model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_trained_lora: bool = False
+    model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False
 ) -> tuple[dict[str, Any], dict[str, Any]]:
     """Exports the torch model to the packed checkpoint with original HF naming.
 
@@ -429,7 +429,7 @@ def _export_hf_checkpoint(
     # Resmooth and requantize fused layers
     # TODO: Handle mixed precision
     # TODO: Support requantize and resmooth for modelopt-trained LoRA models
-    if not is_modelopt_trained_lora:
+    if not is_modelopt_qlora:
         requantize_resmooth_fused_llm_layers(model)
 
     # Remove all hooks from the model
@@ -489,7 +489,7 @@ def _export_hf_checkpoint(
     quantized_state_dict = model.state_dict()
 
     quantized_state_dict = postprocess_state_dict(
-        quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_trained_lora
+        quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora
     )
 
     # Check if any layers are quantized
@@ -504,7 +504,7 @@ def export_hf_checkpoint(
     dtype: torch.dtype | None = None,
     export_dir: Path | str = tempfile.gettempdir(),
     save_modelopt_state: bool = False,
-    is_modelopt_trained_lora: bool = False,
+    is_modelopt_qlora: bool = False,
 ):
     """Exports the torch model to unified checkpoint and saves to export_dir.
 
@@ -514,18 +514,15 @@ def export_hf_checkpoint(
         export_dir: the target export path.
         save_modelopt_state: whether to save the modelopt state_dict.
     """
-    base_export_dir: Path | str = (
-        f"{export_dir}/base_model" if is_modelopt_trained_lora else export_dir
-    )
+    # Setup directories
     export_dir = Path(export_dir)
-    export_dir.mkdir(parents=True, exist_ok=True)
-    base_export_dir = Path(base_export_dir)
-    base_export_dir.mkdir(parents=True, exist_ok=True)
+    base_export_dir = export_dir / "base_model" if is_modelopt_qlora else export_dir
+
+    for dir_path in [export_dir, base_export_dir]:
+        dir_path.mkdir(parents=True, exist_ok=True)
 
     try:
-        post_state_dict, hf_quant_config = _export_hf_checkpoint(
-            model, dtype, is_modelopt_trained_lora
-        )
+        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype, is_modelopt_qlora)
 
         # NOTE: (hg) Should we save hf_quant_config when there's no quantization applied?
         # Save hf_quant_config.json for backward compatibility
@@ -536,8 +533,8 @@ def export_hf_checkpoint(
 
         post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict)
 
-        # In the case of LoRA model, we save the base model
-        if is_modelopt_trained_lora:
+        if is_modelopt_qlora:
+            # In the case of LoRA model, we save the base model and adapters
             model.base_model.save_pretrained(
                 base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
             )