From 7e5b704a30787fa8a62304dbae7580aa8096d3ad Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Mon, 22 Sep 2025 15:07:23 +0000 Subject: [PATCH 01/26] e2e example for qlora ddp export Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/main.py | 3 +++ modelopt/torch/export/quant_utils.py | 9 ++++++++ modelopt/torch/export/unified_export_hf.py | 19 +++++++++++----- .../plugins/transformers_trainer.py | 22 +++++++++++++++++++ 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py index 30f49a6a5..f6727d76c 100644 --- a/examples/llm_qat/main.py +++ b/examples/llm_qat/main.py @@ -273,6 +273,9 @@ def train(): kwargs = {"export_student": True} if training_args.distill else {} trainer.save_model(training_args.output_dir, **kwargs) + if training_args.lora and getattr(quant_args, "compress", False): + trainer.export_base_model_hf_checkpoint() + if __name__ == "__main__": train() diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 885a12582..a5ab0158d 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -832,6 +832,9 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str "k_bmm_quantizer._bias_value": "k_proj.k_bias", "v_bmm_quantizer._bias_value": "v_proj.v_bias", "input_quantizer._pre_quant_scale": "pre_quant_scale", + "base_layer.weight": "weight", + "base_layer.input_scale": "input_scale", + "base_layer.weight_scale": "weight_scale", } post_state_dict = {} @@ -843,6 +846,7 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str and "_amax" not in key and "_bias_value" not in key and "input_quantizer._pre_quant_scale" not in key + and "base_layer" not in key ): post_state_dict[key] = value continue @@ -894,6 +898,11 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str ): keys_to_delete.append(key) + # remove LoRA adapters from state dict + for key, value in post_state_dict.items(): + if "lora" in key and key not in keys_to_delete: + keys_to_delete.append(key) + # Check for tied weights and remove duplicates seen_tensors = {} diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 2a69831e9..af153158c 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -29,7 +29,7 @@ from modelopt.torch.quantization import set_quantizer_by_cfg_context from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer -from modelopt.torch.quantization.qtensor import NVFP4QTensor +from modelopt.torch.quantization.qtensor import NVFP4QTensor, QTensorWrapper from modelopt.torch.quantization.utils import quantizer_attr_names from .convert_hf_config import convert_hf_quant_config_format @@ -85,6 +85,9 @@ def _is_enabled_quantizer(quantizer): def requantize_resmooth_fused_llm_layers(model: torch.nn.Module): """Group modules that take the same input and register shared parameters in module.""" + # Skip for LoRA finetuned models + if hasattr(model, "base_model"): + return # TODO: Handle DBRX MoE input_to_linear = defaultdict(list) output_to_layernorm = defaultdict(None) @@ -311,7 +314,7 @@ def _export_quantized_weight( )[0] quantized_weight = to_quantized_weight( - weight.to(dtype), + weight.to(dtype) if not isinstance(weight, QTensorWrapper) else weight, weight_scale, quantization_format, weight_scale_2, @@ -323,7 +326,7 @@ def _export_quantized_weight( ) else: quantized_weight = to_quantized_weight( - weight.to(dtype), + weight.to(dtype) if not isinstance(weight, QTensorWrapper) else weight, weight_scale, quantization_format, weight_scale_2, @@ -461,7 +464,11 @@ def _export_hf_checkpoint( for name, sub_module in layer_pool.items(): if get_quantization_format(sub_module) != QUANTIZATION_NONE: has_quantized_layers = True - if is_quantlinear(sub_module): + if ( + is_quantlinear(sub_module) + and hasattr(sub_module, "weight_quantizer") + and sub_module.weight_quantizer.is_enabled + ): _export_quantized_weight(sub_module, dtype) elif ( "Llama4TextExperts" in type(sub_module).__name__ @@ -523,7 +530,9 @@ def export_hf_checkpoint( post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict) - # Save model + # For QLoRA models we export the base model + if hasattr(model, "base_model"): + model = model.base_model model.save_pretrained( export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state ) diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 9d429ffab..c4a04dad5 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -16,6 +16,7 @@ """ModelOpt plugin for transformers Trainer.""" import gc +import json import os import types from dataclasses import dataclass, field @@ -28,6 +29,7 @@ from modelopt.torch.distill import KDLossConfig from modelopt.torch.distill.mode import _convert_for_kd from modelopt.torch.distill.plugins.huggingface import KDTrainer +from modelopt.torch.export.unified_export_hf import export_hf_checkpoint from modelopt.torch.opt.conversion import restore_from_modelopt_state from modelopt.torch.opt.plugins import ModelOptHFTrainer from modelopt.torch.quantization.config import QuantizeConfig @@ -217,6 +219,7 @@ def forward_loop(model): gc.collect() self._save_modelopt_state_with_weights() + torch.cuda.empty_cache() if self.accelerator.is_main_process: @@ -275,6 +278,25 @@ def save_model(self, *args, **kwargs): outputs = super().save_model(*args, **kwargs) return outputs + def _load_best_model(self, *args, **kwargs): + """Load the best model.""" + is_lora = getattr(self.args, "lora", None) + if not is_lora: + super()._load_best_model(*args, **kwargs) + else: + # Custom logic for loading best model with LoRA + adapter_name = self.model.active_adapter() + self.model.delete_adapter(adapter_name) + self.model.load_adapter(self.state.best_model_checkpoint, adapter_name) + + def export_base_model_hf_checkpoint(self): + """Export the basemodel to HF checkpoint for deployment.""" + # Save config.json + if self.accelerator.is_main_process: + with open(f"{self.args.output_dir}/config.json", "w") as f: + json.dump(self.model.config.to_dict(), f, indent=2) + export_hf_checkpoint(self.model, export_dir=f"{self.args.output_dir}/base_model") + def _patch_accelerate_for_fsdp2_fix(self): """Fixes for accelerate prepare. From cc341d3b2a44eb0e7b58ac19cfaebe7ce65676ac Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Mon, 22 Sep 2025 16:02:22 +0000 Subject: [PATCH 02/26] updated readme Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/README.md | 6 +++++- examples/llm_qat/main.py | 2 +- modelopt/torch/quantization/plugins/transformers_trainer.py | 5 +++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index 44e3ceb6a..d5352f941 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -345,7 +345,11 @@ To perform QLoRA training, run: --lora True ``` -> **_NOTE:_** QLoRA is currently an experimental feature designed to reduce the memory footprint during training. Deployment functionality is not yet available. +After performing QLoRA training the final checkpoint is exported to be ready for deployment. For more details about QLoRA deployment using vLLM dere to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html). To deploy with vLLM, run: + +```sh +vllm serve llama3-fp4-qlora/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora --port 8000 --tokenizer llama3-fp4-qlora +``` ## Pre-Quantized Checkpoints diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py index f6727d76c..66e8bc991 100644 --- a/examples/llm_qat/main.py +++ b/examples/llm_qat/main.py @@ -274,7 +274,7 @@ def train(): trainer.save_model(training_args.output_dir, **kwargs) if training_args.lora and getattr(quant_args, "compress", False): - trainer.export_base_model_hf_checkpoint() + trainer.export_base_model() if __name__ == "__main__": diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index c4a04dad5..54827f954 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -279,17 +279,18 @@ def save_model(self, *args, **kwargs): return outputs def _load_best_model(self, *args, **kwargs): - """Load the best model.""" + """Load the best model for final evaluation.""" is_lora = getattr(self.args, "lora", None) if not is_lora: super()._load_best_model(*args, **kwargs) else: # Custom logic for loading best model with LoRA + # TODO: Remove once we migrate to using get_peft_model() adapter_name = self.model.active_adapter() self.model.delete_adapter(adapter_name) self.model.load_adapter(self.state.best_model_checkpoint, adapter_name) - def export_base_model_hf_checkpoint(self): + def export_base_model(self): """Export the basemodel to HF checkpoint for deployment.""" # Save config.json if self.accelerator.is_main_process: From 20fb90f0ff1a618075d04c061af87d09aa0ee9aa Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Mon, 22 Sep 2025 16:09:58 +0000 Subject: [PATCH 03/26] minor refactor Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- modelopt/torch/export/quant_utils.py | 4 ++++ modelopt/torch/export/unified_export_hf.py | 8 +++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index a5ab0158d..70f4b5acd 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -733,6 +733,7 @@ def to_quantized_weight( quantization: str, weights_scaling_factor2: torch.Tensor | None = None, block_size: int | None = None, + dtype: torch.dtype | None = None, ): """Converts the weight to the quantized (packed) format.""" if weights_scaling_factor is not None: @@ -745,6 +746,9 @@ def to_quantized_weight( if isinstance(weight, QTensorWrapper): return weight.data + if dtype: + weight = weight.to(dtype) + if quantization == QUANTIZATION_FP8: # Fix RuntimeError: Promotion for Float8 Types is not supported, attempted to promote Float8_e4m3fn and Float # in speculative decoding fp8 model export diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index af153158c..ac895e609 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -29,7 +29,7 @@ from modelopt.torch.quantization import set_quantizer_by_cfg_context from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer -from modelopt.torch.quantization.qtensor import NVFP4QTensor, QTensorWrapper +from modelopt.torch.quantization.qtensor import NVFP4QTensor from modelopt.torch.quantization.utils import quantizer_attr_names from .convert_hf_config import convert_hf_quant_config_format @@ -314,11 +314,12 @@ def _export_quantized_weight( )[0] quantized_weight = to_quantized_weight( - weight.to(dtype) if not isinstance(weight, QTensorWrapper) else weight, + weight, weight_scale, quantization_format, weight_scale_2, block_size, + dtype, ) quantized_weight, weight_scale = maybe_transpose_expert_weight_dimensions( @@ -326,11 +327,12 @@ def _export_quantized_weight( ) else: quantized_weight = to_quantized_weight( - weight.to(dtype) if not isinstance(weight, QTensorWrapper) else weight, + weight, weight_scale, quantization_format, weight_scale_2, block_size, + dtype, ) setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False)) From 3c560e695b262fabf3d4a09d9c499058d9cc6dcb Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Mon, 22 Sep 2025 16:16:13 +0000 Subject: [PATCH 04/26] minor update Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/README.md | 2 +- modelopt/torch/quantization/plugins/transformers_trainer.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index d5352f941..ca9f6c4aa 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -345,7 +345,7 @@ To perform QLoRA training, run: --lora True ``` -After performing QLoRA training the final checkpoint is exported to be ready for deployment. For more details about QLoRA deployment using vLLM dere to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html). To deploy with vLLM, run: +After performing QLoRA training the final checkpoint exported is ready for deployment using vLLM. For more details about QLoRA deployment using vLLM refer to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html). To deploy with vLLM, run: ```sh vllm serve llama3-fp4-qlora/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora --port 8000 --tokenizer llama3-fp4-qlora diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 54827f954..29fbb6c72 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -16,7 +16,6 @@ """ModelOpt plugin for transformers Trainer.""" import gc -import json import os import types from dataclasses import dataclass, field @@ -219,7 +218,6 @@ def forward_loop(model): gc.collect() self._save_modelopt_state_with_weights() - torch.cuda.empty_cache() if self.accelerator.is_main_process: @@ -294,8 +292,6 @@ def export_base_model(self): """Export the basemodel to HF checkpoint for deployment.""" # Save config.json if self.accelerator.is_main_process: - with open(f"{self.args.output_dir}/config.json", "w") as f: - json.dump(self.model.config.to_dict(), f, indent=2) export_hf_checkpoint(self.model, export_dir=f"{self.args.output_dir}/base_model") def _patch_accelerate_for_fsdp2_fix(self): From d0218e7cbc5d8453b71b5bd1d327edca0cb86da5 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Mon, 22 Sep 2025 21:21:27 +0000 Subject: [PATCH 05/26] updated unit test Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- tests/examples/llm_qat/test_llm_qat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/examples/llm_qat/test_llm_qat.py b/tests/examples/llm_qat/test_llm_qat.py index c9fef976d..ed7548973 100644 --- a/tests/examples/llm_qat/test_llm_qat.py +++ b/tests/examples/llm_qat/test_llm_qat.py @@ -97,7 +97,6 @@ def test_llama_lora_qat_nvfp4(tiny_llama_path, tmp_path): ) -@pytest.mark.skip(reason="Fix QLoRa test failure") def test_llama_qlora_nvfp4(tiny_llama_path, tmp_path): _run_command( [ From 2c980b88bdd52def5dced354f24a1ed7c318d0d6 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 23 Sep 2025 00:44:04 +0000 Subject: [PATCH 06/26] Minor bug fix Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- modelopt/torch/export/quant_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 70f4b5acd..9b932e815 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -612,6 +612,8 @@ def process_layer_quant_config(layer_config_dict): # Get the corresponding AWQ block size block_size_value = layer_config_dict.get(awq_key, 0) + # print(f"DEBUG LOG: Processing layer {k} with quantization {v}, block size {block_size_value}") + if v == "fp8": layer_config = {"quant_algo": "FP8"} elif v == "fp8_pc_pt": @@ -1085,9 +1087,11 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st if block_size == 0: block_size = get_weight_block_size(module) - # Construct per layer config dictionary - layer_config_dict[name + ".quantization"] = quantization_format - layer_config_dict[name + ".awq_block_size"] = block_size + # Handles case if default weight quantizer is not enabled or is None + if block_size != 0: + # Construct per layer config dictionary + layer_config_dict[name + ".quantization"] = quantization_format + layer_config_dict[name + ".awq_block_size"] = block_size # Find kv cache quant format if ( From 5ed0307bb7ffffb45a5882755043452ab6577a84 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Thu, 25 Sep 2025 19:32:18 +0000 Subject: [PATCH 07/26] Update trainer to save base model weights and config.json Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/main.py | 3 --- .../plugins/transformers_trainer.py | 20 ++++++++++++------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/examples/llm_qat/main.py b/examples/llm_qat/main.py index 66e8bc991..30f49a6a5 100644 --- a/examples/llm_qat/main.py +++ b/examples/llm_qat/main.py @@ -273,9 +273,6 @@ def train(): kwargs = {"export_student": True} if training_args.distill else {} trainer.save_model(training_args.output_dir, **kwargs) - if training_args.lora and getattr(quant_args, "compress", False): - trainer.export_base_model() - if __name__ == "__main__": train() diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 29fbb6c72..583d1d6dd 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -21,6 +21,7 @@ from dataclasses import dataclass, field import torch +from safetensors.torch import save_file from tqdm import tqdm import modelopt.torch.opt as mto @@ -28,7 +29,6 @@ from modelopt.torch.distill import KDLossConfig from modelopt.torch.distill.mode import _convert_for_kd from modelopt.torch.distill.plugins.huggingface import KDTrainer -from modelopt.torch.export.unified_export_hf import export_hf_checkpoint from modelopt.torch.opt.conversion import restore_from_modelopt_state from modelopt.torch.opt.plugins import ModelOptHFTrainer from modelopt.torch.quantization.config import QuantizeConfig @@ -182,6 +182,18 @@ def _save_modelopt_state_with_weights(self): print_rank_0(f"Saved modelopt state to {self._modelopt_state_path}") + # Save base model compressed weights for QLoRA + if getattr(self.quant_args, "compress", False): + # Save base model config.json + self.model.config.save_pretrained(self.args.output_dir) + + # Save base model compressed weights excluding lora weights + state_dict = self.model.state_dict() + for k in [key for key in state_dict if "lora" in key]: + del state_dict[k] + + save_file(state_dict, f"{self.args.output_dir}/model.safetensors") + def _restore_modelopt_state_with_weights(self): modelopt_state = torch.load(self._modelopt_state_path, weights_only=False) modelopt_weights = modelopt_state.pop("modelopt_state_weights", None) @@ -288,12 +300,6 @@ def _load_best_model(self, *args, **kwargs): self.model.delete_adapter(adapter_name) self.model.load_adapter(self.state.best_model_checkpoint, adapter_name) - def export_base_model(self): - """Export the basemodel to HF checkpoint for deployment.""" - # Save config.json - if self.accelerator.is_main_process: - export_hf_checkpoint(self.model, export_dir=f"{self.args.output_dir}/base_model") - def _patch_accelerate_for_fsdp2_fix(self): """Fixes for accelerate prepare. From 41a5464d8a839d2226bb93e26f3a38fcee6ce66d Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Fri, 26 Sep 2025 01:16:00 +0000 Subject: [PATCH 08/26] export for fp8 lora base model Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 26 +++++++++++++++++ examples/llm_ptq/hf_ptq.py | 29 ++++++++++++++----- modelopt/torch/export/quant_utils.py | 8 ++--- modelopt/torch/export/unified_export_hf.py | 22 +++++++++----- .../plugins/transformers_trainer.py | 5 ++-- 5 files changed, 68 insertions(+), 22 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 38e11a8e1..95c51006f 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -25,6 +25,7 @@ import transformers from accelerate import infer_auto_device_map, init_empty_weights from accelerate.utils import get_max_memory +from safetensors.torch import load_file from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer try: @@ -115,6 +116,31 @@ def get_dtype(dtype): return dtype +def get_lora_model( + ckpt_path: str, + device="cuda", +): + """ + Loads a QLoRA model that has been trained using modelopt trainer. + """ + device_map = "auto" + if device == "cpu": + device_map = "cpu" + + # Load model with adapters + model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map) + + # Restore modelopt state + modelopt_state = torch.load(f"{ckpt_path}/modelopt_state.pth", weights_only=False) + restore_from_modelopt_state(model, modelopt_state) + + # Load compressed weights + state_dict = load_file(f"{ckpt_path}/model.safetensors") + model.load_state_dict(state_dict, strict=False) + + return model + + def get_model( ckpt_path, device="cuda", diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index da6761252..2ad474f86 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -241,14 +241,20 @@ def main(args): # If low memory mode is enabled, we compress the model while loading the HF checkpoint. calibration_only = False if not args.low_memory_mode: - model = get_model( - args.pyt_ckpt_path, - args.device, - gpu_mem_percentage=args.gpu_max_mem_percentage, - trust_remote_code=args.trust_remote_code, - use_seq_device_map=args.use_seq_device_map, - attn_implementation=args.attn_implementation, - ) + if args.lora: + model = get_lora_model( + args.pyt_ckpt_path, + args.device, + ) + else: + model = get_model( + args.pyt_ckpt_path, + args.device, + gpu_mem_percentage=args.gpu_max_mem_percentage, + trust_remote_code=args.trust_remote_code, + use_seq_device_map=args.use_seq_device_map, + attn_implementation=args.attn_implementation, + ) else: assert args.qformat in QUANT_CFG_CHOICES, ( f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}" @@ -629,6 +635,7 @@ def output_decode(generated_ids, input_shape): "They will be set at deployment time." ) + print("DEBUG LOG: Calling unified export hf checkpoint") export_hf_checkpoint( full_model, export_dir=export_path, @@ -772,6 +779,12 @@ def output_decode(generated_ids, input_shape): default=None, type=str, ) + parser.add_argument( + "--lora", + help="Specify the model to be exported is a LoRA model trained using modelopt.", + default=False, + action="store_true", + ) args = parser.parse_args() diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 9b932e815..41ed01c0c 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -1087,11 +1087,9 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st if block_size == 0: block_size = get_weight_block_size(module) - # Handles case if default weight quantizer is not enabled or is None - if block_size != 0: - # Construct per layer config dictionary - layer_config_dict[name + ".quantization"] = quantization_format - layer_config_dict[name + ".awq_block_size"] = block_size + # Construct per layer config dictionary + layer_config_dict[name + ".quantization"] = quantization_format + layer_config_dict[name + ".awq_block_size"] = block_size # Find kv cache quant format if ( diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index ac895e609..2e1ea8a51 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -518,32 +518,40 @@ def export_hf_checkpoint( export_dir: the target export path. save_modelopt_state: whether to save the modelopt state_dict. """ + is_lora = hasattr(model, "base_model") + base_export_dir: Path | str = f"{export_dir}/base_model" if is_lora else export_dir export_dir = Path(export_dir) export_dir.mkdir(parents=True, exist_ok=True) + base_export_dir = Path(base_export_dir) + base_export_dir.mkdir(parents=True, exist_ok=True) + try: post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype) # NOTE: (hg) Should we save hf_quant_config when there's no quantization applied? # Save hf_quant_config.json for backward compatibility - with open(f"{export_dir}/hf_quant_config.json", "w") as file: + with open(f"{base_export_dir}/hf_quant_config.json", "w") as file: json.dump(hf_quant_config, file, indent=4) hf_quant_config = convert_hf_quant_config_format(hf_quant_config) post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict) - # For QLoRA models we export the base model - if hasattr(model, "base_model"): - model = model.base_model + # In the case of LoRA model, we save the base model + if is_lora: + model.base_model.save_pretrained( + base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state + ) + model.save_pretrained( export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state ) - original_config = f"{export_dir}/config.json" + original_config = f"{base_export_dir}/config.json" config_data = {} - with open(original_config) as file: - config_data = json.load(file) + # In the case of LoRA model.save_pretrained does not save the correct config.json + config_data = model.config.to_dict() config_data["quantization_config"] = hf_quant_config diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 583d1d6dd..6f839d22c 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -147,7 +147,7 @@ def __init__( self.model, "peft_config" ): # TODO: use get_peft_model here instead of add_adapter - self.model.add_adapter(self.args.lora_config, adapter_name="adapter") + self.model.add_adapter(self.args.lora_config) print_rank_0("Lora adapter added.") if hasattr(self.model, "peft_config") and self.quant_cfg is not None: @@ -185,6 +185,7 @@ def _save_modelopt_state_with_weights(self): # Save base model compressed weights for QLoRA if getattr(self.quant_args, "compress", False): # Save base model config.json + # weight_quantizer = self.quant_cfg["quant_cfg"]["*weight_quantizer"] self.model.config.save_pretrained(self.args.output_dir) # Save base model compressed weights excluding lora weights @@ -362,7 +363,7 @@ def __init__( if self.quant_cfg is not None and not is_quantized(self.model): self._quantize_model() if getattr(self.args, "lora_config", None) is not None: - self.model.add_adapter(self.args.lora_config, adapter_name="adapter") + self.model.add_adapter(self.args.lora_config) print_rank_0("Lora adapter added.") self._convert_to_distillation_model() From 2a6ce0a65cded80e05e8f6f69e249efdd0694251 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Fri, 26 Sep 2025 16:26:28 +0000 Subject: [PATCH 09/26] added support for nvfp4 export Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 8 +++++-- modelopt/torch/export/quant_utils.py | 14 +++++++++++-- modelopt/torch/export/unified_export_hf.py | 9 ++++---- .../quantization/qtensor/nvfp4_tensor.py | 21 +++++++++++++++++++ 4 files changed, 44 insertions(+), 8 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 2ad474f86..b1e988c26 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -364,7 +364,9 @@ def main(args): ) mts.export(model) - if args.auto_quantize_bits or args.qformat in QUANT_CFG_CHOICES: + if ( + args.auto_quantize_bits or args.qformat in QUANT_CFG_CHOICES + ) and not model_is_already_quantized: if "awq" in args.qformat: print( "\n####\nAWQ calibration could take longer than other calibration methods. " @@ -393,6 +395,9 @@ def main(args): sample_input_single_batch = None run_auto_quant = args.auto_quantize_bits is not None + print("DEBUG LOG: Entereing here") + for k, v in model.state_dict().items(): + print(k, v.shape, v.dtype, v.device) args.batch_size = get_max_batch_size( model, @@ -635,7 +640,6 @@ def output_decode(generated_ids, input_shape): "They will be set at deployment time." ) - print("DEBUG LOG: Calling unified export hf checkpoint") export_hf_checkpoint( full_model, export_dir=export_path, diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 41ed01c0c..2f7300884 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -270,6 +270,15 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> QUANTIZATION_NVFP4_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ]: + if hasattr(weight_quantizer, "_scale"): + # In this case, weight must be a QTensorWrapper + original_shape = weight.metadata["shape"] + ws = NVFP4QTensor.get_modelopt_weights_scaling_factor( + weight_quantizer._scale, original_shape + ) + print(f"weight_quantizer._scale: {ws.shape}") + return ws + return NVFP4QTensor.get_weights_scaling_factor( weight, weight_quantizer.block_sizes[-1], @@ -612,8 +621,6 @@ def process_layer_quant_config(layer_config_dict): # Get the corresponding AWQ block size block_size_value = layer_config_dict.get(awq_key, 0) - # print(f"DEBUG LOG: Processing layer {k} with quantization {v}, block size {block_size_value}") - if v == "fp8": layer_config = {"quant_algo": "FP8"} elif v == "fp8_pc_pt": @@ -1088,6 +1095,9 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st block_size = get_weight_block_size(module) # Construct per layer config dictionary + if block_size == 0 and quantization_format != QUANTIZATION_FP8: + continue + layer_config_dict[name + ".quantization"] = quantization_format layer_config_dict[name + ".awq_block_size"] = block_size diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 2e1ea8a51..ae6537eda 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -542,10 +542,11 @@ def export_hf_checkpoint( model.base_model.save_pretrained( base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state ) - - model.save_pretrained( - export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state - ) + model.save_pretrained(export_dir, save_modelopt_state=save_modelopt_state) + else: + model.save_pretrained( + export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state + ) original_config = f"{base_export_dir}/config.json" config_data = {} diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index 65861695f..b98a32a3d 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -94,6 +94,27 @@ def get_weights_scaling_factor_2(cls, input: torch.Tensor): """Returns per tensor weight scaling factor.""" return reduce_amax(input).float() / (6.0 * 448.0) + @classmethod + def get_modelopt_weights_scaling_factor(cls, weight_scaling_factor: torch.Tensor, weight_shape): + """Returns the modelopt weights scaling factor if the quantization is done by trtllm.""" + if weight_scaling_factor.dtype == torch.float8_e4m3fn: + return weight_scaling_factor + + if weight_scaling_factor.dtype == torch.uint8 and weight_scaling_factor.ndim == 1: + # If quantization is done by trtllm, convert cutlass fp4 scale to modelopt fp4 scale + try: + from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import ( + cutlass_fp4_scale_to_modelopt_fp4_scale, + ) + + return cutlass_fp4_scale_to_modelopt_fp4_scale( + weight_scaling_factor, weight_shape[-2:] + ) + except ImportError as e: + raise ImportError( + "This tensor is quantized by trtllm, but tensorrt_llm cannot be imported." + ) from e + @classmethod def get_activation_scaling_factor(cls, quantizer): """Returns the activation scaling factor for export.""" From 1480f8680a9e6f704375f0d7dba29adcff15e5de Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Fri, 26 Sep 2025 17:08:04 +0000 Subject: [PATCH 10/26] minor Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- modelopt/torch/export/quant_utils.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 2f7300884..ad7ed1439 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -271,13 +271,9 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> QUANTIZATION_W4A8_NVFP4_FP8, ]: if hasattr(weight_quantizer, "_scale"): - # In this case, weight must be a QTensorWrapper - original_shape = weight.metadata["shape"] - ws = NVFP4QTensor.get_modelopt_weights_scaling_factor( - weight_quantizer._scale, original_shape + return NVFP4QTensor.get_modelopt_weights_scaling_factor( + weight_quantizer._scale, weight.metadata["shape"] ) - print(f"weight_quantizer._scale: {ws.shape}") - return ws return NVFP4QTensor.get_weights_scaling_factor( weight, From 05ab948420d840fbcaa4b32cb5948d99408ed7a6 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Mon, 29 Sep 2025 05:20:33 +0000 Subject: [PATCH 11/26] e2e checkpoint tested for nvfp4 and fp8 Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 11 ++-- examples/llm_ptq/hf_ptq.py | 36 ++++++----- modelopt/torch/export/quant_utils.py | 60 ++++++++++++------- modelopt/torch/export/unified_export_hf.py | 25 ++++---- .../plugins/transformers_trainer.py | 7 +-- .../quantization/qtensor/nvfp4_tensor.py | 17 +----- 6 files changed, 83 insertions(+), 73 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 95c51006f..537791346 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -118,15 +118,11 @@ def get_dtype(dtype): def get_lora_model( ckpt_path: str, - device="cuda", + device_map="cuda", ): """ Loads a QLoRA model that has been trained using modelopt trainer. """ - device_map = "auto" - if device == "cpu": - device_map = "cpu" - # Load model with adapters model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map) @@ -148,6 +144,7 @@ def get_model( trust_remote_code=False, use_seq_device_map=False, attn_implementation=None, + is_lora=False, ): print(f"Initializing model from {ckpt_path}") @@ -155,6 +152,10 @@ def get_model( if device == "cpu": device_map = "cpu" + if is_lora: + model = get_lora_model(ckpt_path, device_map) + return model + config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {} if attn_implementation is not None: config_kwargs["attn_implementation"] = attn_implementation diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index b1e988c26..47510273b 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -241,20 +241,15 @@ def main(args): # If low memory mode is enabled, we compress the model while loading the HF checkpoint. calibration_only = False if not args.low_memory_mode: - if args.lora: - model = get_lora_model( - args.pyt_ckpt_path, - args.device, - ) - else: - model = get_model( - args.pyt_ckpt_path, - args.device, - gpu_mem_percentage=args.gpu_max_mem_percentage, - trust_remote_code=args.trust_remote_code, - use_seq_device_map=args.use_seq_device_map, - attn_implementation=args.attn_implementation, - ) + model = get_model( + args.pyt_ckpt_path, + args.device, + gpu_mem_percentage=args.gpu_max_mem_percentage, + trust_remote_code=args.trust_remote_code, + use_seq_device_map=args.use_seq_device_map, + attn_implementation=args.attn_implementation, + is_lora=args.lora, + ) else: assert args.qformat in QUANT_CFG_CHOICES, ( f"Quantization format is not supported for low memory mode. Supported formats: {QUANT_CFG_CHOICES.keys()}" @@ -395,9 +390,6 @@ def main(args): sample_input_single_batch = None run_auto_quant = args.auto_quantize_bits is not None - print("DEBUG LOG: Entereing here") - for k, v in model.state_dict().items(): - print(k, v.shape, v.dtype, v.device) args.batch_size = get_max_batch_size( model, @@ -493,7 +485,7 @@ def main(args): quant_cfg["quant_cfg"]["*image*"] = {"enable": False} quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} - if not model_is_already_quantized or calibration_only: + if calibration_only: # Only run single sample for preview input_ids = next(iter(calib_dataloader))[ "input_features" if model_type == "whisper" else "input_ids" @@ -567,7 +559,12 @@ def output_decode(generated_ids, input_shape): else: assert model_type != "dbrx", f"Does not support export {model_type} without quantizaton" - print(f"qformat: {args.qformat}. No quantization applied, export {device} model") + if model_is_already_quantized: + warnings.warn( + "Skipping quantization: Model is already quantized. Exporting the model..." + ) + else: + print(f"qformat: {args.qformat}. No quantization applied, export {device} model") with torch.inference_mode(): if model_type is None: @@ -643,6 +640,7 @@ def output_decode(generated_ids, input_shape): export_hf_checkpoint( full_model, export_dir=export_path, + is_modelopt_trained_lora=args.lora, ) # Copy custom model files (Python files and JSON configs) if trust_remote_code is used diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index ad7ed1439..23bfd060c 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -270,23 +270,28 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> QUANTIZATION_NVFP4_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ]: + # If scale is already registered, indicates weights are already compressed. + # We convert to modelopt scale if necessary and return if hasattr(weight_quantizer, "_scale"): return NVFP4QTensor.get_modelopt_weights_scaling_factor( weight_quantizer._scale, weight.metadata["shape"] ) - - return NVFP4QTensor.get_weights_scaling_factor( - weight, - weight_quantizer.block_sizes[-1], - NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer).to( - weight.device - ), - )[0] + else: + return NVFP4QTensor.get_weights_scaling_factor( + weight, + weight_quantizer.block_sizes[-1], + NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer).to( + weight.device + ), + )[0] if quantization_format in [QUANTIZATION_W4A8_MXFP4_FP8, QUANTIZATION_MXFP4]: - return MXFP4QTensor.quantize(weight, block_size=weight_quantizer.block_sizes[-1])[ - 1 - ].reshape(*weight.shape[:-1], -1) + if hasattr(weight_quantizer, "_scale"): + return weight_quantizer._scale.reshape(*weight.shape[:-1], -1) + else: + return MXFP4QTensor.quantize(weight, block_size=weight_quantizer.block_sizes[-1])[ + 1 + ].reshape(*weight.shape[:-1], -1) return get_scaling_factor(weight_quantizer) @@ -302,7 +307,10 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") QUANTIZATION_NVFP4_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ]: - return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) + if hasattr(weight_quantizer, "_double_scale"): + return weight_quantizer._double_scale + else: + return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) # SequentialQuantizer is required if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled: @@ -824,7 +832,12 @@ def from_quantized_weight( raise NotImplementedError(f"quantization format {quantization} not supported") -def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str | None) -> dict: +def postprocess_state_dict( + state_dict: dict, + maxbound: float, + quantization: str | None, + is_modelopt_trained_lora: bool = False, +) -> dict: """Filters out keys related to weight quantizers and updates KV cache related keys. Args: @@ -841,11 +854,18 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str "k_bmm_quantizer._bias_value": "k_proj.k_bias", "v_bmm_quantizer._bias_value": "v_proj.v_bias", "input_quantizer._pre_quant_scale": "pre_quant_scale", - "base_layer.weight": "weight", - "base_layer.input_scale": "input_scale", - "base_layer.weight_scale": "weight_scale", } + # For modelopt-trained LoRA models, we need to remove the base_layer prefix from the keys for deployment + if is_modelopt_trained_lora: + replacements.update( + { + "base_layer.weight": "weight", + "base_layer.input_scale": "input_scale", + "base_layer.weight_scale": "weight_scale", + } + ) + post_state_dict = {} for key, value in state_dict.items(): @@ -908,10 +928,10 @@ def postprocess_state_dict(state_dict: dict, maxbound: float, quantization: str keys_to_delete.append(key) # remove LoRA adapters from state dict - for key, value in post_state_dict.items(): - if "lora" in key and key not in keys_to_delete: - keys_to_delete.append(key) - + if is_modelopt_trained_lora: + for key, value in post_state_dict.items(): + if "lora" in key and key not in keys_to_delete: + keys_to_delete.append(key) # Check for tied weights and remove duplicates seen_tensors = {} diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index ae6537eda..9dfe97aaa 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -85,9 +85,6 @@ def _is_enabled_quantizer(quantizer): def requantize_resmooth_fused_llm_layers(model: torch.nn.Module): """Group modules that take the same input and register shared parameters in module.""" - # Skip for LoRA finetuned models - if hasattr(model, "base_model"): - return # TODO: Handle DBRX MoE input_to_linear = defaultdict(list) output_to_layernorm = defaultdict(None) @@ -343,7 +340,7 @@ def _export_quantized_weight( def _export_hf_checkpoint( - model: nn.Module, dtype: torch.dtype | None = None + model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_trained_lora: bool = False ) -> tuple[dict[str, Any], dict[str, Any]]: """Exports the torch model to the packed checkpoint with original HF naming. @@ -435,7 +432,9 @@ def _export_hf_checkpoint( # Resmooth and requantize fused layers # TODO: Handle mixed precision - requantize_resmooth_fused_llm_layers(model) + # TODO: Support requantize and resmooth for modelopt-trained LoRA models + if not is_modelopt_trained_lora: + requantize_resmooth_fused_llm_layers(model) # Remove all hooks from the model try: @@ -494,7 +493,7 @@ def _export_hf_checkpoint( quantized_state_dict = model.state_dict() quantized_state_dict = postprocess_state_dict( - quantized_state_dict, kv_cache_max_bound, kv_cache_format + quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_trained_lora ) # Check if any layers are quantized @@ -509,6 +508,7 @@ def export_hf_checkpoint( dtype: torch.dtype | None = None, export_dir: Path | str = tempfile.gettempdir(), save_modelopt_state: bool = False, + is_modelopt_trained_lora: bool = False, ): """Exports the torch model to unified checkpoint and saves to export_dir. @@ -518,15 +518,18 @@ def export_hf_checkpoint( export_dir: the target export path. save_modelopt_state: whether to save the modelopt state_dict. """ - is_lora = hasattr(model, "base_model") - base_export_dir: Path | str = f"{export_dir}/base_model" if is_lora else export_dir + base_export_dir: Path | str = ( + f"{export_dir}/base_model" if is_modelopt_trained_lora else export_dir + ) export_dir = Path(export_dir) export_dir.mkdir(parents=True, exist_ok=True) base_export_dir = Path(base_export_dir) base_export_dir.mkdir(parents=True, exist_ok=True) try: - post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype) + post_state_dict, hf_quant_config = _export_hf_checkpoint( + model, dtype, is_modelopt_trained_lora + ) # NOTE: (hg) Should we save hf_quant_config when there's no quantization applied? # Save hf_quant_config.json for backward compatibility @@ -538,11 +541,11 @@ def export_hf_checkpoint( post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict) # In the case of LoRA model, we save the base model - if is_lora: + if is_modelopt_trained_lora: model.base_model.save_pretrained( base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state ) - model.save_pretrained(export_dir, save_modelopt_state=save_modelopt_state) + model.save_pretrained(export_dir) else: model.save_pretrained( export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 6f839d22c..85323910e 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -185,7 +185,6 @@ def _save_modelopt_state_with_weights(self): # Save base model compressed weights for QLoRA if getattr(self.quant_args, "compress", False): # Save base model config.json - # weight_quantizer = self.quant_cfg["quant_cfg"]["*weight_quantizer"] self.model.config.save_pretrained(self.args.output_dir) # Save base model compressed weights excluding lora weights @@ -292,14 +291,14 @@ def save_model(self, *args, **kwargs): def _load_best_model(self, *args, **kwargs): """Load the best model for final evaluation.""" is_lora = getattr(self.args, "lora", None) - if not is_lora: - super()._load_best_model(*args, **kwargs) - else: + if is_lora and not self.is_fsdp_enabled: # Custom logic for loading best model with LoRA # TODO: Remove once we migrate to using get_peft_model() adapter_name = self.model.active_adapter() self.model.delete_adapter(adapter_name) self.model.load_adapter(self.state.best_model_checkpoint, adapter_name) + else: + super()._load_best_model(*args, **kwargs) def _patch_accelerate_for_fsdp2_fix(self): """Fixes for accelerate prepare. diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index b98a32a3d..f134b6bd8 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -270,20 +270,9 @@ def _unpack_tensor(input: torch.Tensor): return unpacked.reshape(unpacked_shape) # Get scales from kwargs - if kwarg["scale"].dtype == torch.uint8 and kwarg["scale"].ndim == 1: - # If quantization is done by trtllm, convert cutlass fp4 scale to modelopt fp4 scale - try: - from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import ( - cutlass_fp4_scale_to_modelopt_fp4_scale, - ) - - kwarg["scale"] = cutlass_fp4_scale_to_modelopt_fp4_scale( - kwarg["scale"], self.metadata["shape"][-2:] - ) - except ImportError as e: - raise ImportError( - "This tensor is quantized by trtllm, but tensorrt_llm cannot be imported." - ) from e + kwarg["scale"] = self.get_modelopt_weights_scaling_factor( + kwarg["scale"], self.metadata["shape"] + ) if fast: from ..triton.fp4_kernel import fp4_dequantize From 32cbf9aef5bd8a2dfc2274859db02800abf0eabe Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Mon, 29 Sep 2025 15:55:42 +0000 Subject: [PATCH 12/26] cleanup Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 4 ++-- examples/llm_ptq/hf_ptq.py | 8 +++---- examples/llm_qat/README.md | 17 ++++++++++++-- modelopt/torch/export/quant_utils.py | 27 +++++++++++----------- modelopt/torch/export/unified_export_hf.py | 27 ++++++++++------------ 5 files changed, 47 insertions(+), 36 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 537791346..528c4a7a8 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -144,7 +144,7 @@ def get_model( trust_remote_code=False, use_seq_device_map=False, attn_implementation=None, - is_lora=False, + is_modelopt_qlora=False, ): print(f"Initializing model from {ckpt_path}") @@ -152,7 +152,7 @@ def get_model( if device == "cpu": device_map = "cpu" - if is_lora: + if is_modelopt_qlora: model = get_lora_model(ckpt_path, device_map) return model diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 47510273b..860cabbf7 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -248,7 +248,7 @@ def main(args): trust_remote_code=args.trust_remote_code, use_seq_device_map=args.use_seq_device_map, attn_implementation=args.attn_implementation, - is_lora=args.lora, + is_modelopt_qlora=args.qlora, ) else: assert args.qformat in QUANT_CFG_CHOICES, ( @@ -640,7 +640,7 @@ def output_decode(generated_ids, input_shape): export_hf_checkpoint( full_model, export_dir=export_path, - is_modelopt_trained_lora=args.lora, + is_modelopt_qlora=args.qlora, ) # Copy custom model files (Python files and JSON configs) if trust_remote_code is used @@ -782,8 +782,8 @@ def output_decode(generated_ids, input_shape): type=str, ) parser.add_argument( - "--lora", - help="Specify the model to be exported is a LoRA model trained using modelopt.", + "--qlora", + help="Specify the model to be exported is a QLoRA model trained using modelopt.", default=False, action="store_true", ) diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index ca9f6c4aa..20422a604 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -345,10 +345,23 @@ To perform QLoRA training, run: --lora True ``` -After performing QLoRA training the final checkpoint exported is ready for deployment using vLLM. For more details about QLoRA deployment using vLLM refer to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html). To deploy with vLLM, run: +After performing QLoRA training the final checkpoint can be exported for deployment with vLLM using the following command. ```sh -vllm serve llama3-fp4-qlora/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora --port 8000 --tokenizer llama3-fp4-qlora +cd ../llm_ptq + +python hf_ptq.py \ + --pyt_ckpt_path llama3-fp4-qlora \ + --qformat nvfp4 \ + --export_dir llama3-fp4-qlora-hf \ + --qlora + +``` + +To deploy with vLLM, run the following command. For more details about QLoRA deployment using vLLM refer to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html). + +```sh +vllm serve llama3-fp4-qlora-hf/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora-hf --port 8000 --tokenizer llama3-fp4-qlora-hf ``` ## Pre-Quantized Checkpoints diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 23bfd060c..491fdcda3 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -836,7 +836,7 @@ def postprocess_state_dict( state_dict: dict, maxbound: float, quantization: str | None, - is_modelopt_trained_lora: bool = False, + is_modelopt_qlora: bool = False, ) -> dict: """Filters out keys related to weight quantizers and updates KV cache related keys. @@ -855,9 +855,10 @@ def postprocess_state_dict( "v_bmm_quantizer._bias_value": "v_proj.v_bias", "input_quantizer._pre_quant_scale": "pre_quant_scale", } + skip_keys = ["output_quantizer", "_amax", "_bias_value", "input_quantizer._pre_quant_scale"] # For modelopt-trained LoRA models, we need to remove the base_layer prefix from the keys for deployment - if is_modelopt_trained_lora: + if is_modelopt_qlora: replacements.update( { "base_layer.weight": "weight", @@ -865,18 +866,13 @@ def postprocess_state_dict( "base_layer.weight_scale": "weight_scale", } ) + skip_keys.append("base_layer") post_state_dict = {} for key, value in state_dict.items(): # Skip keys not related to quantizers - if ( - "output_quantizer" not in key - and "_amax" not in key - and "_bias_value" not in key - and "input_quantizer._pre_quant_scale" not in key - and "base_layer" not in key - ): + if all(skip_key not in key for skip_key in skip_keys): post_state_dict[key] = value continue @@ -928,8 +924,8 @@ def postprocess_state_dict( keys_to_delete.append(key) # remove LoRA adapters from state dict - if is_modelopt_trained_lora: - for key, value in post_state_dict.items(): + if is_modelopt_qlora: + for key in post_state_dict: if "lora" in key and key not in keys_to_delete: keys_to_delete.append(key) # Check for tied weights and remove duplicates @@ -1110,10 +1106,15 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st if block_size == 0: block_size = get_weight_block_size(module) - # Construct per layer config dictionary - if block_size == 0 and quantization_format != QUANTIZATION_FP8: + # In the case of NVFP4, block_size 0 indicates weight_quantizer is not enabled + if block_size == 0 and quantization_format in [ + QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_W4A8_NVFP4_FP8, + ]: continue + # Construct per layer config dictionary layer_config_dict[name + ".quantization"] = quantization_format layer_config_dict[name + ".awq_block_size"] = block_size diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 9dfe97aaa..12f13410b 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -340,7 +340,7 @@ def _export_quantized_weight( def _export_hf_checkpoint( - model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_trained_lora: bool = False + model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False ) -> tuple[dict[str, Any], dict[str, Any]]: """Exports the torch model to the packed checkpoint with original HF naming. @@ -433,7 +433,7 @@ def _export_hf_checkpoint( # Resmooth and requantize fused layers # TODO: Handle mixed precision # TODO: Support requantize and resmooth for modelopt-trained LoRA models - if not is_modelopt_trained_lora: + if not is_modelopt_qlora: requantize_resmooth_fused_llm_layers(model) # Remove all hooks from the model @@ -493,7 +493,7 @@ def _export_hf_checkpoint( quantized_state_dict = model.state_dict() quantized_state_dict = postprocess_state_dict( - quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_trained_lora + quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora ) # Check if any layers are quantized @@ -508,7 +508,7 @@ def export_hf_checkpoint( dtype: torch.dtype | None = None, export_dir: Path | str = tempfile.gettempdir(), save_modelopt_state: bool = False, - is_modelopt_trained_lora: bool = False, + is_modelopt_qlora: bool = False, ): """Exports the torch model to unified checkpoint and saves to export_dir. @@ -518,18 +518,15 @@ def export_hf_checkpoint( export_dir: the target export path. save_modelopt_state: whether to save the modelopt state_dict. """ - base_export_dir: Path | str = ( - f"{export_dir}/base_model" if is_modelopt_trained_lora else export_dir - ) + # Setup directories export_dir = Path(export_dir) - export_dir.mkdir(parents=True, exist_ok=True) - base_export_dir = Path(base_export_dir) - base_export_dir.mkdir(parents=True, exist_ok=True) + base_export_dir = export_dir / "base_model" if is_modelopt_qlora else export_dir + + for dir_path in [export_dir, base_export_dir]: + dir_path.mkdir(parents=True, exist_ok=True) try: - post_state_dict, hf_quant_config = _export_hf_checkpoint( - model, dtype, is_modelopt_trained_lora - ) + post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype, is_modelopt_qlora) # NOTE: (hg) Should we save hf_quant_config when there's no quantization applied? # Save hf_quant_config.json for backward compatibility @@ -540,8 +537,8 @@ def export_hf_checkpoint( post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict) - # In the case of LoRA model, we save the base model - if is_modelopt_trained_lora: + if is_modelopt_qlora: + # In the case of LoRA model, we save the base model and adapters model.base_model.save_pretrained( base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state ) From 9b4c8a30c1d76c8893d05deb245c7d7c6928fedf Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 30 Sep 2025 02:39:11 +0000 Subject: [PATCH 13/26] refactored Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 27 ---- examples/llm_ptq/hf_ptq.py | 15 +-- examples/llm_qat/export.py | 119 ++++++++++++++++++ modelopt/torch/export/quant_utils.py | 39 ++---- modelopt/torch/export/unified_export_hf.py | 44 +++---- .../plugins/transformers_trainer.py | 22 ++-- 6 files changed, 157 insertions(+), 109 deletions(-) create mode 100644 examples/llm_qat/export.py diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 528c4a7a8..38e11a8e1 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -25,7 +25,6 @@ import transformers from accelerate import infer_auto_device_map, init_empty_weights from accelerate.utils import get_max_memory -from safetensors.torch import load_file from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer try: @@ -116,27 +115,6 @@ def get_dtype(dtype): return dtype -def get_lora_model( - ckpt_path: str, - device_map="cuda", -): - """ - Loads a QLoRA model that has been trained using modelopt trainer. - """ - # Load model with adapters - model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map) - - # Restore modelopt state - modelopt_state = torch.load(f"{ckpt_path}/modelopt_state.pth", weights_only=False) - restore_from_modelopt_state(model, modelopt_state) - - # Load compressed weights - state_dict = load_file(f"{ckpt_path}/model.safetensors") - model.load_state_dict(state_dict, strict=False) - - return model - - def get_model( ckpt_path, device="cuda", @@ -144,7 +122,6 @@ def get_model( trust_remote_code=False, use_seq_device_map=False, attn_implementation=None, - is_modelopt_qlora=False, ): print(f"Initializing model from {ckpt_path}") @@ -152,10 +129,6 @@ def get_model( if device == "cpu": device_map = "cpu" - if is_modelopt_qlora: - model = get_lora_model(ckpt_path, device_map) - return model - config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {} if attn_implementation is not None: config_kwargs["attn_implementation"] = attn_implementation diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 860cabbf7..c08c7957b 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -248,7 +248,6 @@ def main(args): trust_remote_code=args.trust_remote_code, use_seq_device_map=args.use_seq_device_map, attn_implementation=args.attn_implementation, - is_modelopt_qlora=args.qlora, ) else: assert args.qformat in QUANT_CFG_CHOICES, ( @@ -359,9 +358,7 @@ def main(args): ) mts.export(model) - if ( - args.auto_quantize_bits or args.qformat in QUANT_CFG_CHOICES - ) and not model_is_already_quantized: + if args.auto_quantize_bits or args.qformat in QUANT_CFG_CHOICES: if "awq" in args.qformat: print( "\n####\nAWQ calibration could take longer than other calibration methods. " @@ -485,7 +482,7 @@ def main(args): quant_cfg["quant_cfg"]["*image*"] = {"enable": False} quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} - if calibration_only: + if not model_is_already_quantized and calibration_only: # Only run single sample for preview input_ids = next(iter(calib_dataloader))[ "input_features" if model_type == "whisper" else "input_ids" @@ -559,12 +556,7 @@ def output_decode(generated_ids, input_shape): else: assert model_type != "dbrx", f"Does not support export {model_type} without quantizaton" - if model_is_already_quantized: - warnings.warn( - "Skipping quantization: Model is already quantized. Exporting the model..." - ) - else: - print(f"qformat: {args.qformat}. No quantization applied, export {device} model") + print(f"qformat: {args.qformat}. No quantization applied, export {device} model") with torch.inference_mode(): if model_type is None: @@ -640,7 +632,6 @@ def output_decode(generated_ids, input_shape): export_hf_checkpoint( full_model, export_dir=export_path, - is_modelopt_qlora=args.qlora, ) # Copy custom model files (Python files and JSON configs) if trust_remote_code is used diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py new file mode 100644 index 000000000..2d032627b --- /dev/null +++ b/examples/llm_qat/export.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json +import warnings +from pathlib import Path + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from modelopt.torch.export.convert_hf_config import convert_hf_quant_config_format +from modelopt.torch.export.unified_export_hf import _export_hf_checkpoint +from modelopt.torch.opt.conversion import restore_from_modelopt_state +from modelopt.torch.quantization.utils import set_quantizer_state_dict + +RAND_SEED = 1234 + + +def get_lora_model( + ckpt_path: str, + device="cuda", +): + """ + Loads a QLoRA model that has been trained using modelopt trainer. + """ + device_map = "auto" + if device == "cpu": + device_map = "cpu" + + # Load model with adapters + model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map) + + # Restore modelopt state + modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_calibration.pth", weights_only=False) + restore_from_modelopt_state(model, modelopt_state) + + # Restore modelopt quantizer state dict + modelopt_weights = modelopt_state.pop("modelopt_state_weights", None) + if modelopt_weights is not None: + print("Restoring modelopt weights") + set_quantizer_state_dict(model, modelopt_weights) + + return model + + +def main(args): + # Load model + model = get_lora_model(args.pyt_ckpt_path, args.device) + tokenizer = AutoTokenizer.from_pretrained(args.pyt_ckpt_path) + + # Export HF checkpoint + export_dir = Path(args.export_path) + export_dir.mkdir(parents=True, exist_ok=True) + base_model_dir = export_dir / "base_model" + base_model_dir.mkdir(parents=True, exist_ok=True) + + try: + post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_lora=True) + + with open(f"{export_dir}/base_model/hf_quant_config.json", "w") as file: + json.dump(hf_quant_config, file, indent=4) + + hf_quant_config = convert_hf_quant_config_format(hf_quant_config) + + # Save base model + model.base_model.save_pretrained(f"{export_dir}/base_model", state_dict=post_state_dict) + # Save adapters + model.save_pretrained(export_dir) + + config_path = f"{export_dir}/base_model/config.json" + + # In the case of LoRA model.save_pretrained does not save the correct config.json + config_data = model.config.to_dict() + print(config_data) + + config_data["quantization_config"] = hf_quant_config + + with open(config_path, "w") as file: + json.dump(config_data, file, indent=4) + + # Save tokenizer + tokenizer.save_pretrained(export_dir) + + except Exception as e: + warnings.warn( + "Cannot export model to the model_config. The modelopt-optimized model state_dict" + " can be saved with torch.save for further inspection." + ) + raise e + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--pyt_ckpt_path", + help="Specify where the PyTorch checkpoint path is", + required=True, + ) + + parser.add_argument("--device", default="cuda") + + parser.add_argument("--export_path", default="exported_model") + + args = parser.parse_args() + + main(args) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 491fdcda3..5253cd2d6 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -270,28 +270,18 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> QUANTIZATION_NVFP4_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ]: - # If scale is already registered, indicates weights are already compressed. - # We convert to modelopt scale if necessary and return - if hasattr(weight_quantizer, "_scale"): - return NVFP4QTensor.get_modelopt_weights_scaling_factor( - weight_quantizer._scale, weight.metadata["shape"] - ) - else: - return NVFP4QTensor.get_weights_scaling_factor( - weight, - weight_quantizer.block_sizes[-1], - NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer).to( - weight.device - ), - )[0] + return NVFP4QTensor.get_weights_scaling_factor( + weight, + weight_quantizer.block_sizes[-1], + NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer).to( + weight.device + ), + )[0] if quantization_format in [QUANTIZATION_W4A8_MXFP4_FP8, QUANTIZATION_MXFP4]: - if hasattr(weight_quantizer, "_scale"): - return weight_quantizer._scale.reshape(*weight.shape[:-1], -1) - else: - return MXFP4QTensor.quantize(weight, block_size=weight_quantizer.block_sizes[-1])[ - 1 - ].reshape(*weight.shape[:-1], -1) + return MXFP4QTensor.quantize(weight, block_size=weight_quantizer.block_sizes[-1])[ + 1 + ].reshape(*weight.shape[:-1], -1) return get_scaling_factor(weight_quantizer) @@ -307,10 +297,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") QUANTIZATION_NVFP4_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ]: - if hasattr(weight_quantizer, "_double_scale"): - return weight_quantizer._double_scale - else: - return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) + return NVFP4QTensor.get_weights_scaling_factor_2_from_quantizer(weight_quantizer) # SequentialQuantizer is required if not isinstance(weight_quantizer, SequentialQuantizer) or not weight_quantizer[-1].is_enabled: @@ -746,7 +733,6 @@ def to_quantized_weight( quantization: str, weights_scaling_factor2: torch.Tensor | None = None, block_size: int | None = None, - dtype: torch.dtype | None = None, ): """Converts the weight to the quantized (packed) format.""" if weights_scaling_factor is not None: @@ -759,9 +745,6 @@ def to_quantized_weight( if isinstance(weight, QTensorWrapper): return weight.data - if dtype: - weight = weight.to(dtype) - if quantization == QUANTIZATION_FP8: # Fix RuntimeError: Promotion for Float8 Types is not supported, attempted to promote Float8_e4m3fn and Float # in speculative decoding fp8 model export diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 12f13410b..30ad9ddbb 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -311,12 +311,11 @@ def _export_quantized_weight( )[0] quantized_weight = to_quantized_weight( - weight, + weight.to(dtype), weight_scale, quantization_format, weight_scale_2, block_size, - dtype, ) quantized_weight, weight_scale = maybe_transpose_expert_weight_dimensions( @@ -324,12 +323,11 @@ def _export_quantized_weight( ) else: quantized_weight = to_quantized_weight( - weight, + weight.to(dtype), weight_scale, quantization_format, weight_scale_2, block_size, - dtype, ) setattr(sub_module, weight_name, nn.Parameter(quantized_weight, requires_grad=False)) @@ -340,7 +338,7 @@ def _export_quantized_weight( def _export_hf_checkpoint( - model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False + model: nn.Module, dtype: torch.dtype | None = None, is_lora: bool = False ) -> tuple[dict[str, Any], dict[str, Any]]: """Exports the torch model to the packed checkpoint with original HF naming. @@ -433,7 +431,7 @@ def _export_hf_checkpoint( # Resmooth and requantize fused layers # TODO: Handle mixed precision # TODO: Support requantize and resmooth for modelopt-trained LoRA models - if not is_modelopt_qlora: + if not is_lora: requantize_resmooth_fused_llm_layers(model) # Remove all hooks from the model @@ -493,7 +491,7 @@ def _export_hf_checkpoint( quantized_state_dict = model.state_dict() quantized_state_dict = postprocess_state_dict( - quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora + quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_lora ) # Check if any layers are quantized @@ -508,7 +506,6 @@ def export_hf_checkpoint( dtype: torch.dtype | None = None, export_dir: Path | str = tempfile.gettempdir(), save_modelopt_state: bool = False, - is_modelopt_qlora: bool = False, ): """Exports the torch model to unified checkpoint and saves to export_dir. @@ -518,41 +515,30 @@ def export_hf_checkpoint( export_dir: the target export path. save_modelopt_state: whether to save the modelopt state_dict. """ - # Setup directories export_dir = Path(export_dir) - base_export_dir = export_dir / "base_model" if is_modelopt_qlora else export_dir - - for dir_path in [export_dir, base_export_dir]: - dir_path.mkdir(parents=True, exist_ok=True) - + export_dir.mkdir(parents=True, exist_ok=True) try: - post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype, is_modelopt_qlora) + post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype) # NOTE: (hg) Should we save hf_quant_config when there's no quantization applied? # Save hf_quant_config.json for backward compatibility - with open(f"{base_export_dir}/hf_quant_config.json", "w") as file: + with open(f"{export_dir}/hf_quant_config.json", "w") as file: json.dump(hf_quant_config, file, indent=4) hf_quant_config = convert_hf_quant_config_format(hf_quant_config) post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict) - if is_modelopt_qlora: - # In the case of LoRA model, we save the base model and adapters - model.base_model.save_pretrained( - base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state - ) - model.save_pretrained(export_dir) - else: - model.save_pretrained( - export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state - ) + # Save model + model.save_pretrained( + export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state + ) - original_config = f"{base_export_dir}/config.json" + original_config = f"{export_dir}/config.json" config_data = {} - # In the case of LoRA model.save_pretrained does not save the correct config.json - config_data = model.config.to_dict() + with open(original_config) as file: + config_data = json.load(file) config_data["quantization_config"] = hf_quant_config diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 85323910e..8144e9f8a 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -21,7 +21,6 @@ from dataclasses import dataclass, field import torch -from safetensors.torch import save_file from tqdm import tqdm import modelopt.torch.opt as mto @@ -182,18 +181,6 @@ def _save_modelopt_state_with_weights(self): print_rank_0(f"Saved modelopt state to {self._modelopt_state_path}") - # Save base model compressed weights for QLoRA - if getattr(self.quant_args, "compress", False): - # Save base model config.json - self.model.config.save_pretrained(self.args.output_dir) - - # Save base model compressed weights excluding lora weights - state_dict = self.model.state_dict() - for k in [key for key in state_dict if "lora" in key]: - del state_dict[k] - - save_file(state_dict, f"{self.args.output_dir}/model.safetensors") - def _restore_modelopt_state_with_weights(self): modelopt_state = torch.load(self._modelopt_state_path, weights_only=False) modelopt_weights = modelopt_state.pop("modelopt_state_weights", None) @@ -222,6 +209,15 @@ def forward_loop(model): print_rank_0("Quantizing the model...") mtq.quantize(self.model, self.quant_cfg, forward_loop) # type: ignore [arg-type] + # Save modelopt state before compression + modelopt_state = mto.modelopt_state(self.model) + modelopt_state["modelopt_state_weights"] = get_quantizer_state_dict(self.model) + torch.save(modelopt_state, f"{self.args.output_dir}/modelopt_state_calibration.pth") + + print_rank_0( + f"Saved modelopt state before compression to {f'{self.args.output_dir}/modelopt_state_calibration.pth'}" + ) + if getattr(self.quant_args, "compress", False): print_rank_0("Compressing model after calibration") mtq.compress(self.model) From 6045e4e04da6447e640af34cc873afa814ab21d3 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 30 Sep 2025 02:49:07 +0000 Subject: [PATCH 14/26] minor update Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 8 +--- examples/llm_qat/README.md | 6 +-- examples/llm_qat/export.py | 2 +- modelopt/torch/export/unified_export_hf.py | 6 +-- .../plugins/transformers_trainer.py | 6 +-- .../quantization/qtensor/nvfp4_tensor.py | 38 +++++++------------ 6 files changed, 23 insertions(+), 43 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index c08c7957b..da6761252 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -482,7 +482,7 @@ def main(args): quant_cfg["quant_cfg"]["*image*"] = {"enable": False} quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} - if not model_is_already_quantized and calibration_only: + if not model_is_already_quantized or calibration_only: # Only run single sample for preview input_ids = next(iter(calib_dataloader))[ "input_features" if model_type == "whisper" else "input_ids" @@ -772,12 +772,6 @@ def output_decode(generated_ids, input_shape): default=None, type=str, ) - parser.add_argument( - "--qlora", - help="Specify the model to be exported is a QLoRA model trained using modelopt.", - default=False, - action="store_true", - ) args = parser.parse_args() diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index 20422a604..54dffbd5b 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -348,13 +348,9 @@ To perform QLoRA training, run: After performing QLoRA training the final checkpoint can be exported for deployment with vLLM using the following command. ```sh -cd ../llm_ptq - -python hf_ptq.py \ +python export.py \ --pyt_ckpt_path llama3-fp4-qlora \ - --qformat nvfp4 \ --export_dir llama3-fp4-qlora-hf \ - --qlora ``` diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py index 2d032627b..566590dd6 100644 --- a/examples/llm_qat/export.py +++ b/examples/llm_qat/export.py @@ -68,7 +68,7 @@ def main(args): base_model_dir.mkdir(parents=True, exist_ok=True) try: - post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_lora=True) + post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_modelopt_qlora=True) with open(f"{export_dir}/base_model/hf_quant_config.json", "w") as file: json.dump(hf_quant_config, file, indent=4) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 30ad9ddbb..4ad25e96c 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -338,7 +338,7 @@ def _export_quantized_weight( def _export_hf_checkpoint( - model: nn.Module, dtype: torch.dtype | None = None, is_lora: bool = False + model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False ) -> tuple[dict[str, Any], dict[str, Any]]: """Exports the torch model to the packed checkpoint with original HF naming. @@ -431,7 +431,7 @@ def _export_hf_checkpoint( # Resmooth and requantize fused layers # TODO: Handle mixed precision # TODO: Support requantize and resmooth for modelopt-trained LoRA models - if not is_lora: + if not is_modelopt_qlora: requantize_resmooth_fused_llm_layers(model) # Remove all hooks from the model @@ -491,7 +491,7 @@ def _export_hf_checkpoint( quantized_state_dict = model.state_dict() quantized_state_dict = postprocess_state_dict( - quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_lora + quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora ) # Check if any layers are quantized diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 8144e9f8a..78a56e4e4 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -209,13 +209,13 @@ def forward_loop(model): print_rank_0("Quantizing the model...") mtq.quantize(self.model, self.quant_cfg, forward_loop) # type: ignore [arg-type] - # Save modelopt state before compression + # Save modelopt state before compression. This is used to later export the model for deployment. modelopt_state = mto.modelopt_state(self.model) modelopt_state["modelopt_state_weights"] = get_quantizer_state_dict(self.model) - torch.save(modelopt_state, f"{self.args.output_dir}/modelopt_state_calibration.pth") + torch.save(modelopt_state, f"{self.args.output_dir}/modelopt_state_calib.pth") print_rank_0( - f"Saved modelopt state before compression to {f'{self.args.output_dir}/modelopt_state_calibration.pth'}" + f"Saved modelopt state before compression to {f'{self.args.output_dir}/modelopt_state_calib.pth'}" ) if getattr(self.quant_args, "compress", False): diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index f134b6bd8..65861695f 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -94,27 +94,6 @@ def get_weights_scaling_factor_2(cls, input: torch.Tensor): """Returns per tensor weight scaling factor.""" return reduce_amax(input).float() / (6.0 * 448.0) - @classmethod - def get_modelopt_weights_scaling_factor(cls, weight_scaling_factor: torch.Tensor, weight_shape): - """Returns the modelopt weights scaling factor if the quantization is done by trtllm.""" - if weight_scaling_factor.dtype == torch.float8_e4m3fn: - return weight_scaling_factor - - if weight_scaling_factor.dtype == torch.uint8 and weight_scaling_factor.ndim == 1: - # If quantization is done by trtllm, convert cutlass fp4 scale to modelopt fp4 scale - try: - from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import ( - cutlass_fp4_scale_to_modelopt_fp4_scale, - ) - - return cutlass_fp4_scale_to_modelopt_fp4_scale( - weight_scaling_factor, weight_shape[-2:] - ) - except ImportError as e: - raise ImportError( - "This tensor is quantized by trtllm, but tensorrt_llm cannot be imported." - ) from e - @classmethod def get_activation_scaling_factor(cls, quantizer): """Returns the activation scaling factor for export.""" @@ -270,9 +249,20 @@ def _unpack_tensor(input: torch.Tensor): return unpacked.reshape(unpacked_shape) # Get scales from kwargs - kwarg["scale"] = self.get_modelopt_weights_scaling_factor( - kwarg["scale"], self.metadata["shape"] - ) + if kwarg["scale"].dtype == torch.uint8 and kwarg["scale"].ndim == 1: + # If quantization is done by trtllm, convert cutlass fp4 scale to modelopt fp4 scale + try: + from tensorrt_llm._torch.auto_deploy.utils.quantization_utils import ( + cutlass_fp4_scale_to_modelopt_fp4_scale, + ) + + kwarg["scale"] = cutlass_fp4_scale_to_modelopt_fp4_scale( + kwarg["scale"], self.metadata["shape"][-2:] + ) + except ImportError as e: + raise ImportError( + "This tensor is quantized by trtllm, but tensorrt_llm cannot be imported." + ) from e if fast: from ..triton.fp4_kernel import fp4_dequantize From 2dbfe9fd4b341fdfcbf35275732c38507612593c Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 30 Sep 2025 02:54:05 +0000 Subject: [PATCH 15/26] added requantize/resmooth for qlora export Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- modelopt/torch/export/unified_export_hf.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 4ad25e96c..b765aecc1 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -430,9 +430,7 @@ def _export_hf_checkpoint( # Resmooth and requantize fused layers # TODO: Handle mixed precision - # TODO: Support requantize and resmooth for modelopt-trained LoRA models - if not is_modelopt_qlora: - requantize_resmooth_fused_llm_layers(model) + requantize_resmooth_fused_llm_layers(model) # Remove all hooks from the model try: From 9f8257ab6321b6e83439a4029340769da37e5c71 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:09:52 +0000 Subject: [PATCH 16/26] removed stray print Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/export.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py index 566590dd6..c327e0488 100644 --- a/examples/llm_qat/export.py +++ b/examples/llm_qat/export.py @@ -84,7 +84,6 @@ def main(args): # In the case of LoRA model.save_pretrained does not save the correct config.json config_data = model.config.to_dict() - print(config_data) config_data["quantization_config"] = hf_quant_config From 5e17c9d8b90b8ea5d718532a16c6aab5ee939d0a Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:12:52 +0000 Subject: [PATCH 17/26] update readme and documentation Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/README.md | 2 +- modelopt/torch/export/quant_utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index 54dffbd5b..674e20c19 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -350,7 +350,7 @@ After performing QLoRA training the final checkpoint can be exported for deploym ```sh python export.py \ --pyt_ckpt_path llama3-fp4-qlora \ - --export_dir llama3-fp4-qlora-hf \ + --export_path llama3-fp4-qlora-hf \ ``` diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 5253cd2d6..810a728a8 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -827,6 +827,7 @@ def postprocess_state_dict( state_dict: The full model state_dict. maxbound: The maximum bound value for the output quantizer. quantization: The KV cache quantization format. + is_modelopt_qlora: Whether the model is a modelopt-trained QLoRA model. Returns: The filtered state_dict without unnecessary keys like '_amax' and non KV cache output quantizers. From c6e499e5b4caa7ddb457d3e18a5d59c011a34eb5 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:16:13 +0000 Subject: [PATCH 18/26] minor fix Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py index c327e0488..c16fe14fe 100644 --- a/examples/llm_qat/export.py +++ b/examples/llm_qat/export.py @@ -44,7 +44,7 @@ def get_lora_model( model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map) # Restore modelopt state - modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_calibration.pth", weights_only=False) + modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_calib.pth", weights_only=False) restore_from_modelopt_state(model, modelopt_state) # Restore modelopt quantizer state dict From 7b8f1da823926a5114279c7c23c8a9589f0af21a Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:19:37 +0000 Subject: [PATCH 19/26] added logging statements Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/export.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py index c16fe14fe..78d71ada6 100644 --- a/examples/llm_qat/export.py +++ b/examples/llm_qat/export.py @@ -25,6 +25,7 @@ from modelopt.torch.export.unified_export_hf import _export_hf_checkpoint from modelopt.torch.opt.conversion import restore_from_modelopt_state from modelopt.torch.quantization.utils import set_quantizer_state_dict +from modelopt.torch.utils import print_rank_0 RAND_SEED = 1234 @@ -46,12 +47,13 @@ def get_lora_model( # Restore modelopt state modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_calib.pth", weights_only=False) restore_from_modelopt_state(model, modelopt_state) + print_rank_0("Restored modelopt state") # Restore modelopt quantizer state dict modelopt_weights = modelopt_state.pop("modelopt_state_weights", None) if modelopt_weights is not None: - print("Restoring modelopt weights") set_quantizer_state_dict(model, modelopt_weights) + print_rank_0("Restored modelopt quantizer state dict") return model From fdec2f0b92fce13bdde74147427bd6431928e6d4 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:21:31 +0000 Subject: [PATCH 20/26] minor Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/export.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py index 78d71ada6..d3a176e2d 100644 --- a/examples/llm_qat/export.py +++ b/examples/llm_qat/export.py @@ -84,7 +84,6 @@ def main(args): config_path = f"{export_dir}/base_model/config.json" - # In the case of LoRA model.save_pretrained does not save the correct config.json config_data = model.config.to_dict() config_data["quantization_config"] = hf_quant_config From 9f986a7e5f4c0ebd1be5466ebe021f9c72372d81 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:41:44 +0000 Subject: [PATCH 21/26] added TODO Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/export.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py index d3a176e2d..00c6567e8 100644 --- a/examples/llm_qat/export.py +++ b/examples/llm_qat/export.py @@ -37,6 +37,7 @@ def get_lora_model( """ Loads a QLoRA model that has been trained using modelopt trainer. """ + # TODO: Add support for merging adapters in BF16 and merging adapters with quantization for deployment device_map = "auto" if device == "cpu": device_map = "cpu" @@ -72,17 +73,17 @@ def main(args): try: post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_modelopt_qlora=True) - with open(f"{export_dir}/base_model/hf_quant_config.json", "w") as file: + with open(f"{base_model_dir}/hf_quant_config.json", "w") as file: json.dump(hf_quant_config, file, indent=4) hf_quant_config = convert_hf_quant_config_format(hf_quant_config) # Save base model - model.base_model.save_pretrained(f"{export_dir}/base_model", state_dict=post_state_dict) + model.base_model.save_pretrained(f"{base_model_dir}", state_dict=post_state_dict) # Save adapters model.save_pretrained(export_dir) - config_path = f"{export_dir}/base_model/config.json" + config_path = f"{base_model_dir}/config.json" config_data = model.config.to_dict() @@ -112,7 +113,11 @@ def main(args): parser.add_argument("--device", default="cuda") - parser.add_argument("--export_path", default="exported_model") + parser.add_argument( + "--export_path", + default="exported_model", + help="Path to save the exported model", + ) args = parser.parse_args() From afbadaecd18fef6ca5f7c0bb0325852b54602d49 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Mon, 6 Oct 2025 05:47:22 +0000 Subject: [PATCH 22/26] Refactor to include QAT/QAD export too Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/export.py | 45 ++++++++++++------- .../plugins/transformers_trainer.py | 11 +---- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py index 00c6567e8..43ec5b407 100644 --- a/examples/llm_qat/export.py +++ b/examples/llm_qat/export.py @@ -21,6 +21,7 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer +import modelopt.torch.opt as mto from modelopt.torch.export.convert_hf_config import convert_hf_quant_config_format from modelopt.torch.export.unified_export_hf import _export_hf_checkpoint from modelopt.torch.opt.conversion import restore_from_modelopt_state @@ -29,6 +30,9 @@ RAND_SEED = 1234 +# Enable automatic save/load of modelopt state huggingface checkpointing +mto.enable_huggingface_checkpointing() + def get_lora_model( ckpt_path: str, @@ -42,19 +46,20 @@ def get_lora_model( if device == "cpu": device_map = "cpu" - # Load model with adapters + # Load model model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map=device_map) - # Restore modelopt state - modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_calib.pth", weights_only=False) - restore_from_modelopt_state(model, modelopt_state) - print_rank_0("Restored modelopt state") + # Restore modelopt state for LoRA models. For QAT/QAD models from_pretrained call handles this + if hasattr(model, "peft_config"): + modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_train.pth", weights_only=False) + restore_from_modelopt_state(model, modelopt_state) + print_rank_0("Restored modelopt state") - # Restore modelopt quantizer state dict - modelopt_weights = modelopt_state.pop("modelopt_state_weights", None) - if modelopt_weights is not None: - set_quantizer_state_dict(model, modelopt_weights) - print_rank_0("Restored modelopt quantizer state dict") + # Restore modelopt quantizer state dict + modelopt_weights = modelopt_state.pop("modelopt_state_weights", None) + if modelopt_weights is not None: + set_quantizer_state_dict(model, modelopt_weights) + print_rank_0("Restored modelopt quantizer state dict") return model @@ -63,25 +68,31 @@ def main(args): # Load model model = get_lora_model(args.pyt_ckpt_path, args.device) tokenizer = AutoTokenizer.from_pretrained(args.pyt_ckpt_path) + is_qlora = hasattr(model, "peft_config") # Export HF checkpoint export_dir = Path(args.export_path) export_dir.mkdir(parents=True, exist_ok=True) - base_model_dir = export_dir / "base_model" - base_model_dir.mkdir(parents=True, exist_ok=True) + if is_qlora: + base_model_dir = export_dir / "base_model" + base_model_dir.mkdir(parents=True, exist_ok=True) + else: + base_model_dir = export_dir try: - post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_modelopt_qlora=True) + post_state_dict, hf_quant_config = _export_hf_checkpoint(model, is_modelopt_qlora=is_qlora) with open(f"{base_model_dir}/hf_quant_config.json", "w") as file: json.dump(hf_quant_config, file, indent=4) hf_quant_config = convert_hf_quant_config_format(hf_quant_config) - # Save base model - model.base_model.save_pretrained(f"{base_model_dir}", state_dict=post_state_dict) - # Save adapters - model.save_pretrained(export_dir) + # Save model + if is_qlora: + model.base_model.save_pretrained(f"{base_model_dir}", state_dict=post_state_dict) + model.save_pretrained(export_dir) + else: + model.save_pretrained(export_dir, state_dict=post_state_dict) config_path = f"{base_model_dir}/config.json" diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 78a56e4e4..6877fe35e 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -209,14 +209,8 @@ def forward_loop(model): print_rank_0("Quantizing the model...") mtq.quantize(self.model, self.quant_cfg, forward_loop) # type: ignore [arg-type] - # Save modelopt state before compression. This is used to later export the model for deployment. - modelopt_state = mto.modelopt_state(self.model) - modelopt_state["modelopt_state_weights"] = get_quantizer_state_dict(self.model) - torch.save(modelopt_state, f"{self.args.output_dir}/modelopt_state_calib.pth") - - print_rank_0( - f"Saved modelopt state before compression to {f'{self.args.output_dir}/modelopt_state_calib.pth'}" - ) + # Save modelopt state + self._save_modelopt_state_with_weights() if getattr(self.quant_args, "compress", False): print_rank_0("Compressing model after calibration") @@ -225,7 +219,6 @@ def forward_loop(model): # Force garbage collection to free up memory gc.collect() - self._save_modelopt_state_with_weights() torch.cuda.empty_cache() if self.accelerator.is_main_process: From 231c147e65935a93c4c7de81042057a05cbdc48a Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Mon, 6 Oct 2025 05:55:37 +0000 Subject: [PATCH 23/26] Updated README Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- examples/llm_qat/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index 674e20c19..8504bb851 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -303,12 +303,10 @@ See more details on running LLM evaluation benchmarks [here](../llm_eval/README. The final model after QAT is similar in architecture to that of PTQ model. QAT model simply have updated weights as compared to the PTQ model. It can be deployed to TensorRT-LLM (TRTLLM) or to TensorRT just like a regular **ModelOpt** PTQ model if the quantization format is supported for deployment. -To run QAT model with TRTLLM, run: +To run QAT model with vLLM/TRTLLM, run: ```sh -cd ../llm_ptq - -./scripts/huggingface_example.sh --model ../llm_qat/llama3-qat --quant w4a8_awq +python export.py --pyt_ckpt_path llama3-qat --export_path llama3-qat-deploy ``` Note: The QAT checkpoint for `w4a8_awq` config can be created by using `--quant_cfg W4A8_AWQ_BETA_CFG` in [QAT example](#end-to-end-qat-example). @@ -345,6 +343,8 @@ To perform QLoRA training, run: --lora True ``` +## QLoRA deployment + After performing QLoRA training the final checkpoint can be exported for deployment with vLLM using the following command. ```sh From 1853f43a067b2fe5241a8fe8cea65cb73765188b Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 7 Oct 2025 19:34:10 +0000 Subject: [PATCH 24/26] updated condition in get_quant_config Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- modelopt/torch/export/quant_utils.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 810a728a8..2458c7d1a 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -1079,28 +1079,30 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st # Try to get block size from each weight attribute (e.g., gate_up_proj, down_proj) block_size = 0 weight_names = list(weight_attr_names(module)) + weight_quantizer_enabled = False for weight_name in weight_names: weight_block_size = get_weight_block_size(module, weight_name) if weight_block_size > 0: block_size = weight_block_size + weight_quantizer_enabled = True break # Fallback to default weight quantizer if no specific weight quantizer found if block_size == 0: block_size = get_weight_block_size(module) + weight_quantizer = getattr( + module, quantizer_attr_names("weight").weight_quantizer, None + ) + # Check if weight_quantizer is enabled + weight_quantizer_enabled = block_size > 0 or ( + weight_quantizer is not None and weight_quantizer.is_enabled + ) - # In the case of NVFP4, block_size 0 indicates weight_quantizer is not enabled - if block_size == 0 and quantization_format in [ - QUANTIZATION_NVFP4, - QUANTIZATION_NVFP4_AWQ, - QUANTIZATION_W4A8_NVFP4_FP8, - ]: - continue - - # Construct per layer config dictionary - layer_config_dict[name + ".quantization"] = quantization_format - layer_config_dict[name + ".awq_block_size"] = block_size + if weight_quantizer_enabled: + # Construct per layer config dictionary + layer_config_dict[name + ".quantization"] = quantization_format + layer_config_dict[name + ".awq_block_size"] = block_size # Find kv cache quant format if ( From 9272369f678fdc2c20dfdad5050e03de50685e54 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 7 Oct 2025 19:39:55 +0000 Subject: [PATCH 25/26] added check for frozen base model Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- modelopt/torch/quantization/plugins/transformers_trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 6877fe35e..7b97d6009 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -283,6 +283,10 @@ def _load_best_model(self, *args, **kwargs): if is_lora and not self.is_fsdp_enabled: # Custom logic for loading best model with LoRA # TODO: Remove once we migrate to using get_peft_model() + # This custom logic only loads best adapters. Ensure base model is frozen + assert all( + param.requires_grad is False for param in self.model.base_model.parameters() + ), "Base model must be frozen for lora" adapter_name = self.model.active_adapter() self.model.delete_adapter(adapter_name) self.model.load_adapter(self.state.best_model_checkpoint, adapter_name) From d59567a52b91eb7c40f516f909ce0bf65939b811 Mon Sep 17 00:00:00 2001 From: Suguna Velury <178320438+sugunav14@users.noreply.github.com> Date: Tue, 7 Oct 2025 20:03:33 +0000 Subject: [PATCH 26/26] updated check Signed-off-by: Suguna Velury <178320438+sugunav14@users.noreply.github.com> --- .../torch/quantization/plugins/transformers_trainer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index 7b97d6009..7be3f2306 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -285,8 +285,11 @@ def _load_best_model(self, *args, **kwargs): # TODO: Remove once we migrate to using get_peft_model() # This custom logic only loads best adapters. Ensure base model is frozen assert all( - param.requires_grad is False for param in self.model.base_model.parameters() - ), "Base model must be frozen for lora" + not param.requires_grad + for name, param in self.model.base_model.named_parameters() + if "base_layer" in name + ), "Some base_layer parameters are not frozen" + adapter_name = self.model.active_adapter() self.model.delete_adapter(adapter_name) self.model.load_adapter(self.state.best_model_checkpoint, adapter_name)