Skip to content

Commit d23963b

Browse files
committed
cleanup
Signed-off-by: Suguna Velury <[email protected]>
1 parent d00a7f6 commit d23963b

File tree

5 files changed

+47
-36
lines changed

5 files changed

+47
-36
lines changed

examples/llm_ptq/example_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,15 +152,15 @@ def get_model(
152152
trust_remote_code=False,
153153
use_seq_device_map=False,
154154
attn_implementation=None,
155-
is_lora=False,
155+
is_modelopt_qlora=False,
156156
):
157157
print(f"Initializing model from {ckpt_path}")
158158

159159
device_map = "auto"
160160
if device == "cpu":
161161
device_map = "cpu"
162162

163-
if is_lora:
163+
if is_modelopt_qlora:
164164
model = get_lora_model(ckpt_path, device_map)
165165
return model
166166

examples/llm_ptq/hf_ptq.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ def main(args):
238238
trust_remote_code=args.trust_remote_code,
239239
use_seq_device_map=args.use_seq_device_map,
240240
attn_implementation=args.attn_implementation,
241-
is_lora=args.lora,
241+
is_modelopt_qlora=args.qlora,
242242
)
243243
else:
244244
assert args.qformat in QUANT_CFG_CHOICES, (
@@ -626,7 +626,7 @@ def output_decode(generated_ids, input_shape):
626626
export_hf_checkpoint(
627627
full_model,
628628
export_dir=export_path,
629-
is_modelopt_trained_lora=args.lora,
629+
is_modelopt_qlora=args.qlora,
630630
)
631631

632632
# Restore default padding and export the tokenizer as well.
@@ -765,8 +765,8 @@ def output_decode(generated_ids, input_shape):
765765
type=str,
766766
)
767767
parser.add_argument(
768-
"--lora",
769-
help="Specify the model to be exported is a LoRA model trained using modelopt.",
768+
"--qlora",
769+
help="Specify the model to be exported is a QLoRA model trained using modelopt.",
770770
default=False,
771771
action="store_true",
772772
)

examples/llm_qat/README.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,10 +354,23 @@ To perform QLoRA training, run:
354354
--lora True
355355
```
356356

357-
After performing QLoRA training the final checkpoint exported is ready for deployment using vLLM. For more details about QLoRA deployment using vLLM refer to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html). To deploy with vLLM, run:
357+
After performing QLoRA training the final checkpoint can be exported for deployment with vLLM using the following command.
358358

359359
```sh
360-
vllm serve llama3-fp4-qlora/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora --port 8000 --tokenizer llama3-fp4-qlora
360+
cd ../llm_ptq
361+
362+
python hf_ptq.py \
363+
--pyt_ckpt_path llama3-fp4-qlora \
364+
--qformat nvfp4 \
365+
--export_dir llama3-fp4-qlora-hf \
366+
--qlora
367+
368+
```
369+
370+
To deploy with vLLM, run the following command. For more details about QLoRA deployment using vLLM refer to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html).
371+
372+
```sh
373+
vllm serve llama3-fp4-qlora-hf/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora-hf --port 8000 --tokenizer llama3-fp4-qlora-hf
361374
```
362375

363376
## Pre-Quantized Checkpoints

modelopt/torch/export/quant_utils.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -830,7 +830,7 @@ def postprocess_state_dict(
830830
state_dict: dict,
831831
maxbound: float,
832832
quantization: str | None,
833-
is_modelopt_trained_lora: bool = False,
833+
is_modelopt_qlora: bool = False,
834834
) -> dict:
835835
"""Filters out keys related to weight quantizers and updates KV cache related keys.
836836
@@ -849,28 +849,24 @@ def postprocess_state_dict(
849849
"v_bmm_quantizer._bias_value": "v_proj.v_bias",
850850
"input_quantizer._pre_quant_scale": "pre_quant_scale",
851851
}
852+
skip_keys = ["output_quantizer", "_amax", "_bias_value", "input_quantizer._pre_quant_scale"]
852853

853854
# For modelopt-trained LoRA models, we need to remove the base_layer prefix from the keys for deployment
854-
if is_modelopt_trained_lora:
855+
if is_modelopt_qlora:
855856
replacements.update(
856857
{
857858
"base_layer.weight": "weight",
858859
"base_layer.input_scale": "input_scale",
859860
"base_layer.weight_scale": "weight_scale",
860861
}
861862
)
863+
skip_keys.append("base_layer")
862864

863865
post_state_dict = {}
864866

865867
for key, value in state_dict.items():
866868
# Skip keys not related to quantizers
867-
if (
868-
"output_quantizer" not in key
869-
and "_amax" not in key
870-
and "_bias_value" not in key
871-
and "input_quantizer._pre_quant_scale" not in key
872-
and "base_layer" not in key
873-
):
869+
if all(skip_key not in key for skip_key in skip_keys):
874870
post_state_dict[key] = value
875871
continue
876872

@@ -922,8 +918,8 @@ def postprocess_state_dict(
922918
keys_to_delete.append(key)
923919

924920
# remove LoRA adapters from state dict
925-
if is_modelopt_trained_lora:
926-
for key, value in post_state_dict.items():
921+
if is_modelopt_qlora:
922+
for key in post_state_dict:
927923
if "lora" in key and key not in keys_to_delete:
928924
keys_to_delete.append(key)
929925
# Check for tied weights and remove duplicates
@@ -1104,10 +1100,15 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st
11041100
if block_size == 0:
11051101
block_size = get_weight_block_size(module)
11061102

1107-
# Construct per layer config dictionary
1108-
if block_size == 0 and quantization_format != QUANTIZATION_FP8:
1103+
# In the case of NVFP4, block_size 0 indicates weight_quantizer is not enabled
1104+
if block_size == 0 and quantization_format in [
1105+
QUANTIZATION_NVFP4,
1106+
QUANTIZATION_NVFP4_AWQ,
1107+
QUANTIZATION_W4A8_NVFP4_FP8,
1108+
]:
11091109
continue
11101110

1111+
# Construct per layer config dictionary
11111112
layer_config_dict[name + ".quantization"] = quantization_format
11121113
layer_config_dict[name + ".awq_block_size"] = block_size
11131114

modelopt/torch/export/unified_export_hf.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ def _export_quantized_weight(
336336

337337

338338
def _export_hf_checkpoint(
339-
model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_trained_lora: bool = False
339+
model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False
340340
) -> tuple[dict[str, Any], dict[str, Any]]:
341341
"""Exports the torch model to the packed checkpoint with original HF naming.
342342
@@ -429,7 +429,7 @@ def _export_hf_checkpoint(
429429
# Resmooth and requantize fused layers
430430
# TODO: Handle mixed precision
431431
# TODO: Support requantize and resmooth for modelopt-trained LoRA models
432-
if not is_modelopt_trained_lora:
432+
if not is_modelopt_qlora:
433433
requantize_resmooth_fused_llm_layers(model)
434434

435435
# Remove all hooks from the model
@@ -489,7 +489,7 @@ def _export_hf_checkpoint(
489489
quantized_state_dict = model.state_dict()
490490

491491
quantized_state_dict = postprocess_state_dict(
492-
quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_trained_lora
492+
quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora
493493
)
494494

495495
# Check if any layers are quantized
@@ -504,7 +504,7 @@ def export_hf_checkpoint(
504504
dtype: torch.dtype | None = None,
505505
export_dir: Path | str = tempfile.gettempdir(),
506506
save_modelopt_state: bool = False,
507-
is_modelopt_trained_lora: bool = False,
507+
is_modelopt_qlora: bool = False,
508508
):
509509
"""Exports the torch model to unified checkpoint and saves to export_dir.
510510
@@ -514,18 +514,15 @@ def export_hf_checkpoint(
514514
export_dir: the target export path.
515515
save_modelopt_state: whether to save the modelopt state_dict.
516516
"""
517-
base_export_dir: Path | str = (
518-
f"{export_dir}/base_model" if is_modelopt_trained_lora else export_dir
519-
)
517+
# Setup directories
520518
export_dir = Path(export_dir)
521-
export_dir.mkdir(parents=True, exist_ok=True)
522-
base_export_dir = Path(base_export_dir)
523-
base_export_dir.mkdir(parents=True, exist_ok=True)
519+
base_export_dir = export_dir / "base_model" if is_modelopt_qlora else export_dir
520+
521+
for dir_path in [export_dir, base_export_dir]:
522+
dir_path.mkdir(parents=True, exist_ok=True)
524523

525524
try:
526-
post_state_dict, hf_quant_config = _export_hf_checkpoint(
527-
model, dtype, is_modelopt_trained_lora
528-
)
525+
post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype, is_modelopt_qlora)
529526

530527
# NOTE: (hg) Should we save hf_quant_config when there's no quantization applied?
531528
# Save hf_quant_config.json for backward compatibility
@@ -536,8 +533,8 @@ def export_hf_checkpoint(
536533

537534
post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict)
538535

539-
# In the case of LoRA model, we save the base model
540-
if is_modelopt_trained_lora:
536+
if is_modelopt_qlora:
537+
# In the case of LoRA model, we save the base model and adapters
541538
model.base_model.save_pretrained(
542539
base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
543540
)

0 commit comments

Comments
 (0)