Skip to content

Commit f1a2ff3

Browse files
committed
cleanup
Signed-off-by: Suguna Velury <[email protected]>
1 parent 40202eb commit f1a2ff3

File tree

5 files changed

+47
-36
lines changed

5 files changed

+47
-36
lines changed

examples/llm_ptq/example_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,15 +144,15 @@ def get_model(
144144
trust_remote_code=False,
145145
use_seq_device_map=False,
146146
attn_implementation=None,
147-
is_lora=False,
147+
is_modelopt_qlora=False,
148148
):
149149
print(f"Initializing model from {ckpt_path}")
150150

151151
device_map = "auto"
152152
if device == "cpu":
153153
device_map = "cpu"
154154

155-
if is_lora:
155+
if is_modelopt_qlora:
156156
model = get_lora_model(ckpt_path, device_map)
157157
return model
158158

examples/llm_ptq/hf_ptq.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ def main(args):
248248
trust_remote_code=args.trust_remote_code,
249249
use_seq_device_map=args.use_seq_device_map,
250250
attn_implementation=args.attn_implementation,
251-
is_lora=args.lora,
251+
is_modelopt_qlora=args.qlora,
252252
)
253253
else:
254254
assert args.qformat in QUANT_CFG_CHOICES, (
@@ -640,7 +640,7 @@ def output_decode(generated_ids, input_shape):
640640
export_hf_checkpoint(
641641
full_model,
642642
export_dir=export_path,
643-
is_modelopt_trained_lora=args.lora,
643+
is_modelopt_qlora=args.qlora,
644644
)
645645

646646
# Copy custom model files (Python files and JSON configs) if trust_remote_code is used
@@ -782,8 +782,8 @@ def output_decode(generated_ids, input_shape):
782782
type=str,
783783
)
784784
parser.add_argument(
785-
"--lora",
786-
help="Specify the model to be exported is a LoRA model trained using modelopt.",
785+
"--qlora",
786+
help="Specify the model to be exported is a QLoRA model trained using modelopt.",
787787
default=False,
788788
action="store_true",
789789
)

examples/llm_qat/README.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,10 +345,23 @@ To perform QLoRA training, run:
345345
--lora True
346346
```
347347

348-
After performing QLoRA training the final checkpoint exported is ready for deployment using vLLM. For more details about QLoRA deployment using vLLM refer to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html). To deploy with vLLM, run:
348+
After performing QLoRA training the final checkpoint can be exported for deployment with vLLM using the following command.
349349

350350
```sh
351-
vllm serve llama3-fp4-qlora/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora --port 8000 --tokenizer llama3-fp4-qlora
351+
cd ../llm_ptq
352+
353+
python hf_ptq.py \
354+
--pyt_ckpt_path llama3-fp4-qlora \
355+
--qformat nvfp4 \
356+
--export_dir llama3-fp4-qlora-hf \
357+
--qlora
358+
359+
```
360+
361+
To deploy with vLLM, run the following command. For more details about QLoRA deployment using vLLM refer to the documentation [here](https://docs.vllm.ai/en/latest/features/lora.html).
362+
363+
```sh
364+
vllm serve llama3-fp4-qlora-hf/base_model --enable-lora --lora-modules adapter=llama3-fp4-qlora-hf --port 8000 --tokenizer llama3-fp4-qlora-hf
352365
```
353366

354367
## Pre-Quantized Checkpoints

modelopt/torch/export/quant_utils.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -836,7 +836,7 @@ def postprocess_state_dict(
836836
state_dict: dict,
837837
maxbound: float,
838838
quantization: str | None,
839-
is_modelopt_trained_lora: bool = False,
839+
is_modelopt_qlora: bool = False,
840840
) -> dict:
841841
"""Filters out keys related to weight quantizers and updates KV cache related keys.
842842
@@ -855,28 +855,24 @@ def postprocess_state_dict(
855855
"v_bmm_quantizer._bias_value": "v_proj.v_bias",
856856
"input_quantizer._pre_quant_scale": "pre_quant_scale",
857857
}
858+
skip_keys = ["output_quantizer", "_amax", "_bias_value", "input_quantizer._pre_quant_scale"]
858859

859860
# For modelopt-trained LoRA models, we need to remove the base_layer prefix from the keys for deployment
860-
if is_modelopt_trained_lora:
861+
if is_modelopt_qlora:
861862
replacements.update(
862863
{
863864
"base_layer.weight": "weight",
864865
"base_layer.input_scale": "input_scale",
865866
"base_layer.weight_scale": "weight_scale",
866867
}
867868
)
869+
skip_keys.append("base_layer")
868870

869871
post_state_dict = {}
870872

871873
for key, value in state_dict.items():
872874
# Skip keys not related to quantizers
873-
if (
874-
"output_quantizer" not in key
875-
and "_amax" not in key
876-
and "_bias_value" not in key
877-
and "input_quantizer._pre_quant_scale" not in key
878-
and "base_layer" not in key
879-
):
875+
if all(skip_key not in key for skip_key in skip_keys):
880876
post_state_dict[key] = value
881877
continue
882878

@@ -928,8 +924,8 @@ def postprocess_state_dict(
928924
keys_to_delete.append(key)
929925

930926
# remove LoRA adapters from state dict
931-
if is_modelopt_trained_lora:
932-
for key, value in post_state_dict.items():
927+
if is_modelopt_qlora:
928+
for key in post_state_dict:
933929
if "lora" in key and key not in keys_to_delete:
934930
keys_to_delete.append(key)
935931
# Check for tied weights and remove duplicates
@@ -1110,10 +1106,15 @@ def get_quant_config(named_modules: nn.Module | dict[str, nn.Module]) -> dict[st
11101106
if block_size == 0:
11111107
block_size = get_weight_block_size(module)
11121108

1113-
# Construct per layer config dictionary
1114-
if block_size == 0 and quantization_format != QUANTIZATION_FP8:
1109+
# In the case of NVFP4, block_size 0 indicates weight_quantizer is not enabled
1110+
if block_size == 0 and quantization_format in [
1111+
QUANTIZATION_NVFP4,
1112+
QUANTIZATION_NVFP4_AWQ,
1113+
QUANTIZATION_W4A8_NVFP4_FP8,
1114+
]:
11151115
continue
11161116

1117+
# Construct per layer config dictionary
11171118
layer_config_dict[name + ".quantization"] = quantization_format
11181119
layer_config_dict[name + ".awq_block_size"] = block_size
11191120

modelopt/torch/export/unified_export_hf.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ def _export_quantized_weight(
340340

341341

342342
def _export_hf_checkpoint(
343-
model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_trained_lora: bool = False
343+
model: nn.Module, dtype: torch.dtype | None = None, is_modelopt_qlora: bool = False
344344
) -> tuple[dict[str, Any], dict[str, Any]]:
345345
"""Exports the torch model to the packed checkpoint with original HF naming.
346346
@@ -433,7 +433,7 @@ def _export_hf_checkpoint(
433433
# Resmooth and requantize fused layers
434434
# TODO: Handle mixed precision
435435
# TODO: Support requantize and resmooth for modelopt-trained LoRA models
436-
if not is_modelopt_trained_lora:
436+
if not is_modelopt_qlora:
437437
requantize_resmooth_fused_llm_layers(model)
438438

439439
# Remove all hooks from the model
@@ -493,7 +493,7 @@ def _export_hf_checkpoint(
493493
quantized_state_dict = model.state_dict()
494494

495495
quantized_state_dict = postprocess_state_dict(
496-
quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_trained_lora
496+
quantized_state_dict, kv_cache_max_bound, kv_cache_format, is_modelopt_qlora
497497
)
498498

499499
# Check if any layers are quantized
@@ -508,7 +508,7 @@ def export_hf_checkpoint(
508508
dtype: torch.dtype | None = None,
509509
export_dir: Path | str = tempfile.gettempdir(),
510510
save_modelopt_state: bool = False,
511-
is_modelopt_trained_lora: bool = False,
511+
is_modelopt_qlora: bool = False,
512512
):
513513
"""Exports the torch model to unified checkpoint and saves to export_dir.
514514
@@ -518,18 +518,15 @@ def export_hf_checkpoint(
518518
export_dir: the target export path.
519519
save_modelopt_state: whether to save the modelopt state_dict.
520520
"""
521-
base_export_dir: Path | str = (
522-
f"{export_dir}/base_model" if is_modelopt_trained_lora else export_dir
523-
)
521+
# Setup directories
524522
export_dir = Path(export_dir)
525-
export_dir.mkdir(parents=True, exist_ok=True)
526-
base_export_dir = Path(base_export_dir)
527-
base_export_dir.mkdir(parents=True, exist_ok=True)
523+
base_export_dir = export_dir / "base_model" if is_modelopt_qlora else export_dir
524+
525+
for dir_path in [export_dir, base_export_dir]:
526+
dir_path.mkdir(parents=True, exist_ok=True)
528527

529528
try:
530-
post_state_dict, hf_quant_config = _export_hf_checkpoint(
531-
model, dtype, is_modelopt_trained_lora
532-
)
529+
post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype, is_modelopt_qlora)
533530

534531
# NOTE: (hg) Should we save hf_quant_config when there's no quantization applied?
535532
# Save hf_quant_config.json for backward compatibility
@@ -540,8 +537,8 @@ def export_hf_checkpoint(
540537

541538
post_state_dict = rename_and_prune_if_spec_decoding(model, post_state_dict)
542539

543-
# In the case of LoRA model, we save the base model
544-
if is_modelopt_trained_lora:
540+
if is_modelopt_qlora:
541+
# In the case of LoRA model, we save the base model and adapters
545542
model.base_model.save_pretrained(
546543
base_export_dir, state_dict=post_state_dict, save_modelopt_state=save_modelopt_state
547544
)

0 commit comments

Comments
 (0)