Skip to content

Commit 8745a3c

Browse files
Edwardf0t1ajrasanekevalmorabia97
authored
Enable Nemotron nano vlm v1&v2 nvfp4 PTQ workflow (#347)
Signed-off-by: ajrasane <[email protected]> Signed-off-by: Zhiyu Cheng <[email protected]> Signed-off-by: Keval Morabia <[email protected]> Co-authored-by: ajrasane <[email protected]> Co-authored-by: Keval Morabia <[email protected]>
1 parent f5c209d commit 8745a3c

File tree

7 files changed

+494
-41
lines changed

7 files changed

+494
-41
lines changed

.github/workflows/gpu_tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ jobs:
6161
if: needs.check-file-changes.outputs.any_changed == 'true'
6262
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
6363
runs-on: linux-amd64-gpu-l4-latest-1
64-
timeout-minutes: 90
64+
timeout-minutes: 120
6565
container: &gpu_container
6666
image: nvcr.io/nvidia/pytorch:25.06-py3
6767
env:
@@ -80,7 +80,7 @@ jobs:
8080
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
8181
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
8282
runs-on: linux-amd64-gpu-h100-latest-1
83-
timeout-minutes: 90
83+
timeout-minutes: 120
8484
container: *gpu_container
8585
steps: *gpu_steps
8686
gpu-pr-required-check:

CHANGELOG.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Model Optimizer Changelog (Linux)
1313
- Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
1414
- Add support for MCore MoE PTQ/QAT/QAD.
1515
- Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#multi-node-post-training-quantization-with-fsdp2>`_ for more details.
16+
- Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow.
1617

1718
**Documentation**
1819

examples/llm_ptq/example_utils.py

Lines changed: 105 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,91 @@
3939
SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
4040

4141

42+
def run_nemotron_vl_preview(
43+
full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False
44+
):
45+
"""Run text-only and VL preview generation for Nemotron VL models.
46+
47+
Args:
48+
full_model: The full VL model
49+
tokenizer: The tokenizer
50+
input_ids: Input tensor for generation
51+
pyt_ckpt_path: Path to the model checkpoint
52+
stage_name: Description of the stage (e.g., "before quantization", "after quantization")
53+
allow_fallback: Whether to allow fallback to standard generate on failure
54+
55+
Returns:
56+
Generated text response or None if generation failed
57+
"""
58+
from vlm_utils import run_text_only_generation, run_vl_preview_generation
59+
60+
print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
61+
question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
62+
generation_config = {
63+
"max_new_tokens": 100,
64+
"do_sample": False,
65+
"eos_token_id": tokenizer.eos_token_id,
66+
}
67+
68+
# Try text-only generation
69+
text_response = run_text_only_generation(
70+
full_model, tokenizer, question, generation_config, pyt_ckpt_path
71+
)
72+
73+
if text_response is not None:
74+
print(f"✅ Text-only generation successful: {text_response[:100]}...")
75+
generated_ids = text_response
76+
elif allow_fallback:
77+
print("Text-only generation failed, falling back to standard generate...")
78+
generated_ids = full_model.generate(input_ids, max_new_tokens=100)
79+
else:
80+
generated_ids = None
81+
82+
# Run additional VL test with images
83+
print(f"Running additional VL test with images ({stage_name})...")
84+
run_vl_preview_generation(full_model, tokenizer, pyt_ckpt_path, stage_name)
85+
86+
return generated_ids
87+
88+
89+
def _is_multimodal_config(config):
90+
"""Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
91+
return (
92+
hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL)
93+
or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal
94+
or hasattr(config, "vision_lora") # Vision LoRA configurations
95+
or hasattr(config, "audio_processor") # Audio processing capabilities
96+
or (
97+
hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
98+
) # Image embedding layers
99+
)
100+
101+
102+
def is_nemotron_vl(model_or_config):
103+
"""Check if model or config indicates a Nemotron VL model.
104+
105+
Args:
106+
model_or_config: Either a model instance or a config object.
107+
108+
Returns:
109+
bool: True if it's a Nemotron VL model, False otherwise.
110+
"""
111+
# Try to get config from model, or use directly if it's a config
112+
if hasattr(model_or_config, "config"):
113+
config = model_or_config.config
114+
from modelopt.torch.export.model_utils import is_multimodal_model
115+
116+
if not is_multimodal_model(model_or_config):
117+
return False
118+
else:
119+
config = model_or_config
120+
if not _is_multimodal_config(config):
121+
return False
122+
123+
architectures = getattr(config, "architectures", [])
124+
return any("nemotron" in arch.lower() for arch in architectures)
125+
126+
42127
def build_quant_cfg(
43128
qformat,
44129
kv_cache_qformat,
@@ -185,7 +270,21 @@ def get_model(
185270
if device == "cpu":
186271
device_map = "cpu"
187272

273+
# Prepare config kwargs for loading
188274
config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {}
275+
276+
# Load config once and handle VL model detection
277+
try:
278+
hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
279+
if is_nemotron_vl(hf_config):
280+
print(
281+
"Detected Nemotron VL model from config. "
282+
"Disabling automatic device mapping for compatibility."
283+
)
284+
device_map = None
285+
except Exception as e:
286+
print(f"Error: Could not load config from {ckpt_path}: {e}")
287+
raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
189288
if attn_implementation is not None:
190289
config_kwargs["attn_implementation"] = attn_implementation
191290

@@ -207,11 +306,6 @@ def get_model(
207306
)
208307
model = hf_vila.llm
209308
else:
210-
hf_config = AutoConfig.from_pretrained(
211-
ckpt_path,
212-
**config_kwargs,
213-
)
214-
215309
if use_seq_device_map:
216310
device_map = "sequential"
217311
# If we use sequential, set max_memory limit to ensure that the model does not occupy the full GPU
@@ -282,6 +376,12 @@ def get_model(
282376
**model_kwargs,
283377
)
284378
model.eval()
379+
380+
# If device_map was disabled (None), manually move model to target device
381+
if device_map is None and device != "cpu":
382+
print(f"Moving model to {device} device...")
383+
model = model.to(device)
384+
285385
if device == "cuda" and not is_model_on_gpu(model):
286386
print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
287387

examples/llm_ptq/hf_ptq.py

Lines changed: 75 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
get_processor,
3131
get_tokenizer,
3232
is_enc_dec,
33+
is_nemotron_vl,
34+
run_nemotron_vl_preview,
3335
)
3436
from transformers import (
3537
AutoConfig,
@@ -48,7 +50,7 @@
4850
export_tensorrt_llm_checkpoint,
4951
get_model_type,
5052
)
51-
from modelopt.torch.export.model_utils import is_multimodal_model
53+
from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
5254
from modelopt.torch.quantization.config import need_calibration
5355
from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
5456
from modelopt.torch.quantization.utils import is_quantized
@@ -283,6 +285,9 @@ def main(args):
283285

284286
full_model = model
285287

288+
# Detect if this is a Nemotron VL model using architecture-based detection
289+
is_nemotron_vl_model = is_nemotron_vl(full_model)
290+
286291
if model_type == "mllama":
287292
processor = get_processor(
288293
args.pyt_ckpt_path,
@@ -312,15 +317,8 @@ def main(args):
312317
tokenizer.padding_side = "left"
313318

314319
# We only quantize the language model for VLMs other than the type supported above.
315-
if hasattr(model, "language_model"):
316-
parent_model = model # llama4 case
317-
if isinstance(type(model).__dict__.get("language_model"), property):
318-
assert hasattr(model, "model") and hasattr(model.model, "language_model"), (
319-
"Expected language_model in model.model, but attribute not found. "
320-
"This may indicate an unsupported model structure."
321-
)
322-
parent_model = model.model # gemma3, qwen2.5 VL case
323-
320+
language_model, parent_model = get_language_model_from_vl(model)
321+
if language_model is not None:
324322
disabled_quant_cfg = {
325323
"quant_cfg": {"default": {"enable": False}},
326324
"algorithm": "max",
@@ -331,7 +329,7 @@ def main(args):
331329
if name != "language_model":
332330
mtq.quantize(child, disabled_quant_cfg, forward_loop=None)
333331

334-
model = model.language_model
332+
model = language_model
335333
model_type = get_model_type(model)
336334

337335
if model_type == "phi4mm":
@@ -458,34 +456,65 @@ def main(args):
458456
KV_QUANT_CFG_CHOICES,
459457
)
460458

459+
# For Nemotron VL models, disable quantization of vision components
460+
if is_nemotron_vl_model:
461+
print("Disabling quantization for vision components in Nemotron VL model")
462+
quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
463+
quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
464+
# Also disable radio model components specifically
465+
quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
466+
quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
467+
461468
if not model_is_already_quantized or calibration_only:
462469
# Only run single sample for preview
463470
input_ids = next(iter(calib_dataloader))[
464471
"input_features" if model_type == "whisper" else "input_ids"
465472
][0:1]
466-
try:
467-
generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
468-
except Exception as e:
469-
print(
470-
"Error during model generation. Please check if your transformers version is "
471-
"compatible with the model."
473+
474+
# Generate preview before quantization
475+
if is_nemotron_vl_model and tokenizer is not None:
476+
generated_ids_before_ptq = run_nemotron_vl_preview(
477+
full_model,
478+
tokenizer,
479+
input_ids,
480+
args.pyt_ckpt_path,
481+
"before quantization",
482+
allow_fallback=True,
472483
)
473-
print(f"Error details: {e}")
474-
raise
484+
else:
485+
# Standard generation for non-Nemotron VL models
486+
generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
475487
if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
476488
print("Applying nvfp4 quantization (MoE only) for gpt-oss")
477489

478490
# quantize the model
479491
model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only)
492+
493+
# For VL models, update full_model to use the quantized language model
494+
if is_nemotron_vl_model:
495+
_, parent_model = get_language_model_from_vl(full_model)
496+
if parent_model is not None:
497+
print("Updating full_model with quantized language_model...")
498+
parent_model.language_model = model
499+
480500
if args.verbose:
481501
mtq.print_quant_summary(model)
482502

483503
# Run some samples
484504
torch.cuda.empty_cache()
485505
generated_ids_after_ptq = None
486-
if model_type != "llama4":
506+
if model_type != "llama4" and not is_nemotron_vl_model:
487507
# Our fake quantizer may not be fully compatible with torch.compile.
488508
generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100)
509+
elif is_nemotron_vl_model and tokenizer is not None:
510+
generated_ids_after_ptq = run_nemotron_vl_preview(
511+
full_model,
512+
tokenizer,
513+
input_ids,
514+
args.pyt_ckpt_path,
515+
"after quantization",
516+
allow_fallback=False,
517+
)
489518
else:
490519
warnings.warn(
491520
"Llama4 Maverick generation after quantization has a bug. Skipping generation sample."
@@ -518,15 +547,25 @@ def output_decode(generated_ids, input_shape):
518547

519548
if generated_ids_after_ptq is not None:
520549
print("--------")
521-
print(f"example test input: {input_decode(input_ids)}")
522-
print("--------")
523-
print(
524-
f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
525-
)
526-
print("--------")
527-
print(
528-
f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
529-
)
550+
if is_nemotron_vl_model:
551+
# For Nemotron VL models, generated_ids are text strings from model.chat()
552+
print("Nemotron VL model text-only generation results:")
553+
print(f"Text response before quantization: {generated_ids_before_ptq}")
554+
print("--------")
555+
print(f"Text response after quantization: {generated_ids_after_ptq}")
556+
print("--------")
557+
print("Note: Additional VL tests with images were run separately above")
558+
else:
559+
# For regular LLMs, generated_ids are token tensors that need decoding
560+
print(f"example test input: {input_decode(input_ids)}")
561+
print("--------")
562+
print(
563+
f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
564+
)
565+
print("--------")
566+
print(
567+
f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
568+
)
530569
else:
531570
warnings.warn("Skipping quantization: model is already quantized.")
532571

@@ -548,9 +587,12 @@ def output_decode(generated_ids, input_shape):
548587
# Save original model config and the processor config to the export path for VLMs.
549588
print(f"Saving original model config to {export_path}")
550589

551-
AutoConfig.from_pretrained(
552-
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
553-
).save_pretrained(export_path)
590+
config_kwargs = {"trust_remote_code": args.trust_remote_code}
591+
if args.attn_implementation is not None:
592+
config_kwargs["attn_implementation"] = args.attn_implementation
593+
AutoConfig.from_pretrained(args.pyt_ckpt_path, **config_kwargs).save_pretrained(
594+
export_path
595+
)
554596

555597
# Try to save processor config if available
556598
try:
@@ -748,7 +790,7 @@ def output_decode(generated_ids, input_shape):
748790
parser.add_argument(
749791
"--attn_implementation",
750792
help=(
751-
"Specify the attention implementation to use."
793+
"Specify the attention implementation to use. "
752794
"This arg will be passed to the HF model loading if specified."
753795
),
754796
default=None,

0 commit comments

Comments
 (0)