Skip to content
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
476b59f
Add option to benchmark pipeline in diffusion_trt.py (#457)
ajrasane Oct 22, 2025
b583d98
default attn_implementaion to eager to avoid issues
Edwardf0t1 Sep 17, 2025
8b76102
add proper detection and handling for nemotron VL model in ptq examples
Edwardf0t1 Sep 19, 2025
89d207c
create fake vl inputs in export for nemotron VL model
Edwardf0t1 Sep 19, 2025
6991fdf
update fake inputs generation, initialize distributed for Nemotron mo…
Edwardf0t1 Sep 19, 2025
80fecf0
remove distributed prcessing setup and vision input generation since …
Edwardf0t1 Sep 20, 2025
15f0d61
special handling for nemotron VL preview generation in hf_ptq
Edwardf0t1 Sep 21, 2025
ae42b9b
fix mypy error
Edwardf0t1 Sep 21, 2025
587d427
add support for v2 model inference (.generate) with image inputs
Edwardf0t1 Oct 15, 2025
208cb9e
debug loading v2 converted nvfp4 weights from mcore
Edwardf0t1 Oct 17, 2025
f94558f
load scalers only for v2 fp4
Edwardf0t1 Oct 19, 2025
31c4f75
re-use existing vlm detection util function
Edwardf0t1 Oct 23, 2025
ec4a0ef
refactor and create a utils script for vlm
Edwardf0t1 Oct 23, 2025
5f0ea72
remove dulicated is_nemotron_vl usage
Edwardf0t1 Oct 23, 2025
60a698a
update
Edwardf0t1 Oct 23, 2025
446e135
add a util function to extract language model from VLM, update changelog
Edwardf0t1 Oct 23, 2025
f849c17
fix format
Edwardf0t1 Oct 23, 2025
96e1613
update
Edwardf0t1 Oct 23, 2025
c572513
update
Edwardf0t1 Oct 23, 2025
8e6dea3
update
Edwardf0t1 Oct 23, 2025
16bea91
WIP: local changes before pulling remote updates
Edwardf0t1 Oct 23, 2025
8e1d6cb
Increase gpu_tests timeout from 90 to 120 mins
kevalmorabia97 Oct 23, 2025
4561de9
revert torch_onnx.py
Edwardf0t1 Oct 24, 2025
57d388e
revert diffusion_trt.py
Edwardf0t1 Oct 24, 2025
f9b88fd
minor
Edwardf0t1 Oct 24, 2025
0e00954
update
Edwardf0t1 Oct 24, 2025
1a3bac1
update
Edwardf0t1 Oct 24, 2025
a4fa12d
update
Edwardf0t1 Oct 24, 2025
6216038
update
Edwardf0t1 Oct 24, 2025
4352ab6
update
Edwardf0t1 Oct 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
if: needs.check-file-changes.outputs.any_changed == 'true'
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
runs-on: linux-amd64-gpu-l4-latest-1
timeout-minutes: 90
timeout-minutes: 120
container: &gpu_container
image: nvcr.io/nvidia/pytorch:25.06-py3
env:
Expand All @@ -80,7 +80,7 @@ jobs:
if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
# Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
runs-on: linux-amd64-gpu-h100-latest-1
timeout-minutes: 90
timeout-minutes: 120
container: *gpu_container
steps: *gpu_steps
gpu-pr-required-check:
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Model Optimizer Changelog (Linux)
- Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
- Add support for MCore MoE PTQ/QAT/QAD.
- Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#multi-node-post-training-quantization-with-fsdp2>`_ for more details.
- Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Forward-dated release entry

0.39 (2025-11-07) is in the future (today is 2025-10-23). Please mark this as Unreleased/TBD to avoid confusion until the release is cut.

-0.39 (2025-11-07)
+0.39 (Unreleased)

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In CHANGELOG.rst around line 16, the release entry "0.39 (2025-11-07)" is
forward-dated; change the header to indicate it is not yet released (e.g., "0.39
(Unreleased)" or "0.39 (TBD)") and leave the content line "Add support for
Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow." under that
Unreleased/TBD heading so the changelog does not show a future date.


**Documentation**

Expand Down
49 changes: 44 additions & 5 deletions examples/llm_ptq/example_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,30 @@
SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]


def _is_multimodal_config(config):
"""Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
return (
hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL)
or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal
or hasattr(config, "vision_lora") # Vision LoRA configurations
or hasattr(config, "audio_processor") # Audio processing capabilities
or (
hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
) # Image embedding layers
)


def is_nemotron_vl_model(model):
"""Check if model is a Nemotron VL model based on config architectures."""
from modelopt.torch.export.model_utils import is_multimodal_model

if not is_multimodal_model(model):
return False

architectures = getattr(model.config, "architectures", [])
return any("nemotron" in arch.lower() for arch in architectures)


def build_quant_cfg(
qformat,
kv_cache_qformat,
Expand Down Expand Up @@ -185,7 +209,21 @@ def get_model(
if device == "cpu":
device_map = "cpu"

# Prepare config kwargs for loading
config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {}

# Load config once and handle VL model detection
try:
hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
if _is_multimodal_config(hf_config):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the following code nemotron specific? I don't think we have to handle it for other VL models.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we can refactor it in a follow-up PR. There're a couple of VLM model (e.g., mllama, vila, etc) specific codes.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please use is_nemotron_vl_model here. Other VLM does not need this

print(
"Detected vision-language model from config. "
"Disabling automatic device mapping to avoid device_map errors."
)
device_map = None
except Exception as e:
print(f"Error: Could not load config from {ckpt_path}: {e}")
raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
if attn_implementation is not None:
config_kwargs["attn_implementation"] = attn_implementation

Expand All @@ -207,11 +245,6 @@ def get_model(
)
model = hf_vila.llm
else:
hf_config = AutoConfig.from_pretrained(
ckpt_path,
**config_kwargs,
)

if use_seq_device_map:
device_map = "sequential"
# If we use sequential, set max_memory limit to ensure that the model does not occupy the full GPU
Expand Down Expand Up @@ -282,6 +315,12 @@ def get_model(
**model_kwargs,
)
model.eval()

# If device_map was disabled (None), manually move model to target device
if device_map is None and device != "cpu":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if device == "cpu"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That was handled by HF's device_map="cpu" in L210.

print(f"Moving model to {device} device...")
model = model.to(device)

if device == "cuda" and not is_model_on_gpu(model):
print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")

Expand Down
151 changes: 118 additions & 33 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
get_processor,
get_tokenizer,
is_enc_dec,
is_nemotron_vl_model,
)
from transformers import (
AutoConfig,
Expand All @@ -39,6 +40,7 @@
PreTrainedTokenizerFast,
WhisperProcessor,
)
from vlm_utils import run_text_only_generation, run_vl_preview_generation

import modelopt.torch.opt as mto
import modelopt.torch.quantization as mtq
Expand All @@ -48,7 +50,7 @@
export_tensorrt_llm_checkpoint,
get_model_type,
)
from modelopt.torch.export.model_utils import is_multimodal_model
from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
from modelopt.torch.quantization.config import need_calibration
from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
from modelopt.torch.quantization.utils import is_quantized
Expand Down Expand Up @@ -283,6 +285,9 @@ def main(args):

full_model = model

# Detect if this is a Nemotron VL model using architecture-based detection
is_nemotron_vl = is_nemotron_vl_model(full_model)

if model_type == "mllama":
processor = get_processor(
args.pyt_ckpt_path,
Expand Down Expand Up @@ -312,15 +317,8 @@ def main(args):
tokenizer.padding_side = "left"

# We only quantize the language model for VLMs other than the type supported above.
if hasattr(model, "language_model"):
parent_model = model # llama4 case
if isinstance(type(model).__dict__.get("language_model"), property):
assert hasattr(model, "model") and hasattr(model.model, "language_model"), (
"Expected language_model in model.model, but attribute not found. "
"This may indicate an unsupported model structure."
)
parent_model = model.model # gemma3, qwen2.5 VL case

language_model, parent_model = get_language_model_from_vl(model)
if language_model is not None:
disabled_quant_cfg = {
"quant_cfg": {"default": {"enable": False}},
"algorithm": "max",
Expand All @@ -331,7 +329,7 @@ def main(args):
if name != "language_model":
mtq.quantize(child, disabled_quant_cfg, forward_loop=None)

model = model.language_model
model = language_model
model_type = get_model_type(model)

if model_type == "phi4mm":
Expand Down Expand Up @@ -458,34 +456,108 @@ def main(args):
KV_QUANT_CFG_CHOICES,
)

# For Nemotron VL models, disable quantization of vision components
if is_nemotron_vl:
print("Disabling quantization for vision components in Nemotron VL model")
quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
# Also disable radio model components specifically
quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}

Comment on lines 459 to 467
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

Bug: mutating quant_cfg when auto_quantize_bits is set (quant_cfg={})

build_quant_cfg returns {} for auto-quant mode, so writing quant_cfg["quant_cfg"][...] raises KeyError and vision modules aren’t disabled. In auto-quant, disable modules via disabled_layers.

Apply this diff to cover both code paths:

-        if is_nemotron_vl:
-            print("Disabling quantization for vision components in Nemotron VL model")
-            quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
-            quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
-            # Also disable radio model components specifically
-            quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
-            quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+        if is_nemotron_vl:
+            print("Disabling quantization for vision components in Nemotron VL model")
+            nemotron_disabled = {
+                "*vision*": {"enable": False},
+                "*image*": {"enable": False},
+                "*radio*": {"enable": False},
+                "*visual*": {"enable": False},
+            }
+            if not args.auto_quantize_bits:
+                quant_cfg.setdefault("quant_cfg", {}).update(nemotron_disabled)
+            else:
+                # Pass patterns to auto_quant to skip these modules during search
+                extra_disabled_patterns = list(nemotron_disabled.keys())

And extend auto_quantize() to accept these patterns:
(addition outside this hunk)

 def auto_quantize(
-    model, qformat, auto_quantize_bits, calib_dataloader, calibrate_loop, batch_size=1
+    model, qformat, auto_quantize_bits, calib_dataloader, calibrate_loop, batch_size=1,
+    extra_disabled_layers=None,
 ):
@@
-        disabled_layers=["*lm_head*"],
+        disabled_layers=(["*lm_head*"] + (extra_disabled_layers or [])),

Finally, pass extra_disabled_patterns when calling auto_quantize():
(addition near the call site)

-        model = auto_quantize(
+        model = auto_quantize(
             model,
             args.qformat,
             args.auto_quantize_bits,
             calib_dataloader,
             calibrate_loop,
-            args.batch_size,
+            args.batch_size,
+            extra_disabled_layers=(extra_disabled_patterns if is_nemotron_vl else None),
         )

Please run a short auto-quant session on a Nemotron VL to confirm vision blocks remain unquantized.


I'll verify the concerns in this review comment by examining the codebase structure and the specific functions involved.

I'll search the codebase to verify the claims about the build_quant_cfg return value and the auto_quantize function signature.


🏁 Script executed:

#!/bin/bash
# Find and examine the relevant function definitions and code
echo "=== Checking file structure and examining hf_ptq.py ==="
fd "hf_ptq.py" --type f

echo -e "\n=== Examining lines 460-480 to see context ==="
sed -n '460,480p' examples/llm_ptq/hf_ptq.py

echo -e "\n=== Finding build_quant_cfg function ==="
rg -n "def build_quant_cfg" --type py

echo -e "\n=== Finding auto_quantize function signature ==="
rg -n "def auto_quantize" --type py -A 5

echo -e "\n=== Finding calls to auto_quantize ==="
rg -n "auto_quantize\(" --type py -B 2 -A 5

echo -e "\n=== Searching for args.auto_quantize_bits usage ==="
rg -n "auto_quantize_bits" examples/llm_ptq/hf_ptq.py --type py -B 2 -A 2

Length of output: 19066


🏁 Script executed:

#!/bin/bash
# Examine the key functions and usage patterns

echo "=== Lines 460-480 in hf_ptq.py ==="
sed -n '460,480p' examples/llm_ptq/hf_ptq.py

echo -e "\n=== Finding build_quant_cfg function definition ==="
rg -n "def build_quant_cfg" --type py -A 20

echo -e "\n=== Finding auto_quantize function definition ==="
rg -n "def auto_quantize" --type py -A 10

echo -e "\n=== Finding calls to auto_quantize ==="
rg -n "model = auto_quantize" --type py -B 3 -A 8

echo -e "\n=== Checking quant_cfg initialization and conditional branches ==="
rg -n "if.*auto_quantize_bits" examples/llm_ptq/hf_ptq.py --type py -B 3 -A 8

Length of output: 10706



🏁 Script executed:

#!/bin/bash
# Get complete build_quant_cfg function
echo "=== Full build_quant_cfg function ==="
sed -n '55,100p' examples/llm_ptq/example_utils.py

echo -e "\n=== Check control flow around Nemotron VL code (lines 450-480) ==="
sed -n '450,480p' examples/llm_ptq/hf_ptq.py

echo -e "\n=== Full auto_quantize function signature in hf_ptq.py ==="
sed -n '96,120p' examples/llm_ptq/hf_ptq.py

echo -e "\n=== Check disabled_layers usage in auto_quantize ==="
rg -n "disabled_layers" examples/llm_ptq/hf_ptq.py -B 2 -A 2

Length of output: 4324


Bug confirmed: KeyError when accessing quant_cfg["quant_cfg"] in auto-quantize mode

The issue is real. When args.auto_quantize_bits is set, build_quant_cfg() returns an empty dict {} (examples/llm_ptq/example_utils.py:64-65). The Nemotron VL code at lines 466-474 unconditionally accesses quant_cfg["quant_cfg"][...] without checking if the key exists, causing a KeyError. Additionally, the auto_quantize() function (line 96) has no mechanism to accept and pass through extra disabled layer patterns—disabled_layers is hardcoded to ["*lm_head*"] at line 135.

The proposed diff correctly addresses this by:

  1. Adding conditional logic to handle both auto-quant and non-auto-quant paths
  2. Extending auto_quantize() to accept extra_disabled_layers parameter
  3. Passing the Nemotron VL patterns when calling auto_quantize()
🤖 Prompt for AI Agents
In examples/llm_ptq/hf_ptq.py around lines 466 to 474, the code unconditionally
indexes quant_cfg["quant_cfg"] which raises KeyError in auto-quantize mode
(build_quant_cfg returns {}); update the Nemotron VL branch to first detect if
args.auto_quantize_bits is set and, if so, call auto_quantize(model, bits,
extra_disabled_layers=patterns) passing the vision/image/radio/visual patterns;
otherwise ensure quant_cfg has a "quant_cfg" dict (create one if missing) before
assigning the pattern keys and values. Also modify the auto_quantize(...)
function signature to accept an extra_disabled_layers param that merges with the
existing ["*lm_head*"] list and uses it when building disabled_layers so the
Nemotron VL disables those extra patterns during auto quantization.

if not model_is_already_quantized or calibration_only:
# Only run single sample for preview
input_ids = next(iter(calib_dataloader))[
"input_features" if model_type == "whisper" else "input_ids"
][0:1]
try:
generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
except Exception as e:
print(
"Error during model generation. Please check if your transformers version is "
"compatible with the model."

# Generate preview before quantization
if is_nemotron_vl and tokenizer is not None:
print("Running text-only preview generation for Nemotron VL model...")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you abstract lne 476 - 499 to a helper function? It can be re-used 527-559

question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
generation_config = {
"max_new_tokens": 100,
"do_sample": False,
"eos_token_id": tokenizer.eos_token_id,
}

# Try text-only generation first, fall back to standard generate
text_response = run_text_only_generation(
full_model, tokenizer, question, generation_config, args.pyt_ckpt_path
)

if text_response is not None:
generated_ids_before_ptq = text_response
print(f"✅ Text-only generation successful: {text_response[:100]}...")
else:
print("Text-only generation failed, falling back to standard generate...")
generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)

# Run additional VL test with images
print("Running additional VL test with images...")
run_vl_preview_generation(
full_model, tokenizer, args.pyt_ckpt_path, "before quantization (VL test)"
)
print(f"Error details: {e}")
raise
else:
# Standard generation for non-Nemotron VL models
generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
print("Applying nvfp4 quantization (MoE only) for gpt-oss")

# quantize the model
model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only)

# For VL models, update full_model to use the quantized language model
if is_nemotron_vl:
_, parent_model = get_language_model_from_vl(full_model)
if parent_model is not None:
print("Updating full_model with quantized language_model...")
parent_model.language_model = model

if args.verbose:
mtq.print_quant_summary(model)

# Run some samples
torch.cuda.empty_cache()
generated_ids_after_ptq = None
if model_type != "llama4":
if model_type != "llama4" and not is_nemotron_vl:
# Our fake quantizer may not be fully compatible with torch.compile.
generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100)
elif is_nemotron_vl:
print("Running text-only preview generation for quantized Nemotron VL model...")
try:
# Try text-only generation using helper function that supports both v1 and v2
if tokenizer is None:
raise ValueError("Tokenizer is required for Nemotron VL text generation")

question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
generation_config = {
"max_new_tokens": 100,
"do_sample": False,
"eos_token_id": tokenizer.eos_token_id,
}

# Use helper function that supports both v1 and v2 models
text_response = run_text_only_generation(
full_model, tokenizer, question, generation_config, args.pyt_ckpt_path
)

if text_response is not None:
generated_ids_after_ptq = text_response # Store text response
print(f"✅ Text-only generation successful: {text_response[:100]}...")
else:
generated_ids_after_ptq = None

except Exception as e:
print(f"Text-only generation failed: {e}")
generated_ids_after_ptq = None

# Run additional VL test with images
print("Running additional VL test with images...")
run_vl_preview_generation(
full_model, tokenizer, args.pyt_ckpt_path, "after quantization (VL test)"
)

else:
warnings.warn(
"Llama4 Maverick generation after quantization has a bug. Skipping generation sample."
Expand Down Expand Up @@ -518,15 +590,25 @@ def output_decode(generated_ids, input_shape):

if generated_ids_after_ptq is not None:
print("--------")
print(f"example test input: {input_decode(input_ids)}")
print("--------")
print(
f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
)
print("--------")
print(
f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
)
if is_nemotron_vl:
# For Nemotron VL models, generated_ids are text strings from model.chat()
print("Nemotron VL model text-only generation results:")
print(f"Text response before quantization: {generated_ids_before_ptq}")
print("--------")
print(f"Text response after quantization: {generated_ids_after_ptq}")
print("--------")
print("Note: Additional VL tests with images were run separately above")
else:
# For regular LLMs, generated_ids are token tensors that need decoding
print(f"example test input: {input_decode(input_ids)}")
print("--------")
print(
f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
)
print("--------")
print(
f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
)
else:
warnings.warn("Skipping quantization: model is already quantized.")

Expand All @@ -548,9 +630,12 @@ def output_decode(generated_ids, input_shape):
# Save original model config and the processor config to the export path for VLMs.
print(f"Saving original model config to {export_path}")

AutoConfig.from_pretrained(
args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
).save_pretrained(export_path)
config_kwargs = {"trust_remote_code": args.trust_remote_code}
if args.attn_implementation is not None:
config_kwargs["attn_implementation"] = args.attn_implementation
AutoConfig.from_pretrained(args.pyt_ckpt_path, **config_kwargs).save_pretrained(
export_path
)

# Try to save processor config if available
try:
Expand Down Expand Up @@ -748,7 +833,7 @@ def output_decode(generated_ids, input_shape):
parser.add_argument(
"--attn_implementation",
help=(
"Specify the attention implementation to use."
"Specify the attention implementation to use. "
"This arg will be passed to the HF model loading if specified."
),
default=None,
Expand Down
Loading
Loading