NVIDIA · kevalmorabia97 · Oct 24, 2025 · Oct 22, 2025 · Sep 17, 2025 · Sep 19, 2025
@@ -61,7 +61,7 @@ jobs:
     if: needs.check-file-changes.outputs.any_changed == 'true'
     # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-l4-latest-1
-    timeout-minutes: 90
+    timeout-minutes: 120
     container: &gpu_container
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
@@ -80,7 +80,7 @@ jobs:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
     # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-h100-latest-1
-    timeout-minutes: 90
+    timeout-minutes: 120
     container: *gpu_container
     steps: *gpu_steps
   gpu-pr-required-check:

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,7 @@ Model Optimizer Changelog (Linux)
 - Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
 - Add support for MCore MoE PTQ/QAT/QAD.
 - Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#multi-node-post-training-quantization-with-fsdp2>`_ for more details.
+- Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow.
 
 **Documentation**
 

@@ -39,6 +39,30 @@
 SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
 
 
+def _is_multimodal_config(config):
+    """Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
+    return (
+        hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
+        or getattr(config, "model_type", "") == "phi4mm"  # Phi-4 multimodal
+        or hasattr(config, "vision_lora")  # Vision LoRA configurations
+        or hasattr(config, "audio_processor")  # Audio processing capabilities
+        or (
+            hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
+        )  # Image embedding layers
+    )
+
+
+def is_nemotron_vl_model(model):
+    """Check if model is a Nemotron VL model based on config architectures."""
+    from modelopt.torch.export.model_utils import is_multimodal_model
+
+    if not is_multimodal_model(model):
+        return False
+
+    architectures = getattr(model.config, "architectures", [])
+    return any("nemotron" in arch.lower() for arch in architectures)
+
+
 def build_quant_cfg(
     qformat,
     kv_cache_qformat,
@@ -185,7 +209,21 @@ def get_model(
     if device == "cpu":
         device_map = "cpu"
 
+    # Prepare config kwargs for loading
     config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {}
+
+    # Load config once and handle VL model detection
+    try:
+        hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
+        if _is_multimodal_config(hf_config):
+            print(
+                "Detected vision-language model from config. "
+                "Disabling automatic device mapping to avoid device_map errors."
+            )
+            device_map = None
+    except Exception as e:
+        print(f"Error: Could not load config from {ckpt_path}: {e}")
+        raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
     if attn_implementation is not None:
         config_kwargs["attn_implementation"] = attn_implementation
 
@@ -207,11 +245,6 @@ def get_model(
         )
         model = hf_vila.llm
     else:
-        hf_config = AutoConfig.from_pretrained(
-            ckpt_path,
-            **config_kwargs,
-        )
-
         if use_seq_device_map:
             device_map = "sequential"
             # If we use sequential, set max_memory limit to ensure that the model does not occupy the full GPU
@@ -282,6 +315,12 @@ def get_model(
                 **model_kwargs,
             )
     model.eval()
+
+    # If device_map was disabled (None), manually move model to target device
+    if device_map is None and device != "cpu":
+        print(f"Moving model to {device} device...")
+        model = model.to(device)
+
     if device == "cuda" and not is_model_on_gpu(model):
         print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
 

@@ -30,6 +30,7 @@
     get_processor,
     get_tokenizer,
     is_enc_dec,
+    is_nemotron_vl_model,
 )
 from transformers import (
     AutoConfig,
@@ -39,6 +40,7 @@
     PreTrainedTokenizerFast,
     WhisperProcessor,
 )
+from vlm_utils import run_text_only_generation, run_vl_preview_generation
 
 import modelopt.torch.opt as mto
 import modelopt.torch.quantization as mtq
@@ -48,7 +50,7 @@
     export_tensorrt_llm_checkpoint,
     get_model_type,
 )
-from modelopt.torch.export.model_utils import is_multimodal_model
+from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
 from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
@@ -283,6 +285,9 @@ def main(args):
 
     full_model = model
 
+    # Detect if this is a Nemotron VL model using architecture-based detection
+    is_nemotron_vl = is_nemotron_vl_model(full_model)
+
     if model_type == "mllama":
         processor = get_processor(
             args.pyt_ckpt_path,
@@ -312,15 +317,8 @@ def main(args):
         tokenizer.padding_side = "left"
 
         # We only quantize the language model for VLMs other than the type supported above.
-        if hasattr(model, "language_model"):
-            parent_model = model  # llama4 case
-            if isinstance(type(model).__dict__.get("language_model"), property):
-                assert hasattr(model, "model") and hasattr(model.model, "language_model"), (
-                    "Expected language_model in model.model, but attribute not found. "
-                    "This may indicate an unsupported model structure."
-                )
-                parent_model = model.model  # gemma3, qwen2.5 VL case
-
+        language_model, parent_model = get_language_model_from_vl(model)
+        if language_model is not None:
             disabled_quant_cfg = {
                 "quant_cfg": {"default": {"enable": False}},
                 "algorithm": "max",
@@ -331,7 +329,7 @@ def main(args):
                 if name != "language_model":
                     mtq.quantize(child, disabled_quant_cfg, forward_loop=None)
 
-            model = model.language_model
+            model = language_model
             model_type = get_model_type(model)
 
     if model_type == "phi4mm":
@@ -458,34 +456,108 @@ def main(args):
             KV_QUANT_CFG_CHOICES,
         )
 
+        # For Nemotron VL models, disable quantization of vision components
+        if is_nemotron_vl:
+            print("Disabling quantization for vision components in Nemotron VL model")
+            quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
+            quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
+            # Also disable radio model components specifically
+            quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
+            quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+
         if not model_is_already_quantized or calibration_only:
             # Only run single sample for preview
             input_ids = next(iter(calib_dataloader))[
                 "input_features" if model_type == "whisper" else "input_ids"
             ][0:1]
-            try:
-                generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
-            except Exception as e:
-                print(
-                    "Error during model generation. Please check if your transformers version is "
-                    "compatible with the model."
+
+            # Generate preview before quantization
+            if is_nemotron_vl and tokenizer is not None:
+                print("Running text-only preview generation for Nemotron VL model...")
+                question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+                generation_config = {
+                    "max_new_tokens": 100,
+                    "do_sample": False,
+                    "eos_token_id": tokenizer.eos_token_id,
+                }
+
+                # Try text-only generation first, fall back to standard generate
+                text_response = run_text_only_generation(
+                    full_model, tokenizer, question, generation_config, args.pyt_ckpt_path
+                )
+
+                if text_response is not None:
+                    generated_ids_before_ptq = text_response
+                    print(f"✅ Text-only generation successful: {text_response[:100]}...")
+                else:
+                    print("Text-only generation failed, falling back to standard generate...")
+                    generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
+
+                # Run additional VL test with images
+                print("Running additional VL test with images...")
+                run_vl_preview_generation(
+                    full_model, tokenizer, args.pyt_ckpt_path, "before quantization (VL test)"
                 )
-                print(f"Error details: {e}")
-                raise
+            else:
+                # Standard generation for non-Nemotron VL models
+                generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
             if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
                 print("Applying nvfp4 quantization (MoE only) for gpt-oss")
 
             # quantize the model
             model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only)
+
+            # For VL models, update full_model to use the quantized language model
+            if is_nemotron_vl:
+                _, parent_model = get_language_model_from_vl(full_model)
+                if parent_model is not None:
+                    print("Updating full_model with quantized language_model...")
+                    parent_model.language_model = model
+
             if args.verbose:
                 mtq.print_quant_summary(model)
 
             # Run some samples
             torch.cuda.empty_cache()
             generated_ids_after_ptq = None
-            if model_type != "llama4":
+            if model_type != "llama4" and not is_nemotron_vl:
                 # Our fake quantizer may not be fully compatible with torch.compile.
                 generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100)
+            elif is_nemotron_vl:
+                print("Running text-only preview generation for quantized Nemotron VL model...")
+                try:
+                    # Try text-only generation using helper function that supports both v1 and v2
+                    if tokenizer is None:
+                        raise ValueError("Tokenizer is required for Nemotron VL text generation")
+
+                    question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+                    generation_config = {
+                        "max_new_tokens": 100,
+                        "do_sample": False,
+                        "eos_token_id": tokenizer.eos_token_id,
+                    }
+
+                    # Use helper function that supports both v1 and v2 models
+                    text_response = run_text_only_generation(
+                        full_model, tokenizer, question, generation_config, args.pyt_ckpt_path
+                    )
+
+                    if text_response is not None:
+                        generated_ids_after_ptq = text_response  # Store text response
+                        print(f"✅ Text-only generation successful: {text_response[:100]}...")
+                    else:
+                        generated_ids_after_ptq = None
+
+                except Exception as e:
+                    print(f"Text-only generation failed: {e}")
+                    generated_ids_after_ptq = None
+
+                # Run additional VL test with images
+                print("Running additional VL test with images...")
+                run_vl_preview_generation(
+                    full_model, tokenizer, args.pyt_ckpt_path, "after quantization (VL test)"
+                )
+
             else:
                 warnings.warn(
                     "Llama4 Maverick generation after quantization has a bug. Skipping generation sample."
@@ -518,15 +590,25 @@ def output_decode(generated_ids, input_shape):
 
             if generated_ids_after_ptq is not None:
                 print("--------")
-                print(f"example test input: {input_decode(input_ids)}")
-                print("--------")
-                print(
-                    f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
-                )
-                print("--------")
-                print(
-                    f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
-                )
+                if is_nemotron_vl:
+                    # For Nemotron VL models, generated_ids are text strings from model.chat()
+                    print("Nemotron VL model text-only generation results:")
+                    print(f"Text response before quantization: {generated_ids_before_ptq}")
+                    print("--------")
+                    print(f"Text response after quantization: {generated_ids_after_ptq}")
+                    print("--------")
+                    print("Note: Additional VL tests with images were run separately above")
+                else:
+                    # For regular LLMs, generated_ids are token tensors that need decoding
+                    print(f"example test input: {input_decode(input_ids)}")
+                    print("--------")
+                    print(
+                        f"example outputs before ptq: {output_decode(generated_ids_before_ptq, input_ids.shape[1])}"
+                    )
+                    print("--------")
+                    print(
+                        f"example outputs after ptq: {output_decode(generated_ids_after_ptq, input_ids.shape[1])}"
+                    )
         else:
             warnings.warn("Skipping quantization: model is already quantized.")
 
@@ -548,9 +630,12 @@ def output_decode(generated_ids, input_shape):
             # Save original model config and the processor config to the export path for VLMs.
             print(f"Saving original model config to {export_path}")
 
-            AutoConfig.from_pretrained(
-                args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
-            ).save_pretrained(export_path)
+            config_kwargs = {"trust_remote_code": args.trust_remote_code}
+            if args.attn_implementation is not None:
+                config_kwargs["attn_implementation"] = args.attn_implementation
+            AutoConfig.from_pretrained(args.pyt_ckpt_path, **config_kwargs).save_pretrained(
+                export_path
+            )
 
             # Try to save processor config if available
             try:
@@ -748,7 +833,7 @@ def output_decode(generated_ids, input_shape):
     parser.add_argument(
         "--attn_implementation",
         help=(
-            "Specify the attention implementation to use."
+            "Specify the attention implementation to use. "
             "This arg will be passed to the HF model loading if specified."
         ),
         default=None,