add a util function to extract language model from VLM, update changelog

Edwardf0t1 · Edwardf0t1 · commit 446e13595db8 · 2025-10-24T03:09:00.000Z
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -13,6 +13,7 @@ Model Optimizer Changelog (Linux)
 - Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
 - Add support for MCore MoE PTQ/QAT/QAD.
 - Add support for multi-node PTQ and export with FSDP2 in ``examples/llm_ptq/multinode_ptq.py``. See `examples/llm_ptq/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/llm_ptq#multi-node-post-training-quantization-with-fsdp2>`_ for more details.
+- Add support for Nemotron Nano VL v1 & v2 models in FP8/NVFP4 PTQ workflow.
 
 **Documentation**
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -50,7 +50,7 @@
     export_tensorrt_llm_checkpoint,
     get_model_type,
 )
-from modelopt.torch.export.model_utils import is_multimodal_model
+from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
 from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
@@ -317,15 +317,8 @@ def main(args):
         tokenizer.padding_side = "left"
 
         # We only quantize the language model for VLMs other than the type supported above.
-        if hasattr(model, "language_model"):
-            parent_model = model  # llama4 case
-            if isinstance(type(model).__dict__.get("language_model"), property):
-                assert hasattr(model, "model") and hasattr(model.model, "language_model"), (
-                    "Expected language_model in model.model, but attribute not found. "
-                    "This may indicate an unsupported model structure."
-                )
-                parent_model = model.model  # gemma3, qwen2.5 VL case
-
+        language_model, parent_model = get_language_model_from_vl(model)
+        if language_model is not None:
             disabled_quant_cfg = {
                 "quant_cfg": {"default": {"enable": False}},
                 "algorithm": "max",
@@ -336,7 +329,7 @@ def main(args):
                 if name != "language_model":
                     mtq.quantize(child, disabled_quant_cfg, forward_loop=None)
 
-            model = model.language_model
+            model = language_model
             model_type = get_model_type(model)
 
     if model_type == "phi4mm":
@@ -538,9 +531,11 @@ def main(args):
             model = quantize_model(model, quant_cfg, args, calib_dataloader, calibration_only)
 
             # For VL models, update full_model to use the quantized language model
-            if is_nemotron_vl and hasattr(full_model, "language_model"):
-                print("Updating full_model with quantized language_model...")
-                full_model.language_model = model
+            if is_nemotron_vl:
+                _, parent_model = get_language_model_from_vl(full_model)
+                if parent_model is not None:
+                    print("Updating full_model with quantized language_model...")
+                    parent_model.language_model = model
 
             if args.verbose:
                 mtq.print_quant_summary(model)
diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py
@@ -231,3 +231,5 @@ def run_text_only_generation(model, tokenizer, question, generation_config, mode
     except Exception as e:
         print(f"Text-only generation failed: {e}")
         return None
+
+
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
@@ -60,7 +60,7 @@
         {MODEL_NAME_TO_TYPE=}
 """
 
-__all__ = ["get_model_type", "is_multimodal_model"]
+__all__ = ["get_model_type", "is_multimodal_model", "get_language_model_from_vl"]
 
 
 def get_model_type(model):
@@ -109,3 +109,49 @@ def is_multimodal_model(model):
             hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
         )  # Image embedding layers
     )
+
+
+def get_language_model_from_vl(model):
+    """Extract the language model component from a Vision-Language Model (VLM).
+
+    This function handles the common patterns for accessing the language model component
+    in various VLM architectures. It checks multiple possible locations where the
+    language model might be stored.
+
+    Args:
+        model: The VLM model instance to extract the language model from
+
+    Returns:
+        tuple: (language_model, parent_model) where:
+            - language_model: The extracted language model component, or None if not found
+            - parent_model: The parent model containing the language_model attribute
+
+    Examples:
+        >>> # For LLaVA-style models
+        >>> lang_model, parent = get_language_model_from_vl(vlm_model)
+        >>> if lang_model is not None:
+        ...     # Work with the language model component
+        ...     quantized_lang_model = quantize(lang_model)
+        ...     # Update the parent model
+        ...     parent.language_model = quantized_lang_model
+    """
+    # Pattern 1: Direct language_model attribute (e.g., LLaVA, some Nemotron models)
+    if hasattr(model, "language_model"):
+        # Check if it's a property that might need special handling
+        if isinstance(type(model).__dict__.get("language_model"), property):
+            # Some models have language_model as a property that points to model.model.language_model
+            if hasattr(model, "model") and hasattr(model.model, "language_model"):
+                return model.model.language_model, model.model
+            else:
+                # Property exists but no nested structure found
+                return model.language_model, model
+        else:
+            # Direct attribute access
+            return model.language_model, model
+
+    # Pattern 2: Nested in model.model.language_model (e.g., some Gemma3, Qwen2.5-VL models)
+    elif hasattr(model, "model") and hasattr(model.model, "language_model"):
+        return model.model.language_model, model.model
+
+    # Pattern 3: No language_model found
+    return None, None
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -43,6 +43,7 @@
     is_quantlinear,
     set_expert_quantizer_amax,
 )
+from .model_utils import get_language_model_from_vl, is_multimodal_model
 from .model_config import (
     KV_CACHE_FP8,
     KV_CACHE_NVFP4,
@@ -136,11 +137,7 @@ def _output_hook(module, input, output):
         decoder_fake_input = fake_input
 
         # Check if this is a VL model that needs special input handling
-        is_vl_model = (
-            hasattr(model.config, "vision_config")
-            or hasattr(model, "vision_model")
-            or "nemotron" in getattr(model, "name_or_path", "").lower()
-        )
+        is_vl_model = is_multimodal_model(model)
 
         if model_type.startswith("whisper"):
             # For Whisper models, we need to pass a fake input with the specific sequence length
@@ -159,16 +156,10 @@ def _output_hook(module, input, output):
                 model(fake_input, decoder_input_ids=decoder_fake_input)
             elif is_vl_model:
                 # For VL models, try to run optimization on just the language model part
-                language_model = None
-                if hasattr(model, "language_model"):
-                    language_model = model.language_model
-                    print(
-                        "Found language_model attribute - running optimization on language model only"
-                    )
-                elif hasattr(model, "model") and hasattr(model.model, "language_model"):
-                    language_model = model.model.language_model
+                language_model, _ = get_language_model_from_vl(model)
+                if language_model is not None:
                     print(
-                        "Found language_model in model.model - running optimization on language model only"
+                        "Found language_model component - running optimization on language model only"
                     )
 
                 if language_model is not None: