Merge branch 'main' into QAT-Walkthrough-Notebook

kevalmorabia97 · web-flow · commit 40afd8f537bb · 2025-09-06T10:48:16.000+05:30
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -22,20 +22,31 @@ jobs:
       any_changed: ${{ steps.changed-tests.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - id: get-pr-info
         uses: nv-gha-runners/get-pr-info@main
+      # Get commit from main branch that is present in the PR to use as base for changed files
+      - id: calculate-merge-base
+        env:
+          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+        run: |
+          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
       - name: Check for changes in test-relevant directories
         id: changed-tests
         uses: step-security/changed-files@v46.0.5
         with:
+          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
+          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
           files: |
             .github/workflows/gpu_tests.yml
             modelopt/**
             tests/gpu/**
             tox.ini
             pyproject.toml
             setup.py
-          base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}
+          fail_on_initial_diff_error: true
   wait-checks:
     needs: [check-file-changes]
     if: needs.check-file-changes.outputs.any_changed == 'true'
@@ -70,3 +81,12 @@ jobs:
     timeout-minutes: 90
     container: *gpu_container
     steps: *gpu_steps
+  gpu-pr-required-check:
+    # Run even if gpu-tests-pr is skipped
+    if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
+    needs: [check-file-changes, gpu-tests-pr]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Required GPU tests did not succeed
+        if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
+        run: exit 1
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -126,3 +126,9 @@ jobs:
           python-version: "3.12"
       - name: Run unit tests
         run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
+  unit-pr-required-check:
+    if: github.event_name == 'pull_request'
+    needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "All PR unit test jobs completed"
diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml
@@ -2,10 +2,10 @@
 .tests-default:
   stage: tests
   rules:
-    - if: $JET_ONLY != null
-      when: never
-    - if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
-    - if: $CI_PIPELINE_SOURCE == "web" || $CI_PIPELINE_SOURCE == "schedule"
+    - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: always
+    - if: $CI_PIPELINE_SOURCE != "schedule"
+      when: manual
 
 ##### Unit Tests #####
 unit:
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
@@ -25,7 +25,9 @@
 from accelerate.hooks import remove_hook_from_module
 from example_utils import apply_kv_cache_quant, get_model, get_processor, get_tokenizer, is_enc_dec
 from transformers import (
+    AutoConfig,
     AutoModelForCausalLM,
+    AutoProcessor,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
     WhisperProcessor,
@@ -39,6 +41,7 @@
     export_tensorrt_llm_checkpoint,
     get_model_type,
 )
+from modelopt.torch.export.model_utils import is_multimodal_model
 from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
@@ -567,19 +570,26 @@ def output_decode(generated_ids, input_shape):
 
         export_path = args.export_path
 
-        if hasattr(full_model, "language_model"):
-            # Save original model config and the preprocessor config to the export path for VLMs.
-            from transformers import AutoConfig, AutoProcessor
+        # Check if the model is a multimodal/VLM model
+        is_vlm = is_multimodal_model(full_model)
 
-            print(f"Saving original model and processor configs to {export_path}")
+        if is_vlm:
+            # Save original model config and the processor config to the export path for VLMs.
+            print(f"Saving original model config to {export_path}")
 
             AutoConfig.from_pretrained(
                 args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
             ).save_pretrained(export_path)
 
-            AutoProcessor.from_pretrained(
-                args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
-            ).save_pretrained(export_path)
+            # Try to save processor config if available
+            try:
+                print(f"Saving processor config to {export_path}")
+                AutoProcessor.from_pretrained(
+                    args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
+                ).save_pretrained(export_path)
+            except Exception as e:
+                print(f"Warning: Could not save processor config: {e}")
+                print("This is normal for some VLM architectures that don't use AutoProcessor")
 
         if model_type == "mllama":
             full_model_config = model.config
diff --git a/modelopt/onnx/quantization/qdq_utils.py b/modelopt/onnx/quantization/qdq_utils.py
@@ -790,8 +790,10 @@ def remove_input_dq_and_output_q(
                             if cons_idx in quantizable_custom_ops[consumer.op_type]["inp"]:
                                 consumer.input[cons_idx] = q_node.output[0]
                             else:
-                                q_node_prev = tensor_producers[q_node.input[0]]
-                                consumer.input[cons_idx] = q_node_prev.output[0]
+                                q_node_prev = tensor_producers.get(q_node.input[0], None)
+                                consumer.input[cons_idx] = (
+                                    q_node_prev.output[0] if q_node_prev else q_node.input[0]
+                                )
                             break
 
                 # Track DequantizeLinear node indices for cleanup
@@ -828,8 +830,11 @@ def remove_input_dq_and_output_q(
                 if quantizable_custom_ops[producer.op_type]["out"]:
                     dq_node[0].input[0] = producer.output[0]
                 else:
-                    dq_node_next = tensor_consumers[dq_node[0].output[0]]
-                    dq_node_next[0].input[0] = producer.output[0]
+                    dq_node_next = tensor_consumers.get(dq_node[0].output[0], None)
+                    if dq_node_next:
+                        dq_node_next[0].input[0] = producer.output[0]
+                    else:
+                        dq_node[0].input[0] = producer.output[0]
 
                 # Track QuantizeLinear node indices for cleanup
                 q_indices.append(node_idx)
diff --git a/modelopt/onnx/trt_utils.py b/modelopt/onnx/trt_utils.py
@@ -416,9 +416,10 @@ def interpret_trt_plugins_precision_flag(
             # Will add Q/DQ nodes in the requested I/O indices
             inp_precision_quant = [i for i, p in enumerate(inp_precision) if p in ["int8", "fp8"]]
             out_precision_quant = [i for i, p in enumerate(out_precision) if p in ["int8", "fp8"]]
-            custom_ops_to_quantize[op_type] = {
-                "inp": inp_precision_quant,
-                "out": out_precision_quant,
-            }
+            if inp_precision_quant or out_precision_quant:
+                custom_ops_to_quantize[op_type] = {
+                    "inp": inp_precision_quant,
+                    "out": out_precision_quant,
+                }
 
     return custom_ops_to_cast, custom_ops_to_quantize
diff --git a/modelopt/onnx/utils.py b/modelopt/onnx/utils.py
@@ -25,7 +25,6 @@
 import numpy as np
 import onnx
 import onnx_graphsurgeon as gs
-from onnx import TensorProto, ValueInfoProto, numpy_helper
 from onnx.helper import get_attribute_value
 from onnx_graphsurgeon import Constant, Node, Variable
 
@@ -289,7 +288,7 @@ def _convert_types_to_np(types: dict[str, int] | list[int] | int) -> Any:
 
 def get_tensor_by_name(
     onnx_model: onnx.ModelProto, tensor_name: str
-) -> ValueInfoProto | TensorProto | None:
+) -> onnx.ValueInfoProto | onnx.TensorProto | None:
     """This function returns a tensor from its name.
 
     This function searches for a tensor in the model's:
@@ -438,7 +437,7 @@ def randomize_weights_onnx_bytes(onnx_bytes: bytes, seed: int = 0) -> bytes:
                     numpy_array = np.random.normal(float(avg), float(var), size=init.dims).astype(
                         dtype
                     )
-                    tensor = numpy_helper.from_array(numpy_array, init.name)
+                    tensor = onnx.numpy_helper.from_array(numpy_array, init.name)
                     model.graph.initializer[idx].CopyFrom(tensor)
 
     buffer = io.BytesIO()
@@ -751,3 +750,53 @@ def onnx_type_str_to_enum(dtype: str) -> int:
     dtype = dtype.split("tensor(")[-1].split(")")[0]
     dtype = "FLOAT" if dtype == "float32" else dtype.upper()
     return getattr(onnx.TensorProto, dtype)
+
+
+def remove_node_training_mode(onnx_model: onnx.ModelProto, node_op_type: str) -> onnx.ModelProto:
+    """Remove `training_mode` attribute and extra training outputs from nodes of a given op type.
+
+    This also removes the unused outputs from the training_mode nodes.
+
+    Args:
+        onnx_model: The onnx model.
+        node_op_type: The node type to remove training_mode attribute from.
+
+    Returns:
+        The onnx model with the training_mode attribute removed.
+    """
+    removed_output_names = set()
+    all_inputs = {inp for n in onnx_model.graph.node for inp in n.input}
+    graph_outputs = {o.name for o in onnx_model.graph.output}
+    keep = all_inputs | graph_outputs
+
+    for node in onnx_model.graph.node:
+        if node.op_type != node_op_type:
+            continue
+
+        is_training_mode = False
+        # Drop the 'training_mode' attribute if present
+        for idx, attr in enumerate(list(node.attribute)):
+            if attr.name == "training_mode":
+                del node.attribute[idx]
+                if attr.i == 1:
+                    is_training_mode = True
+                break
+
+        # If the node has extra outputs, remove them all including the training outputs
+        if is_training_mode:
+            to_remove = []
+            for name in node.output:
+                if name not in keep:
+                    removed_output_names.add(name)
+                    to_remove.append(name)
+
+            for name in to_remove:
+                node.output.remove(name)
+
+    if removed_output_names:
+        # Clean up corresponding value_info entries
+        keep = [vi for vi in onnx_model.graph.value_info if vi.name not in removed_output_names]
+        del onnx_model.graph.value_info[:]
+        onnx_model.graph.value_info.extend(keep)
+
+    return onnx_model
diff --git a/modelopt/torch/_deploy/utils/torch_onnx.py b/modelopt/torch/_deploy/utils/torch_onnx.py
@@ -45,6 +45,7 @@
     get_node_names,
     get_output_names,
     get_output_shapes,
+    remove_node_training_mode,
 )
 from modelopt.torch.quantization.export_onnx import configure_linear_module_onnx_quantizers
 from modelopt.torch.utils import flatten_tree, standardize_named_model_args
@@ -569,25 +570,3 @@ def get_onnx_bytes(*args, **kwargs) -> bytes:
     onnx_bytes = get_onnx_bytes_and_metadata(*args, **kwargs)[0]
     onnx_bytes_obj = OnnxBytes.from_bytes(onnx_bytes)
     return onnx_bytes_obj.get_onnx_model_file_bytes()
-
-
-def remove_node_training_mode(onnx_model: ModelProto, node_op_type: str) -> ModelProto:
-    """Remove training_mode attribute from selected node type.
-
-    Args:
-        onnx_model: The onnx model.
-        node_op_type: The node type to remove training_mode attribute from.
-
-    Returns:
-        The onnx model with the training_mode attribute removed.
-    """
-    for node in onnx_model.graph.node:
-        if node.op_type == node_op_type:
-            for attribute in node.attribute:
-                if attribute.name == "training_mode":
-                    if attribute.i == 1:
-                        node.output.remove(node.output[1])
-                        node.output.remove(node.output[1])
-                    attribute.i = 0
-
-    return onnx_model
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
@@ -60,7 +60,7 @@
         {MODEL_NAME_TO_TYPE=}
 """
 
-__all__ = ["get_model_type"]
+__all__ = ["get_model_type", "is_multimodal_model"]
 
 
 def get_model_type(model):
@@ -69,3 +69,43 @@ def get_model_type(model):
         if k.lower() in type(model).__name__.lower():
             return v
     return None
+
+
+def is_multimodal_model(model):
+    """Check if a model is a Vision-Language Model (VLM) or multimodal model.
+
+    This function detects various multimodal model architectures by checking for:
+    - Standard vision configurations (vision_config)
+    - Language model attributes (language_model)
+    - Specific multimodal model types (phi4mm)
+    - Vision LoRA configurations
+    - Audio processing capabilities
+    - Image embedding layers
+
+    Args:
+        model: The HuggingFace model instance to check
+
+    Returns:
+        bool: True if the model is detected as multimodal, False otherwise
+
+    Examples:
+        >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+        >>> is_multimodal_model(model)
+        True
+
+        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-multimodal-instruct")
+        >>> is_multimodal_model(model)
+        True
+    """
+    config = model.config
+
+    return (
+        hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
+        or hasattr(model, "language_model")  # Language model attribute (e.g., LLaVA)
+        or getattr(config, "model_type", "") == "phi4mm"  # Phi-4 multimodal
+        or hasattr(config, "vision_lora")  # Vision LoRA configurations
+        or hasattr(config, "audio_processor")  # Audio processing capabilities
+        or (
+            hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
+        )  # Image embedding layers
+    )
diff --git a/modelopt/torch/prune/plugins/mcore_minitron.py b/modelopt/torch/prune/plugins/mcore_minitron.py
@@ -58,30 +58,6 @@
     "num_layers",
 }
 
-SUPPORTED_MODELS = set()
-
-try:
-    from megatron.core.models.gpt import GPTModel
-
-    SUPPORTED_MODELS.add(GPTModel)
-except Exception:
-    pass
-
-try:
-    from megatron.core.models.mamba import MambaModel
-
-    SUPPORTED_MODELS.add(MambaModel)
-except Exception:
-    pass
-
-try:
-    from nemo.collections import llm
-
-    # NOTE: llm.MambaModel is a subclass of llm.GPTModel
-    SUPPORTED_MODELS.add(llm.GPTModel)
-except Exception:
-    pass
-
 
 class MCoreMinitronSearcher(BaseSearcher):
     """Searcher for Minitron pruning algorithm."""
@@ -150,16 +126,6 @@ def before_search(self) -> None:
     def run_search(self) -> None:
         """Run actual search."""
         # Run forward loop to collect activations and sort parameters
-        model_cfg = None
-        for m_type in SUPPORTED_MODELS:
-            if isinstance(self.model, m_type):
-                model_cfg = self.model.config
-                break
-        if model_cfg is None:
-            raise NotImplementedError(
-                f"Only {SUPPORTED_MODELS} models are supported! Got: {type(self.model)}"
-            )
-
         assert self.forward_loop is not None
         is_training = self.model.training
         self.model.eval()
@@ -178,6 +144,7 @@ def run_search(self) -> None:
                 hp.active = export_config[hp_name]
 
         # kv_channels can be None so we need to save original from original hidden_size and num_attention_heads
+        model_cfg = self.model.config
         orig_kv_channels = getattr(model_cfg, "kv_channels")
         if orig_kv_channels is None:
             orig_kv_channels = getattr(model_cfg, "hidden_size") // getattr(
diff --git a/modelopt/torch/speculative/plugins/megatron_eagle.py b/modelopt/torch/speculative/plugins/megatron_eagle.py
diff --git a/tests/gpu/torch/quantization/backends/test_gemm_common.py b/tests/gpu/torch/quantization/backends/test_gemm_common.py
diff --git a/tests/gpu/torch/speculative/plugins/test_speculative_megatron_modules.py b/tests/gpu/torch/speculative/plugins/test_speculative_megatron_modules.py
diff --git a/tests/unit/onnx/test_onnx_utils.py b/tests/unit/onnx/test_onnx_utils.py
diff --git a/tests/unit/torch/deploy/utils/test_torch_onnx_utils.py b/tests/unit/torch/deploy/utils/test_torch_onnx_utils.py