NVIDIA
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 21 additions & 21 deletions b/‎.github/CODEOWNERS‎
Lines changed: 21 additions & 21 deletions
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 21 additions & 1 deletion b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 6 additions & 7 deletions b/‎.github/workflows/unit_tests.yml‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/diffusers/quantization/diffusion_trt.py‎
Lines changed: 5 additions & 1 deletion b/‎examples/diffusers/quantization/diffusion_trt.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/diffusers/quantization/quantize.py‎
Lines changed: 6 additions & 2 deletions b/‎examples/diffusers/quantization/quantize.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎examples/diffusers/quantization/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/diffusers/quantization/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 18 additions & 8 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 18 additions & 8 deletions
diff --git a/‎examples/llm_qat/README.md‎
Lines changed: 0 additions & 1 deletion b/‎examples/llm_qat/README.md‎
Lines changed: 0 additions & 1 deletion
@@ -32,24 +32,24 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
 # Examples
 /docker @NVIDIA/modelopt-docker-codeowners
 /README.md @NVIDIA/modelopt-examples-codeowners
-examples @NVIDIA/modelopt-examples-codeowners
-examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
-examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
-examples/deepseek @NVIDIA/modelopt-deploy-codeowners
-examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
-examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
-examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
-examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
-examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
-examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
-examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
-examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
-examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
-examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
-examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
-examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
-examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
-examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
-examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
-examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
-examples/windows @NVIDIA/modelopt-windows-codeowners
+/examples @NVIDIA/modelopt-examples-codeowners
+/examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
+/examples/cnn_qat @NVIDIA/modelopt-examples-cnn_qat-codeowners
+/examples/deepseek @NVIDIA/modelopt-deploy-codeowners
+/examples/diffusers @NVIDIA/modelopt-examples-diffusers-codeowners
+/examples/gpt-oss @NVIDIA/modelopt-examples-gpt-oss-codeowners
+/examples/llm_autodeploy @NVIDIA/modelopt-deploy-codeowners
+/examples/llm_distill @NVIDIA/modelopt-torch-distill-codeowners
+/examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
+/examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
+/examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
+/examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
+/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
+/examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
+/examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
+/examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
+/examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
+/examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners
+/examples/vlm_eval @NVIDIA/modelopt-examples-vlm-codeowners
+/examples/vlm_ptq @NVIDIA/modelopt-examples-vlm-codeowners
+/examples/windows @NVIDIA/modelopt-windows-codeowners
@@ -22,20 +22,31 @@ jobs:
       any_changed: ${{ steps.changed-tests.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
       - id: get-pr-info
         uses: nv-gha-runners/get-pr-info@main
+      # Get commit from main branch that is present in the PR to use as base for changed files
+      - id: calculate-merge-base
+        env:
+          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+        run: |
+          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
       - name: Check for changes in test-relevant directories
         id: changed-tests
         uses: step-security/[email protected]
         with:
+          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
+          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
           files: |
             .github/workflows/gpu_tests.yml
             modelopt/**
             tests/gpu/**
             tox.ini
             pyproject.toml
             setup.py
-          base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}
+          fail_on_initial_diff_error: true
   wait-checks:
     needs: [check-file-changes]
     if: needs.check-file-changes.outputs.any_changed == 'true'
@@ -70,3 +81,12 @@ jobs:
     timeout-minutes: 90
     container: *gpu_container
     steps: *gpu_steps
+  gpu-pr-required-check:
+    # Run even if gpu-tests-pr is skipped
+    if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
+    needs: [check-file-changes, gpu-tests-pr]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Required GPU tests did not succeed
+        if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.gpu-tests-pr.result != 'success') }}
+        run: exit 1
@@ -4,13 +4,6 @@ name: Unit tests
 on:
   pull_request:
     branches: [main, release/*]
-    paths:
-      - ".github/workflows/unit_tests.yml"
-      - "modelopt/**"
-      - "tests/unit/**"
-      - "pyproject.toml"
-      - "setup.py"
-      - "tox.ini"
   push:
     branches: [main, release/*]
     paths:
@@ -126,3 +119,9 @@ jobs:
           python-version: "3.12"
       - name: Run unit tests
         run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
+  unit-pr-required-check:
+    if: github.event_name == 'pull_request'
+    needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "All PR unit test jobs completed"
@@ -5,10 +5,12 @@ Model Optimizer Changelog (Linux)
 ^^^^^^^^^^^^^^^^^
 
 **Deprecations**
+- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
 
 **Bug Fixes**
 
 **New Features**
+- ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
 
 0.35 (2025-09-04)
 ^^^^^^^^^^^^^^^^^
@@ -21,6 +23,7 @@ Model Optimizer Changelog (Linux)
 **Bug Fixes**
 
 - Fix attention head ranking logic for pruning Megatron Core GPT models.
+- Upgrade TensorRT-LLM dependency to 1.1.0rc2.
 
 **New Features**
 
 
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6
+FROM nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2
 
 ARG PIP_EXTRA_INDEX_URL="https://pypi.nvidia.com"
 ENV PIP_EXTRA_INDEX_URL=$PIP_EXTRA_INDEX_URL \
 
@@ -105,7 +105,11 @@ def main():
 
     image_name = args.save_image_as if args.save_image_as else f"{args.model}.png"
 
-    pipe = PipelineManager.create_pipeline_from(MODEL_ID[args.model], dtype_map[args.model_dtype])
+    pipe = PipelineManager.create_pipeline_from(
+        MODEL_ID[args.model],
+        dtype_map[args.model_dtype],
+        override_model_path=args.override_model_path,
+    )
 
     # Save the backbone of the pipeline and move it to the GPU
     add_embedding = None
 
@@ -309,7 +309,9 @@ def __init__(self, config: ModelConfig, logger: logging.Logger):
 
     @staticmethod
     def create_pipeline_from(
-        model_type: ModelType, torch_dtype: torch.dtype = torch.bfloat16
+        model_type: ModelType,
+        torch_dtype: torch.dtype = torch.bfloat16,
+        override_model_path: str | None = None,
     ) -> DiffusionPipeline:
         """
         Create and return an appropriate pipeline based on configuration.
@@ -321,7 +323,9 @@ def create_pipeline_from(
             ValueError: If model type is unsupported
         """
         try:
-            model_id = MODEL_REGISTRY[model_type]
+            model_id = (
+                MODEL_REGISTRY[model_type] if override_model_path is None else override_model_path
+            )
             if model_type == ModelType.SD3_MEDIUM:
                 pipe = StableDiffusion3Pipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
             elif model_type in [ModelType.FLUX_DEV, ModelType.FLUX_SCHNELL]:
 
@@ -1,4 +1,5 @@
 cuda-python
+diffusers<=0.34.0
 nvtx
 onnx_graphsurgeon
 opencv-python>=4.8.1.78,<4.12.0.88
 
@@ -25,7 +25,9 @@
 from accelerate.hooks import remove_hook_from_module
 from example_utils import apply_kv_cache_quant, get_model, get_processor, get_tokenizer, is_enc_dec
 from transformers import (
+    AutoConfig,
     AutoModelForCausalLM,
+    AutoProcessor,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
     WhisperProcessor,
@@ -39,6 +41,7 @@
     export_tensorrt_llm_checkpoint,
     get_model_type,
 )
+from modelopt.torch.export.model_utils import is_multimodal_model
 from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.plugins.accelerate import init_quantized_weights
 from modelopt.torch.quantization.utils import is_quantized
@@ -567,19 +570,26 @@ def output_decode(generated_ids, input_shape):
 
         export_path = args.export_path
 
-        if hasattr(full_model, "language_model"):
-            # Save original model config and the preprocessor config to the export path for VLMs.
-            from transformers import AutoConfig, AutoProcessor
+        # Check if the model is a multimodal/VLM model
+        is_vlm = is_multimodal_model(full_model)
 
-            print(f"Saving original model and processor configs to {export_path}")
+        if is_vlm:
+            # Save original model config and the processor config to the export path for VLMs.
+            print(f"Saving original model config to {export_path}")
 
             AutoConfig.from_pretrained(
                 args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
             ).save_pretrained(export_path)
 
-            AutoProcessor.from_pretrained(
-                args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
-            ).save_pretrained(export_path)
+            # Try to save processor config if available
+            try:
+                print(f"Saving processor config to {export_path}")
+                AutoProcessor.from_pretrained(
+                    args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code
+                ).save_pretrained(export_path)
+            except Exception as e:
+                print(f"Warning: Could not save processor config: {e}")
+                print("This is normal for some VLM architectures that don't use AutoProcessor")
 
         if model_type == "mllama":
             full_model_config = model.config
@@ -732,7 +742,7 @@ def output_decode(generated_ids, input_shape):
     )
     parser.add_argument(
         "--verbose",
-        help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",
+        help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",
         default=True,
         action=argparse.BooleanOptionalAction,
     )
 
@@ -82,7 +82,6 @@ def forward_loop(model):
 
 
 # Quantize the model in-place; The model should be unwrapped from any distributed wrapper
-# The model may be wrapped in a DataParallel or DistributedDataParallel after `mtq.quantize`
 model = mtq.quantize(model, mtq.INT8_DEFAULT_CFG, forward_loop)
 
 # Save the modelopt quantizer states
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-FROM nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6`
	`1`	`+FROM nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2`
`2`	`2`
`3`	`3`	`ARG PIP_EXTRA_INDEX_URL="https://pypi.nvidia.com"`
`4`	`4`	`ENV PIP_EXTRA_INDEX_URL=$PIP_EXTRA_INDEX_URL \`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`cuda-python`
	`2`	`+diffusers<=0.34.0`
`2`	`3`	`nvtx`
`3`	`4`	`onnx_graphsurgeon`
`4`	`5`	`opencv-python>=4.8.1.78,<4.12.0.88`