NVIDIA
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 0 additions & 1 deletion b/‎.github/CODEOWNERS‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/code_quality.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/code_quality.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/example_tests.yml‎
Lines changed: 102 additions & 0 deletions b/‎.github/workflows/example_tests.yml‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.github/workflows/pages.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pages.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/unit_tests.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.gitlab/tests.yml‎
Lines changed: 53 additions & 27 deletions b/‎.gitlab/tests.yml‎
Lines changed: 53 additions & 27 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 13 additions & 1 deletion b/‎CHANGELOG.rst‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 8 additions & 4 deletions b/‎README.md‎
Lines changed: 8 additions & 4 deletions
@@ -30,7 +30,6 @@ modelopt/torch/trace @NVIDIA/modelopt-torch-nas-prune-codeowners
 modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
 
 # Examples
-/docker @NVIDIA/modelopt-docker-codeowners
 /README.md @NVIDIA/modelopt-examples-codeowners
 /examples @NVIDIA/modelopt-examples-codeowners
 /examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners
 
@@ -15,7 +15,7 @@ concurrency:
 jobs:
   code-quality:
     runs-on: ubuntu-latest
-    timeout-minutes: 15
+    timeout-minutes: 30
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
 
@@ -0,0 +1,102 @@
+# NOTE: Make sure this file is consistent with .gitlab/tests.yml
+name: E2E Example tests
+
+on:
+  push:
+    branches: ["pull-request/[0-9]+"]
+    # NOTE: paths cannot be used since push happens to copied PR and only latest commit to PR is used
+  schedule:
+    - cron: "0 0 * * *" # Nightly
+  workflow_dispatch: # On-demand
+
+# Cancel previous runs if new commit is pushed to the same PR
+concurrency:
+  group: ${{ github.workflow }}-${{ startsWith(github.ref, 'refs/heads/pull-request/') && github.ref || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  check-file-changes:
+    if: startsWith(github.ref, 'refs/heads/pull-request/')
+    runs-on: ubuntu-latest
+    outputs:
+      any_changed: ${{ steps.changed-tests.outputs.any_changed }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - id: get-pr-info
+        uses: nv-gha-runners/get-pr-info@main
+      # Get commit from main branch that is present in the PR to use as base for changed files
+      - id: calculate-merge-base
+        env:
+          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+        run: |
+          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") | tee --append "${GITHUB_OUTPUT}"
+      - name: Check for changes in test-relevant directories
+        id: changed-tests
+        uses: step-security/[email protected]
+        with:
+          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
+          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          files: |
+            .github/workflows/example_tests.yml
+            examples/llm_ptq/**
+            modelopt/torch/**
+            tests/examples/llm_ptq/**
+            setup.py
+          fail_on_initial_diff_error: true
+  wait-checks:
+    needs: [check-file-changes]
+    if: needs.check-file-changes.outputs.any_changed == 'true'
+    uses: ./.github/workflows/_wait_for_checks.yml
+    permissions:
+      checks: read
+    secrets: inherit
+    with:
+      match_pattern: '^DCO$|^linux$' # Wait for DCO and Unit tests / linux to pass
+      delay: 300s
+  example-tests-pr:
+    needs: [check-file-changes, wait-checks]
+    if: needs.check-file-changes.outputs.any_changed == 'true'
+    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
+    runs-on: linux-amd64-gpu-h100-latest-1
+    timeout-minutes: 90
+    strategy:
+      matrix:
+        EXAMPLE: [llm_ptq]
+    container: &example_container
+      image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2
+      env:
+        PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
+    steps: &example_steps
+      - uses: actions/checkout@v4
+      - uses: nv-gha-runners/setup-proxy-cache@main
+      - name: Setup environment variables
+        run: |
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" >> $GITHUB_ENV
+          echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
+      - name: Run example tests
+        run: |
+          pip install ".[hf,dev-test]"
+          find examples/${{ matrix.EXAMPLE }} -name "requirements.txt" | while read req_file; do pip install -r "$req_file" || exit 1; done
+          pytest -s tests/examples/${{ matrix.EXAMPLE }}
+  example-tests-non-pr:
+    if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
+    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
+    runs-on: linux-amd64-gpu-h100-latest-1
+    timeout-minutes: 90
+    strategy:
+      matrix:
+        EXAMPLE: [llm_ptq]
+    container: *example_container
+    steps: *example_steps
+  example-pr-required-check:
+    # Run even if example-tests-pr is skipped
+    if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') && always() }}
+    needs: [check-file-changes, example-tests-pr]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Required GPU tests did not succeed
+        if: ${{ needs.check-file-changes.result != 'success' || (needs.check-file-changes.outputs.any_changed == 'true' && needs.example-tests-pr.result != 'success') }}
+        run: exit 1
@@ -44,7 +44,6 @@ jobs:
             modelopt/**
             tests/gpu/**
             tox.ini
-            pyproject.toml
             setup.py
           fail_on_initial_diff_error: true
   wait-checks:
@@ -67,11 +66,14 @@ jobs:
       image: nvcr.io/nvidia/pytorch:25.06-py3
       env:
         GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
-        LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" # Add libcudnn*.so and libnv*.so to path.
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
     steps: &gpu_steps
       - uses: actions/checkout@v4
       - uses: nv-gha-runners/setup-proxy-cache@main
+      - name: Setup environment variables
+        run: |
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" >> $GITHUB_ENV
+          echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
       - name: Run gpu tests
         run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
   gpu-tests-non-pr:
 
@@ -23,7 +23,7 @@ permissions:
 jobs:
   build-docs:
     runs-on: ubuntu-latest
-    timeout-minutes: 15
+    timeout-minutes: 30
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
 
@@ -10,7 +10,6 @@ on:
       - ".github/workflows/unit_tests.yml"
       - "modelopt/**"
       - "tests/unit/**"
-      - "pyproject.toml"
       - "setup.py"
       - "tox.ini"
   schedule:
 
@@ -1,11 +1,12 @@
-# NOTE: Make sure this file is consistent with .github/workflows/{unit,gpu}_tests.yml
+# NOTE: Make sure this file is consistent with .github/workflows/{unit,gpu,example}_tests.yml
 .tests-default:
+  variables:
+    PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
   stage: tests
   rules:
     - if: $CI_PIPELINE_SOURCE == "schedule"
-      when: always
-    - if: $CI_PIPELINE_SOURCE != "schedule"
-      when: manual
+    - if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/
+    - when: manual
 
 ##### Unit Tests #####
 unit:
@@ -24,49 +25,74 @@ unit:
     - tox -e py3$PYTHON-torch$TORCH-tf_$TRANSFORMERS-unit
 
 ##### GPU Tests #####
-gpu:
+.multi-gpu-tests-default:
   extends: .tests-default
-  timeout: 60m
+  timeout: 90m
   image: nvcr.io/nvidia/pytorch:25.06-py3
   variables:
     GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
-    LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" # Add libcudnn*.so and libnv*.so to path.
-    PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
   tags: [docker, linux, 2-gpu]
+  before_script:
+    # Add libcudnn*.so and libnv*.so to path
+    - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
+    # Add trtexec to path
+    - export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin"
+    # Install git-lfs for Daring-Anteater dataset
+    - apt-get update && apt-get install -y git-lfs
+    - git lfs install --system
+
+multi-gpu:
+  extends: .multi-gpu-tests-default
   script:
     # Use pre-installed packages without a new venv with tox-current-env
     - pip install tox-current-env
     - tox -e py312-cuda12-gpu --current-env
 
 ##### Example Tests #####
-example:
-  extends: .tests-default
-  stage: tests
-  timeout: 45m
-  image: gitlab-master.nvidia.com:5005/omniml/modelopt/modelopt_examples:latest
-  variables:
-    TEST_TYPE: pytest
-  tags: [docker, linux, 2-gpu, sm<89]
+example-torch:
+  extends: .multi-gpu-tests-default
+  timeout: 30m
   parallel:
     matrix:
-      - TEST: [diffusers, llm_distill, llm_qat, llm_sparsity, onnx_ptq, speculative_decoding]
-  allow_failure: true # Allow to continue next stages even if job is canceled (e.g. during release)
-  before_script:
-    - pip install ".[all]" -U
+      - EXAMPLE: [llm_distill, llm_sparsity, speculative_decoding]
   script:
-    # Uninstall apex since T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391
-    - if [ "$TEST" = "diffusers" ]; then pip uninstall -y apex; fi
-    - if [ "$TEST_TYPE" = "pytest" ]; then pytest -s tests/examples/$TEST; else bash tests/examples/test_$TEST.sh; fi
+    - pip install ".[hf,dev-test]"
+    - find examples/$EXAMPLE -name "requirements.txt" | while read req_file; do pip install -r "$req_file" || exit 1; done
+    - pytest -s tests/examples/$EXAMPLE
 
-example-ada:
-  extends: example
+# TODO: Fix llm_qat test hang in GitLab CI
+example-failing:
+  extends: example-torch
+  allow_failure: true
+  parallel:
+    matrix:
+      - EXAMPLE: [llm_qat]
+
+example-trtllm:
+  extends: example-torch
   timeout: 60m
+  image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  tags: [docker, linux, 2-gpu, sm>=89]
+  parallel:
+    matrix:
+      - EXAMPLE: [llm_autodeploy, llm_eval, llm_ptq, vlm_ptq]
+
+example-onnx:
+  extends: example-torch
+  image: nvcr.io/nvidia/tensorrt:25.08-py3
   tags: [docker, linux, 2-gpu, sm>=89]
   parallel:
     matrix:
-      - TEST: [llm_eval, llm_ptq, vlm_ptq, llm_autodeploy]
-      - TEST: [onnx_ptq]
+      - EXAMPLE: [diffusers, onnx_ptq]
+        TEST_TYPE: pytest
+      - EXAMPLE: [onnx_ptq]
         TEST_TYPE: bash
+  script:
+    # Uninstall apex since T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391
+    - if [ "$EXAMPLE" = "diffusers" ]; then pip uninstall -y apex; fi
+    - pip install ".[all,dev-test]"
+    - find examples/$EXAMPLE -name "requirements.txt" | while read req_file; do pip install -r "$req_file" || exit 1; done
+    - if [ "$TEST_TYPE" = "pytest" ]; then pytest -s tests/examples/$EXAMPLE; else bash tests/examples/test_$EXAMPLE.sh; fi
 
 ##### Megatron / NeMo Integration Tests #####
 megatron-nemo-integration:
 
@@ -1,13 +1,23 @@
 Model Optimizer Changelog (Linux)
 =================================
 
+0.39 (2025-10-xx)
+^^^^^^^^^^^^^^^^^
+
+**Deprecations**
+
+**New Features**
+
+- Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
+
 0.37 (2025-09-xx)
 ^^^^^^^^^^^^^^^^^
 
 **Deprecations**
 
+- Deprecated ModelOpt's custom docker images. Please use the PyTorch, TensorRT-LLM or TensorRT docker image directly or refer to the `installation guide <https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html>`_ for more details.
 - Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
-- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly.
+- Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. ``engine_dir`` is replaced with ``checkpoint_dir`` in ``examples/llm_ptq`` and ``examples/vlm_ptq``. For performance evaluation, please use ``trtllm-bench`` directly.
 - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format.
 - Deprecated ``examples/vlm_eval`` as it depends on the deprecated TRT-LLM's TRT backend.
 
@@ -16,6 +26,8 @@ Model Optimizer Changelog (Linux)
 - ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
 - Upgrade TensorRT-LLM dependency to 1.1.0rc2.
 - Support Phi-4-multimodal and Qwen2.5-VL quantized HF checkpoint export in ``examples/vlm_ptq``.
+- Support storing and restoring Minitron pruning activations and scores for re-pruning without running the forward loop again.
+- Add Minitron pruning example for Megatron-LM framework. See ``examples/megatron-lm`` for more details.
 
 0.35 (2025-09-04)
 ^^^^^^^^^^^^^^^^^
 
@@ -11,7 +11,7 @@ pip install -e ".[dev]"
 ```
 
 If you are working on features that require dependencies like TensorRT-LLM or Megatron-Core, consider using a docker container to simplify the setup process.
-See [docker README](./README.md#installation--docker) for more details.
+Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information.
 
 ## 🧹 Code linting and formatting
 
 
@@ -61,20 +61,24 @@ Model Optimizer is also integrated with [NVIDIA NeMo](https://github.com/NVIDIA-
 To install stable release packages for Model Optimizer with `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/):
 
 ```bash
-pip install nvidia-modelopt[all]
+pip install -U nvidia-modelopt[all]
 ```
 
-To install from source in editable mode with all development dependencies or to test the latest changes, run:
+To install from source in editable mode with all development dependencies or to use the latest features, run:
 
 ```bash
 # Clone the Model Optimizer repository
-git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
+git clone git@github.com:NVIDIA/TensorRT-Model-Optimizer.git
 cd TensorRT-Model-Optimizer
 
 pip install -e .[dev]
 ```
 
-Visit our [installation guide](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more fine-grained control on installed dependencies or view our pre-made [dockerfiles](docker/README.md) for more information.
+You can also directly use the [TensorRT-LLM docker images](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags)
+(e.g., `nvcr.io/nvidia/tensorrt-llm/release:<version>`), which have Model Optimizer pre-installed.
+Make sure to upgrade Model Optimizer to the latest version using ``pip`` as described above.
+Visit our [installation guide](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for
+more fine-grained control on installed dependencies or for alternative docker images and environment variables to setup.
 
 ## Techniques