NVIDIA
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 0 additions & 1 deletion b/‎.github/CODEOWNERS‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/example_tests.yml‎
Lines changed: 2 additions & 4 deletions b/‎.github/workflows/example_tests.yml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎.github/workflows/gpu_tests.yml‎
Lines changed: 2 additions & 4 deletions b/‎.github/workflows/gpu_tests.yml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 17 additions & 8 deletions b/‎.github/workflows/unit_tests.yml‎
Lines changed: 17 additions & 8 deletions
diff --git a/‎.gitlab/release.yml‎
Lines changed: 1 addition & 1 deletion b/‎.gitlab/release.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitlab/tests.yml‎
Lines changed: 1 addition & 3 deletions b/‎.gitlab/tests.yml‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 19 additions & 15 deletions b/‎CHANGELOG.rst‎
Lines changed: 19 additions & 15 deletions
diff --git a/‎examples/deepseek/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎examples/deepseek/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/deepseek/README.md‎
Lines changed: 37 additions & 7 deletions b/‎examples/deepseek/README.md‎
Lines changed: 37 additions & 7 deletions
@@ -44,7 +44,6 @@ modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners
 /examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
 /examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
 /examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
-/examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
 /examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
 /examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
 /examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
 
@@ -54,12 +54,11 @@ jobs:
       checks: read
     secrets: inherit
     with:
-      match_pattern: '^DCO$|^linux$' # Wait for DCO and Unit tests / linux to pass
+      match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
   example-tests-pr:
     needs: [check-file-changes, wait-checks]
     if: needs.check-file-changes.outputs.any_changed == 'true'
-    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-h100-latest-1
     timeout-minutes: 90
     strategy:
@@ -84,8 +83,7 @@ jobs:
           pytest -s tests/examples/${{ matrix.EXAMPLE }}
   example-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
-    runs-on: linux-amd64-gpu-h100-latest-1
+    runs-on: linux-amd64-gpu-h100-latest-2
     timeout-minutes: 90
     strategy:
       matrix:
 
@@ -54,12 +54,11 @@ jobs:
       checks: read
     secrets: inherit
     with:
-      match_pattern: '^DCO$|^linux$' # Wait for DCO and Unit tests / linux to pass
+      match_pattern: "^DCO$|^linux$" # Wait for DCO and Unit tests / linux to pass
       delay: 300s
   gpu-tests-pr:
     needs: [check-file-changes, wait-checks]
     if: needs.check-file-changes.outputs.any_changed == 'true'
-    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
     runs-on: linux-amd64-gpu-l4-latest-1
     timeout-minutes: 120
     container: &gpu_container
@@ -78,8 +77,7 @@ jobs:
         run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
   gpu-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
-    # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
-    runs-on: linux-amd64-gpu-h100-latest-1
+    runs-on: linux-amd64-gpu-h100-latest-2
     timeout-minutes: 120
     container: *gpu_container
     steps: *gpu_steps
 
@@ -28,7 +28,7 @@ jobs:
       checks: read
     secrets: inherit
     with:
-      match_pattern: '^DCO$'
+      match_pattern: "^DCO$"
   linux:
     needs: [check-dco]
     runs-on: ubuntu-latest
@@ -39,7 +39,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests
-        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch28-tf_latest-unit
+        run: pip install tox && COV_ARGS="--cov" tox -e py312-torch29-tf_latest-unit
       - name: Upload coverage reports to Codecov
         uses: codecov/codecov-action@v5
         with:
@@ -57,7 +57,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests (without coverage)
-        run: pip install tox && tox -e py312-torch28-tf_latest-unit
+        run: pip install tox && tox -e py312-torch29-tf_latest-unit
   multi-py:
     if: github.event_name == 'pull_request'
     needs: [linux]
@@ -72,15 +72,15 @@ jobs:
         with:
           python-version: "3.${{ matrix.py }}"
       - name: Run unit tests
-        run: pip install tox && tox -e py3${{ matrix.py }}-torch28-tf_latest-unit
+        run: pip install tox && tox -e py3${{ matrix.py }}-torch29-tf_latest-unit
   multi-torch:
     if: github.event_name == 'pull_request'
     needs: [linux]
     runs-on: ubuntu-latest
     timeout-minutes: 30
     strategy:
       matrix:
-        torch: [26, 27]
+        torch: [26, 27, 28]
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
@@ -102,7 +102,7 @@ jobs:
         with:
           python-version: "3.12"
       - name: Run unit tests
-        run: pip install tox && tox -e py312-torch28-tf_${{ matrix.tf }}-unit
+        run: pip install tox && tox -e py312-torch29-tf_${{ matrix.tf }}-unit
   partial-install:
     if: github.event_name == 'pull_request'
     needs: [linux]
@@ -119,8 +119,17 @@ jobs:
       - name: Run unit tests
         run: pip install tox && tox -e py312-partial-unit-${{ matrix.test-env }}
   unit-pr-required-check:
-    if: github.event_name == 'pull_request'
+    # Run even if some jobs are skipped
+    if: ${{ github.event_name == 'pull_request' && always() }}
     needs: [linux, windows, multi-py, multi-torch, multi-transformers, partial-install]
     runs-on: ubuntu-latest
     steps:
-      - run: echo "All PR unit test jobs completed"
+      - name: Required unit tests did not succeed
+        if: >-
+          ${{ needs.linux.result != 'success' ||
+          needs.windows.result != 'success' ||
+          needs.multi-py.result != 'success' ||
+          needs.multi-torch.result != 'success' ||
+          needs.multi-transformers.result != 'success' ||
+          needs.partial-install.result != 'success' }}
+        run: exit 1
@@ -17,8 +17,8 @@ build-and-upload-wheels:
         TWINE_PASSWORD: $ARTIFACTORY_TOKEN # Configured in GitLab > Settings > CI/CD
         REPO_URL: https://urm.nvidia.com/artifactory/api/pypi/sw-dl-algo-ammo-pypi-local
     - if: $CI_PIPELINE_SOURCE == "schedule"
+      when: manual
       variables:
-        when: manual
         RELEASE: "false"
         TWINE_USERNAME: gitlab-ci-token
         TWINE_PASSWORD: $CI_JOB_TOKEN
 
@@ -15,12 +15,10 @@ unit:
   timeout: 30m
   variables:
     PYTHON: 12
-    TORCH: 28
+    TORCH: 29
     TRANSFORMERS: latest
   image: python:3.$PYTHON
   before_script:
-    # Install cmake to build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
-    - if [ "$PYTHON" = "12" ]; then apt-get update && apt-get install -y cmake; fi
     - pip install tox
   script:
     - tox -e py3$PYTHON-torch$TORCH-tf_$TRANSFORMERS-unit
 
@@ -1,31 +1,34 @@
 Model Optimizer Changelog (Linux)
 =================================
-0.41 (2025-12-xx)
-^^^^^^^^^^^^^^^^^
-
-**Deprecations**
-
-**New Features**
-
-- Add support for PyTorch Geometric quantization.
-
-**Misc**
-
-- Bump minimum recommended transformers version to 4.53.
-
 
-0.40 (2025-12-xx)
+0.40 (2025-12-11)
 ^^^^^^^^^^^^^^^^^
 
 **Bug Fixes**
 
 - Fix a bug in FastNAS pruning (computer vision models) where the model parameters were sorted twice messing up the ordering.
+- Fix Q/DQ/Cast node placements in 'FP32 required' tensors in custom ops in the ONNX quantization workflow.
 
 **New Features**
 
 - Add MoE (e.g. Qwen3-30B-A3B, gpt-oss-20b) pruning support for ``num_moe_experts``, ``moe_ffn_hidden_size`` and ``moe_shared_expert_intermediate_size`` parameters in Minitron pruning (``mcore_minitron``).
 - Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/specdec_bench#speculative-decoding-benchmark>`_ for more details.
 - Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
+- Add KL Divergence loss based auto_quantize method. See `auto_quantize API docs <https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize>`_ for more details.
+- Add support for saving and resuming auto_quantize search state. This speeds up the auto_quantize process by skipping the score estimation step if the search state is provided.
+- Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
+- Add support for PyTorch Geometric quantization.
+- Add per tensor and per channel MSE calibrator support.
+- Added support for PTQ/QAT checkpoint export and loading for running fakequant evaluation in vLLM. See ``examples/vllm_serve/README.md#load-qatptq-model-and-serve-in-vllm-wip`` for more details.
+
+**Documentation**
+
+- Deprecate ``examples/megatron-lm`` in favor of more detailed documentation in `Megatron-LM/examples/post_training/modelopt <https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt>`_.
+
+**Misc**
+
+- Bump minimum recommended transformers version to 4.53.
+- Replace ONNX simplification package from ``onnxsim`` to ``onnxslim``.
 
 0.39 (2025-11-11)
 ^^^^^^^^^^^^^^^^^
@@ -49,6 +52,7 @@ Model Optimizer Changelog (Linux)
 - Enabled native Modelopt quantization support for FP8 and NVFP4 formats in SGLang. See `SGLang quantization documentation <https://github.com/sgl-project/sglang/blob/main/docs/advanced_features/quantization.md#using-nvidia-modelopt>`_ for more details.
 - Added modelopt quantized checkpoints in vLLM/SGLang CI/CD pipelines (PRs are under review).
 - Add support for exporting QLoRA checkpoint fintuned using ModelOpt.
+- Update NVFP4 AWQ checkpoint export. It now fuses scaling factors of o_proj and down_proj layers into the model when possible to facilitate deployment.
 
 **Documentation**
 
@@ -72,7 +76,7 @@ Model Optimizer Changelog (Linux)
 - Upgrade TensorRT-LLM dependency to 1.1.0rc2.
 - Support Phi-4-multimodal and Qwen2.5-VL quantized HF checkpoint export in ``examples/vlm_ptq``.
 - Support storing and restoring Minitron pruning activations and scores for re-pruning without running the forward loop again.
-- Add Minitron pruning example for Megatron-LM framework. See ``examples/megatron-lm`` for more details.
+- Add Minitron pruning example for Megatron-LM framework. See `Megatron-LM/examples/post_training/modelopt <https://github.com/NVIDIA/Megatron-LM/tree/main/examples/post_training/modelopt>`_ for more details.
 
 0.35 (2025-09-04)
 ^^^^^^^^^^^^^^^^^
 
@@ -1 +1,2 @@
 DeepSeek-V3/
+DeepSeek-V3.2-Exp/
@@ -1,39 +1,69 @@
-# Quantize Deepseek R1 to FP4
+# Quantize Deepseek models to FP4
 
-This example will demonstrate the steps to quantize DeepSeek R1 model to FP4 and export a unified checkpoint that can be deployed with TRT-LLM.
+This example will demonstrate the steps to quantize DeepSeek models to FP4 and export a unified checkpoint that can be deployed with TRT-LLM.
 
 ## Setup
 
 Due to the model size, currently it requires 8xH200 or 16xH100 to quantize the FP8 model, we will use 8xH200 as example.
 
-### Convert the HF checkpoint for deepseek FP8 inference
+## Convert the HF checkpoint for deepseek FP8 inference
 
 ```bash
 # set up variables to run the example
 export HF_FP8_CKPT={path_to_downloaded_hf_checkpoint}
 export DS_CKPT={path_to_save_converted_checkpoint}
 export FP4_QUANT_PATH={path_to_save_quantization_results}
 export HF_FP4_PATH={path_to_save_the_final_FP4_checkpoint}
+```
+
+### DeepSeek V3 R1 V3.1
 
-# download the FP8 checkpoint from Hugginface
+```bash
+# download the FP8 checkpoint from Hugginface. This is an example of DeepSeek-R1
 huggingface-cli download deepseek-ai/DeepSeek-R1 --local-dir $HF_FP8_CKPT
 
 # clone DeepSeek-V3 (base model of R1) Github repository for FP8 inference,
 git clone https://github.com/deepseek-ai/DeepSeek-V3.git && cd DeepSeek-V3 && git checkout 1398800
+```
+
+### [Experimental] DeepSeek V3.2
 
+```bash
+# download the FP8 checkpoint from Hugginface.
+huggingface-cli download deepseek-ai/DeepSeek-V3.2-Exp --local-dir $HF_FP8_CKPT
+
+# clone DeepSeek-V3.2 Github repository for FP8 inference,
+git clone https://github.com/deepseek-ai/DeepSeek-V3.2-Exp.git && cd DeepSeek-V3.2-Exp && git checkout 3b99a53
+
+# Install requirements
+pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
+pip install -r inference/requirements.txt
+```
+
+### Convert the Checkpoint
+
+```bash
 # convert the HF checkpoint to a specific format for Deepseek
 python inference/convert.py --hf-ckpt-path $HF_FP8_CKPT --save-path $DS_CKPT --n-experts 256 --model-parallel 8
 ```
 
-### Post-training quantization
+## Post-training quantization
+
+### Run the calibration scripts
 
-#### Run the calibration scripts
+DeepSeek V3, R1, V3.1
 
 ```bash
 torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3/inference/configs/config_671B.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
 ```
 
-#### Quantize the FP8 hf checkpoint to FP4
+DeepSeek V3.2
+
+```bash
+torchrun --nproc-per-node 8 --master_port=12346 ptq.py --model_path $DS_CKPT --config DeepSeek-V3.2-Exp/inference/config_671B_v3.2.json --quant_cfg NVFP4_DEFAULT_CFG --output_path $FP4_QUANT_PATH
+```
+
+### Quantize the FP8 hf checkpoint to FP4
 
 We provide a one-step-script which will: