NVIDIA
diff --git a/‎.dockerignore‎
Lines changed: 0 additions & 1 deletion b/‎.dockerignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 15 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 28 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 44 additions & 35 deletions b/‎docker/Dockerfile‎
Lines changed: 44 additions & 35 deletions
diff --git a/‎docs/source/_ext/modelopt_autodoc_pydantic.py‎
Lines changed: 3 additions & 2 deletions b/‎docs/source/_ext/modelopt_autodoc_pydantic.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/source/_templates/autosummary/module.rst‎
Lines changed: 3 additions & 2 deletions b/‎docs/source/_templates/autosummary/module.rst‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/source/conf.py‎
Lines changed: 4 additions & 34 deletions b/‎docs/source/conf.py‎
Lines changed: 4 additions & 34 deletions
diff --git a/‎docs/source/getting_started/_installation_for_Linux.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/getting_started/_installation_for_Linux.rst‎
Lines changed: 2 additions & 2 deletions
@@ -28,7 +28,6 @@ coverage.xml
 .pytest_cache/
 
 # Sphinx documentation
-docs/_build
 docs/build
 docs/source/reference/generated
 
 
@@ -22,7 +22,6 @@ coverage.xml
 .pytest_cache/
 
 # Sphinx documentation
-docs/_build
 docs/build
 docs/source/reference/generated
 
 
@@ -12,6 +12,9 @@ repos:
         args: [--maxkb=500, --enforce-all]
         exclude: >
           (?x)^(
+              internal/experimental/GTC_2024_demo/SDXL_PTQ.ipynb|
+              internal/experimental/vae_training/.*|
+              internal/examples/diffusers/quantization/assets/.*.png|
               examples/diffusers/quantization/assets/.*.png|
               examples/diffusers/cache_diffusion/assets/.*.png|
           )$
@@ -22,6 +25,7 @@ repos:
       - id: check-toml
       - id: check-yaml
         args: [--allow-multiple-documents]
+        exclude: ^internal/.gitlab/ # !references are not supported
       - id: debug-statements
       - id: end-of-file-fixer
       - id: mixed-line-ending
@@ -36,7 +40,7 @@ repos:
         exclude: ^.github/
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.6
+    rev: v0.11.9
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
@@ -110,7 +114,15 @@ repos:
               examples/llm_sparsity/finetune.py|
               examples/speculative_decoding/main.py|
               examples/speculative_decoding/medusa_utils.py|
-              examples/speculative_decoding/vllm_generate.py|
+              examples/speculative_decoding/server_generate.py|
+              internal/examples/diffusers/cache_diffusion/cache_diffusion/module.py|
+              internal/examples/diffusers/cache_diffusion/pipeline/models/sdxl.py|
+              internal/examples/mlperf/infer.py|
+              internal/examples/onnx_ptq/quantize_llama.py|
+              internal/examples/torchvision/modelopt_torchvision.py|
+              internal/examples/vlm_eval/data_utils.py|
+              internal/examples/vlm_eval/eval_utils.py|
+              internal/examples/vlm_eval/mmmu.py|
           )$
 
       # Default hook for Apache 2.0 in core c/c++/cuda files
@@ -155,3 +167,4 @@ repos:
       - id: lychee
         args: ["--no-progress", "--exclude-loopback"]
         stages: [manual] # Only run with `pre-commit run --all-files --hook-stage manual lychee`
+        exclude: internal/
@@ -1,6 +1,34 @@
 Model Optimizer Changelog (Linux)
 =================================
 
+0.31 (2025-06-04)
+^^^^^^^^^^^^^^^^^
+
+**Backward Breaking Changes**
+
+- NeMo and Megatron-LM distributed checkpoint (``torch-dist``) stored with legacy version can no longer be loaded. The remedy is to load the legacy distributed checkpoint with 0.29 and store a ``torch`` checkpoint and resume with 0.31 to convert to a new format. The following changes only apply to storing and resuming distributed checkpoint.
+    - ``quantizer_state`` of :class:`TensorQuantizer <modelopt.torch.quantization.nn.modules.TensorQuantizer>` is now stored in ``extra_state`` of :class:`QuantModule <modelopt.torch.quantization.nn.module.QuantModule>` where it used to be stored in the sharded ``modelopt_state``.
+    - The dtype and shape of ``amax`` and ``pre_quant_scale`` stored in the distributed checkpoint are now retored. Some dtype and shape are previously changed to make all decoder layers to have homogeneous structure in the checkpoint.
+    - Togather with megatron.core-0.13, quantized model will store and resume distributed checkpoint in a heterogenous format.
+- auto_quantize API now accepts a list of quantization config dicts as the list of quantization choices.
+    - This API previously accepts a list of strings of quantization format names. It was therefore limited to only pre-defined quantization formats unless through some hacks.
+    - With this change, now user can easily use their own custom quantization formats for auto_quantize.
+    - In addition, the ``quantization_formats`` now exclude ``None`` (indicating "do not quantize") as a valid format because the auto_quantize internally always add "do not quantize" as an option anyway.
+- Model export config is refactored. The quant config in ``hf_quant_config.json`` is converted and saved to ``config.json``. ``hf_quant_config.json`` will be deprecated soon.
+
+
+**Deprecations**
+
+- Deprecate ``Python 3.9`` support.
+
+**New Features**
+
+- Upgrade LLM examples to use TensorRT-LLM 0.19.
+- Add new model support in the ``llm_ptq`` example: Qwen3 MoE.
+- ModelOpt now supports advanced quantization algorithms such as AWQ, SVDQuant and SmoothQuant for cpu-offloaded Huggingface models.
+- Add AutoCast tool to convert ONNX models to FP16 or BF16.
+- Add ``--low_memory_mode`` flag in the llm_ptq example support to initialize HF models with compressed weights and reduce peak memory of PTQ and quantized checkpoint export.
+
 0.29 (2025-05-08)
 ^^^^^^^^^^^^^^^^^
 
 
@@ -18,6 +18,7 @@
 
 ## Latest News
 
+- [2025/05/14] [NVIDIA TensorRT Unlocks FP4 Image Generation for NVIDIA Blackwell GeForce RTX 50 Series GPUs](https://developer.nvidia.com/blog/nvidia-tensorrt-unlocks-fp4-image-generation-for-nvidia-blackwell-geforce-rtx-50-series-gpus/)
 - [2025/04/21] [Adobe optimized deployment using TensorRT-Model-Optimizer + TensorRT leading to a 60% reduction in diffusion latency, a 40% reduction in total cost of ownership](https://developer.nvidia.com/blog/optimizing-transformer-based-diffusion-models-for-video-generation-with-nvidia-tensorrt/)
 - [2025/04/05] [NVIDIA Accelerates Inference on Meta Llama 4 Scout and Maverick](https://developer.nvidia.com/blog/nvidia-accelerates-inference-on-meta-llama-4-scout-and-maverick/). Check out how to quantize Llama4 for deployment acceleration [here](./examples/llm_ptq/README.md#llama-4)
 - [2025/03/18] [World's Fastest DeepSeek-R1 Inference with Blackwell FP4 & Increasing Image Generation Efficiency on Blackwell](https://developer.nvidia.com/blog/nvidia-blackwell-delivers-world-record-deepseek-r1-inference-performance/)
 
@@ -1,47 +1,56 @@
-FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
+FROM nvcr.io/nvidia/pytorch:25.03-py3
+
+ARG PIP_EXTRA_INDEX_URL="https://pypi.nvidia.com"
+ARG TRT_LLM_COMMIT=v0.19.0
+ARG REMOVE_TRT_LLM_SRC=1
+ARG CUDA_ARCH="89-real;90-real;100-real"
+
+ENV PIP_EXTRA_INDEX_URL=$PIP_EXTRA_INDEX_URL \
+    PIP_NO_CACHE_DIR=off \
+    PIP_CONSTRAINT= \
+    TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0 10.0+PTX"
 
 WORKDIR /workspace
 
-RUN apt-get update && \
-    apt-get -y install python3.10 python3-pip python-is-python3 openmpi-bin libopenmpi-dev libgl1 libglib2.0-0 wget git git-lfs unzip jq cmake vim && \
-    rm -rf /var/lib/apt/lists/*
+# Install TensorRT-LLM from source
+RUN --mount=type=ssh,id=nvidia git clone https://github.com/NVIDIA/TensorRT-LLM.git tensorrt-llm \
+    && cd tensorrt-llm \
+    && git checkout ${TRT_LLM_COMMIT} \
+    && git submodule update --init --recursive
 
-ARG PIP_EXTRA_INDEX_URL="https://pypi.nvidia.com"
-ENV PIP_EXTRA_INDEX_URL=$PIP_EXTRA_INDEX_URL
-ENV PIP_NO_CACHE_DIR=off
-
-# Install the latest setuptools using pip
-RUN rm -rf /usr/lib/python3/dist-packages/setuptools* && \
-    pip install --upgrade pip setuptools
-
-# Install TensorRT-LLM
-ARG TRT_LLM_VERSION=0.18.1
-RUN pip install "tensorrt-llm~=$TRT_LLM_VERSION" -U
-RUN git clone --depth 1 --branch "v$TRT_LLM_VERSION" https://github.com/NVIDIA/TensorRT-LLM.git && \
-    mkdir tensorrt-llm && \
-    mv TensorRT-LLM/benchmarks/ tensorrt-llm && \
-    rm -rf TensorRT-LLM
-RUN cd /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs && ln -s libnvinfer_plugin_tensorrt_llm.so libnvinfer_plugin_tensorrt_llm.so.10
-ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs:$LD_LIBRARY_PATH
+# Install required dependencies
+RUN bash tensorrt-llm/docker/common/install_base.sh $(python --version 2>&1 | awk '{print $2}')
+RUN bash tensorrt-llm/docker/common/install_cmake.sh
+RUN bash tensorrt-llm/docker/common/install_mpi4py.sh
+RUN bash tensorrt-llm/docker/common/install_tensorrt.sh
+RUN bash tensorrt-llm/docker/common/install_cuda_toolkit.sh
+
+RUN cd tensorrt-llm && git lfs install && git lfs pull
+
+RUN cd tensorrt-llm \
+    && ./scripts/build_wheel.py --job_count $(nproc) --clean --python_bindings --benchmarks --install --cuda_architecture=${CUDA_ARCH} \
+    && git rev-parse --short HEAD > /workspace/tensorrt-llm.commit \
+    && chmod -R 777 .
+RUN pip install tensorrt-llm/build/tensorrt_llm*.whl
+
+# Remove TensorRT-LLM source code to reduce image size except for benchmarks and examples folders
+RUN if [ "$REMOVE_TRT_LLM_SRC" = "1" ]; then \
+    mkdir -p tensorrt-llm_keep; \
+    mv tensorrt-llm/benchmarks tensorrt-llm_keep/benchmarks; \
+    mv tensorrt-llm/examples tensorrt-llm_keep/examples; \
+    rm -rf tensorrt-llm; \
+    mv tensorrt-llm_keep tensorrt-llm; \
+    fi
+
+# Update PATH and LD_LIBRARY_PATH variables for the TensorRT binaries
+ENV LD_LIBRARY_PATH="/usr/local/tensorrt/targets/x86_64-linux-gnu/lib:${LD_LIBRARY_PATH}" \
+    PATH="/usr/local/tensorrt/targets/x86_64-linux-gnu/bin:${PATH}"
 
 # Export the path to 'libcudnn.so.X' needed by 'libonnxruntime_providers_tensorrt.so'
-ENV LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
-
-# Install TensorRT dev environment
-ARG TENSORRT_URL=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/tars/TensorRT-10.9.0.34.Linux.x86_64-gnu.cuda-12.8.tar.gz
-RUN wget -q -O tensorrt.tar.gz $TENSORRT_URL && \
-    tar -xf tensorrt.tar.gz && \
-    cp TensorRT-*/bin/trtexec /usr/local/bin && \
-    cp TensorRT-*/include/* /usr/include/x86_64-linux-gnu && \
-    python -m pip install TensorRT-*/python/tensorrt-*-cp310-none-linux_x86_64.whl && \
-    cp -a TensorRT-*/targets/x86_64-linux-gnu/lib/* /usr/local/lib/python3.10/dist-packages/tensorrt_libs && \
-    rm -rf TensorRT-*.Linux.x86_64-gnu.cuda-*.tar.gz TensorRT-* tensorrt.tar.gz
-ENV TRT_LIB_PATH=/usr/local/lib/python3.10/dist-packages/tensorrt_libs
-ENV LD_LIBRARY_PATH=$TRT_LIB_PATH:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 
 # Install modelopt with all optional dependencies and pre-compile CUDA extensions otherwise they take several minutes on every docker run
 RUN pip install -U "nvidia-modelopt[all,dev-test]"
-ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0 10.0+PTX"
 RUN python -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()"
 
 # Find and install requirements.txt files for all examples excluding windows
 
@@ -17,8 +17,9 @@
 
 import json
 import types
+from collections.abc import Callable
 from contextlib import contextmanager, nullcontext
-from typing import Any, Callable
+from typing import Any
 
 from sphinx.application import Sphinx
 from sphinxcontrib.autodoc_pydantic import __version__
@@ -112,7 +113,7 @@ def add_default_dict(self) -> None:
         # create valid rst lines from the config
         config_json = json.dumps(config, default=str, indent=3)
         lines = [f"   {line}" for line in config_json.split("\n")]
-        lines = [":Default config (JSON):", "", ".. code-block:: json", ""] + lines + [""]
+        lines = [":Default config (JSON):", "", ".. code-block:: json", "", *lines, ""]
 
         # add lines to autodoc
         source_name = self.get_sourcename()
 
@@ -10,8 +10,9 @@
    :toctree:
    :recursive:
 {% for item in modules %}
-{% if '.plugins.' not in item or item == 'modelopt.torch.opt.plugins.huggingface' %}
-   {{ item }}
+{% set full_item = fullname + '.' + item.split('.')[-1] %}
+{% if '.plugins.' not in full_item or full_item == 'modelopt.torch.opt.plugins.huggingface' %}
+   {{ full_item }}
 {% endif %}
 {%- endfor %}
 {% endif %}
 
@@ -32,29 +32,14 @@
 # sys.path.insert(0, os.path.abspath('.'))
 
 import os
-import shutil
 import sys
-import tempfile
 
 import sphinx.application
 from docutils import nodes
 from docutils.nodes import Element
-from pypandoc.pandoc_download import download_pandoc
 from sphinx.writers.html5 import HTML5Translator
 
-from modelopt import __version__  # noqa: E402
-
-if not shutil.which("pandoc"):
-    # Install pandoc if it is not installed.
-    # Default `targetfolder` for Mac (~/Applications/pandoc) is not in `$PATH` so use whatever is in PATH
-    # Pandoc is required by nbconvert but it is not included in the pypandoc pip package
-    with tempfile.TemporaryDirectory() as tmpdir:
-        download_pandoc(
-            version="3.1.13",
-            download_folder=tmpdir,
-            targetfolder=os.environ["PATH"].split(os.pathsep)[0],
-            delete_installer=True,
-        )
+from modelopt import __version__
 
 sys.path.insert(0, os.path.abspath("../../"))
 sys.path.append(os.path.abspath("./_ext"))
@@ -75,13 +60,11 @@
     "sphinx.ext.autodoc",
     "sphinx.ext.autosummary",
     "sphinx.ext.githubpages",
-    "sphinx.ext.napoleon",
-    # "sphinx.ext.viewcode",
+    "sphinx.ext.napoleon",  # Support for NumPy and Google style docstrings
+    "sphinxarg.ext",  # for command-line help documentation
     "sphinx_copybutton",  # line numbers getting copied so cannot use `:linenos:`
     "sphinx_inline_tabs",
-    "nbsphinx",  # rendering jupyter notebooks in docs
     "sphinx_togglebutton",
-    "IPython.sphinxext.ipython_console_highlighting",
     "sphinxcontrib.autodoc_pydantic",
     "modelopt_autodoc_pydantic",
 ]
@@ -132,7 +115,7 @@
 
 
 # Mock imports for autodoc
-autodoc_mock_imports = ["mpi4py", "tensorrt_llm"]
+autodoc_mock_imports = ["mpi4py", "tensorrt_llm", "triton"]
 
 autosummary_generate = True
 autosummary_imported_members = False
@@ -162,19 +145,6 @@
 autodoc_member_order = "alphabetical"  # can also use `bysource` or `groupwise` to sort members
 
 
-# Do not auto-execute notebooks where all outputs are empty
-nbsphinx_execute = "never"
-
-# Add link to download notebook on top of each notebook tutorial!
-nbsphinx_prolog = r"""
-.. raw:: html
-
-    <div class="admonition note">
-        This tutorial is available as a Jupyter Notebook!
-        <a href="{{ env.doc2path(env.docname, base=None).split('/')|last|e }}">Download notebook from here</a>.
-    </div>
-"""
-
 # autodoc_pydantic model settings
 autodoc_pydantic_model_show_config_summary = False
 autodoc_pydantic_model_show_validator_summary = False
 
@@ -12,15 +12,15 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
 +-------------------------+-----------------------------+
 | Architecture            |  x86_64, aarch64 (SBSA)     |
 +-------------------------+-----------------------------+
-| Python                  |  >=3.9,<3.13                |
+| Python                  |  >=3.10,<3.13               |
 +-------------------------+-----------------------------+
 | CUDA                    |  >=12.0                     |
 +-------------------------+-----------------------------+
 | PyTorch (Optional)      |  >=2.4                      |
 +-------------------------+-----------------------------+
 | TensorRT-LLM (Optional) |  0.18                       |
 +-------------------------+-----------------------------+
-| ONNX Runtime (Optional) |  1.20 (Python>=3.10)        |
+| ONNX Runtime (Optional) |  1.22                       |
 +-------------------------+-----------------------------+
 | TensorRT (Optional)     |  >=10.0                     |
 +-------------------------+-----------------------------+