diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 924861c1..61a416ee 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -30,7 +30,6 @@ modelopt/torch/trace @NVIDIA/modelopt-torch-nas-prune-codeowners modelopt/torch/utils @NVIDIA/modelopt-torch-utils-codeowners # Examples -/docker @NVIDIA/modelopt-docker-codeowners /README.md @NVIDIA/modelopt-examples-codeowners /examples @NVIDIA/modelopt-examples-codeowners /examples/chained_optimizations @NVIDIA/modelopt-torch-nas-prune-codeowners diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml index 57b9f04c..272b84bf 100644 --- a/.github/workflows/example_tests.yml +++ b/.github/workflows/example_tests.yml @@ -68,15 +68,17 @@ jobs: container: &example_container image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2 env: - LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib:${LD_LIBRARY_PATH}" - # PATH: "/usr/local/tensorrt/targets/x86_64-linux-gnu/bin:${PATH}" PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages steps: &example_steps - uses: actions/checkout@v4 - uses: nv-gha-runners/setup-proxy-cache@main + - name: Setup environment variables + run: | + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" >> $GITHUB_ENV + echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV - name: Run example tests run: | - pip install ".[all,dev-test]" + pip install ".[hf,dev-test]" find examples/${{ matrix.EXAMPLE }} -name "requirements.txt" | while read req_file; do pip install -r "$req_file" || exit 1; done pytest -s tests/examples/${{ matrix.EXAMPLE }} example-tests-non-pr: diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index 693c99b1..402191dc 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -66,11 +66,14 @@ jobs: image: nvcr.io/nvidia/pytorch:25.06-py3 env: GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py - LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" # Add libcudnn*.so and libnv*.so to path. PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages steps: &gpu_steps - uses: actions/checkout@v4 - uses: nv-gha-runners/setup-proxy-cache@main + - name: Setup environment variables + run: | + echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" >> $GITHUB_ENV + echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV - name: Run gpu tests run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env gpu-tests-non-pr: diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml index 91640b11..e4b5c90e 100644 --- a/.gitlab/tests.yml +++ b/.gitlab/tests.yml @@ -1,11 +1,12 @@ -# NOTE: Make sure this file is consistent with .github/workflows/{unit,gpu}_tests.yml +# NOTE: Make sure this file is consistent with .github/workflows/{unit,gpu,example}_tests.yml .tests-default: + variables: + PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages stage: tests rules: - if: $CI_PIPELINE_SOURCE == "schedule" - when: always - - if: $CI_PIPELINE_SOURCE != "schedule" - when: manual + - if: $CI_COMMIT_TAG =~ /^\d+\.\d+\.\d+$/ + - when: manual ##### Unit Tests ##### unit: @@ -24,50 +25,74 @@ unit: - tox -e py3$PYTHON-torch$TORCH-tf_$TRANSFORMERS-unit ##### GPU Tests ##### -gpu: +.multi-gpu-tests-default: extends: .tests-default timeout: 60m image: nvcr.io/nvidia/pytorch:25.06-py3 variables: GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py - LD_LIBRARY_PATH: "/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}" # Add libcudnn*.so and libnv*.so to path. - PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages tags: [docker, linux, 2-gpu] + before_script: + # Add libcudnn*.so and libnv*.so to path + - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" + # Add trtexec to path + - export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" + # Install git-lfs for Daring-Anteater dataset + - apt-get update && apt-get install -y git-lfs + - git lfs install --system + +multi-gpu: + extends: .multi-gpu-tests-default script: # Use pre-installed packages without a new venv with tox-current-env - pip install tox-current-env - tox -e py312-cuda12-gpu --current-env ##### Example Tests ##### -example: - extends: .tests-default - stage: tests - timeout: 45m - image: gitlab-master.nvidia.com:5005/omniml/modelopt/modelopt_examples:latest - variables: - TEST_TYPE: pytest - tags: [docker, linux, 2-gpu, sm<89] +example-torch: + extends: .multi-gpu-tests-default + timeout: 30m parallel: matrix: - - EXAMPLE: [diffusers, llm_distill, llm_qat, llm_sparsity, onnx_ptq, speculative_decoding] - allow_failure: true # Allow to continue next stages even if job is canceled (e.g. during release) - before_script: - - pip install ".[all,dev-test]" + - EXAMPLE: [llm_distill, llm_sparsity, speculative_decoding] script: - # Uninstall apex since T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391 - - if [ "$EXAMPLE" = "diffusers" ]; then pip uninstall -y apex; fi + - pip install ".[hf,dev-test]" - find examples/$EXAMPLE -name "requirements.txt" | while read req_file; do pip install -r "$req_file" || exit 1; done - - if [ "$TEST_TYPE" = "pytest" ]; then pytest -s tests/examples/$EXAMPLE; else bash tests/examples/test_$EXAMPLE.sh; fi + - pytest -s tests/examples/$EXAMPLE -example-ada: - extends: example +# TODO: Fix llm_qat test hang in GitLab CI +example-failing: + extends: example-torch + allow_failure: true + parallel: + matrix: + - EXAMPLE: [llm_qat] + +example-trtllm: + extends: example-torch timeout: 60m + image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2 + tags: [docker, linux, 2-gpu, sm>=89] + parallel: + matrix: + - EXAMPLE: [llm_autodeploy, llm_eval, llm_ptq, vlm_ptq] + +example-onnx: + extends: example-torch + image: nvcr.io/nvidia/tensorrt:25.08-py3 tags: [docker, linux, 2-gpu, sm>=89] parallel: matrix: - - EXAMPLE: [llm_eval, llm_ptq, vlm_ptq, llm_autodeploy] + - EXAMPLE: [diffusers, onnx_ptq] + TEST_TYPE: pytest - EXAMPLE: [onnx_ptq] TEST_TYPE: bash + script: + # Uninstall apex since T5 Int8 (PixArt) + Apex is not supported as per https://github.com/huggingface/transformers/issues/21391 + - if [ "$EXAMPLE" = "diffusers" ]; then pip uninstall -y apex; fi + - pip install ".[all,dev-test]" + - find examples/$EXAMPLE -name "requirements.txt" | while read req_file; do pip install -r "$req_file" || exit 1; done + - if [ "$TEST_TYPE" = "pytest" ]; then pytest -s tests/examples/$EXAMPLE; else bash tests/examples/test_$EXAMPLE.sh; fi ##### Megatron / NeMo Integration Tests ##### megatron-nemo-integration: diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8dc315c4..38d2a52b 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,7 @@ Model Optimizer Changelog (Linux) **Deprecations** +- Deprecated ModelOpt's custom docker images. Please use the PyTorch, TensorRT-LLM or TensorRT docker image directly or refer to the `installation guide `_ for more details. - Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead. - Deprecated TRT-LLM's TRT backend in ``examples/llm_ptq`` and ``examples/vlm_ptq``. Tasks ``build`` and ``benchmark`` support are removed and replaced with ``quant``. For performance evaluation, please use ``trtllm-bench`` directly. - ``--export_fmt`` flag in ``examples/llm_ptq`` is removed. By default we export to the unified Hugging Face checkpoint format. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 27182ace..52568976 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,7 +11,7 @@ pip install -e ".[dev]" ``` If you are working on features that require dependencies like TensorRT-LLM or Megatron-Core, consider using a docker container to simplify the setup process. -See [docker README](./README.md#installation--docker) for more details. +Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. ## 🧹 Code linting and formatting diff --git a/README.md b/README.md index a6c88c78..c19bfdde 100644 --- a/README.md +++ b/README.md @@ -61,10 +61,10 @@ Model Optimizer is also integrated with [NVIDIA NeMo](https://github.com/NVIDIA- To install stable release packages for Model Optimizer with `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/): ```bash -pip install nvidia-modelopt[all] +pip install -U nvidia-modelopt[all] ``` -To install from source in editable mode with all development dependencies or to test the latest changes, run: +To install from source in editable mode with all development dependencies or to use the latest features, run: ```bash # Clone the Model Optimizer repository @@ -74,7 +74,11 @@ cd TensorRT-Model-Optimizer pip install -e .[dev] ``` -Visit our [installation guide](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more fine-grained control on installed dependencies or view our pre-made [dockerfiles](docker/README.md) for more information. +You can also directly use the [TensorRT-LLM docker images](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) +(e.g., `nvcr.io/nvidia/tensorrt-llm/release:`), which have Model Optimizer pre-installed. +Make sure to upgrade Model Optimizer to the latest version using ``pip`` as described above. +Visit our [installation guide](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for +more fine-grained control on installed dependencies or for alternative docker images and environment variables to setup. ## Techniques diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 8a736d25..00000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2 - -ENV PIP_EXTRA_INDEX_URL="https://pypi.nvidia.com" \ - PIP_NO_CACHE_DIR=off \ - PIP_CONSTRAINT= \ - TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0 10.0 12.0+PTX" - -RUN apt-get update && \ - apt-get install -y libgl1 && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /workspace - -RUN ln -s /app/tensorrt_llm /workspace/tensorrt_llm - -# Update PATH and LD_LIBRARY_PATH variables for the TensorRT binaries -ENV LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib:${LD_LIBRARY_PATH}" \ - PATH="/usr/local/tensorrt/targets/x86_64-linux-gnu/bin:${PATH}" - -# Install modelopt from source with all optional dependencies and pre-compile CUDA extensions otherwise they take several minutes on every docker run -COPY . TensorRT-Model-Optimizer -RUN pip install -e "./TensorRT-Model-Optimizer[all]" -RUN rm -rf TensorRT-Model-Optimizer/.git -RUN python -c "import modelopt.torch.quantization.extensions as ext; ext.precompile()" - -# Allow users to run without root -RUN chmod -R 777 /workspace diff --git a/docker/README.md b/docker/README.md deleted file mode 100644 index 1b6984b8..00000000 --- a/docker/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# ModelOpt Docker - -This folder contains the Dockerfile for the ModelOpt docker image. - -## Building the Docker Image - -To build the docker image, run the following command from the root of the repository: - -```bash -bash docker/build.sh -``` - -The docker image will be built and tagged as `docker.io/library/modelopt_examples:latest`. - -> [!NOTE] -> For ONNX PTQ, use the optimized docker image from [onnx_ptq Dockerfile](../examples/onnx_ptq/docker/) instead of this one. diff --git a/docker/build.sh b/docker/build.sh deleted file mode 100755 index 7addcdd5..00000000 --- a/docker/build.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -docker build --network=host --progress=plain . -f docker/Dockerfile -t modelopt_examples:latest "$@" diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst index 16afac64..7c214fd3 100644 --- a/docs/source/getting_started/_installation_for_Linux.rst +++ b/docs/source/getting_started/_installation_for_Linux.rst @@ -30,39 +30,30 @@ Environment setup .. tab:: Docker image (Recommended) - **Using ModelOpt's docker image** + To use Model Optimizer with full dependencies (e.g. TensorRT/TensorRT-LLM deployment), we recommend using the + `TensorRT-LLM docker image `_, + e.g., ``nvcr.io/nvidia/tensorrt-llm/release:``. - To use Model Optimizer with full dependencies (e.g. TensorRT/TensorRT-LLM deployment), we recommend using our provided docker image - which is based on the `TensorRT-LLM `_ - docker image with additional dependencies installed. + Make sure to upgrade Model Optimizer to the latest version using ``pip`` as described in the next section. - After installing the `NVIDIA Container Toolkit `_, - please run the following commands to build the Model Optimizer docker container which has all the base - dependencies pre-installed. You may need to install additional dependencies from the examples's `requirements.txt` file. + You would also need to setup appropriate environment variables for the TensorRT binaries as follows: .. code-block:: shell - # Clone the ModelOpt repository - git clone git@github.com:NVIDIA/TensorRT-Model-Optimizer.git - cd TensorRT-Model-Optimizer + export PIP_CONSTRAINT="" + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" + export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" - # Build the docker (will be tagged `docker.io/library/modelopt_examples:latest`) - # You may customize `docker/Dockerfile` to include or exclude certain dependencies you may or may not need. - bash docker/build.sh + You may need to install additional dependencies from the respective examples's `requirements.txt` file. - # Run the docker image - docker run --gpus all -it --shm-size 20g --rm docker.io/library/modelopt_examples:latest bash - - # Check installation (inside the docker container) - python -c "import modelopt; print(modelopt.__version__)" - - **Using alternative NVIDIA docker images** + **Alternative NVIDIA docker images** For PyTorch, you can also use `NVIDIA NGC PyTorch container `_ and for NVIDIA NeMo framework, you can use the `NeMo container `_. Both of these containers come with Model Optimizer pre-installed. Make sure to update the Model Optimizer to the latest version if not already. - For ONNX PTQ, you can use the optimized docker image from [onnx_ptq Dockerfile](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/onnx_ptq/docker). + For ONNX / TensorRT use cases, you can also use the `TensorRT container `_ + which provides superior performance to the PyTorch container. .. tab:: Local environment (PIP / Conda) @@ -86,9 +77,8 @@ Environment setup If you wish to use ModelOpt in conjunction with other NVIDIA libraries (e.g. TensorRT, TensorRT-LLM, NeMo, Triton, etc.), please make sure to check the ease of installation of these libraries in a local environment. If you face any - issues, we recommend using a docker image for a seamless experience. For example, `TensorRT-LLM documentation `_. - requires installing in a docker image. You may still choose to use other ModelOpt's features locally for example, - quantizing a HuggingFace model and then use a docker image for deployment. + issues, we recommend using a docker image for a seamless experience. You may still choose to use other ModelOpt's + features locally for example, quantizing a HuggingFace model and then use a docker image for deployment. Install Model Optimizer ======================= diff --git a/examples/diffusers/README.md b/examples/diffusers/README.md index d957db75..cb8ec08a 100644 --- a/examples/diffusers/README.md +++ b/examples/diffusers/README.md @@ -27,6 +27,14 @@ Cache Diffusion is a technique that reuses cached outputs from previous diffusio ## Pre-Requisites +### Docker + +Please use the TensorRT docker image (e.g., `nvcr.io/nvidia/tensorrt:25.08-py3`) or visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. + +Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install example-specific dependencies. + +### Local Installation + Install Model Optimizer with `onnx` and `hf` dependencies using `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/): ```bash @@ -37,7 +45,7 @@ Each subsection (cache_diffusion, quantization, etc.) have their own `requiremen You can find the latest TensorRT [here](https://developer.nvidia.com/tensorrt/download). -Visit our [installation guide](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) or view our pre-made [dockerfiles](../../docker/Dockerfile) for more information. +Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. ## Getting Started diff --git a/examples/diffusers/cache_diffusion/requirements.txt b/examples/diffusers/cache_diffusion/requirements.txt index 38c65678..e7c2b3c1 100644 --- a/examples/diffusers/cache_diffusion/requirements.txt +++ b/examples/diffusers/cache_diffusion/requirements.txt @@ -1,4 +1,4 @@ -cuda-python +cuda-python<13 opencv-python>=4.8.1.78,<4.12.0.88 peft>=0.10.0 polygraphy==0.49.9 diff --git a/examples/diffusers/quantization/requirements.txt b/examples/diffusers/quantization/requirements.txt index 52921fe7..67d9ab99 100644 --- a/examples/diffusers/quantization/requirements.txt +++ b/examples/diffusers/quantization/requirements.txt @@ -1,4 +1,4 @@ -cuda-python +cuda-python<13 diffusers<=0.34.0 nvtx onnx_graphsurgeon diff --git a/examples/llm_autodeploy/README.md b/examples/llm_autodeploy/README.md index f14ecb8b..cd910a28 100644 --- a/examples/llm_autodeploy/README.md +++ b/examples/llm_autodeploy/README.md @@ -8,7 +8,7 @@ This guide demonstrates how to deploy mixed-precision models using ModelOpt's Au ## Prerequisites -AutoDeploy is currently available on the main branch of TRT-LLM. Follow the [docker setup instructions](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/installation/build-from-source-linux.md#option-1-build-tensorrt-llm-in-one-step) to get started. +AutoDeploy is available in TensorRT-LLM docker images. Please refer to our [Installation Guide](../../README.md#installation) for more details. ### 1. Quantize and Deploy Model diff --git a/examples/llm_distill/README.md b/examples/llm_distill/README.md index 6b97e8f7..3b44b4f7 100644 --- a/examples/llm_distill/README.md +++ b/examples/llm_distill/README.md @@ -21,15 +21,23 @@ This section focuses on demonstrating how to apply Model Optimizer to perform kn ## Pre-Requisites +### Docker + +For Hugging Face models, please use the PyTorch docker image (e.g., `nvcr.io/nvidia/pytorch:25.06-py3`). +For NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.07`) which has all the dependencies installed. +Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. + +Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install example-specific dependencies. + +### Local Installation + For Hugging Face models, install Model Optimizer with `hf` dependencies using `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/) and install the requirements for the example: ```bash -pip install nvidia-modelopt[hf] +pip install -U nvidia-modelopt[hf] pip install -r requirements.txt ``` -For NeMo models, use the NeMo container `nvcr.io/nvidia/nemo:25.07` or later which has all the dependencies installed. - ## Getting Started ### Set up your base models diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 2c95b831..46780b36 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -25,16 +25,25 @@ This section focuses on Post-training quantization, a technique that reduces mod ## Pre-Requisites +### Docker + +For Hugging Face models, please use the TensorRT-LLM docker image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2`). +For NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.07`). +Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. + +Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install example-specific dependencies. + +### Local Installation + For Hugging Face models, install Model Optimizer with `hf` dependencies using `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/) and install the requirements for the example: ```bash -pip install nvidia-modelopt[hf] +pip install -U nvidia-modelopt[hf] pip install -r requirements.txt ``` -If you want to deploy the quantized model on TRT-LLM, you will also need to install the TRT-LLM dependencies as per the [TRT-LLM documentation](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html#installation). Alternatively you can use the ModelOpt docker image built from the [ModelOpt docker build step](../../docker/README.md) which has all the dependencies including TRT-LLM installed. - -For NeMo models, use the NeMo container `nvcr.io/nvidia/nemo:25.04` or later which has all the dependencies including TRT-LLM installed. +For TensorRT-LLM deployment, please use the TensorRT-LLM docker image or follow their [installation docs](https://nvidia.github.io/TensorRT-LLM/installation/index.html). +Similarly, for vLLM or SGLang deployment, please use their installation docs. ## Getting Started diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index 3d895694..44e3ceb6 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -22,16 +22,7 @@ Quantization Aware Training (QAT) helps to improve the model accuracy beyond pos ## Pre-Requisites -For Hugging Face models, install Model Optimizer with `hf` dependencies using `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/) and install the requirements for the example: - -```bash -pip install nvidia-modelopt[hf] -pip install -r requirements.txt -``` - -If you want to deploy the quantized model on TRT-LLM, you will also need to install the TRT-LLM dependencies as per the [TRT-LLM documentation](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html#installation). Alternatively you can use the ModelOpt docker image built from the [ModelOpt docker build step](../../docker/Dockerfile) which has all the dependencies including TRT-LLM installed. - -For NeMo models, use the NeMo container `nvcr.io/nvidia/nemo:25.04` or later which has all the dependencies including TRT-LLM installed. +Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#pre-requisites) for the pre-requisites. ## Getting Started diff --git a/examples/llm_sparsity/data_prep.py b/examples/llm_sparsity/data_prep.py index d47ff118..b37212f6 100644 --- a/examples/llm_sparsity/data_prep.py +++ b/examples/llm_sparsity/data_prep.py @@ -57,9 +57,9 @@ def main(): os.makedirs(args.save_path, exist_ok=True) with open(os.path.join(args.save_path, "cnn_train.json"), "w") as write_f: - json.dump(tokenized_dataset["train"]["text"], write_f, indent=4, ensure_ascii=False) + json.dump(list(tokenized_dataset["train"]["text"]), write_f, indent=4, ensure_ascii=False) with open(os.path.join(args.save_path, "cnn_eval.json"), "w") as write_f: - json.dump(tokenized_dataset["test"]["text"], write_f, indent=4, ensure_ascii=False) + json.dump(list(tokenized_dataset["test"]["text"]), write_f, indent=4, ensure_ascii=False) if __name__ == "__main__": diff --git a/examples/onnx_ptq/README.md b/examples/onnx_ptq/README.md index 93be1f3e..47058d36 100644 --- a/examples/onnx_ptq/README.md +++ b/examples/onnx_ptq/README.md @@ -24,26 +24,16 @@ Model Optimizer enables highly performant quantization formats including NVFP4, ### Docker -Build from this [Dockerfile](./docker/Dockerfile) which includes the latest publicly available TensorRT version, providing access to cutting-edge features and superior performance compared to the `modelopt_examples` [Docker image](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/docker/Dockerfile). +Please use the TensorRT docker image (e.g., `nvcr.io/nvidia/tensorrt:25.08-py3`) or visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. -Build the Docker image (will be tagged `docker.io/library/onnx_ptq_examples:latest`) - -```bash -./docker/build.sh -``` - -Run the docker image - -```bash -docker run --user 0:0 -it --gpus all --shm-size=2g -v /path/to/ImageNet/dataset:/workspace/imagenet docker.io/library/onnx_ptq_examples:latest -``` +Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install example-specific dependencies. ### Local Installation Install Model Optimizer with `onnx` dependencies using `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/) and install the requirements for the example: ```bash -pip install nvidia-modelopt[onnx] +pip install -U nvidia-modelopt[onnx] pip install -r requirements.txt ``` diff --git a/examples/onnx_ptq/docker/Dockerfile b/examples/onnx_ptq/docker/Dockerfile deleted file mode 100644 index 0b770b74..00000000 --- a/examples/onnx_ptq/docker/Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -FROM nvcr.io/nvidia/tensorrt:25.08-py3 - -ARG CMAKE_VERSION=3.28.0 - -ENV PIP_EXTRA_INDEX_URL="https://pypi.nvidia.com" \ - PIP_NO_CACHE_DIR=off - -RUN python -m pip install --upgrade pip \ - && pip install cmake==${CMAKE_VERSION} \ - && mkdir -p -m 0600 ~/.ssh \ - && ssh-keyscan github.com >> ~/.ssh/known_hosts - -WORKDIR /workspace - -RUN pip install tensorrt==10.13.2.6 -ENV TRT_PATH=/usr/local/lib/python3.12/dist-packages/tensorrt -ENV CUDNN_LIB_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/cudnn/lib -ENV LD_LIBRARY_PATH="${CUDNN_LIB_DIR}:${TRT_PATH}/lib:/usr/include:${LD_LIBRARY_PATH}" -ENV PATH="${TRT_PATH}/bin:${PATH}" - -# Copy application code and install requirements -COPY modelopt TensorRT-Model-Optimizer/modelopt -COPY examples/onnx_ptq TensorRT-Model-Optimizer/examples/onnx_ptq -COPY setup.py TensorRT-Model-Optimizer/setup.py -COPY pyproject.toml TensorRT-Model-Optimizer/pyproject.toml - -# Install onnx_ptq requirements -RUN pip install -r TensorRT-Model-Optimizer/examples/onnx_ptq/requirements.txt - -# Install modelopt -RUN pip install -e "./TensorRT-Model-Optimizer[hf,onnx]" - -# Allow users to run without root -RUN chmod -R 777 /workspace diff --git a/examples/onnx_ptq/docker/build.sh b/examples/onnx_ptq/docker/build.sh deleted file mode 100755 index f1ac572e..00000000 --- a/examples/onnx_ptq/docker/build.sh +++ /dev/null @@ -1,131 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -euo pipefail # Exit on error, undefined vars, pipe failures - -# Default values -IMAGE_NAME="modelopt_onnx_examples:latest" -DOCKERFILE_PATH="examples/onnx_ptq/docker/Dockerfile" - -# Function to show usage -usage() { - cat << EOF -Usage: $0 [OPTIONS] - -Options: - -t, --tag IMAGE_NAME Docker image name (default: $IMAGE_NAME) - -h, --help Show this help message - -This script automatically detects whether you're running from: - β€’ modelopt/ root directory - β€’ modelopt/examples/onnx_ptq/ directory - -and builds the Docker image accordingly. -EOF - exit 1 -} - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - -t|--tag) - [[ -n "${2:-}" ]] || { echo "Error: --tag requires a value"; exit 1; } - IMAGE_NAME="$2" - shift 2 - ;; - -h|--help) - usage - ;; - *) - echo "Error: Unknown option '$1'" - usage - ;; - esac -done - -# Function to find modelopt root directory -find_modelopt_root() { - local current_dir="$PWD" - - # Check current directory first - if [[ -f "setup.py" && -f "pyproject.toml" && -d "modelopt" ]]; then - echo "$current_dir" - return 0 - fi - - # Check parent directories (up to 3 levels) - for i in {1..3}; do - local parent_dir - parent_dir=$(dirname "$current_dir") - [[ "$parent_dir" == "$current_dir" ]] && break # Reached filesystem root - - if [[ -f "$parent_dir/setup.py" && -f "$parent_dir/pyproject.toml" && -d "$parent_dir/modelopt" ]]; then - echo "$parent_dir" - return 0 - fi - current_dir="$parent_dir" - done - - return 1 -} - -# Find modelopt root directory -echo "πŸ” Locating modelopt root directory..." -if ROOT_DIR=$(find_modelopt_root); then - echo "βœ… Found modelopt root: $ROOT_DIR" - cd "$ROOT_DIR" -else - cat << EOF -❌ Error: Cannot locate modelopt root directory. - -Expected structure: - modelopt/ - β”œβ”€β”€ setup.py - β”œβ”€β”€ pyproject.toml - β”œβ”€β”€ modelopt/ - └── examples/onnx_ptq/docker/ - -Please run this script from within the modelopt repository. -EOF - exit 1 -fi - -# Validate that Dockerfile exists -if [[ ! -f "$DOCKERFILE_PATH" ]]; then - echo "❌ Error: Dockerfile not found at $DOCKERFILE_PATH" - exit 1 -fi - -# Build Docker image -echo "🐳 Building Docker image..." -echo " β€’ Image name: $IMAGE_NAME" -echo " β€’ Build context: $(pwd)" -echo " β€’ Dockerfile: $DOCKERFILE_PATH" -echo - -docker build \ - --file "$DOCKERFILE_PATH" \ - --tag "$IMAGE_NAME" \ - . \ - "$@" - -echo -echo "βœ… Docker image built successfully: $IMAGE_NAME" -echo -echo "πŸš€ To run the container:" -echo " docker run --user 0:0 -it --gpus all --shm-size=2g \\" -echo " -v /path/to/ImageNet/dataset:/workspace/imagenet \\" -echo " $IMAGE_NAME" diff --git a/examples/pruning/README.md b/examples/pruning/README.md index 6d0123e3..34ed302b 100644 --- a/examples/pruning/README.md +++ b/examples/pruning/README.md @@ -23,7 +23,7 @@ This section focuses on applying Model Optimizer's state-of-the-art complementar ## Pre-Requisites -For Minitron pruning for Megatron-LM / NeMo models, use the NeMo container `nvcr.io/nvidia/nemo:25.07` or later which has all the dependencies installed. +For Minitron pruning for Megatron-LM / NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.07`) which has all the dependencies installed. For FastNAS pruning for PyTorch Computer Vision models, no additional dependencies are required. diff --git a/examples/speculative_decoding/README.md b/examples/speculative_decoding/README.md index 2e8966d2..503cf303 100644 --- a/examples/speculative_decoding/README.md +++ b/examples/speculative_decoding/README.md @@ -25,16 +25,26 @@ This example focuses on training with Hugging Face. To train with Megatron‑LM, ## Pre-Requisites +### Docker + +Please use the PyTorch docker image (e.g., `nvcr.io/nvidia/pytorch:25.06-py3`) or visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information. + +Also follow the installation steps below to upgrade to the latest version of Model Optimizer and install dataset and example-specific dependencies. + +### Local Installation + Install Modelopt with `hf` dependencies and other requirements for this example: ```bash -pip install -e ... +pip install -U nvidia-modelopt[hf] pip install -r requirements.txt ``` We use [Daring-Anteater](https://huggingface.co/datasets/nvidia/Daring-Anteater) dataset in this example. Download by: ```bash +apt-get update && apt-get install -y git-lfs +git lfs install --system git clone https://huggingface.co/datasets/nvidia/Daring-Anteater ``` diff --git a/tests/examples/README.md b/tests/examples/README.md index 869d8311..ed9a32f2 100644 --- a/tests/examples/README.md +++ b/tests/examples/README.md @@ -9,22 +9,17 @@ Make sure to use as small models and less data as possible to keep the tests fas ## Running the tests -To run a test, use the [ModelOpt docker image](../../README.md#installation--docker) so all required dependencies are available. -and mount your local modelopt directory to `/workspace/TensorRT-Model-Optimizer` and run this from the root of the repository. +To run a test, start from the recommended docker image from our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html). +Then mount your local modelopt directory to `/workspace/TensorRT-Model-Optimizer` and run this from the root of the repository. ```bash +cd /workspace/TensorRT-Model-Optimizer +pip install -e ".[all,dev-test]" pytest tests/examples/$TEST ``` -NOTE: Some tests (e.g. `llm_ptq`) have an option to disable using a smaller proxy model, and instead use the original model by setting the `MODELOPT_FAST_TESTS` environment variable to `false`. This is useful in nightly tests to ensure the original model is used. - -```bash -MODELOPT_FAST_TESTS=false ROOT_SAVE_PATH=/tmp/test_llm_ptq/ pytest tests/examples/llm_ptq/ -``` - ## Environment variables The following environment variables can be set to control the behavior of the tests: -- `MODELOPT_FAST_TESTS`: If set to `false`, the tests will use the original model instead of a smaller proxy model. Default is `true`. - `MODELOPT_LOCAL_MODEL_ROOT`: If set, the tests will use the local model directory instead of downloading the model from the internet. Default is not set, which means the model will be downloaded. diff --git a/tests/examples/llm_sparsity/test_llama_sparsify.py b/tests/examples/llm_sparsity/test_llama_sparsify.py index 1fc94292..6c815a45 100644 --- a/tests/examples/llm_sparsity/test_llama_sparsify.py +++ b/tests/examples/llm_sparsity/test_llama_sparsify.py @@ -39,7 +39,6 @@ def data_path(tmp_path_factory): @pytest.mark.parametrize( ("sparsity_fmt", "dtype"), [ - ("sparsegpt", "bf16"), ("sparse_magnitude", "bf16"), ], )