Bump TRT-LLM to 1.1.0rc5 + fix failing CICD tests

kevalmorabia97 · kevalmorabia97 · commit 5a6c995fc324 · 2025-10-18T06:08:51.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
@@ -66,7 +66,7 @@ jobs:
       matrix:
         EXAMPLE: [llm_ptq]
     container: &example_container
-      image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2
+      image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc5
       env:
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -73,8 +73,7 @@ jobs:
       - uses: nv-gha-runners/setup-proxy-cache@main
       - name: Setup environment variables
         run: |
-          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" >> $GITHUB_ENV
-          echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
+          echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu" >> $GITHUB_ENV
       - name: Run gpu tests
         run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
   gpu-tests-non-pr:
diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml
@@ -35,9 +35,7 @@ unit:
   tags: [docker, linux, 2-gpu]
   before_script:
     # Add libcudnn*.so and libnv*.so to path
-    - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
-    # Add trtexec to path
-    - export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin"
+    - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu"
     # Install git-lfs for Daring-Anteater dataset
     - apt-get update && apt-get install -y git-lfs
     - git lfs install --system
@@ -64,7 +62,7 @@ example-torch:
 example-trtllm:
   extends: example-torch
   timeout: 60m
-  image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc5
   tags: [docker, linux, 2-gpu, sm>=89]
   parallel:
     matrix:
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,17 +1,16 @@
 Model Optimizer Changelog (Linux)
 =================================
 
-0.39 (2025-11-xx)
+0.39 (2025-11-07)
 ^^^^^^^^^^^^^^^^^
 
-**Deprecations**
-
 **New Features**
 
+- Upgrade TensorRT-LLM requirement to 1.1.0rc5.
 - Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
 - Add LoRA mode support for MCore in a new peft submodule: ``modelopt.torch.peft.update_model(model, LORA_CFG)``.
 - Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details.
-- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` if no dataset is specified.
+- Add support for ``nemotron-post-training-dataset-v2`` and ``nemotron-post-training-dataset-v1`` in ``examples/llm_ptq``. Default to a mix of ``cnn_dailymail`` and ``nemotron-post-training-dataset-v2`` (gated dataset accessed using ``HF_TOKEN`` environment variable) if no dataset is specified.
 - Allow specifying ``calib_seq`` in ``examples/llm_ptq`` to set the maximum sequence length for calibration.
 
 **Documentation**
diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst
@@ -18,7 +18,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
 +-------------------------+-----------------------------+
 | PyTorch                 |  >=2.6                      |
 +-------------------------+-----------------------------+
-| TensorRT-LLM (Optional) |  1.1.0rc2.post2             |
+| TensorRT-LLM (Optional) |  1.1.0rc5                   |
 +-------------------------+-----------------------------+
 | ONNX Runtime (Optional) |  1.22                       |
 +-------------------------+-----------------------------+
@@ -41,8 +41,7 @@ Environment setup
     .. code-block:: shell
 
         export PIP_CONSTRAINT=""
-        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib"
-        export PATH="${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin"
+        export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu"
 
     You may need to install additional dependencies from the respective examples's `requirements.txt` file.
 
diff --git a/examples/diffusers/quantization/requirements.txt b/examples/diffusers/quantization/requirements.txt
@@ -4,3 +4,6 @@ nvtx
 onnx_graphsurgeon
 opencv-python>=4.8.1.78,<4.12.0.88
 sentencepiece
+# TODO: Fix for torch 2.9
+torch<2.9
+torchvision<0.24.0
diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md
@@ -27,7 +27,7 @@ This section focuses on Post-training quantization, a technique that reduces mod
 
 ### Docker
 
-For Hugging Face models, please use the TensorRT-LLM docker image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2`).
+For Hugging Face models, please use the TensorRT-LLM docker image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc5`).
 For NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.07`).
 Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information.
 
diff --git a/examples/llm_sparsity/launch_finetune.sh b/examples/llm_sparsity/launch_finetune.sh
@@ -91,7 +91,7 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \
     --warmup_ratio 0.0 \
     --lr_scheduler_type cosine \
     --logging_steps 1 \
-    --fsdp full_shard auto_wrap \
+    --fsdp 'full_shard auto_wrap' \
     --fsdp_transformer_layer_cls_to_wrap LlamaDecoderLayer \
     --tf32 True \
     --modelopt_restore_path $MODELOPT_RESTORE_PATH \
diff --git a/examples/llm_sparsity/requirements.txt b/examples/llm_sparsity/requirements.txt
@@ -1,4 +1,3 @@
 flash-attn
 sentencepiece>=0.2.0
 tensorboardX
-transformers>=4.57.0
diff --git a/tests/_test_utils/torch_quantization/onnx_export.py b/tests/_test_utils/torch_quantization/onnx_export.py
@@ -65,6 +65,7 @@ def forward_loop(model):
         input_names=input_names,
         output_names=output_names,
         do_constant_folding=constant_folding,
+        dynamo=False,
         **kwargs,
     )
 
diff --git a/tests/gpu/onnx/test_plugin.py b/tests/gpu/onnx/test_plugin.py
@@ -19,6 +19,13 @@
 import onnx
 import onnx_graphsurgeon as gs
 from _test_utils.import_helper import skip_if_no_libcudnn, skip_if_no_tensorrt
+from _test_utils.onnx_autocast.utils import _assert_tensors_are_fp16
+from _test_utils.onnx_quantization.utils import _assert_nodes_are_quantized
+
+from modelopt.onnx.autocast import convert_to_mixed_precision
+from modelopt.onnx.autocast.graphsanitizer import GraphSanitizer
+from modelopt.onnx.quantization.quantize import quantize
+from modelopt.onnx.trt_utils import load_onnx_model
 
 skip_if_no_libcudnn()
 skip_if_no_tensorrt()
@@ -95,11 +102,6 @@ def _create_test_model_trt():
 
 
 def test_trt_plugin_quantization(tmp_path):
-    from _test_utils.onnx_quantization.utils import _assert_nodes_are_quantized
-
-    from modelopt.onnx.quantization.quantize import quantize
-    from modelopt.onnx.trt_utils import load_onnx_model
-
     model = _create_test_model_trt()
     with open(os.path.join(tmp_path, "model_with_trt_plugin.onnx"), "w") as f:
         onnx.save_model(model, f.name)
@@ -126,11 +128,6 @@ def test_trt_plugin_quantization(tmp_path):
 
 
 def test_trt_plugin_autocast(tmp_path):
-    from _test_utils.onnx_autocast.utils import _assert_tensors_are_fp16
-
-    from modelopt.onnx.autocast import convert_to_mixed_precision
-    from modelopt.onnx.autocast.graphsanitizer import GraphSanitizer
-
     model = _create_test_model_trt()
     with open(os.path.join(tmp_path, "model_with_trt_plugin_autocast.onnx"), "w") as f:
         onnx.save_model(model, f.name)
diff --git a/tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py b/tests/gpu/onnx/test_quantize_onnx_torch_int4_awq.py
@@ -111,12 +111,11 @@ def _forward_loop(model, dataloader):
 
         wq_onnx_awq_clip = dq_tensor(wq_onnx_awq_clip, scale_awq_clip, block_size)
 
-        assert np.allclose(wq_torch_awq_lite.detach(), wq_onnx_awq_lite.T, atol=1e-3)
-        assert np.allclose(wq_torch_awq_clip.detach(), wq_onnx_awq_clip.T, atol=1e-3)
+        assert np.allclose(wq_torch_awq_lite.detach().cpu(), wq_onnx_awq_lite.T, atol=1e-3)
+        assert np.allclose(wq_torch_awq_clip.detach().cpu(), wq_onnx_awq_clip.T, atol=1e-3)
 
 
 def test_int4_awq_cuda(tmp_path):
-    skip_if_onnx_version_above_1_18()
     skip_if_no_libcudnn()
     block_size = 128
 
diff --git a/tox.ini b/tox.ini
@@ -59,9 +59,9 @@ commands =
     torch_deploy: python -m pytest tests/unit/torch/deploy
 
 
-########################################################
-# GPU test environments (Can be used with --current-env)
-########################################################
+###########################################################
+# GPU test environments (Should be used with --current-env)
+###########################################################
 [testenv:{py310,py311,py312}-cuda12-gpu]
 setenv =
     MAMBA_FORCE_BUILD=TRUE
@@ -71,8 +71,9 @@ commands_pre =
     pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
 
     # Install Mamba model dependencies (takes 8-10mins!)
-    # Triton 3.4.0 causes some real quant tests to fail
-    pip install "triton<3.4"
+    # Skip triton because pytorch-triton is installed in the NGC PyTorch containers
+    pip install pip-mark-installed
+    pip-mark-installed triton
     pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
 
     # Install Eagle-3 test dependencies

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ def forward_loop(model):`
`65`	`65`	`input_names=input_names,`
`66`	`66`	`output_names=output_names,`
`67`	`67`	`do_constant_folding=constant_folding,`
	`68`	`+ dynamo=False,`
`68`	`69`	`**kwargs,`
`69`	`70`	`)`
`70`	`71`