Bump trtllm to 1.2.0rc0.post1 and pytorch to 25.08 for cuda 13

kevalmorabia97 · kevalmorabia97 · commit e03f104e8d55 · 2025-10-17T11:15:40.000-07:00
Signed-off-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/.github/workflows/example_tests.yml b/.github/workflows/example_tests.yml
@@ -66,7 +66,7 @@ jobs:
       matrix:
         EXAMPLE: [llm_ptq]
     container: &example_container
-      image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2
+      image: nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc0.post1
       env:
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -63,7 +63,7 @@ jobs:
     runs-on: linux-amd64-gpu-l4-latest-1
     timeout-minutes: 90
     container: &gpu_container
-      image: nvcr.io/nvidia/pytorch:25.06-py3
+      image: nvcr.io/nvidia/pytorch:25.08-py3
       env:
         GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
         PIP_CONSTRAINT: "" # Disable pip constraint for upgrading packages
@@ -76,7 +76,7 @@ jobs:
           echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/include:/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/targets/x86_64-linux-gnu/lib" >> $GITHUB_ENV
           echo "PATH=${PATH}:/usr/local/tensorrt/targets/x86_64-linux-gnu/bin" >> $GITHUB_ENV
       - name: Run gpu tests
-        run: pip install tox-current-env && tox -e py312-cuda12-gpu --current-env
+        run: pip install tox-current-env && tox -e py312-cuda13-gpu --current-env
   gpu-tests-non-pr:
     if: ${{ !startsWith(github.ref, 'refs/heads/pull-request/') }}
     # Runner list at https://github.com/nv-gha-runners/enterprise-runner-configuration/blob/main/docs/runner-groups.md
diff --git a/.gitlab/tests.yml b/.gitlab/tests.yml
@@ -29,7 +29,7 @@ unit:
 .multi-gpu-tests-default:
   extends: .tests-default
   timeout: 90m
-  image: nvcr.io/nvidia/pytorch:25.06-py3
+  image: nvcr.io/nvidia/pytorch:25.08-py3
   variables:
     GIT_DEPTH: 1000 # For correct version for tests/gpu/torch/quantization/plugins/test_megatron.py
   tags: [docker, linux, 2-gpu]
@@ -47,7 +47,7 @@ multi-gpu:
   script:
     # Use pre-installed packages without a new venv with tox-current-env
     - pip install tox-current-env
-    - tox -e py312-cuda12-gpu --current-env
+    - tox -e py312-cuda13-gpu --current-env
 
 ##### Example Tests #####
 example-torch:
@@ -64,7 +64,7 @@ example-torch:
 example-trtllm:
   extends: example-torch
   timeout: 60m
-  image: nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  image: nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc0.post1
   tags: [docker, linux, 2-gpu, sm>=89]
   parallel:
     matrix:
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -4,10 +4,13 @@ Model Optimizer Changelog (Linux)
 0.39 (2025-11-xx)
 ^^^^^^^^^^^^^^^^^
 
-**Deprecations**
+**Backward Breaking Changes**
+
+- Default ``cupy`` package (for INT4 ONNX quantization) is now ``cupy-cuda13x`` for CUDA 13 unless installed from source. If you install from PyPI wheel and have CUDA 12, you need to run ``pip uninstall -y cupy-cuda13x`` and ``pip install cupy-cuda12x`` separately.
 
 **New Features**
 
+- Upgrade TensorRT-LLM requirement to 1.2.0rc0.post1.
 - Add flag ``op_types_to_exclude_fp16`` in ONNX quantization to exclude ops from being converted to FP16/BF16. Alternatively, for custom TensorRT ops, this can also be done by indicating ``'fp32'`` precision in ``trt_plugins_precision``.
 - Add LoRA mode support for MCore in a new peft submodule: ``modelopt.torch.peft.update_model(model, LORA_CFG)``.
 - Support PTQ and fakequant in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve`` for more details.
diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst
@@ -18,7 +18,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
 +-------------------------+-----------------------------+
 | PyTorch                 |  >=2.6                      |
 +-------------------------+-----------------------------+
-| TensorRT-LLM (Optional) |  1.1.0rc2.post2             |
+| TensorRT-LLM (Optional) |  1.2.0rc0.post1             |
 +-------------------------+-----------------------------+
 | ONNX Runtime (Optional) |  1.22                       |
 +-------------------------+-----------------------------+
diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md
@@ -27,7 +27,7 @@ This section focuses on Post-training quantization, a technique that reduces mod
 
 ### Docker
 
-For Hugging Face models, please use the TensorRT-LLM docker image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc2.post2`).
+For Hugging Face models, please use the TensorRT-LLM docker image (e.g., `nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc0.post1`).
 For NeMo models, use the NeMo container (e.g., `nvcr.io/nvidia/nemo:25.07`).
 Visit our [installation docs](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/2_installation.html) for more information.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -154,7 +154,7 @@ exclude_lines = [
 
 
 [tool.bandit]
-exclude_dirs = ["examples/", "tests/"]
+exclude_dirs = ["examples/", "tests/", "setup.py"]
 # Do not change `skips`. It should be consistent with NVIDIA's Wheel-CI-CD bandit.yml config.
 # Use of `# nosec BXXX` requires special approval
 skips = [
diff --git a/setup.py b/setup.py
@@ -13,14 +13,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""The package setup script for modelopt customizing certain aspects of the installation process."""
+"""The package setup script for modelopt customizing certain aspects of the installation process.
+
+If installing from source, the CUDA version is detected and the appropriate cupy package is selected.
+If installing from a wheel, cupy for CUDA 13 is installed by default.
+    If you have CUDA 12, you need to run `pip uninstall -y cupy-cuda13x` and `pip install cupy-cuda12x` separately.
+"""
+
+import re
+import subprocess
 
 import setuptools
 from setuptools_scm import get_version
 
-# TODO: Set fallback_version to X.Y.Z release version when creating the release branch
 version = get_version(root=".", fallback_version="0.0.0")
 
+
+def get_cuda_major_version() -> int | None:
+    """Return CUDA major version installed on the system or None if detection fails."""
+    # Check nvcc version
+    try:
+        result = subprocess.run(
+            ["nvcc", "--version"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if result.returncode == 0:
+            # Parse output like "release 12.0, V12.0.140" or "release 13.0, V13.0.0"
+            for line in result.stdout.split("\n"):
+                if "release" in line.lower():
+                    match = re.search(r"release (\d+)\.", line)
+                    if match:
+                        return int(match.group(1))
+    except Exception:
+        pass
+
+    return None
+
+
 # Required and optional dependencies ###############################################################
 required_deps = [
     # Common
@@ -43,7 +74,6 @@
 optional_deps = {
     "onnx": [
         "cppimport",
-        "cupy-cuda12x; platform_machine != 'aarch64' and platform_system != 'Darwin'",
         "ml_dtypes",  # for bfloat16 conversion
         "onnx-graphsurgeon",
         "onnx~=1.19.0",
@@ -93,14 +123,19 @@
         "sphinx-rtd-theme~=3.0.0",  # 3.0 does not show version, which we want as Linux & Windows have separate releases
         "sphinx-togglebutton>=0.3.2",
     ],
-    # build/packaging tools
-    "dev-build": [
-        "cython",
-        "setuptools>=80",
-        "setuptools-scm>=8",
-    ],
 }
 
+# Select the appropriate cupy package based on the detected CUDA version or fallback to cupy-cuda12x
+cuda_version = get_cuda_major_version()
+
+if cuda_version is None:
+    # Default to CUDA 13 if detection fails
+    cuda_version = 13
+
+optional_deps["onnx"].append(
+    f"cupy-cuda{cuda_version}x ; platform_machine != 'aarch64' and platform_system != 'Darwin'"
+)
+
 # create "compound" optional dependencies
 optional_deps["all"] = [
     deps for k in optional_deps if not k.startswith("dev") for deps in optional_deps[k]
diff --git a/tests/_test_utils/torch_quantization/onnx_export.py b/tests/_test_utils/torch_quantization/onnx_export.py
@@ -65,6 +65,7 @@ def forward_loop(model):
         input_names=input_names,
         output_names=output_names,
         do_constant_folding=constant_folding,
+        dynamo=False,  # torch 2.9 flips default to True
         **kwargs,
     )
 
diff --git a/tox.ini b/tox.ini
@@ -17,6 +17,7 @@ deps =
     torch27: torchvision~=0.22.0
     torch28: torchvision~=0.23.0
 
+
     # Build onnxsim from sdists for Python 3.12 until http://github.com/daquexian/onnx-simplifier/pull/353
     py312: onnxsim
 
@@ -62,7 +63,7 @@ commands =
 ########################################################
 # GPU test environments (Can be used with --current-env)
 ########################################################
-[testenv:{py310,py311,py312}-cuda12-gpu]
+[testenv:{py310,py311,py312}-cuda13-gpu]
 setenv =
     MAMBA_FORCE_BUILD=TRUE
 commands_pre =
@@ -71,8 +72,8 @@ commands_pre =
     pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git
 
     # Install Mamba model dependencies (takes 8-10mins!)
-    # Triton 3.4.0 causes some real quant tests to fail
-    pip install "triton<3.4"
+    # Install same triton as pytorch-triton in the NGC PyTorch 25.08 docker otherwise Mamba may install an incompatible version
+    pip install triton==3.3.1
     pip install --no-build-isolation git+https://github.com/state-spaces/mamba.git
 
     # Install Eagle-3 test dependencies

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ def forward_loop(model):`
`65`	`65`	`input_names=input_names,`
`66`	`66`	`output_names=output_names,`
`67`	`67`	`do_constant_folding=constant_folding,`
	`68`	`+ dynamo=False, # torch 2.9 flips default to True`
`68`	`69`	`**kwargs,`
`69`	`70`	`)`
`70`	`71`