Skip to content

Commit 01083b5

Browse files
EmmaQiaoChbobbolixxi-nvchzblych
authored
[TRTLLM-9849][infra] Update dependencies to 25.12 (#9818)
Signed-off-by: qqiao <[email protected]> Signed-off-by: Bo Li <[email protected]> Signed-off-by: Emma Qiao <[email protected]> Signed-off-by: xxi <[email protected]> Signed-off-by: xxi <[email protected]> Co-authored-by: Bo Li <[email protected]> Co-authored-by: xxi <[email protected]> Co-authored-by: xxi <[email protected]> Co-authored-by: Yanchao Lu <[email protected]>
1 parent 35c2442 commit 01083b5

File tree

16 files changed

+96
-51
lines changed

16 files changed

+96
-51
lines changed

cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -493,7 +493,12 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
493493
#if !DISABLE_SYNC_FOR_PROFILING
494494
uint32_t expected_value = *ptrs.flag_val;
495495

496+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
497+
// .acquire and .release qualifiers for fence instruction require sm_90 or higher.
496498
asm volatile("fence.release.sys;");
499+
#else
500+
asm volatile("fence.acq_rel.sys;");
501+
#endif
497502
#pragma unroll 1 // No unroll as one iter is typically enough
498503
for (int target_rank = lane_id; target_rank < ep_size; target_rank += warpSize)
499504
{
@@ -525,7 +530,6 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
525530
flag_set = flag_value == expected_value;
526531
} while (!flag_set);
527532
}
528-
// asm volatile("fence.acquire.sys;");
529533
#endif
530534
}
531535
}
@@ -1018,7 +1022,6 @@ __global__ void moeA2ACombineKernel(
10181022

10191023
if (blockIdx.x == 0)
10201024
{
1021-
// asm volatile("fence.release.sys;");
10221025
#pragma unroll 1 // No unroll
10231026
for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
10241027
{
@@ -1050,7 +1053,12 @@ __global__ void moeA2ACombineKernel(
10501053
flag_set = flag_value == expected_value;
10511054
} while (!flag_set);
10521055
}
1056+
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
1057+
// .acquire and .release qualifiers for fence instruction require sm_90 or higher.
10531058
asm volatile("fence.acquire.sys;");
1059+
#else
1060+
asm volatile("fence.acq_rel.sys;");
1061+
#endif
10541062
}
10551063
__syncthreads();
10561064
#endif

docker/Dockerfile.multi

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# Multi-stage Dockerfile
22
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
33
ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver
4-
ARG BASE_TAG=25.10-py3
5-
ARG TRITON_BASE_TAG=25.10-py3
4+
ARG BASE_TAG=25.12-py3
5+
ARG TRITON_BASE_TAG=25.12-py3
66
ARG DEVEL_IMAGE=devel
77

88
FROM ${BASE_IMAGE}:${BASE_TAG} AS base
@@ -147,6 +147,7 @@ RUN --mount=type=cache,target=/root/.cache/pip --mount=type=bind,from=wheel,sour
147147
pip install /tmp/wheel/tensorrt_llm*.whl
148148

149149
COPY README.md ./
150+
COPY --from=wheel /src/tensorrt_llm/build/tensorrt_llm*.whl ./
150151
COPY docs docs
151152
COPY cpp/include include
152153

docker/Makefile

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -202,17 +202,16 @@ jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_V
202202
jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE)
203203
jenkins-rockylinux8_%: STAGE = tritondevel
204204
jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
205-
# [TODO] Update to NVIDIA CUDA 13.0.2 when it's available
206-
jenkins-rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8
205+
jenkins-rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8
207206

208207
rockylinux8_%: STAGE = tritondevel
209208
rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
210-
rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8
209+
rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8
211210

212211
# For x86_64 and aarch64
213212
ubuntu22_%: STAGE = tritondevel
214213
ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda
215-
ubuntu22_%: BASE_TAG = 13.0.1-devel-ubuntu22.04
214+
ubuntu22_%: BASE_TAG = 13.1.0-devel-ubuntu22.04
216215

217216
trtllm_%: STAGE = release
218217
trtllm_%: PUSH_TO_STAGING := 0

docker/common/install_cuda_toolkit.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ set -ex
55
# This script is used for reinstalling CUDA on Rocky Linux 8 with the run file.
66
# CUDA version is usually aligned with the latest NGC CUDA image tag.
77
# Only use when public CUDA image is not ready.
8-
CUDA_VER="13.0.2_580.95.05"
8+
CUDA_VER="13.1.0_590.44.01"
99
CUDA_VER_SHORT="${CUDA_VER%_*}"
1010

1111
NVCC_VERSION_OUTPUT=$(nvcc --version)

docker/common/install_polygraphy.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ set -ex
55
if [ -n "${GITHUB_MIRROR}" ]; then
66
export PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
77
fi
8-
pip3 install polygraphy==0.49.9
8+
pip3 install polygraphy==0.49.26
99

1010
# Clean up pip cache and temporary files
1111
pip3 cache purge

docker/common/install_pytorch.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ set -ex
44

55
# Use latest stable version from https://pypi.org/project/torch/#history
66
# and closest to the version specified in
7-
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10
8-
TORCH_VERSION="2.9.0"
7+
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12
8+
TORCH_VERSION="2.9.1"
99
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
1010

1111
prepare_environment() {

docker/common/install_tensorrt.sh

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,20 @@
22

33
set -ex
44

5-
TRT_VER="10.13.3.9"
5+
TRT_VER="10.14.1.48"
66
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
7-
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10
8-
CUDA_VER="13.0" # 13.0.2
7+
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12
8+
CUDA_VER="13.1" # 13.1.0
99
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
1010
# PyTorch 2.x can compile with cuDNN v9.
11-
CUDNN_VER="9.14.0.64-1"
12-
NCCL_VER="2.27.7-1+cuda13.0"
13-
CUBLAS_VER="13.1.0.3-1"
11+
CUDNN_VER="9.17.0.29-1"
12+
NCCL_VER="2.28.9-1+cuda13.0"
13+
CUBLAS_VER="13.2.0.9-1"
1414
# Align with the pre-installed CUDA / NVCC / NVRTC versions from
1515
# https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
16-
NVRTC_VER="13.0.88-1"
17-
CUDA_RUNTIME="13.0.96-1"
18-
CUDA_DRIVER_VERSION="580.95.05-1.el8"
16+
NVRTC_VER="13.1.80-1"
17+
CUDA_RUNTIME="13.1.80-1"
18+
CUDA_DRIVER_VERSION="590.44.01-1.el8"
1919

2020
for i in "$@"; do
2121
case $i in
@@ -118,7 +118,12 @@ install_rockylinux_requirements() {
118118
install_tensorrt() {
119119
PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
120120
PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
121+
121122
TRT_CUDA_VERSION=${CUDA_VER}
123+
# No CUDA 13.1 version for TensorRT yet. Use CUDA 13.0 package instead.
124+
if [ "$CUDA_VER" = "13.1" ]; then
125+
TRT_CUDA_VERSION="13.0"
126+
fi
122127
TRT_VER_SHORT=$(echo $TRT_VER | cut -d. -f1-3)
123128

124129
if [ -z "$RELEASE_URL_TRT" ];then

jenkins/Build.groovy

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,19 +83,19 @@ def BUILD_CONFIGS = [
8383
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
8484
(TARNAME) : "TensorRT-LLM-GH200.tar.gz",
8585
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
86-
(BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
86+
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
8787
],
8888
(CONFIG_LINUX_AARCH64_PYBIND): [
8989
(WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
9090
(TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz",
9191
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
92-
(BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
92+
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
9393
],
9494
(CONFIG_LINUX_AARCH64_LLVM) : [
9595
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
9696
(TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
9797
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
98-
(BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
98+
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
9999
],
100100
]
101101

jenkins/L0_Test.groovy

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310
3939
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312
4040

4141
// DLFW torch image
42-
DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.10-py3"
42+
DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.12-py3"
4343

4444
//Ubuntu base image
4545
UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
@@ -316,6 +316,11 @@ def processShardTestList(llmSrc, testDBList, splitId, splits, perfMode=false) {
316316
foundRunningLine = true
317317
return false // Don't include the "Running" line itself
318318
}
319+
// Stop collecting when we hit the warnings/errors summary separator
320+
if (foundRunningLine && line.contains('======================')) {
321+
foundRunningLine = false // Stop collecting
322+
return false
323+
}
319324

320325
def hasDoubleColon = line.contains('::')
321326
def shouldInclude = foundRunningLine && hasDoubleColon
@@ -3389,7 +3394,7 @@ def launchTestJobs(pipeline, testFilter)
33893394
// Python version and OS for sanity check
33903395
x86SanityCheckConfigs = [
33913396
"PY312-DLFW": [
3392-
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE,
3397+
LLM_DOCKER_IMAGE, // Workaround ABI incompatibilities between PyTorch 2.9.1 and 2.10.0a0
33933398
"B200_PCIe",
33943399
X86_64_TRIPLE,
33953400
false,
@@ -3418,15 +3423,16 @@ def launchTestJobs(pipeline, testFilter)
34183423
]
34193424

34203425
aarch64SanityCheckConfigs = [
3426+
/* //Disable PY312-UB2404 temporarily since lack of official PyTorch for CUDA 13.1.
34213427
"PY312-UB2404": [
34223428
LLM_DOCKER_IMAGE,
34233429
"GH200",
34243430
AARCH64_TRIPLE,
34253431
false,
34263432
"",
3427-
UBUNTU_24_04_IMAGE,
3428-
true, // Extra PyTorch CUDA 13.0 install
3429-
],
3433+
DLFW_IMAGE,
3434+
false, // Extra PyTorch CUDA 13.0 install
3435+
],*/
34303436
"PY312-DLFW": [
34313437
LLM_DOCKER_IMAGE,
34323438
"GH200",
@@ -3524,17 +3530,17 @@ def launchTestJobs(pipeline, testFilter)
35243530
def platform = cpu_arch == X86_64_TRIPLE ? "x86_64" : "sbsa"
35253531
trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb")
35263532
trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb")
3527-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0")
3533+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-1")
35283534
}
35293535
// Extra PyTorch CUDA 13.0 install for all bare-metal environments (Default PyTorch is for CUDA 12.8)
35303536
if (values[6]) {
35313537
echo "###### Extra PyTorch CUDA 13.0 install Start ######"
35323538
// Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability.
35333539
// PyTorch CUDA 13.0 package and torchvision package can be installed as expected.
35343540
if (k8s_arch == "amd64") {
3535-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
3541+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
35363542
} else {
3537-
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
3543+
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
35383544
}
35393545
}
35403546

jenkins/current_image_tags.properties

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
1414
IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
1515

16-
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055
17-
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055
18-
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512241744-10055
19-
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512241744-10055
16+
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818
17+
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818
18+
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202601011103-9818
19+
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202601011103-9818

0 commit comments

Comments
 (0)