diff --git a/build.py b/build.py index 716b227c42..56e0be5842 100755 --- a/build.py +++ b/build.py @@ -74,11 +74,11 @@ "release_version": "2.60.0dev", "triton_container_version": "25.08dev", "upstream_container_version": "25.07", - "ort_version": "1.22.0", + "ort_version": "1.23.0", "ort_openvino_version": "2025.2.0", "standalone_openvino_version": "2025.2.0", "dcgm_version": "4.2.3-2", - "vllm_version": "0.9.0.1", + "vllm_version": "0.9.2", "rhel_py_version": "3.12.3", } @@ -1259,7 +1259,7 @@ def create_dockerfile_linux( # stage of the PyTorch backend if not FLAGS.enable_gpu and ("pytorch" in backends): df += """ -RUN patchelf --add-needed /usr/local/cuda/lib64/stubs/libcublasLt.so.12 backends/pytorch/libtorch_cuda.so +RUN patchelf --add-needed /usr/local/cuda/lib64/stubs/libcublasLt.so.13 backends/pytorch/libtorch_cuda.so """ if "tensorrtllm" in backends: df += """ @@ -1494,7 +1494,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach cp -r nvpl_slim_24.04/include/* /usr/local/include && \\ rm -rf nvpl_slim_24.04.tar nvpl_slim_24.04; \\ fi \\ - && pip3 install --no-cache-dir --progress-bar on --index-url $VLLM_INDEX_URL -r /run/secrets/requirements \\ + && pip3 install --no-cache-dir --extra-index-url $VLLM_INDEX_URL -r /run/secrets/requirements \\ # Need to install in-house build of pytorch-triton to support triton_key definition used by torch 2.5.1 && cd /tmp \\ && wget $PYTORCH_TRITON_URL \\ @@ -1554,18 +1554,18 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine): df += """ RUN mkdir -p /usr/local/cuda/lib64/stubs COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusparse.so /usr/local/cuda/lib64/stubs/libcusparse.so.12 -COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusolver.so /usr/local/cuda/lib64/stubs/libcusolver.so.11 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusolver.so /usr/local/cuda/lib64/stubs/libcusolver.so.12 COPY --from=min_container /usr/local/cuda/lib64/stubs/libcurand.so /usr/local/cuda/lib64/stubs/libcurand.so.10 -COPY --from=min_container /usr/local/cuda/lib64/stubs/libcufft.so /usr/local/cuda/lib64/stubs/libcufft.so.11 -COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublas.so /usr/local/cuda/lib64/stubs/libcublas.so.12 -COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.12 -COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.11 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcufft.so /usr/local/cuda/lib64/stubs/libcufft.so.12 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublas.so /usr/local/cuda/lib64/stubs/libcublas.so.13 +COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.13 RUN mkdir -p /usr/local/cuda/targets/{cuda_arch}-linux/lib -COPY --from=min_container /usr/local/cuda/lib64/libcudart.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. -COPY --from=min_container /usr/local/cuda/lib64/libcupti.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. -COPY --from=min_container /usr/local/cuda/lib64/libnvJitLink.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. +COPY --from=min_container /usr/local/cuda/lib64/libcudart.so.13 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. +COPY --from=min_container /usr/local/cuda/lib64/libcupti.so.13 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. +COPY --from=min_container /usr/local/cuda/lib64/libnvJitLink.so.13 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. COPY --from=min_container /usr/local/cuda/lib64/libcufile.so.0 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. +COPY --from=min_container /usr/local/cuda/lib64/libnvrtc.so.13 /usr/local/cuda/targets/{cuda_arch}-linux/lib/. RUN mkdir -p /opt/hpcx/ucc/lib/ /opt/hpcx/ucx/lib/ COPY --from=min_container /opt/hpcx/ucc/lib/libucc.so.1 /opt/hpcx/ucc/lib/libucc.so.1 diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models index 5bbfb4c74f..956ba03955 100755 --- a/qa/common/gen_jetson_trt_models +++ b/qa/common/gen_jetson_trt_models @@ -142,6 +142,7 @@ docker pull $TENSORRT_IMAGE docker run $DOCKER_GPU_ARGS \ --rm -v $DOCKER_VOLUME:/mnt \ + -e TRT_VERBOSE \ $TENSORRT_IMAGE bash -xe $VOLUME_SRCDIR/$TRT_MODEL_SCRIPT # Copy generated models to /tmp/ if not running in CI diff --git a/qa/common/gen_qa_dyna_sequence_implicit_models.py b/qa/common/gen_qa_dyna_sequence_implicit_models.py index e07e4cf5ec..a977710c51 100755 --- a/qa/common/gen_qa_dyna_sequence_implicit_models.py +++ b/qa/common/gen_qa_dyna_sequence_implicit_models.py @@ -357,7 +357,11 @@ def create_onnx_modelconfig(models_dir, model_version, max_batch, dtype, shape): def create_plan_modelfile(models_dir, model_version, max_batch, dtype, shape): trt_dtype = np_to_trt_dtype(dtype) - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() @@ -492,7 +496,11 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape) trt_dtype = np_to_trt_dtype(dtype) trt_memory_format = trt.TensorFormat.LINEAR - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() diff --git a/qa/common/gen_qa_dyna_sequence_models.py b/qa/common/gen_qa_dyna_sequence_models.py index 9c21d92b96..e91ce42132 100755 --- a/qa/common/gen_qa_dyna_sequence_models.py +++ b/qa/common/gen_qa_dyna_sequence_models.py @@ -59,7 +59,11 @@ def create_plan_shape_tensor_modelfile( trt_shape_dtype = np_to_trt_dtype(shape_tensor_input_dtype) trt_memory_format = trt.TensorFormat.LINEAR - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() @@ -202,7 +206,11 @@ def create_plan_modelfile(models_dir, model_version, max_batch, dtype, shape): # Create the model. For now don't implement a proper accumulator # just return 0 if not-ready and 'INPUT'+'START'*('END'*'CORRID') # otherwise... the tests know to expect this. - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() @@ -310,7 +318,11 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape) # Create the model. For now don't implement a proper accumulator # just return 0 if not-ready and 'INPUT'+'START'*('END'*'CORRID') # otherwise... the tests know to expect this. - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() diff --git a/qa/common/gen_qa_identity_models.py b/qa/common/gen_qa_identity_models.py index 5fa7b7ab01..7b513d3fbf 100755 --- a/qa/common/gen_qa_identity_models.py +++ b/qa/common/gen_qa_identity_models.py @@ -545,7 +545,11 @@ def create_plan_dynamic_rf_modelfile( models_dir, model_version, io_cnt, max_batch, dtype, shape, profile_max_size ): # Create the model - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() @@ -644,7 +648,11 @@ def create_plan_shape_tensor_modelfile( # Note that values of OUTPUT tensor must be identical # to INPUT values - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() @@ -748,7 +756,11 @@ def create_plan_dynamic_modelfile( models_dir, model_version, io_cnt, max_batch, dtype, shape, profile_max_size ): # Create the model - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() diff --git a/qa/common/gen_qa_implicit_models.py b/qa/common/gen_qa_implicit_models.py index c3429d6012..241c021bdd 100755 --- a/qa/common/gen_qa_implicit_models.py +++ b/qa/common/gen_qa_implicit_models.py @@ -899,7 +899,11 @@ def create_onnx_modelconfig( def create_plan_modelfile(models_dir, model_version, max_batch, dtype, shape): trt_dtype = np_to_trt_dtype(dtype) - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() @@ -1005,7 +1009,11 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape) trt_dtype = np_to_trt_dtype(dtype) trt_memory_format = trt.TensorFormat.LINEAR - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository index f84c0603d0..6865c1c71e 100755 --- a/qa/common/gen_qa_model_repository +++ b/qa/common/gen_qa_model_repository @@ -430,7 +430,7 @@ python3 $VOLUME_SRCDIR/gen_qa_ragged_models.py --tensorrt --models_dir=$VOLUME_R chmod -R 777 $VOLUME_RAGGEDDESTDIR python3 $VOLUME_SRCDIR/gen_qa_trt_format_models.py --models_dir=$VOLUME_FORMATDESTDIR chmod -R 777 $VOLUME_FORMATDESTDIR -python3 $VOLUME_SRCDIR/gen_qa_trt_data_dependent_shape.py --models_dir=$VOLUME_DATADEPENDENTDIR +nvidia-smi --query-gpu=compute_cap | grep -qz 11.0 && echo -e '\033[33m[WARNING]\033[0m Skipping model generation for data dependent shape' || python3 $VOLUME_SRCDIR/gen_qa_trt_data_dependent_shape.py --models_dir=$VOLUME_DATADEPENDENTDIR chmod -R 777 $VOLUME_DATADEPENDENTDIR # Make shared library for custom Hardmax plugin. if [ -d "/usr/src/tensorrt" ]; then @@ -463,6 +463,7 @@ if [ "$MODEL_TYPE" != "igpu" ] ; then --label PROJECT_NAME=$PROJECT_NAME \ $DOCKER_GPU_ARGS \ -v $DOCKER_VOLUME:/mnt \ + -e TRT_VERBOSE \ $TENSORRT_IMAGE \ bash -xe $VOLUME_SRCDIR/$TRTSCRIPT diff --git a/qa/common/gen_qa_models.py b/qa/common/gen_qa_models.py index cd7efea723..cfce75be39 100755 --- a/qa/common/gen_qa_models.py +++ b/qa/common/gen_qa_models.py @@ -66,7 +66,11 @@ def create_plan_dynamic_rf_modelfile( trt_memory_format = trt.TensorFormat.LINEAR # Create the model - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() if max_batch == 0: @@ -206,7 +210,11 @@ def create_plan_dynamic_modelfile( trt_output1_dtype = np_to_trt_dtype(output1_dtype) # Create the model - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() if max_batch == 0: @@ -372,7 +380,11 @@ def create_plan_fixed_rf_modelfile( trt_memory_format = trt.TensorFormat.LINEAR # Create the model - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() if max_batch == 0: @@ -483,7 +495,11 @@ def create_plan_fixed_modelfile( trt_output1_dtype = np_to_trt_dtype(output1_dtype) # Create the model - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() if max_batch == 0: diff --git a/qa/common/gen_qa_ragged_models.py b/qa/common/gen_qa_ragged_models.py index de8c583d88..5db3dcf6ab 100755 --- a/qa/common/gen_qa_ragged_models.py +++ b/qa/common/gen_qa_ragged_models.py @@ -57,7 +57,11 @@ def create_plan_modelfile(models_dir, model_version, dtype): # - BATCH_MAX_ELEMENT_COUNT_AS_SHAPE # - BATCH_ITEM_SHAPE_FLATTEN - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() trt_dtype = np_to_trt_dtype(dtype) @@ -412,7 +416,11 @@ def create_plan_itemshape_modelfile(models_dir, model_version, dtype): # generated to have matching batch dimension, the output can be produced # via identity op and expect Triton will scatter the output properly. - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() trt_dtype = np_to_trt_dtype(dtype) diff --git a/qa/common/gen_qa_reshape_models.py b/qa/common/gen_qa_reshape_models.py index 8193b29677..d70333c925 100755 --- a/qa/common/gen_qa_reshape_models.py +++ b/qa/common/gen_qa_reshape_models.py @@ -58,7 +58,11 @@ def create_plan_modelfile( io_cnt = len(input_shapes) # Create the model that copies inputs to corresponding outputs. - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() diff --git a/qa/common/gen_qa_sequence_models.py b/qa/common/gen_qa_sequence_models.py index ad31bbc0ba..99debede00 100755 --- a/qa/common/gen_qa_sequence_models.py +++ b/qa/common/gen_qa_sequence_models.py @@ -59,7 +59,11 @@ def create_plan_shape_tensor_modelfile( trt_shape_dtype = np_to_trt_dtype(shape_tensor_input_dtype) trt_memory_format = trt.TensorFormat.LINEAR - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() @@ -182,7 +186,11 @@ def create_plan_modelfile(models_dir, model_version, max_batch, dtype, shape): # Create the model. For now don't implement a proper accumulator # just return 0 if not-ready and 'INPUT'+'START' otherwise... the # tests know to expect this. - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() @@ -271,7 +279,11 @@ def create_plan_rf_modelfile(models_dir, model_version, max_batch, dtype, shape) # Create the model. For now don't implement a proper accumulator # just return 0 if not-ready and 'INPUT'+'START' otherwise... the # tests know to expect this. - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() diff --git a/qa/common/gen_qa_trt_data_dependent_shape.py b/qa/common/gen_qa_trt_data_dependent_shape.py index c6f4bf2b5e..9ee9b60b68 100755 --- a/qa/common/gen_qa_trt_data_dependent_shape.py +++ b/qa/common/gen_qa_trt_data_dependent_shape.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -45,7 +45,11 @@ def create_data_dependent_modelfile( trt_input_dtype = np_to_trt_dtype(input_dtype) # Create the model - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() diff --git a/qa/common/gen_qa_trt_format_models.py b/qa/common/gen_qa_trt_format_models.py index 6419a6e2ab..fee469e6a8 100755 --- a/qa/common/gen_qa_trt_format_models.py +++ b/qa/common/gen_qa_trt_format_models.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -81,7 +81,11 @@ def create_plan_modelfile( trt_output_memory_format = output_memory_format # Create the model - TRT_LOGGER = trt.Logger(trt.Logger.INFO) + TRT_LOGGER = ( + trt.Logger(trt.Logger.INFO) + if os.environ.get("TRT_VERBOSE") != "1" + else trt.Logger(trt.Logger.VERBOSE) + ) builder = trt.Builder(TRT_LOGGER) network = builder.create_network() if max_batch == 0: