triton-inference-server · mc-nv · Feb 4, 2025 · Jan 15, 2025 · Jan 16, 2025 · Jan 22, 2025
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
@@ -1,4 +1,4 @@
-# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.12-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.01-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,11 +30,6 @@
 
 [![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
->[!WARNING]
->You are currently on the `main` branch which tracks under-development progress
->towards the next release. The current release is version [2.53.0](https://github.com/triton-inference-server/server/releases/latest)
->and corresponds to the 24.12 container release on NVIDIA GPU Cloud (NGC).
-
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
 multiple deep learning and machine learning frameworks, including TensorRT,
@@ -62,7 +57,7 @@ Major features include:
 - Provides [Backend API](https://github.com/triton-inference-server/backend) that
   allows adding custom backends and pre/post processing operations
 - Supports writing custom backends in python, a.k.a.
-  [Python-based backends.](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md#python-based-backends)
+  [Python-based backends.](https://github.com/triton-inference-server/backend/blob/r25.01/docs/python_based_backends.md#python-based-backends)
 - Model pipelines using
   [Ensembling](docs/user_guide/architecture.md#ensemble-models) or [Business
   Logic Scripting
@@ -91,16 +86,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r24.12 https://github.com/triton-inference-server/server.git
+git clone -b r25.01 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.12-py3 tritonserver --model-repository=/models
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:25.01-py3 tritonserver --model-repository=/models
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.12-py3-sdk
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:25.01-py3-sdk
 /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
@@ -175,10 +170,10 @@ configuration](docs/user_guide/model_configuration.md) for the model.
   [Python](https://github.com/triton-inference-server/python_backend), and more
 - Not all the above backends are supported on every platform supported by Triton.
   Look at the
-  [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md)
+  [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/r25.01/docs/backend_platform_support_matrix.md)
   to learn which backends are supported on your target platform.
 - Learn how to [optimize performance](docs/user_guide/optimization.md) using the
-  [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/main/README.md)
+  [Performance Analyzer](https://github.com/triton-inference-server/perf_analyzer/blob/r25.01/README.md)
   and
   [Model Analyzer](https://github.com/triton-inference-server/model_analyzer)
 - Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in
@@ -192,14 +187,14 @@ A Triton *client* application sends inference and other requests to Triton. The
 [Python and C++ client libraries](https://github.com/triton-inference-server/client)
 provide APIs to simplify this communication.
 
-- Review client examples for [C++](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/examples),
-  [Python](https://github.com/triton-inference-server/client/blob/main/src/python/examples),
-  and [Java](https://github.com/triton-inference-server/client/blob/main/src/java/src/main/java/triton/client/examples)
+- Review client examples for [C++](https://github.com/triton-inference-server/client/blob/r25.01/src/c%2B%2B/examples),
+  [Python](https://github.com/triton-inference-server/client/blob/r25.01/src/python/examples),
+  and [Java](https://github.com/triton-inference-server/client/blob/r25.01/src/java/src/main/java/triton/client/examples)
 - Configure [HTTP](https://github.com/triton-inference-server/client#http-options)
   and [gRPC](https://github.com/triton-inference-server/client#grpc-options)
   client options
 - Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP
-  request without any additional metadata](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#raw-binary-request)
+  request without any additional metadata](https://github.com/triton-inference-server/server/blob/r25.01/docs/protocol/extension_binary_data.md#raw-binary-request)
 
 ### Extend Triton
 
@@ -208,7 +203,7 @@ designed for modularity and flexibility
 
 - [Customize Triton Inference Server container](docs/customization_guide/compose.md) for your use case
 - [Create custom backends](https://github.com/triton-inference-server/backend)
-  in either [C/C++](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api)
+  in either [C/C++](https://github.com/triton-inference-server/backend/blob/r25.01/README.md#triton-backend-api)
   or [Python](https://github.com/triton-inference-server/python_backend)
 - Create [decoupled backends and models](docs/user_guide/decoupled_models.md) that can send
   multiple responses for a request or not send any responses for a request
@@ -217,7 +212,7 @@ designed for modularity and flexibility
   decryption, or conversion
 - Deploy Triton on [Jetson and JetPack](docs/user_guide/jetson.md)
 - [Use Triton on AWS
-   Inferentia](https://github.com/triton-inference-server/python_backend/tree/main/inferentia)
+   Inferentia](https://github.com/triton-inference-server/python_backend/tree/r25.01/inferentia)
 
 ### Additional Documentation
 

diff --git a/TRITON_VERSION b/TRITON_VERSION
@@ -1 +1 @@
-2.54.0dev
+2.55.0dev
diff --git a/build.py b/build.py
@@ -71,12 +71,12 @@
 #
 
 DEFAULT_TRITON_VERSION_MAP = {
-    "release_version": "2.54.0dev",
-    "triton_container_version": "25.01dev",
-    "upstream_container_version": "24.12",
+    "release_version": "2.55.0dev",
+    "triton_container_version": "25.02dev",
+    "upstream_container_version": "25.01",
     "ort_version": "1.20.1",
-    "ort_openvino_version": "2024.4.0",
-    "standalone_openvino_version": "2024.4.0",
+    "ort_openvino_version": "2024.5.0",
+    "standalone_openvino_version": "2024.5.0",
     "dcgm_version": "3.3.6",
     "vllm_version": "0.6.3.post1",
     "rhel_py_version": "3.12.3",
@@ -962,7 +962,6 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
             libcurl-devel \\
             libb64-devel \\
             gperftools-devel \\
-            patchelf \\
             python3-pip \\
             python3-setuptools \\
             rapidjson-devel \\
@@ -990,7 +989,8 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
           wheel \\
           setuptools \\
           docker \\
-          virtualenv
+          virtualenv \\
+          patchelf==0.17.2
 
 # Install boost version >= 1.78 for boost::span
 # Current libboost-dev apt packages are < 1.78, so install from tar.gz
@@ -1089,7 +1089,6 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
             libcurl4-openssl-dev \\
             libb64-dev \\
             libgoogle-perftools-dev \\
-            patchelf \\
             python3-dev \\
             python3-pip \\
             python3-wheel \\
@@ -1110,7 +1109,8 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
 RUN pip3 install --upgrade \\
           build \\
           docker \\
-          virtualenv
+          virtualenv \\
+          patchelf==0.17.2
 
 # Install boost version >= 1.78 for boost::span
 # Current libboost-dev apt packages are < 1.78, so install from tar.gz
@@ -1354,11 +1354,12 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
         libcurl-devel \\
         libb64-devel \\
         gperftools-devel \\
-        patchelf \\
         wget \\
         python3-pip \\
         numactl-devel
 
+RUN pip3 install patchelf==0.17.2
+
 """
     else:
         df += """
@@ -1467,12 +1468,31 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
     """
 
     if "vllm" in backends:
-        df += """
-# vLLM needed for vLLM backend
-RUN pip3 install vllm=={}
-""".format(
-            FLAGS.vllm_version
-        )
+        df += f"""
+ARG BUILD_PUBLIC_VLLM="true"
+ARG VLLM_INDEX_URL
+ARG PYTORCH_TRITON_URL
+
+RUN --mount=type=secret,id=req,target=/run/secrets/requirements \\
+    if [ "$BUILD_PUBLIC_VLLM" = "false" ]; then \\
+        pip3 install --no-cache-dir \\
+        mkl==2021.1.1 \\
+        mkl-include==2021.1.1 \\
+        mkl-devel==2021.1.1 \\
+        && pip3 install --no-cache-dir --progress-bar on --index-url $VLLM_INDEX_URL -r /run/secrets/requirements \\
+        # Need to install in-house build of pytorch-triton to support triton_key definition used by torch 2.5.1
+        && cd /tmp \\
+        && wget $PYTORCH_TRITON_URL \\
+        && pip install --no-cache-dir /tmp/pytorch_triton-*.whl \\
+        && rm /tmp/pytorch_triton-*.whl; \\
+    else \\
+        # public vLLM needed for vLLM backend
+        pip3 install vllm=={DEFAULT_TRITON_VERSION_MAP["vllm_version"]}; \\
+    fi
+
+ARG PYVER=3.12
+ENV LD_LIBRARY_PATH /usr/local/lib:/usr/local/lib/python${{PYVER}}/dist-packages/torch/lib:${{LD_LIBRARY_PATH}}
+"""
 
     if "dali" in backends:
         df += """
@@ -1543,7 +1563,8 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
 
 # patchelf is needed to add deps of libcublasLt.so.12 to libtorch_cuda.so
 RUN apt-get update \\
-      && apt-get install -y --no-install-recommends openmpi-bin patchelf
+      && apt-get install -y --no-install-recommends openmpi-bin
+RUN pip3 install patchelf==0.17.2
 
 ENV LD_LIBRARY_PATH /usr/local/cuda/targets/{cuda_arch}-linux/lib:/usr/local/cuda/lib64/stubs:${{LD_LIBRARY_PATH}}
 """.format(
@@ -1840,13 +1861,21 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
         finalargs = [
             "docker",
             "build",
+        ]
+        if secrets != "":
+            finalargs += [
+                f"--secret id=req,src={requirements}",
+                f"--build-arg VLLM_INDEX_URL={vllm_index_url}",
+                f"--build-arg PYTORCH_TRITON_URL={pytorch_triton_url}",
+                f"--build-arg BUILD_PUBLIC_VLLM={build_public_vllm}",
+            ]
+        finalargs += [
             "-t",
             "tritonserver",
             "-f",
             os.path.join(FLAGS.build_dir, "Dockerfile"),
             ".",
         ]
-
         docker_script.cwd(THIS_SCRIPT_DIR)
         docker_script.cmd(finalargs, check_exitcode=True)
 
@@ -2691,6 +2720,19 @@ def enable_all():
         default=DEFAULT_TRITON_VERSION_MAP["rhel_py_version"],
         help="This flag sets the Python version for RHEL platform of Triton Inference Server to be built. Default: the latest supported version.",
     )
+    parser.add_argument(
+        "--build-secret",
+        action="append",
+        required=False,
+        nargs=2,
+        metavar=("key", "value"),
+        help="Add build secrets in the form of <key> <value>. These secrets are used during the build process for vllm. The secrets are passed to the Docker build step as `--secret id=<key>`. The following keys are expected and their purposes are described below:\n\n"
+        "  - 'req': A file containing a list of dependencies for pip (e.g., requirements.txt).\n"
+        "  - 'vllm_index_url': The index URL for the pip install.\n"
+        "  - 'pytorch_triton_url': The location of the PyTorch wheel to download.\n"
+        "  - 'build_public_vllm': A flag (default is 'true') indicating whether to build the public VLLM version.\n\n"
+        "Ensure that the required environment variables for these secrets are set before running the build.",
+    )
     FLAGS = parser.parse_args()
 
     if FLAGS.image is None:
@@ -2717,6 +2759,8 @@ def enable_all():
         FLAGS.override_backend_cmake_arg = []
     if FLAGS.extra_backend_cmake_arg is None:
         FLAGS.extra_backend_cmake_arg = []
+    if FLAGS.build_secret is None:
+        FLAGS.build_secret = []
 
     # if --enable-all is specified, then update FLAGS to enable all
     # settings, backends, repo-agents, caches, file systems, endpoints, etc.
@@ -2810,6 +2854,14 @@ def enable_all():
             )
             backends["python"] = backends["vllm"]
 
+    secrets = dict(getattr(FLAGS, "build_secret", []))
+    if secrets is not None:
+        requirements = secrets.get("req", "")
+        vllm_index_url = secrets.get("vllm_index_url", "")
+        pytorch_triton_url = secrets.get("pytorch_triton_url", "")
+        build_public_vllm = secrets.get("build_public_vllm", "true")
+        log('Build Arg for BUILD_PUBLIC_VLLM: "{}"'.format(build_public_vllm))
+
     # Initialize map of repo agents to build and repo-tag for each.
     repoagents = {}
     for be in FLAGS.repoagent:

diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.12-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.01-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1

diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.53.0"
+appVersion: "2.54.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart

diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.12-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.01-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r24.12/docs/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r25.01/docs/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r24.12/README.md
+    # see https://github.com/triton-inference-server/server/blob/r25.01/README.md
     #  for more details
 
 service:

diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:24.12-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.01-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1

diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -1,4 +1,4 @@
-# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -33,7 +33,7 @@ metadata:
   namespace: default
 spec:
   containers:
-  - image: nvcr.io/nvidia/tritonserver:24.12-py3-sdk
+  - image: nvcr.io/nvidia/tritonserver:25.01-py3-sdk
     imagePullPolicy: Always
     name: nv-triton-client
     securityContext: