triton-inference-server
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile.sdk‎
Lines changed: 2 additions & 2 deletions b/‎Dockerfile.sdk‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 7 deletions b/‎README.md‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎TRITON_VERSION‎
Lines changed: 1 addition & 1 deletion b/‎TRITON_VERSION‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎build.py‎
Lines changed: 17 additions & 16 deletions b/‎build.py‎
Lines changed: 17 additions & 16 deletions
diff --git a/‎compose.py‎
Lines changed: 1 addition & 1 deletion b/‎compose.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/aws/values.yaml‎
Lines changed: 1 addition & 1 deletion b/‎deploy/aws/values.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/fleetcommand/Chart.yaml‎
Lines changed: 1 addition & 1 deletion b/‎deploy/fleetcommand/Chart.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/fleetcommand/values.yaml‎
Lines changed: 3 additions & 3 deletions b/‎deploy/fleetcommand/values.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎deploy/gcp/values.yaml‎
Lines changed: 1 addition & 1 deletion b/‎deploy/gcp/values.yaml‎
Lines changed: 1 addition & 1 deletion
@@ -88,7 +88,7 @@ proposed change so that the Triton team can provide feedback.
     documentation for instructions on running these tests.
 
 - Triton Inference Server's default build assumes recent versions of
-  dependencies (CUDA, TensorFlow, PyTorch, TensorRT,
+  dependencies (CUDA, PyTorch, TensorRT,
   etc.). Contributions that add compatibility with older versions of
   those dependencies will be considered, but NVIDIA cannot guarantee
   that all possible build configurations work, are not broken by
 
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.07-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.08-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
@@ -43,7 +43,7 @@ ARG JAVA_BINDINGS_MAVEN_VERSION=3.8.4
 ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG=1.5.8
 ARG TRITON_PERF_ANALYZER_BUILD=1
 # DCGM version to install for Model Analyzer
-ARG DCGM_VERSION=4.2.3-2
+ARG DCGM_VERSION=4.4.0-1
 
 ARG NVIDIA_TRITON_SERVER_SDK_VERSION=unknown
 ARG NVIDIA_BUILD_ID=unknown
 
@@ -29,15 +29,15 @@
 
 >[!WARNING]
 >You are currently on the `main` branch which tracks under-development progress
->towards the next release. The current release is version [2.59.1](https://github.com/triton-inference-server/server/releases/latest)
->and corresponds to the 25.07 container release on NVIDIA GPU Cloud (NGC).
+>towards the next release. The current release is version [2.60.0](https://github.com/triton-inference-server/server/releases/latest)
+>and corresponds to the 25.08 container release on NVIDIA GPU Cloud (NGC).
 
 # Triton Inference Server
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
 multiple deep learning and machine learning frameworks, including TensorRT,
-TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton
+PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton
 Inference Server supports inference across cloud, data center, edge and embedded
 devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference
 Server delivers optimized performance for many query types, including real time,
@@ -90,16 +90,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r25.07 https://github.com/triton-inference-server/server.git
+git clone -b r25.08 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:25.07-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:25.08-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:25.07-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:25.08-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
 Image '/workspace/images/mug.jpg':
@@ -166,7 +166,6 @@ configuration](docs/user_guide/model_configuration.md) for the model.
 - Triton supports multiple execution engines, called
   [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including
   [TensorRT](https://github.com/triton-inference-server/tensorrt_backend),
-  [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend),
   [PyTorch](https://github.com/triton-inference-server/pytorch_backend),
   [ONNX](https://github.com/triton-inference-server/onnxruntime_backend),
   [OpenVINO](https://github.com/triton-inference-server/openvino_backend),
 
@@ -1 +1 @@
-2.60.0dev
+2.61.0dev
@@ -71,14 +71,14 @@
 #
 
 DEFAULT_TRITON_VERSION_MAP = {
-    "release_version": "2.60.0dev",
-    "triton_container_version": "25.08dev",
-    "upstream_container_version": "25.07",
-    "ort_version": "1.22.0",
+    "release_version": "2.61.0dev",
+    "triton_container_version": "25.09dev",
+    "upstream_container_version": "25.08",
+    "ort_version": "1.23.0",
     "ort_openvino_version": "2025.2.0",
     "standalone_openvino_version": "2025.2.0",
-    "dcgm_version": "4.2.3-2",
-    "vllm_version": "0.9.0.1",
+    "dcgm_version": "4.4.0-1",
+    "vllm_version": "0.9.2",
     "rhel_py_version": "3.12.3",
 }
 
@@ -1259,7 +1259,7 @@ def create_dockerfile_linux(
     # stage of the PyTorch backend
     if not FLAGS.enable_gpu and ("pytorch" in backends):
         df += """
-RUN patchelf --add-needed /usr/local/cuda/lib64/stubs/libcublasLt.so.12 backends/pytorch/libtorch_cuda.so
+RUN patchelf --add-needed /usr/local/cuda/lib64/stubs/libcublasLt.so.13 backends/pytorch/libtorch_cuda.so
 """
     if "tensorrtllm" in backends:
         df += """
@@ -1494,7 +1494,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
             cp -r nvpl_slim_24.04/include/* /usr/local/include && \\
             rm -rf nvpl_slim_24.04.tar nvpl_slim_24.04; \\
         fi \\
-        && pip3 install --no-cache-dir --progress-bar on --index-url $VLLM_INDEX_URL -r /run/secrets/requirements \\
+        && pip3 install --no-cache-dir --extra-index-url $VLLM_INDEX_URL -r /run/secrets/requirements \\
         # Need to install in-house build of pytorch-triton to support triton_key definition used by torch 2.5.1
         && cd /tmp \\
         && wget $PYTORCH_TRITON_URL \\
@@ -1554,18 +1554,19 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
         df += """
 RUN mkdir -p /usr/local/cuda/lib64/stubs
 COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusparse.so /usr/local/cuda/lib64/stubs/libcusparse.so.12
-COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusolver.so /usr/local/cuda/lib64/stubs/libcusolver.so.11
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusolver.so /usr/local/cuda/lib64/stubs/libcusolver.so.12
 COPY --from=min_container /usr/local/cuda/lib64/stubs/libcurand.so /usr/local/cuda/lib64/stubs/libcurand.so.10
-COPY --from=min_container /usr/local/cuda/lib64/stubs/libcufft.so /usr/local/cuda/lib64/stubs/libcufft.so.11
-COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublas.so /usr/local/cuda/lib64/stubs/libcublas.so.12
-COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.12
-COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.11
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcufft.so /usr/local/cuda/lib64/stubs/libcufft.so.12
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublas.so /usr/local/cuda/lib64/stubs/libcublas.so.13
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.13
 
 RUN mkdir -p /usr/local/cuda/targets/{cuda_arch}-linux/lib
-COPY --from=min_container /usr/local/cuda/lib64/libcudart.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
-COPY --from=min_container /usr/local/cuda/lib64/libcupti.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
-COPY --from=min_container /usr/local/cuda/lib64/libnvJitLink.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
+COPY --from=min_container /usr/local/cuda/lib64/libcudart.so.13 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
+COPY --from=min_container /usr/local/cuda/lib64/libcupti.so.13 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
+COPY --from=min_container /usr/local/cuda/lib64/libnvJitLink.so.13 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
 COPY --from=min_container /usr/local/cuda/lib64/libcufile.so.0 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
+COPY --from=min_container /usr/local/cuda/lib64/libnvrtc.so.13 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
+COPY --from=min_container /usr/local/cuda/lib64/libcusparseLt.so.0 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
 
 RUN mkdir -p /opt/hpcx/ucc/lib/ /opt/hpcx/ucx/lib/
 COPY --from=min_container /opt/hpcx/ucc/lib/libucc.so.1 /opt/hpcx/ucc/lib/libucc.so.1
 
@@ -298,7 +298,7 @@ def create_argmap(images, skip_pull):
     dcgm_ver = re.search("DCGM_VERSION=([\S]{4,}) ", vars)
     dcgm_version = ""
     if dcgm_ver is None:
-        dcgm_version = "4.2.3-2"
+        dcgm_version = "4.4.0-1"
         log(
             "WARNING: DCGM version not found from image, installing the earlierst version {}".format(
                 dcgm_version
 
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.08-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1
 
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.59.1"
+appVersion: 2.60.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart
 
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.08-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r25.07/docs/user_guide/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r25.08/docs/user_guide/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r25.07/README.md
+    # see https://github.com/triton-inference-server/server/blob/r25.08/README.md
     #  for more details
 
 service:
 
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.07-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.08-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1