Prepare r25.05 for merging to main (#8237)

dmitry-tokarev-nv · krishung5 · mc-nv · web-flow · commit 1b8c81848fb5 · 2025-06-05T19:39:21.000-04:00
Co-authored-by: Kris Hung &lt;krish@nvidia.com&gt;
Co-authored-by: Misha Chornyi &lt;99709299+mc-nv@users.noreply.github.com&gt;
Co-authored-by: richardhuo-nv &lt;rihuo@nvidia.com&gt;
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
@@ -29,7 +29,7 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.04-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.05-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
diff --git a/README.md b/README.md
@@ -29,8 +29,8 @@
 
 >[!WARNING]
 >You are currently on the `main` branch which tracks under-development progress
->towards the next release. The current release is version [2.57.0](https://github.com/triton-inference-server/server/releases/latest)
->and corresponds to the 25.04 container release on NVIDIA GPU Cloud (NGC).
+>towards the next release. The current release is version [2.58.0](https://github.com/triton-inference-server/server/releases/latest)
+>and corresponds to the 25.05 container release on NVIDIA GPU Cloud (NGC).
 
 # Triton Inference Server
 
@@ -90,16 +90,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r25.02 https://github.com/triton-inference-server/server.git
+git clone -b r25.05 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:25.02-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:25.05-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:25.02-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:25.05-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
 Image '/workspace/images/mug.jpg':
@@ -260,4 +260,3 @@ For questions, we recommend posting in our community
 
 Please refer to the [NVIDIA Developer Triton page](https://developer.nvidia.com/nvidia-triton-inference-server)
 for more information.
-
diff --git a/TRITON_VERSION b/TRITON_VERSION
@@ -1 +1 @@
-2.58.0dev
+2.59.0dev
diff --git a/build.py b/build.py
@@ -73,7 +73,7 @@
 DEFAULT_TRITON_VERSION_MAP = {
     "release_version": "2.58.0dev",
     "triton_container_version": "25.05dev",
-    "upstream_container_version": "25.04",
+    "upstream_container_version": "25.05",
     "ort_version": "1.22.0",
     "ort_openvino_version": "2025.1.0",
     "standalone_openvino_version": "2025.1.0",
diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.05-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
@@ -26,7 +26,7 @@
 
 apiVersion: v1
 # appVersion is the Triton version; update when changing release
-appVersion: "2.57.0"
+appVersion: "2.58.0"
 description: Triton Inference Server (Fleet Command)
 name: triton-inference-server
 # version is the Chart version; update when changing anything in the chart
diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.05-py3
   pullPolicy: IfNotPresent
   numGpus: 1
   serverCommand: tritonserver
@@ -47,13 +47,13 @@ image:
     #
     # To set model control mode, uncomment and configure below
     # TODO: Fix the following url, it is invalid
-    # See https://github.com/triton-inference-server/server/blob/r25.04/docs/user_guide/model_management.md
+    # See https://github.com/triton-inference-server/server/blob/r25.05/docs/user_guide/model_management.md
     #  for more details
     #- --model-control-mode=explicit|poll|none
     #
     # Additional server args
     #
-    # see https://github.com/triton-inference-server/server/blob/r25.04/README.md
+    # see https://github.com/triton-inference-server/server/blob/r25.05/README.md
     #  for more details
 
 service:
diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.05-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: gs://triton-inference-server-repository/model_repository
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -33,7 +33,7 @@ metadata:
   namespace: default
 spec:
   containers:
-  - image: nvcr.io/nvidia/tritonserver:25.04-py3-sdk
+  - image: nvcr.io/nvidia/tritonserver:25.05-py3-sdk
     imagePullPolicy: Always
     name: nv-triton-client
     securityContext:
diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -27,9 +27,9 @@
 
 export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
 export APP_NAME=tritonserver
-export MAJOR_VERSION=2.57
-export MINOR_VERSION=2.57.0
-export NGC_VERSION=25.04-py3
+export MAJOR_VERSION=2.58
+export MINOR_VERSION=2.58.0
+export NGC_VERSION=25.05-py3
 
 docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION
 
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 apiVersion: v1
-appVersion: "2.57"
+appVersion: "2.58"
 description: Triton Inference Server
 name: triton-inference-server
-version: 2.57.0
+version: 2.58.0
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -31,14 +31,14 @@ maxReplicaCount: 3
 tritonProtocol: HTTP
 # HPA GPU utilization autoscaling target
 HPATargetAverageValue: 85
-modelRepositoryPath: gs://triton_sample_models/25.04
-publishedVersion: '2.57.0'
+modelRepositoryPath: gs://triton_sample_models/25.05
+publishedVersion: '2.58.0'
 gcpMarketplace: true
 
 image:
   registry: gcr.io
   repository: nvidia-ngc-public/tritonserver
-  tag: 25.04-py3
+  tag: 25.05-py3
   pullPolicy: IfNotPresent
   # modify the model repository here to match your GCP storage bucket
   numGpus: 1
diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.57.0'
+  publishedVersion: '2.58.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -27,7 +27,7 @@
 x-google-marketplace:
   schemaVersion: v2
   applicationApiVersion: v1beta1
-  publishedVersion: '2.57.0'
+  publishedVersion: '2.58.0'
   publishedVersionMetadata:
     releaseNote: >-
       Initial release.
@@ -89,7 +89,7 @@ properties:
   modelRepositoryPath:
     type: string
     title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
-    default: gs://triton_sample_models/25.04
+    default: gs://triton_sample_models/25.05
   image.ldPreloadPath:
     type: string
     title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable.
diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -33,7 +33,7 @@
 ```
 docker run --gpus all -it --network host \
     --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-    -v ~:/scripts nvcr.io/nvidia/tensorrt:25.04-py3
+    -v ~:/scripts nvcr.io/nvidia/tensorrt:25.05-py3
 
 pip install onnx six torch tf2onnx tensorflow
 
@@ -57,7 +57,7 @@ mkdir -p engines
 
 python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
 
-gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/25.04/bert/1/model.plan
+gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/25.05/bert/1/model.plan
 ```
 
-For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/25.04/` should be updated accordingly with the correct version.
+For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/25.05/` should be updated accordingly with the correct version.
diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
@@ -30,7 +30,7 @@ tags:
   openshift: false
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.05-py3
   pullPolicy: IfNotPresent
   modelRepositoryServer: < Replace with the IP Address of your file server >
   modelRepositoryPath: /srv/models
diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
@@ -27,7 +27,7 @@
 replicaCount: 1
 
 image:
-  imageName: nvcr.io/nvidia/tritonserver:25.04-py3
+  imageName: nvcr.io/nvidia/tritonserver:25.05-py3
   pullPolicy: IfNotPresent
   modelRepositoryPath: s3://https://<OCI_NAMESPACE>.compat.objectstorage.<OCI_REGION>.oraclecloud.com:443/triton-inference-server-repository
   numGpus: 1
diff --git a/docs/examples/model_repository/inception_onnx/config.pbtxt b/docs/examples/model_repository/inception_onnx/config.pbtxt
@@ -25,20 +25,22 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 name: "inception_onnx"
 platform: "onnxruntime_onnx"
-max_batch_size: 128
+max_batch_size: 0
 input [
   {
-    name: "input"
+    name: "input:0"
     data_type: TYPE_FP32
     format: FORMAT_NHWC
     dims: [ 299, 299, 3 ]
+    reshape { shape: [ 1, 299, 299, 3 ] }
   }
 ]
 output [
   {
-    name: "InceptionV3/Predictions/Softmax"
+    name: "InceptionV3/Predictions/Softmax:0"
     data_type: TYPE_FP32
     dims: [ 1001 ]
+    reshape { shape: [ 1, 1001 ] }
     label_filename: "inception_labels.txt"
   }
 ]
diff --git a/docs/introduction/compatibility.md b/docs/introduction/compatibility.md
@@ -38,6 +38,7 @@
 
 | Triton release version	 | NGC Tag	 | Python version	 | Torch version | TensorRT version | TensorRT-LLM version | CUDA version | CUDA Driver version | Size |
 | --- | ---  | --- | --- | --- | --- | --- | --- | --- |
+| 25.05 | nvcr.io/nvidia/tritonserver:25.05-trtllm-python-py3 | Python 3.12.3  | 2.7.0a0+7c8ec84dab.nv25.3 | 10.9.0.34 | 0.19.0 | 12.8.1.012 | 570.124.06 | 17G |
 | 25.04 | nvcr.io/nvidia/tritonserver:25.04-trtllm-python-py3 | Python 3.12.3  | 2.7.0a0+7c8ec84dab.nv25.3 | 10.9.0.34 | 0.18.2 | 12.8.1.012 | 570.124.06 | 17G |
 | 25.03 | nvcr.io/nvidia/tritonserver:25.03-trtllm-python-py3 | Python 3.12.3  | 2.7.0a0%2B7c8ec84dab.nv25.3 | 10.9.0.34 | 0.18.0 | 12.8.1.012 | 570.124.06 | 28G |
 | 25.02 | nvcr.io/nvidia/tritonserver:25.02-trtllm-python-py3 | Python 3.12.3 | 2.6.0a0%2Becf3bae40a.nv25.1 | 10.8.0.43 | 0.17.0.post1 | 12.8.0.038 | 570.86.10 | 28G |
@@ -56,6 +57,7 @@
 
 | Triton release version	 | NGC Tag	 | Python version	 | vLLM version | CUDA version | CUDA Driver version | Size |
 | --- | --- | --- | --- | --- | --- | --- |
+| 25.05 | nvcr.io/nvidia/tritonserver:25.04-vllm-python-py3 | Python 3.12.3  | 0.8.4+c4369543.nv25.5.cu129 | 12.9.0.043 | 575.51.03 | 10G |
 | 25.04 | nvcr.io/nvidia/tritonserver:25.04-vllm-python-py3 | Python 3.12.3  | 0.8.1+5f4af9e0.nv25.4.cu129 | 12.9.0.036 | 575.51.02 | 10G |
 | 25.03 | nvcr.io/nvidia/tritonserver:25.03-vllm-python-py3 | Python 3.12.3  | 0.7.3+04de634a.nv25.3.cu128 | 12.8.1.012 | 570.124.06 | 22G |
 | 25.02 | nvcr.io/nvidia/tritonserver:25.02-vllm-python-py3 | Python 3.12.3  | 0.7.0+5e800e3d.nv25.2.cu128 | 12.8.0.038 | 570.86.10 | 22G |
@@ -74,6 +76,7 @@
 
 | Triton release version	 | ONNX Runtime	 |
 | --- | --- |
+| 25.05 | 1.22.0 |
 | 25.04 | 1.21.0 |
 | 25.03 | 1.21.0 |
 | 25.02 | 1.20.1 |
diff --git a/docs/introduction/release_notes.md b/docs/introduction/release_notes.md
@@ -25,10 +25,10 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 -->
-# [Triton Inference Server Release 25.04](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-25-04.html#rel-25-04)
+# [Triton Inference Server Release 25.05](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-25-05.html#rel-25-05)
 
-The Triton Inference Server container image, release 25.04, is available
+The Triton Inference Server container image, release 25.05, is available
 on [NGC](https://ngc.nvidia.com/catalog/containers/nvidia:tritonserver) and
 is open source
 on [GitHub](https://github.com/triton-inference-server/server). Release notes can
-be found on the [GitHub Release Page](https://github.com/triton-inference-server/server/releases)
+be found on the [GitHub Release Page](https://github.com/triton-inference-server/server/releases)
diff --git a/python/openai/README.md b/python/openai/README.md
@@ -51,7 +51,7 @@
 docker run -it --net=host --gpus all --rm \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
   -e HF_TOKEN \
-  nvcr.io/nvidia/tritonserver:25.04-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:25.05-vllm-python-py3
 ```
 
 2. Launch the OpenAI-compatible Triton Inference Server:
diff --git a/qa/L0_infer/test.sh b/qa/L0_infer/test.sh
@@ -73,7 +73,7 @@ fi
 if [ "$TEST_SYSTEM_SHARED_MEMORY" -eq 1 ] || [ "$TEST_CUDA_SHARED_MEMORY" -eq 1 ]; then
     EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="33"}
 else
-    EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="44"}
+    EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="46"}
 fi
 
 TEST_JETSON=${TEST_JETSON:=0}
diff --git a/qa/common/gen_jetson_trt_models b/qa/common/gen_jetson_trt_models
@@ -34,7 +34,7 @@
 # Make all generated files accessible outside of container
 umask 0000
 # Set the version of the models
-TRITON_VERSION=${TRITON_VERSION:=25.04}
+TRITON_VERSION=${TRITON_VERSION:=25.05}
 # Set the CUDA device to use
 CUDA_DEVICE=${RUNNER_ID:=0}
 # Set TensorRT image
diff --git a/qa/common/gen_qa_custom_ops b/qa/common/gen_qa_custom_ops
@@ -37,7 +37,7 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=25.04}
+TRITON_VERSION=${TRITON_VERSION:=25.05}
 NVIDIA_UPSTREAM_VERSION=${NVIDIA_UPSTREAM_VERSION:=$TRITON_VERSION}
 PYTORCH_IMAGE=${PYTORCH_IMAGE:=nvcr.io/nvidia/pytorch:$NVIDIA_UPSTREAM_VERSION-py3}
 UBUNTU_IMAGE=${UBUNTU_IMAGE:=ubuntu:24.04}
diff --git a/qa/common/gen_qa_model_repository b/qa/common/gen_qa_model_repository
@@ -48,7 +48,7 @@
 ##
 ############################################################################
 
-TRITON_VERSION=${TRITON_VERSION:=25.04}
+TRITON_VERSION=${TRITON_VERSION:=25.05}
 
 # ONNX. Use ONNX_OPSET 0 to use the default for ONNX version
 ONNX_VERSION=1.16.1

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`#`
`30`	`30`
`31`	`31`	`# Base image on the minimum Triton container`
`32`		`-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.04-py3-min`
	`32`	`+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.05-py3-min`
`33`	`33`
`34`	`34`	`ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo`
`35`	`35`	`ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo`