triton-inference-server
diff --git a/‎Dockerfile.QA‎
Lines changed: 6 additions & 1 deletion b/‎Dockerfile.QA‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎docs/customization_guide/deploy.md‎
Lines changed: 16 additions & 1 deletion b/‎docs/customization_guide/deploy.md‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎qa/L0_decoupled/test.sh‎
Lines changed: 40 additions & 1 deletion b/‎qa/L0_decoupled/test.sh‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎qa/L0_memory/client.py‎
Lines changed: 122 additions & 0 deletions b/‎qa/L0_memory/client.py‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎qa/L0_memory/test.sh‎
Lines changed: 110 additions & 4 deletions b/‎qa/L0_memory/test.sh‎
Lines changed: 110 additions & 4 deletions
@@ -128,6 +128,7 @@ RUN mkdir -p qa/common && \
     cp bin/multi_server qa/L0_multi_server/. && \
     cp bin/memory_test qa/L0_memory/. && \
     cp bin/pinned_memory_manager_test qa/L0_memory/. && \
+    mkdir -p qa/L0_memory/python_models/repeat_int32/1 && \
     cp bin/repo_agent_test qa/L0_triton_repo_agent/. && \
     cp lib/libtritonrepoagent_relocation.so qa/L0_triton_repo_agent/. && \
     mkdir qa/L0_query/models/query/1 && \
@@ -264,7 +265,11 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \
     cp /workspace/tritonbuild/python/examples/decoupled/square_model.py \
         qa/L0_decoupled/python_models/square_int32/1/. && \
     cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \
-        qa/L0_decoupled/python_models/square_int32/.
+        qa/L0_decoupled/python_models/square_int32/. && \
+    cp /workspace/tritonbuild/python/examples/decoupled/repeat_model.py \
+        qa/L0_memory/python_models/repeat_int32/1/model.py && \
+    cp /workspace/tritonbuild/python/examples/decoupled/repeat_config.pbtxt \
+        qa/L0_memory/python_models/repeat_int32/config.pbtxt
 
 RUN mkdir -p qa/L0_decoupled_grpc_error && \
     cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -287,6 +287,21 @@ no untrusted files of same name exist in a location of higher search priority
 (e.g., System32). It is still recommended to add backend-specific dependencies
 to their corresponding backend folder when possible.
 
+# GRPC server options
+Triton Inference Server's gRPC inference handlers internally use states to manage inference requests and response queues. Each state consists of one inference request and one response queue. The response queue within a state can hold multiple response objects. These states remain allocated for reuse to optimize performance by minimizing dynamic allocations.
+
+You can configure the following parameters to balance memory usage and server performance:
+- The maximum number of states that remain allocated.
+- The maximum number of response objects that can stay allocated in the response queue.
+
+##### `--grpc-infer-allocation-pool-size=<integer>`
+Specifies the maximum number of states (inference request/response queues) that remain allocated for reuse. If the number of in-flight requests does not exceed this value, no allocation or deallocation of request/response queues will occur. By default, this value is set to `8`.
+
+##### `--grpc-max-response-pool-size=<integer>`
+Specifies the maximum number of inference response objects that can remain allocated in each response queue at any given time. This option is particularly useful in decoupled mode, where multiple responses are generated for a single request. By default, this value is set to `INT_MAX`.
+
+> [!Warning]
+> Setting this value too low may negatively impact performance.
 
 
 
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -127,6 +127,45 @@ for trial in $TRIALS; do
 
   kill $SERVER_PID
   wait $SERVER_PID
+
+  SERVER_ARGS="--model-repository=$MODELDIR --grpc-max-response-pool-size=1"
+  SERVER_LOG="grpc_max_response_pool_size_1_${trial}_server.log"
+  CLIENT_LOG="grpc_max_response_pool_size_1_${trial}_client.log"
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+      echo -e "\n***\n*** Failed to start $SERVER\n***"
+      cat $SERVER_LOG
+      exit 1
+  fi
+
+  for test in \
+              test_one_to_none \
+              test_one_to_one \
+              test_one_to_many \
+              test_no_streaming \
+              test_response_order \
+        test_wrong_shape; do
+
+      echo "Test: $test" >>$CLIENT_LOG
+      set +e
+      python $DECOUPLED_TEST DecoupledTest.$test >>$CLIENT_LOG 2>&1
+      if [ $? -ne 0 ]; then
+              echo -e "\n***\n*** Test grpc-max-response-pool-size=1 ${trial} - $test Failed\n***" >>$CLIENT_LOG
+              echo -e "\n***\n*** Test grpc-max-response-pool-size=1 ${trial} - $test Failed\n***"
+              RET=1
+      else
+          check_test_results $TEST_RESULT_FILE 1
+          if [ $? -ne 0 ]; then
+              cat $CLIENT_LOG
+              echo -e "\n***\n*** Test Result Verification Failed\n***"
+              RET=1
+          fi
+      fi
+      set -e
+  done
+
+  kill $SERVER_PID
+  wait $SERVER_PID
 done
 
 # Test the server frontend can merge the responses of non-decoupled model that
 
@@ -0,0 +1,122 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import os
+import queue
+import unittest
+from functools import partial
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
+
+OUTPUT_NUM_ELEMENTS = int(os.getenv("OUTPUT_NUM_ELEMENTS", 1))
+
+
+class UserData:
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+
+
+def callback(user_data, result, error):
+    if error:
+        user_data._completed_requests.put(error, timeout=100)
+    else:
+        user_data._completed_requests.put(result, timeout=100)
+
+
+class TestTritonInference(unittest.TestCase):
+    def setUp(self):
+        self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
+
+    def tearDown(self):
+        self.triton_client.stop_stream()
+
+    def test_inference(self):
+        model_name = "repeat_int32"
+        num_responses = 256
+        in_data = np.random.randint(0, 1000, num_responses, dtype=np.int32)
+        delay_data = np.zeros(num_responses, dtype=np.uint32)
+        wait_data = np.zeros(1, dtype=np.uint32)
+        user_data = UserData()
+
+        inputs = [
+            grpcclient.InferInput("IN", [num_responses], "INT32"),
+            grpcclient.InferInput("DELAY", [num_responses], "UINT32"),
+            grpcclient.InferInput("WAIT", [1], "UINT32"),
+        ]
+        outputs = [
+            grpcclient.InferRequestedOutput("OUT"),
+            grpcclient.InferRequestedOutput("IDX"),
+        ]
+
+        inputs[0].set_data_from_numpy(in_data)
+        inputs[1].set_data_from_numpy(delay_data)
+        inputs[2].set_data_from_numpy(wait_data)
+
+        self.triton_client.start_stream(callback=partial(callback, user_data))
+        self.triton_client.async_stream_infer(
+            model_name=model_name,
+            inputs=inputs,
+            outputs=outputs,
+        )
+
+        recv_count = 0
+        while recv_count < num_responses:
+            data_item = user_data._completed_requests.get()
+
+            if isinstance(data_item, InferenceServerException):
+                self.fail(f"InferenceServerException: {data_item}")
+            try:
+                response_idx = data_item.as_numpy("IDX")[0]
+                response_data = data_item.as_numpy("OUT")
+                expected_data = in_data[response_idx]
+
+                self.assertEqual(
+                    response_data[0],
+                    expected_data,
+                    f"Validation failed at index {response_idx} - response_data[0]: {response_data[0]}, expected_data: {expected_data}",
+                )
+                self.assertEqual(
+                    response_data.size,
+                    OUTPUT_NUM_ELEMENTS,
+                    f"Validation failed - response_data.size: {response_data.size}, OUTPUT_NUM_ELEMENTS: {OUTPUT_NUM_ELEMENTS}",
+                )
+
+            except Exception as e:
+                self.fail(f"Error processing response: {str(e)}")
+            recv_count += 1
+
+        self.assertEqual(
+            user_data._completed_requests.qsize(),
+            0,
+            "Did not receive the expected number of responses.",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,6 +25,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+source ../common/util.sh
+
 TEST_LOG="./memory_test.log"
 MEMORY_TEST=./memory_test
 PINNED_MEMORY_MANAGER_TEST=./pinned_memory_manager_test
@@ -39,6 +41,7 @@ rm -f TEST_LOG
 set +e
 $MEMORY_TEST >>$TEST_LOG 2>&1
 if [ $? -ne 0 ]; then
+    cat $TEST_LOG
     echo -e "\n***\n*** Memory Test Failed\n***"
     RET=1
 fi
@@ -47,16 +50,119 @@ set -e
 set +e
 $PINNED_MEMORY_MANAGER_TEST >>$TEST_LOG 2>&1
 if [ $? -ne 0 ]; then
+    cat $TEST_LOG
     echo -e "\n***\n*** Pinned Memory Manager Test Failed\n***"
     RET=1
 fi
 set -e
 
+
+###### Test --grpc-max-response-pool-size server option #######
+
+monitor_memory() {
+  local SERVER_PID=$1
+  local MAX_MEM_FILE=$(mktemp)
+  echo "0" > "$MAX_MEM_FILE"
+  (
+    local MAX_MEM=0
+    while ps -p "$SERVER_PID" >/dev/null 2>&1; do
+      CURRENT_MEM=$(awk '/Rss:/ {print $2}' /proc/$SERVER_PID/smaps_rollup)
+      CURRENT_MEM=${CURRENT_MEM:-0}
+      if [ "$CURRENT_MEM" -gt "$MAX_MEM" ]; then
+        MAX_MEM=$CURRENT_MEM
+        echo "$MAX_MEM" > "$MAX_MEM_FILE"
+      fi
+      sleep 0.1
+    done
+    echo "$MAX_MEM" > "$MAX_MEM_FILE"
+    exit 0
+  ) &
+
+  MONITOR_PID=$!
+  echo "$MONITOR_PID $MAX_MEM_FILE"
+}
+
+stop_server_and_monitoring_memory() {
+  local MONITOR_PID=$1
+  local SERVER_PID=$2
+  kill "$MONITOR_PID" 2>/dev/null && wait "$MONITOR_PID" 2>/dev/null || true
+  kill "$SERVER_PID" 2>/dev/null && wait "$SERVER_PID" 2>/dev/null || true
+}
+
+MODELDIR="./python_models"
+export OUTPUT_NUM_ELEMENTS=49807360
+sed -i '$a\parameters: [{ key: "output_num_elements" value: { string_value: "'"$OUTPUT_NUM_ELEMENTS"'" }}]' $MODELDIR/repeat_int32/config.pbtxt
+
+SERVER=/opt/tritonserver/bin/tritonserver
+SERVER_BASE_ARGS="--model-repository=${MODELDIR} --log-verbose=2 --allow-metrics=0"
+
+declare -A MEMORY_USAGE=()
+
+for POOL_SIZE in 1 25 50 default; do
+  if [[ "$POOL_SIZE" = "default" ]]; then
+    SERVER_ARGS="${SERVER_BASE_ARGS}"
+  else
+    SERVER_ARGS="${SERVER_BASE_ARGS} --grpc-max-response-pool-size=${POOL_SIZE}"
+  fi
+
+  CLIENT_LOG="./client_pool_size_${POOL_SIZE}.log"
+  SERVER_LOG="./server_pool_size_${POOL_SIZE}.log"
+
+  run_server
+  if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    stop_server_and_monitoring_memory $MONITOR_PID $SERVER_PID
+    exit 1
+  fi
+  sleep 2
+
+  # Capture initial memory usage
+  INIT_MEM=$(awk '/Rss:/ {print $2}' /proc/$SERVER_PID/smaps_rollup)
+  read -r MONITOR_PID MAX_MEM_FILE < <(monitor_memory "$SERVER_PID")
+
+  # Run client script
+  set +e
+  python3 client.py >> $CLIENT_LOG 2>&1
+  if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Running client for grpc-max-response-pool-size=${POOL_SIZE} FAILED\n***" >> $CLIENT_LOG 2>&1
+    echo -e "\n***\n*** Running client for grpc-max-response-pool-size=${POOL_SIZE} FAILED\n***"
+    stop_server_and_monitoring_memory $MONITOR_PID $SERVER_PID
+    exit 1
+  fi
+  set -e
+  sleep 2
+
+  stop_server_and_monitoring_memory $MONITOR_PID $SERVER_PID
+
+  if [[ -s "$MAX_MEM_FILE" ]]; then
+    MAX_MEM=$(tail -n 1 "$MAX_MEM_FILE" 2>/dev/null || echo 0)
+    MEMORY_USAGE["$POOL_SIZE"]=$((MAX_MEM - INIT_MEM))
+    echo "Pool size: $POOL_SIZE | Initial Memory: ${INIT_MEM} KB | Peak Memory: ${MEMORY_USAGE[$POOL_SIZE]} KB" >> "memory.log"
+    rm -f "$MAX_MEM_FILE"
+  else
+    echo "FAILED to collect memory usage for grpc-max-response-pool-size=${POOL_SIZE}"
+    exit 1
+  fi
+done
+
+prev_mem=0
+prev_size=""
+for size in default 50 25 1; do
+  current_mem=${MEMORY_USAGE[$size]}
+  if [[ -n "$prev_size" && "$prev_mem" -ne 0 && "$current_mem" -ge "$prev_mem" ]]; then
+    echo -e "\n***\n*** FAILED - Memory $current_mem KB with pool=$size >= $prev_mem KB (with pool=$prev_size)\n***"
+    RET=1
+  fi
+  prev_mem=$current_mem
+  prev_size=$size
+done
+
+
 if [ $RET -eq 0 ]; then
-    echo -e "\n***\n*** Test Passed\n***"
+  echo -e "\n***\n*** Test Passed\n***"
 else
-    cat $TEST_LOG
-    echo -e "\n***\n*** Test FAILED\n***"
+  echo -e "\n***\n*** Test FAILED\n***"
 fi
 
 exit $RET