Skip to content

Commit 5704238

Browse files
pskiran1nnshah1
andauthored
refactor: Update the response queue in the server to reuse response slots (#7879)
Co-authored-by: Neelay Shah <[email protected]>
1 parent 440dcde commit 5704238

File tree

10 files changed

+430
-60
lines changed

10 files changed

+430
-60
lines changed

Dockerfile.QA

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ RUN mkdir -p qa/common && \
128128
cp bin/multi_server qa/L0_multi_server/. && \
129129
cp bin/memory_test qa/L0_memory/. && \
130130
cp bin/pinned_memory_manager_test qa/L0_memory/. && \
131+
mkdir -p qa/L0_memory/python_models/repeat_int32/1 && \
131132
cp bin/repo_agent_test qa/L0_triton_repo_agent/. && \
132133
cp lib/libtritonrepoagent_relocation.so qa/L0_triton_repo_agent/. && \
133134
mkdir qa/L0_query/models/query/1 && \
@@ -264,7 +265,11 @@ RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \
264265
cp /workspace/tritonbuild/python/examples/decoupled/square_model.py \
265266
qa/L0_decoupled/python_models/square_int32/1/. && \
266267
cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \
267-
qa/L0_decoupled/python_models/square_int32/.
268+
qa/L0_decoupled/python_models/square_int32/. && \
269+
cp /workspace/tritonbuild/python/examples/decoupled/repeat_model.py \
270+
qa/L0_memory/python_models/repeat_int32/1/model.py && \
271+
cp /workspace/tritonbuild/python/examples/decoupled/repeat_config.pbtxt \
272+
qa/L0_memory/python_models/repeat_int32/config.pbtxt
268273

269274
RUN mkdir -p qa/L0_decoupled_grpc_error && \
270275
cp -r qa/L0_decoupled/. qa/L0_decoupled_grpc_error

docs/customization_guide/deploy.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<!--
2-
# Copyright (c) 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright (c) 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -287,6 +287,21 @@ no untrusted files of same name exist in a location of higher search priority
287287
(e.g., System32). It is still recommended to add backend-specific dependencies
288288
to their corresponding backend folder when possible.
289289

290+
# GRPC server options
291+
Triton Inference Server's gRPC inference handlers internally use states to manage inference requests and response queues. Each state consists of one inference request and one response queue. The response queue within a state can hold multiple response objects. These states remain allocated for reuse to optimize performance by minimizing dynamic allocations.
292+
293+
You can configure the following parameters to balance memory usage and server performance:
294+
- The maximum number of states that remain allocated.
295+
- The maximum number of response objects that can stay allocated in the response queue.
296+
297+
##### `--grpc-infer-allocation-pool-size=<integer>`
298+
Specifies the maximum number of states (inference request/response queues) that remain allocated for reuse. If the number of in-flight requests does not exceed this value, no allocation or deallocation of request/response queues will occur. By default, this value is set to `8`.
299+
300+
##### `--grpc-max-response-pool-size=<integer>`
301+
Specifies the maximum number of inference response objects that can remain allocated in each response queue at any given time. This option is particularly useful in decoupled mode, where multiple responses are generated for a single request. By default, this value is set to `INT_MAX`.
302+
303+
> [!Warning]
304+
> Setting this value too low may negatively impact performance.
290305
291306

292307

qa/L0_decoupled/test.sh

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -127,6 +127,45 @@ for trial in $TRIALS; do
127127

128128
kill $SERVER_PID
129129
wait $SERVER_PID
130+
131+
SERVER_ARGS="--model-repository=$MODELDIR --grpc-max-response-pool-size=1"
132+
SERVER_LOG="grpc_max_response_pool_size_1_${trial}_server.log"
133+
CLIENT_LOG="grpc_max_response_pool_size_1_${trial}_client.log"
134+
run_server
135+
if [ "$SERVER_PID" == "0" ]; then
136+
echo -e "\n***\n*** Failed to start $SERVER\n***"
137+
cat $SERVER_LOG
138+
exit 1
139+
fi
140+
141+
for test in \
142+
test_one_to_none \
143+
test_one_to_one \
144+
test_one_to_many \
145+
test_no_streaming \
146+
test_response_order \
147+
test_wrong_shape; do
148+
149+
echo "Test: $test" >>$CLIENT_LOG
150+
set +e
151+
python $DECOUPLED_TEST DecoupledTest.$test >>$CLIENT_LOG 2>&1
152+
if [ $? -ne 0 ]; then
153+
echo -e "\n***\n*** Test grpc-max-response-pool-size=1 ${trial} - $test Failed\n***" >>$CLIENT_LOG
154+
echo -e "\n***\n*** Test grpc-max-response-pool-size=1 ${trial} - $test Failed\n***"
155+
RET=1
156+
else
157+
check_test_results $TEST_RESULT_FILE 1
158+
if [ $? -ne 0 ]; then
159+
cat $CLIENT_LOG
160+
echo -e "\n***\n*** Test Result Verification Failed\n***"
161+
RET=1
162+
fi
163+
fi
164+
set -e
165+
done
166+
167+
kill $SERVER_PID
168+
wait $SERVER_PID
130169
done
131170

132171
# Test the server frontend can merge the responses of non-decoupled model that

qa/L0_memory/client.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
28+
import os
29+
import queue
30+
import unittest
31+
from functools import partial
32+
33+
import numpy as np
34+
import tritonclient.grpc as grpcclient
35+
from tritonclient.utils import InferenceServerException
36+
37+
OUTPUT_NUM_ELEMENTS = int(os.getenv("OUTPUT_NUM_ELEMENTS", 1))
38+
39+
40+
class UserData:
41+
def __init__(self):
42+
self._completed_requests = queue.Queue()
43+
44+
45+
def callback(user_data, result, error):
46+
if error:
47+
user_data._completed_requests.put(error, timeout=100)
48+
else:
49+
user_data._completed_requests.put(result, timeout=100)
50+
51+
52+
class TestTritonInference(unittest.TestCase):
53+
def setUp(self):
54+
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
55+
56+
def tearDown(self):
57+
self.triton_client.stop_stream()
58+
59+
def test_inference(self):
60+
model_name = "repeat_int32"
61+
num_responses = 256
62+
in_data = np.random.randint(0, 1000, num_responses, dtype=np.int32)
63+
delay_data = np.zeros(num_responses, dtype=np.uint32)
64+
wait_data = np.zeros(1, dtype=np.uint32)
65+
user_data = UserData()
66+
67+
inputs = [
68+
grpcclient.InferInput("IN", [num_responses], "INT32"),
69+
grpcclient.InferInput("DELAY", [num_responses], "UINT32"),
70+
grpcclient.InferInput("WAIT", [1], "UINT32"),
71+
]
72+
outputs = [
73+
grpcclient.InferRequestedOutput("OUT"),
74+
grpcclient.InferRequestedOutput("IDX"),
75+
]
76+
77+
inputs[0].set_data_from_numpy(in_data)
78+
inputs[1].set_data_from_numpy(delay_data)
79+
inputs[2].set_data_from_numpy(wait_data)
80+
81+
self.triton_client.start_stream(callback=partial(callback, user_data))
82+
self.triton_client.async_stream_infer(
83+
model_name=model_name,
84+
inputs=inputs,
85+
outputs=outputs,
86+
)
87+
88+
recv_count = 0
89+
while recv_count < num_responses:
90+
data_item = user_data._completed_requests.get()
91+
92+
if isinstance(data_item, InferenceServerException):
93+
self.fail(f"InferenceServerException: {data_item}")
94+
try:
95+
response_idx = data_item.as_numpy("IDX")[0]
96+
response_data = data_item.as_numpy("OUT")
97+
expected_data = in_data[response_idx]
98+
99+
self.assertEqual(
100+
response_data[0],
101+
expected_data,
102+
f"Validation failed at index {response_idx} - response_data[0]: {response_data[0]}, expected_data: {expected_data}",
103+
)
104+
self.assertEqual(
105+
response_data.size,
106+
OUTPUT_NUM_ELEMENTS,
107+
f"Validation failed - response_data.size: {response_data.size}, OUTPUT_NUM_ELEMENTS: {OUTPUT_NUM_ELEMENTS}",
108+
)
109+
110+
except Exception as e:
111+
self.fail(f"Error processing response: {str(e)}")
112+
recv_count += 1
113+
114+
self.assertEqual(
115+
user_data._completed_requests.qsize(),
116+
0,
117+
"Did not receive the expected number of responses.",
118+
)
119+
120+
121+
if __name__ == "__main__":
122+
unittest.main()

qa/L0_memory/test.sh

Lines changed: 110 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -25,6 +25,8 @@
2525
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2626
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

28+
source ../common/util.sh
29+
2830
TEST_LOG="./memory_test.log"
2931
MEMORY_TEST=./memory_test
3032
PINNED_MEMORY_MANAGER_TEST=./pinned_memory_manager_test
@@ -39,6 +41,7 @@ rm -f TEST_LOG
3941
set +e
4042
$MEMORY_TEST >>$TEST_LOG 2>&1
4143
if [ $? -ne 0 ]; then
44+
cat $TEST_LOG
4245
echo -e "\n***\n*** Memory Test Failed\n***"
4346
RET=1
4447
fi
@@ -47,16 +50,119 @@ set -e
4750
set +e
4851
$PINNED_MEMORY_MANAGER_TEST >>$TEST_LOG 2>&1
4952
if [ $? -ne 0 ]; then
53+
cat $TEST_LOG
5054
echo -e "\n***\n*** Pinned Memory Manager Test Failed\n***"
5155
RET=1
5256
fi
5357
set -e
5458

59+
60+
###### Test --grpc-max-response-pool-size server option #######
61+
62+
monitor_memory() {
63+
local SERVER_PID=$1
64+
local MAX_MEM_FILE=$(mktemp)
65+
echo "0" > "$MAX_MEM_FILE"
66+
(
67+
local MAX_MEM=0
68+
while ps -p "$SERVER_PID" >/dev/null 2>&1; do
69+
CURRENT_MEM=$(awk '/Rss:/ {print $2}' /proc/$SERVER_PID/smaps_rollup)
70+
CURRENT_MEM=${CURRENT_MEM:-0}
71+
if [ "$CURRENT_MEM" -gt "$MAX_MEM" ]; then
72+
MAX_MEM=$CURRENT_MEM
73+
echo "$MAX_MEM" > "$MAX_MEM_FILE"
74+
fi
75+
sleep 0.1
76+
done
77+
echo "$MAX_MEM" > "$MAX_MEM_FILE"
78+
exit 0
79+
) &
80+
81+
MONITOR_PID=$!
82+
echo "$MONITOR_PID $MAX_MEM_FILE"
83+
}
84+
85+
stop_server_and_monitoring_memory() {
86+
local MONITOR_PID=$1
87+
local SERVER_PID=$2
88+
kill "$MONITOR_PID" 2>/dev/null && wait "$MONITOR_PID" 2>/dev/null || true
89+
kill "$SERVER_PID" 2>/dev/null && wait "$SERVER_PID" 2>/dev/null || true
90+
}
91+
92+
MODELDIR="./python_models"
93+
export OUTPUT_NUM_ELEMENTS=49807360
94+
sed -i '$a\parameters: [{ key: "output_num_elements" value: { string_value: "'"$OUTPUT_NUM_ELEMENTS"'" }}]' $MODELDIR/repeat_int32/config.pbtxt
95+
96+
SERVER=/opt/tritonserver/bin/tritonserver
97+
SERVER_BASE_ARGS="--model-repository=${MODELDIR} --log-verbose=2 --allow-metrics=0"
98+
99+
declare -A MEMORY_USAGE=()
100+
101+
for POOL_SIZE in 1 25 50 default; do
102+
if [[ "$POOL_SIZE" = "default" ]]; then
103+
SERVER_ARGS="${SERVER_BASE_ARGS}"
104+
else
105+
SERVER_ARGS="${SERVER_BASE_ARGS} --grpc-max-response-pool-size=${POOL_SIZE}"
106+
fi
107+
108+
CLIENT_LOG="./client_pool_size_${POOL_SIZE}.log"
109+
SERVER_LOG="./server_pool_size_${POOL_SIZE}.log"
110+
111+
run_server
112+
if [ "$SERVER_PID" == "0" ]; then
113+
echo -e "\n***\n*** Failed to start $SERVER\n***"
114+
cat $SERVER_LOG
115+
stop_server_and_monitoring_memory $MONITOR_PID $SERVER_PID
116+
exit 1
117+
fi
118+
sleep 2
119+
120+
# Capture initial memory usage
121+
INIT_MEM=$(awk '/Rss:/ {print $2}' /proc/$SERVER_PID/smaps_rollup)
122+
read -r MONITOR_PID MAX_MEM_FILE < <(monitor_memory "$SERVER_PID")
123+
124+
# Run client script
125+
set +e
126+
python3 client.py >> $CLIENT_LOG 2>&1
127+
if [ $? -ne 0 ]; then
128+
echo -e "\n***\n*** Running client for grpc-max-response-pool-size=${POOL_SIZE} FAILED\n***" >> $CLIENT_LOG 2>&1
129+
echo -e "\n***\n*** Running client for grpc-max-response-pool-size=${POOL_SIZE} FAILED\n***"
130+
stop_server_and_monitoring_memory $MONITOR_PID $SERVER_PID
131+
exit 1
132+
fi
133+
set -e
134+
sleep 2
135+
136+
stop_server_and_monitoring_memory $MONITOR_PID $SERVER_PID
137+
138+
if [[ -s "$MAX_MEM_FILE" ]]; then
139+
MAX_MEM=$(tail -n 1 "$MAX_MEM_FILE" 2>/dev/null || echo 0)
140+
MEMORY_USAGE["$POOL_SIZE"]=$((MAX_MEM - INIT_MEM))
141+
echo "Pool size: $POOL_SIZE | Initial Memory: ${INIT_MEM} KB | Peak Memory: ${MEMORY_USAGE[$POOL_SIZE]} KB" >> "memory.log"
142+
rm -f "$MAX_MEM_FILE"
143+
else
144+
echo "FAILED to collect memory usage for grpc-max-response-pool-size=${POOL_SIZE}"
145+
exit 1
146+
fi
147+
done
148+
149+
prev_mem=0
150+
prev_size=""
151+
for size in default 50 25 1; do
152+
current_mem=${MEMORY_USAGE[$size]}
153+
if [[ -n "$prev_size" && "$prev_mem" -ne 0 && "$current_mem" -ge "$prev_mem" ]]; then
154+
echo -e "\n***\n*** FAILED - Memory $current_mem KB with pool=$size >= $prev_mem KB (with pool=$prev_size)\n***"
155+
RET=1
156+
fi
157+
prev_mem=$current_mem
158+
prev_size=$size
159+
done
160+
161+
55162
if [ $RET -eq 0 ]; then
56-
echo -e "\n***\n*** Test Passed\n***"
163+
echo -e "\n***\n*** Test Passed\n***"
57164
else
58-
cat $TEST_LOG
59-
echo -e "\n***\n*** Test FAILED\n***"
165+
echo -e "\n***\n*** Test FAILED\n***"
60166
fi
61167

62168
exit $RET

0 commit comments

Comments
 (0)