Skip to content

Commit 552db92

Browse files
Merge branch 'main' into mesharma-ci
2 parents 7c97d9f + b71088a commit 552db92

File tree

13 files changed

+1101
-38
lines changed

13 files changed

+1101
-38
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ coverage.xml
5050
.hypothesis/
5151
.pytest_cache/
5252
cover/
53+
*.out
5354

5455
# Translations
5556
*.mo

README.md

Lines changed: 112 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,12 @@ cd server
114114
--upstream-container-version=${TRITON_CONTAINER_VERSION}
115115
--backend=python:r${TRITON_CONTAINER_VERSION}
116116
--backend=vllm:r${TRITON_CONTAINER_VERSION}
117+
--backend=ensemble
117118
--vllm-version=${VLLM_VERSION}
118119
# Build Triton Server
119120
cd build
120121
bash -x ./docker_build
122+
121123
```
122124

123125
### Option 3. Add the vLLM Backend to the Default Triton Container
@@ -129,7 +131,8 @@ container with the following commands:
129131

130132
```
131133
mkdir -p /opt/tritonserver/backends/vllm
132-
wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/main/src/model.py
134+
git clone https://github.com/triton-inference-server/vllm_backend.git /tmp/vllm_backend
135+
cp -r /tmp/vllm_backend/src/* /opt/tritonserver/backends/vllm
133136
```
134137

135138
## Using the vLLM Backend
@@ -212,14 +215,121 @@ starting from 23.10 release.
212215

213216
You can use `pip install ...` within the container to upgrade vLLM version.
214217

215-
216218
## Running Multiple Instances of Triton Server
217219

218220
If you are running multiple instances of Triton server with a Python-based backend,
219221
you need to specify a different `shm-region-prefix-name` for each server. See
220222
[here](https://github.com/triton-inference-server/python_backend#running-multiple-instances-of-triton-server)
221223
for more information.
222224

225+
## Triton Metrics
226+
Starting with the 24.08 release of Triton, users can now obtain specific
227+
vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics
228+
[here](https://docs.vllm.ai/en/latest/serving/metrics.html)). This can be
229+
accomplished by launching a Triton server in any of the ways described above
230+
(ensuring the build code / container is 24.08 or later) and querying the server.
231+
Upon receiving a successful response, you can query the metrics endpoint by entering
232+
the following:
233+
```bash
234+
curl localhost:8002/metrics
235+
```
236+
VLLM stats are reported by the metrics endpoint in fields that are prefixed with
237+
`vllm:`. Triton currently supports reporting of the following metrics from vLLM.
238+
```bash
239+
# Number of prefill tokens processed.
240+
counter_prompt_tokens
241+
# Number of generation tokens processed.
242+
counter_generation_tokens
243+
# Histogram of time to first token in seconds.
244+
histogram_time_to_first_token
245+
# Histogram of time per output token in seconds.
246+
histogram_time_per_output_token
247+
# Histogram of end to end request latency in seconds.
248+
histogram_e2e_time_request
249+
# Number of prefill tokens processed.
250+
histogram_num_prompt_tokens_request
251+
# Number of generation tokens processed.
252+
histogram_num_generation_tokens_request
253+
# Histogram of the best_of request parameter.
254+
histogram_best_of_request
255+
# Histogram of the n request parameter.
256+
histogram_n_request
257+
```
258+
Your output for these fields should look similar to the following:
259+
```bash
260+
# HELP vllm:prompt_tokens_total Number of prefill tokens processed.
261+
# TYPE vllm:prompt_tokens_total counter
262+
vllm:prompt_tokens_total{model="vllm_model",version="1"} 10
263+
# HELP vllm:generation_tokens_total Number of generation tokens processed.
264+
# TYPE vllm:generation_tokens_total counter
265+
vllm:generation_tokens_total{model="vllm_model",version="1"} 16
266+
# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
267+
# TYPE vllm:time_to_first_token_seconds histogram
268+
vllm:time_to_first_token_seconds_count{model="vllm_model",version="1"} 1
269+
vllm:time_to_first_token_seconds_sum{model="vllm_model",version="1"} 0.03233122825622559
270+
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="0.001"} 0
271+
...
272+
vllm:time_to_first_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
273+
# HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds.
274+
# TYPE vllm:time_per_output_token_seconds histogram
275+
vllm:time_per_output_token_seconds_count{model="vllm_model",version="1"} 15
276+
vllm:time_per_output_token_seconds_sum{model="vllm_model",version="1"} 0.04501533508300781
277+
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="0.01"} 14
278+
...
279+
vllm:time_per_output_token_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 15
280+
# HELP vllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds.
281+
# TYPE vllm:e2e_request_latency_seconds histogram
282+
vllm:e2e_request_latency_seconds_count{model="vllm_model",version="1"} 1
283+
vllm:e2e_request_latency_seconds_sum{model="vllm_model",version="1"} 0.08686184883117676
284+
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="1"} 1
285+
...
286+
vllm:e2e_request_latency_seconds_bucket{model="vllm_model",version="1",le="+Inf"} 1
287+
# HELP vllm:request_prompt_tokens Number of prefill tokens processed.
288+
# TYPE vllm:request_prompt_tokens histogram
289+
vllm:request_prompt_tokens_count{model="vllm_model",version="1"} 1
290+
vllm:request_prompt_tokens_sum{model="vllm_model",version="1"} 10
291+
vllm:request_prompt_tokens_bucket{model="vllm_model",version="1",le="1"} 0
292+
...
293+
vllm:request_prompt_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
294+
# HELP vllm:request_generation_tokens Number of generation tokens processed.
295+
# TYPE vllm:request_generation_tokens histogram
296+
vllm:request_generation_tokens_count{model="vllm_model",version="1"} 1
297+
vllm:request_generation_tokens_sum{model="vllm_model",version="1"} 16
298+
vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="1"} 0
299+
...
300+
vllm:request_generation_tokens_bucket{model="vllm_model",version="1",le="+Inf"} 1
301+
# HELP vllm:request_params_best_of Histogram of the best_of request parameter.
302+
# TYPE vllm:request_params_best_of histogram
303+
vllm:request_params_best_of_count{model="vllm_model",version="1"} 1
304+
vllm:request_params_best_of_sum{model="vllm_model",version="1"} 1
305+
vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="1"} 1
306+
...
307+
vllm:request_params_best_of_bucket{model="vllm_model",version="1",le="+Inf"} 1
308+
# HELP vllm:request_params_n Histogram of the n request parameter.
309+
# TYPE vllm:request_params_n histogram
310+
vllm:request_params_n_count{model="vllm_model",version="1"} 1
311+
vllm:request_params_n_sum{model="vllm_model",version="1"} 1
312+
vllm:request_params_n_bucket{model="vllm_model",version="1",le="1"} 1
313+
...
314+
vllm:request_params_n_bucket{model="vllm_model",version="1",le="+Inf"} 1
315+
```
316+
To enable vLLM engine colleting metrics, "disable_log_stats" option need to be either false
317+
or left empty (false by default) in [model.json](https://github.com/triton-inference-server/vllm_backend/blob/main/samples/model_repository/vllm_model/1/model.json).
318+
```bash
319+
"disable_log_stats": false
320+
```
321+
*Note:* vLLM metrics are not reported to Triton metrics server by default
322+
due to potential performance slowdowns. To enable vLLM model's metrics
323+
reporting, please add following lines to its config.pbtxt as well.
324+
```bash
325+
parameters: {
326+
key: "REPORT_CUSTOM_METRICS"
327+
value: {
328+
string_value:"yes"
329+
}
330+
}
331+
```
332+
223333
## Referencing the Tutorial
224334

225335
You can read further in the
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#!/bin/bash
2+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
source ../../common/util.sh
29+
30+
TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
31+
SERVER=${TRITON_DIR}/bin/tritonserver
32+
BACKEND_DIR=${TRITON_DIR}/backends
33+
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_opt --log-verbose=1"
34+
SERVER_LOG="./vllm_metrics_server.log"
35+
CLIENT_LOG="./vllm_metrics_client.log"
36+
TEST_RESULT_FILE='test_results.txt'
37+
CLIENT_PY="./vllm_metrics_test.py"
38+
SAMPLE_MODELS_REPO="../../../samples/model_repository"
39+
EXPECTED_NUM_TESTS=1
40+
41+
# Helpers =======================================
42+
function copy_model_repository {
43+
rm -rf models && mkdir -p models
44+
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
45+
# `vllm_opt` model will be loaded on server start and stay loaded throughout
46+
# unittesting. To ensure that vllm's memory profiler will not error out
47+
# on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`,
48+
# so that at least 60% of GPU memory was available for other models.
49+
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json
50+
}
51+
52+
run_test() {
53+
local TEST_CASE=$1
54+
55+
run_server
56+
if [ "$SERVER_PID" == "0" ]; then
57+
cat $SERVER_LOG
58+
echo -e "\n***\n*** Failed to start $SERVER\n***"
59+
exit 1
60+
fi
61+
62+
set +e
63+
python3 $CLIENT_PY $TEST_CASE -v > $CLIENT_LOG 2>&1
64+
65+
if [ $? -ne 0 ]; then
66+
cat $CLIENT_LOG
67+
echo -e "\n***\n*** Running $CLIENT_PY $TEST_CASE FAILED. \n***"
68+
RET=1
69+
else
70+
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
71+
if [ $? -ne 0 ]; then
72+
cat $CLIENT_LOG
73+
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
74+
RET=1
75+
fi
76+
fi
77+
set -e
78+
79+
kill $SERVER_PID
80+
wait $SERVER_PID
81+
}
82+
83+
RET=0
84+
85+
# Test disabling vLLM metrics reporting without parameter "REPORT_CUSTOM_METRICS" in config.pbtxt
86+
copy_model_repository
87+
run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
88+
89+
# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt
90+
copy_model_repository
91+
echo -e "
92+
parameters: {
93+
key: \"REPORT_CUSTOM_METRICS\"
94+
value: {
95+
string_value:\"no\"
96+
}
97+
}
98+
" >> models/vllm_opt/config.pbtxt
99+
run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
100+
101+
# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt
102+
copy_model_repository
103+
cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt
104+
echo -e "
105+
parameters: {
106+
key: \"REPORT_CUSTOM_METRICS\"
107+
value: {
108+
string_value:\"yes\"
109+
}
110+
}
111+
" >> models/vllm_opt/config.pbtxt
112+
run_test VLLMTritonMetricsTest.test_vllm_metrics
113+
114+
# Test vLLM metrics custom sampling parameters
115+
# Custom sampling parameters may result in different vLLM output depending
116+
# on the platform. Therefore, these metrics are tests separately.
117+
copy_model_repository
118+
cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt
119+
echo -e "
120+
parameters: {
121+
key: \"REPORT_CUSTOM_METRICS\"
122+
value: {
123+
string_value:\"yes\"
124+
}
125+
}
126+
" >> models/vllm_opt/config.pbtxt
127+
run_test VLLMTritonMetricsTest.test_custom_sampling_params
128+
129+
# Test enabling vLLM metrics reporting in config.pbtxt but disabling in model.json
130+
copy_model_repository
131+
jq '. += {"disable_log_stats" : true}' models/vllm_opt/1/model.json > "temp.json"
132+
mv temp.json models/vllm_opt/1/model.json
133+
echo -e "
134+
parameters: {
135+
key: \"REPORT_CUSTOM_METRICS\"
136+
value: {
137+
string_value:\"yes\"
138+
}
139+
}
140+
" >> models/vllm_opt/config.pbtxt
141+
run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
142+
143+
# Test enabling vLLM metrics reporting in config.pbtxt while disabling in server option
144+
copy_model_repository
145+
echo -e "
146+
parameters: {
147+
key: \"REPORT_CUSTOM_METRICS\"
148+
value: {
149+
string_value:\"yes\"
150+
}
151+
}
152+
" >> models/vllm_opt/config.pbtxt
153+
SERVER_ARGS="${SERVER_ARGS} --allow-metrics=false"
154+
run_test VLLMTritonMetricsTest.test_vllm_metrics_refused
155+
156+
rm -rf "./models" "temp.json"
157+
158+
if [ $RET -eq 1 ]; then
159+
cat $CLIENT_LOG
160+
cat $SERVER_LOG
161+
echo -e "\n***\n*** vLLM test FAILED. \n***"
162+
else
163+
echo -e "\n***\n*** vLLM test PASSED. \n***"
164+
fi
165+
166+
collect_artifacts_from_subdir
167+
exit $RET

0 commit comments

Comments
 (0)