Skip to content

Commit 321faa0

Browse files
committed
Add metrics test
1 parent d95bb2c commit 321faa0

File tree

6 files changed

+274
-3
lines changed

6 files changed

+274
-3
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,9 @@ container with the following commands:
111111

112112
```
113113
mkdir -p /opt/tritonserver/backends/vllm
114-
wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/main/src/model.py
114+
git clone https://github.com/triton-inference-server/vllm_backend.git /opt/tritonserver/backends/vllm/vllm_backend
115+
cp -r /opt/tritonserver/backends/vllm/vllm_backend/src/* /opt/tritonserver/backends/vllm
116+
rm -rf /opt/tritonserver/backends/vllm/vllm_backend
115117
```
116118

117119
## Using the vLLM Backend
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/bin/bash
2+
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
source ../../common/util.sh
29+
30+
TRITON_DIR=${TRITON_DIR:="/opt/tritonserver"}
31+
SERVER=${TRITON_DIR}/bin/tritonserver
32+
BACKEND_DIR=${TRITON_DIR}/backends
33+
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_opt --log-verbose=1"
34+
SERVER_LOG="./vllm_metrics_server.log"
35+
CLIENT_LOG="./vllm_metrics_client.log"
36+
TEST_RESULT_FILE='test_results.txt'
37+
CLIENT_PY="./vllm_metrics_test.py"
38+
SAMPLE_MODELS_REPO="../../../samples/model_repository"
39+
EXPECTED_NUM_TESTS=1
40+
41+
# Helpers =======================================
42+
function assert_curl_success {
43+
message="${1}"
44+
if [ "$code" != "200" ]; then
45+
cat ./curl.out
46+
echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
47+
RET=1
48+
fi
49+
}
50+
51+
rm -rf models && mkdir -p models
52+
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
53+
# `vllm_opt`` model will be loaded on server start and stay loaded throughout
54+
# unittesting. To ensure that vllm's memory profiler will not error out
55+
# on `vllm_load_test` load, we reduce "gpu_memory_utilization" for `vllm_opt`,
56+
# so that at least 60% of GPU memory was available for other models.
57+
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json
58+
59+
RET=0
60+
61+
run_server
62+
if [ "$SERVER_PID" == "0" ]; then
63+
cat $SERVER_LOG
64+
echo -e "\n***\n*** Failed to start $SERVER\n***"
65+
exit 1
66+
fi
67+
68+
set +e
69+
python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1
70+
71+
if [ $? -ne 0 ]; then
72+
cat $CLIENT_LOG
73+
echo -e "\n***\n*** Running $CLIENT_PY FAILED. \n***"
74+
RET=1
75+
else
76+
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
77+
if [ $? -ne 0 ]; then
78+
cat $CLIENT_LOG
79+
echo -e "\n***\n*** Test Result Verification FAILED.\n***"
80+
RET=1
81+
fi
82+
fi
83+
set -e
84+
85+
kill $SERVER_PID
86+
wait $SERVER_PID
87+
rm -rf "./models"
88+
89+
if [ $RET -eq 1 ]; then
90+
cat $CLIENT_LOG
91+
cat $SERVER_LOG
92+
echo -e "\n***\n*** vLLM test FAILED. \n***"
93+
else
94+
echo -e "\n***\n*** vLLM test PASSED. \n***"
95+
fi
96+
97+
collect_artifacts_from_subdir
98+
exit $RET
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
import os
28+
import re
29+
import sys
30+
import unittest
31+
from functools import partial
32+
33+
import requests
34+
import tritonclient.grpc as grpcclient
35+
from tritonclient.utils import *
36+
37+
sys.path.append("../../common")
38+
from test_util import TestResultCollector, UserData, callback, create_vllm_request
39+
40+
_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")
41+
42+
PROMPTS = [
43+
"The most dangerous animal is",
44+
"The capital of France is",
45+
"The future of AI is",
46+
]
47+
SAMPLING_PARAMETERS = {"temperature": "0", "top_p": "1"}
48+
49+
50+
def get_metrics():
51+
"""
52+
Store vllm metrics in a dictionary.
53+
"""
54+
r = requests.get(f"http://{_tritonserver_ipaddr}:8002/metrics")
55+
r.raise_for_status()
56+
57+
# Regular expression to match the pattern
58+
pattern = r"^(vllm:.*){.*} (\d+)$"
59+
vllm_dict = {}
60+
61+
# Find all matches in the text
62+
matches = re.findall(pattern, r.text, re.MULTILINE)
63+
64+
for match in matches:
65+
key, value = match
66+
vllm_dict[key] = int(value)
67+
68+
return vllm_dict
69+
70+
71+
class VLLMTritonMetricsTest(TestResultCollector):
72+
def setUp(self):
73+
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
74+
self.vllm_model_name = "vllm_opt"
75+
76+
def test_vllm_metrics(self):
77+
# Supported vLLM metrics
78+
expected_metrics_dict = {
79+
"vllm:num_requests_running": 0,
80+
"vllm:num_requests_waiting": 0,
81+
"vllm:num_requests_swapped": 0,
82+
"vllm:gpu_cache_usage_perc": 0,
83+
"vllm:cpu_cache_usage_perc": 0,
84+
"vllm:num_preemptions_total": 0,
85+
"vllm:prompt_tokens_total": 0,
86+
"vllm:generation_tokens_total": 0,
87+
}
88+
89+
# Test vLLM metrics
90+
self._test_vllm_model(
91+
prompts=PROMPTS,
92+
sampling_parameters=SAMPLING_PARAMETERS,
93+
stream=False,
94+
send_parameters_as_tensor=True,
95+
model_name=self.vllm_model_name,
96+
)
97+
expected_metrics_dict["vllm:prompt_tokens_total"] = 18
98+
expected_metrics_dict["vllm:generation_tokens_total"] = 48
99+
print(get_metrics())
100+
print(expected_metrics_dict)
101+
self.assertEqual(get_metrics(), expected_metrics_dict)
102+
103+
self._test_vllm_model(
104+
prompts=PROMPTS,
105+
sampling_parameters=SAMPLING_PARAMETERS,
106+
stream=False,
107+
send_parameters_as_tensor=False,
108+
model_name=self.vllm_model_name,
109+
)
110+
expected_metrics_dict["vllm:prompt_tokens_total"] = 36
111+
expected_metrics_dict["vllm:generation_tokens_total"] = 96
112+
self.assertEqual(get_metrics(), expected_metrics_dict)
113+
114+
def _test_vllm_model(
115+
self,
116+
prompts,
117+
sampling_parameters,
118+
stream,
119+
send_parameters_as_tensor,
120+
exclude_input_in_output=None,
121+
expected_output=None,
122+
model_name="vllm_opt",
123+
):
124+
user_data = UserData()
125+
number_of_vllm_reqs = len(prompts)
126+
127+
self.triton_client.start_stream(callback=partial(callback, user_data))
128+
for i in range(number_of_vllm_reqs):
129+
request_data = create_vllm_request(
130+
prompts[i],
131+
i,
132+
stream,
133+
sampling_parameters,
134+
model_name,
135+
send_parameters_as_tensor,
136+
exclude_input_in_output=exclude_input_in_output,
137+
)
138+
self.triton_client.async_stream_infer(
139+
model_name=model_name,
140+
request_id=request_data["request_id"],
141+
inputs=request_data["inputs"],
142+
outputs=request_data["outputs"],
143+
parameters=sampling_parameters,
144+
)
145+
146+
for i in range(number_of_vllm_reqs):
147+
result = user_data._completed_requests.get()
148+
if type(result) is InferenceServerException:
149+
print(result.message())
150+
self.assertIsNot(type(result), InferenceServerException, str(result))
151+
152+
output = result.as_numpy("text_output")
153+
self.assertIsNotNone(output, "`text_output` should not be None")
154+
if expected_output is not None:
155+
self.assertEqual(
156+
output,
157+
expected_output[i],
158+
'Actual and expected outputs do not match.\n \
159+
Expected "{}" \n Actual:"{}"'.format(
160+
output, expected_output[i]
161+
),
162+
)
163+
164+
self.triton_client.stop_stream()
165+
166+
def tearDown(self):
167+
self.triton_client.close()
168+
169+
170+
if __name__ == "__main__":
171+
unittest.main()

ci/L0_backend_vllm/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

2828
RET=0
29-
SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend"
29+
SUBTESTS="accuracy_test request_cancellation enabled_stream vllm_backend metrics_test"
3030

3131
python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc]
3232

src/model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from vllm.sampling_params import SamplingParams
4040
from vllm.utils import random_uuid
4141

42-
from metrics import VllmStatLogger
42+
from utils.metrics import VllmStatLogger
4343

4444
_VLLM_ENGINE_ARGS_FILENAME = "model.json"
4545
_MULTI_LORA_ARGS_FILENAME = "multi_lora.json"
File renamed without changes.

0 commit comments

Comments
 (0)