Skip to content

Commit 65821ed

Browse files
authored
Updating vLLM tests to handle changes between vLLM version 0.2.3 and 0.3.0 (#30)
1 parent 52c1c3c commit 65821ed

File tree

4 files changed

+112
-36
lines changed

4 files changed

+112
-36
lines changed

ci/L0_backend_vllm/accuracy_test/accuracy_test.py

Lines changed: 74 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# Redistribution and use in source and binary forms, with or without
44
# modification, are permitted provided that the following conditions
@@ -24,7 +24,9 @@
2424
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

27+
import argparse
2728
import asyncio
29+
import pickle
2830
import sys
2931
import unittest
3032
from functools import partial
@@ -39,12 +41,24 @@
3941
sys.path.append("../../common")
4042
from test_util import TestResultCollector, UserData, callback, create_vllm_request
4143

44+
VLLM_ENGINE_CONFIG = {
45+
"model": "facebook/opt-125m",
46+
"gpu_memory_utilization": 0.3,
47+
}
48+
49+
50+
PROMPTS = [
51+
"The most dangerous animal is",
52+
"The capital of France is",
53+
"The future of AI is",
54+
]
55+
56+
SAMPLING_PARAMETERS = {"temperature": 0, "top_p": 1}
57+
4258

4359
async def generate_python_vllm_output(prompt, llm_engine):
4460
request_id = random_uuid()
45-
sampling_parameters = {"temperature": 0, "top_p": 1}
46-
sampling_params = SamplingParams(**sampling_parameters)
47-
61+
sampling_params = SamplingParams(**SAMPLING_PARAMETERS)
4862
python_vllm_output = None
4963
last_output = None
5064

@@ -59,50 +73,68 @@ async def generate_python_vllm_output(prompt, llm_engine):
5973
return python_vllm_output
6074

6175

76+
def prepare_vllm_baseline_outputs():
77+
"""
78+
Helper function that starts async vLLM engine and generates output for each
79+
prompt in `PROMPTS`. Saves resulted baselines in `vllm_baseline_output.pkl`
80+
for further use.
81+
"""
82+
llm_engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**VLLM_ENGINE_CONFIG))
83+
python_vllm_output = []
84+
for i in range(len(PROMPTS)):
85+
python_vllm_output.extend(
86+
asyncio.run(generate_python_vllm_output(PROMPTS[i], llm_engine))
87+
)
88+
89+
with open("vllm_baseline_output.pkl", "wb") as f:
90+
pickle.dump(python_vllm_output, f)
91+
92+
return
93+
94+
6295
class VLLMTritonAccuracyTest(TestResultCollector):
6396
def setUp(self):
6497
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
65-
vllm_engine_config = {
66-
"model": "facebook/opt-125m",
67-
"gpu_memory_utilization": 0.3,
68-
}
69-
70-
self.llm_engine = AsyncLLMEngine.from_engine_args(
71-
AsyncEngineArgs(**vllm_engine_config)
72-
)
7398
self.vllm_model_name = "vllm_opt"
99+
self.python_vllm_output = []
100+
with open("vllm_baseline_output.pkl", "rb") as f:
101+
self.python_vllm_output = pickle.load(f)
102+
103+
self.assertNotEqual(
104+
self.python_vllm_output,
105+
[],
106+
"Loaded baseline outputs' list should not be empty",
107+
)
108+
self.assertIsNotNone(
109+
self.python_vllm_output, "Loaded baseline outputs' list should not be None"
110+
)
111+
self.assertEqual(
112+
len(self.python_vllm_output),
113+
len(PROMPTS),
114+
"Unexpected number of baseline outputs loaded, expected {}, but got {}".format(
115+
len(PROMPTS), len(self.python_vllm_output)
116+
),
117+
)
74118

75119
def test_vllm_model(self):
76120
user_data = UserData()
77121
stream = False
78-
prompts = [
79-
"The most dangerous animal is",
80-
"The capital of France is",
81-
"The future of AI is",
82-
]
83-
number_of_vllm_reqs = len(prompts)
84-
sampling_parameters = {"temperature": "0", "top_p": "1"}
85-
python_vllm_output = []
86122
triton_vllm_output = []
87123

88124
self.triton_client.start_stream(callback=partial(callback, user_data))
89-
for i in range(number_of_vllm_reqs):
125+
for i in range(len(PROMPTS)):
90126
request_data = create_vllm_request(
91-
prompts[i], i, stream, sampling_parameters, self.vllm_model_name
127+
PROMPTS[i], i, stream, SAMPLING_PARAMETERS, self.vllm_model_name
92128
)
93129
self.triton_client.async_stream_infer(
94130
model_name=self.vllm_model_name,
95131
request_id=request_data["request_id"],
96132
inputs=request_data["inputs"],
97133
outputs=request_data["outputs"],
98-
parameters=sampling_parameters,
99-
)
100-
101-
python_vllm_output.extend(
102-
asyncio.run(generate_python_vllm_output(prompts[i], self.llm_engine))
134+
parameters=SAMPLING_PARAMETERS,
103135
)
104136

105-
for i in range(number_of_vllm_reqs):
137+
for i in range(len(PROMPTS)):
106138
result = user_data._completed_requests.get()
107139
self.assertIsNot(type(result), InferenceServerException, str(result))
108140

@@ -112,11 +144,24 @@ def test_vllm_model(self):
112144
triton_vllm_output.extend(output)
113145

114146
self.triton_client.stop_stream()
115-
self.assertEqual(python_vllm_output, triton_vllm_output)
147+
self.assertEqual(self.python_vllm_output.sort(), triton_vllm_output.sort())
116148

117149
def tearDown(self):
118150
self.triton_client.close()
119151

120152

121153
if __name__ == "__main__":
154+
parser = argparse.ArgumentParser()
155+
parser.add_argument(
156+
"--generate-baseline",
157+
action="store_true",
158+
required=False,
159+
default=False,
160+
help="Generates baseline output for accuracy tests",
161+
)
162+
FLAGS = parser.parse_args()
163+
if FLAGS.generate_baseline:
164+
prepare_vllm_baseline_outputs()
165+
exit(0)
166+
122167
unittest.main()

ci/L0_backend_vllm/accuracy_test/test.sh

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/bin/bash
2-
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -36,14 +36,22 @@ CLIENT_LOG="./accuracy_test_client.log"
3636
TEST_RESULT_FILE='test_results.txt'
3737
CLIENT_PY="./accuracy_test.py"
3838
SAMPLE_MODELS_REPO="../../../samples/model_repository"
39+
VLLM_ENGINE_LOG="vllm_engine.log"
3940
EXPECTED_NUM_TESTS=1
4041

4142
rm -rf models && mkdir -p models
4243
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
4344
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/vllm_opt/1/model.json
44-
45+
[ -f vllm_baseline_output.pkl ] && rm vllm_baseline_output.pkl
4546
RET=0
4647

48+
set +e
49+
# Need to generate baseline first, since running 2 vLLM engines causes
50+
# memory issues: https://github.com/vllm-project/vllm/issues/2248
51+
python3 $CLIENT_PY --generate-baseline >> $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$!
52+
wait $BASELINE_PID
53+
set -e
54+
4755
run_server
4856
if [ "$SERVER_PID" == "0" ]; then
4957
cat $SERVER_LOG
@@ -52,7 +60,7 @@ if [ "$SERVER_PID" == "0" ]; then
5260
fi
5361

5462
set +e
55-
python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1
63+
python3 $CLIENT_PY > $CLIENT_LOG 2>&1
5664

5765
if [ $? -ne 0 ]; then
5866
cat $CLIENT_LOG

ci/L0_backend_vllm/vllm_backend/test.sh

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,16 @@ CLIENT_PY="./vllm_backend_test.py"
3838
SAMPLE_MODELS_REPO="../../../samples/model_repository"
3939
EXPECTED_NUM_TESTS=3
4040

41+
# Helpers =======================================
42+
function assert_curl_success {
43+
message="${1}"
44+
if [ "$code" != "200" ]; then
45+
cat ./curl.out
46+
echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
47+
RET=1
48+
fi
49+
}
50+
4151
rm -rf models && mkdir -p models
4252
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
4353

@@ -105,17 +115,22 @@ if [[ "$COUNT" -ne 2 ]]; then
105115
fi
106116

107117
# Test loading multiple vllm models at the same time
108-
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR}"
118+
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_one"
109119
SERVER_LOG="./vllm_test_multi_model.log"
110120

111121
# Create two models, one is just a copy of the other, and make sure gpu
112-
# utilization is low enough for multiple models to avoid OOM
122+
# utilization is low enough for multiple models to avoid OOM.
123+
# vLLM changed behavior of their GPU profiler from total to free memory,
124+
# so to load two small models at the same time, we need to start
125+
# triton server in explicit mode, load first model with
126+
# `gpu_memory_utilization` 0.4 and second should be 0.9.
113127
MODEL1="vllm_one"
114128
MODEL2="vllm_two"
115129
mkdir -p models
116130
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/${MODEL1}/
117-
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/${MODEL1}/1/model.json
118131
cp -r models/${MODEL1} models/${MODEL2}
132+
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/${MODEL1}/1/model.json
133+
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.9/' models/${MODEL2}/1/model.json
119134

120135
run_server
121136
if [ "$SERVER_PID" == "0" ]; then
@@ -124,6 +139,13 @@ if [ "$SERVER_PID" == "0" ]; then
124139
exit 1
125140
fi
126141

142+
# Explicitly load model
143+
rm -f ./curl.out
144+
set +e
145+
code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/repository/models/vllm_two/load`
146+
set -e
147+
assert_curl_success "Failed to load 'vllm_two' model"
148+
127149
kill $SERVER_PID
128150
wait $SERVER_PID
129151
rm -rf "./models"
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
"model":"facebook/opt-125m",
33
"disable_log_requests": "true",
4-
"gpu_memory_utilization": 0.5
4+
"gpu_memory_utilization": 0.5,
5+
"enforce_eager": "true"
56
}

0 commit comments

Comments
 (0)