Skip to content

Commit b594d07

Browse files
authored
feat: Auto unload model if vLLM health check failed (#73)
1 parent 366e668 commit b594d07

File tree

7 files changed

+399
-12
lines changed

7 files changed

+399
-12
lines changed

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,15 +307,21 @@ or left empty (false by default) in [model.json](https://github.com/triton-infer
307307
*Note:* vLLM metrics are not reported to Triton metrics server by default
308308
due to potential performance slowdowns. To enable vLLM model's metrics
309309
reporting, please add following lines to its config.pbtxt as well.
310-
```bash
310+
```
311311
parameters: {
312312
key: "REPORT_CUSTOM_METRICS"
313313
value: {
314-
string_value:"yes"
314+
string_value: "true"
315315
}
316316
}
317317
```
318318

319+
## vLLM Engine Health Check (BETA)
320+
321+
vLLM Engine Health Check may be enabled optionally, for more accurate model
322+
state reported by the server. See [this docs](docs/health_check.md) for more
323+
information.
324+
319325
## Referencing the Tutorial
320326

321327
You can read further in the

ci/L0_backend_vllm/metrics_test/test.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -86,26 +86,26 @@ RET=0
8686
copy_model_repository
8787
run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
8888

89-
# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "no" in config.pbtxt
89+
# Test disabling vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "false" in config.pbtxt
9090
copy_model_repository
9191
echo -e "
9292
parameters: {
9393
key: \"REPORT_CUSTOM_METRICS\"
9494
value: {
95-
string_value:\"no\"
95+
string_value: \"false\"
9696
}
9797
}
9898
" >> models/vllm_opt/config.pbtxt
9999
run_test VLLMTritonMetricsTest.test_vllm_metrics_disabled
100100

101-
# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "yes" in config.pbtxt
101+
# Test vLLM metrics reporting with parameter "REPORT_CUSTOM_METRICS" set to "true" in config.pbtxt
102102
copy_model_repository
103103
cp ${SAMPLE_MODELS_REPO}/vllm_model/config.pbtxt models/vllm_opt
104104
echo -e "
105105
parameters: {
106106
key: \"REPORT_CUSTOM_METRICS\"
107107
value: {
108-
string_value:\"yes\"
108+
string_value: \"true\"
109109
}
110110
}
111111
" >> models/vllm_opt/config.pbtxt
@@ -120,7 +120,7 @@ echo -e "
120120
parameters: {
121121
key: \"REPORT_CUSTOM_METRICS\"
122122
value: {
123-
string_value:\"yes\"
123+
string_value: \"true\"
124124
}
125125
}
126126
" >> models/vllm_opt/config.pbtxt
@@ -134,7 +134,7 @@ echo -e "
134134
parameters: {
135135
key: \"REPORT_CUSTOM_METRICS\"
136136
value: {
137-
string_value:\"yes\"
137+
string_value: \"true\"
138138
}
139139
}
140140
" >> models/vllm_opt/config.pbtxt
@@ -146,7 +146,7 @@ echo -e "
146146
parameters: {
147147
key: \"REPORT_CUSTOM_METRICS\"
148148
value: {
149-
string_value:\"yes\"
149+
string_value: \"true\"
150150
}
151151
}
152152
" >> models/vllm_opt/config.pbtxt
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
import json
28+
29+
import numpy as np
30+
import tritonclient.grpc as grpcclient
31+
32+
33+
class TestCheckHealth:
34+
_grpc_url = "localhost:8001"
35+
_model_name = "vllm_opt"
36+
_sampling_parameters = {"temperature": "0", "top_p": "1"}
37+
_prompt = "In this example,"
38+
39+
def _get_inputs(self, prompt, stream=True, sampling_parameters=None):
40+
inputs = []
41+
42+
inputs.append(grpcclient.InferInput("text_input", [1], "BYTES"))
43+
inputs[-1].set_data_from_numpy(
44+
np.array([prompt.encode("utf-8")], dtype=np.object_)
45+
)
46+
47+
inputs.append(grpcclient.InferInput("stream", [1], "BOOL"))
48+
inputs[-1].set_data_from_numpy(np.array([stream], dtype=bool))
49+
50+
if sampling_parameters is not None:
51+
inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES"))
52+
inputs[-1].set_data_from_numpy(
53+
np.array(
54+
[json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_
55+
)
56+
)
57+
58+
return inputs
59+
60+
def _callback(self, result, error):
61+
self._responses.append({"result": result, "error": error})
62+
63+
def _llm_infer(self):
64+
inputs = self._get_inputs(
65+
self._prompt, stream=True, sampling_parameters=self._sampling_parameters
66+
)
67+
self._responses = []
68+
with grpcclient.InferenceServerClient(self._grpc_url) as client:
69+
client.start_stream(self._callback)
70+
client.async_stream_infer(
71+
self._model_name, inputs=inputs, parameters=self._sampling_parameters
72+
)
73+
client.stop_stream()
74+
75+
def _assert_text_output_valid(self):
76+
text_output = ""
77+
for response in self._responses:
78+
result, error = response["result"], response["error"]
79+
assert error is None
80+
text_output += result.as_numpy(name="text_output")[0].decode("utf-8")
81+
assert len(text_output) > 0, "output is empty"
82+
assert text_output.count(" ") > 4, "output is not a sentence"
83+
84+
def _assert_infer_exception(self, expected_exception_message):
85+
assert len(self._responses) == 1
86+
for response in self._responses:
87+
result, error = response["result"], response["error"]
88+
assert result is None
89+
assert str(error) == expected_exception_message
90+
91+
def _assert_model_ready(self, expected_readiness):
92+
with grpcclient.InferenceServerClient(self._grpc_url) as client:
93+
# is_model_ready API
94+
assert client.is_model_ready(self._model_name) == expected_readiness
95+
# get_model_repository_index API
96+
model_state = None
97+
for model_index in client.get_model_repository_index().models:
98+
if model_index.name == self._model_name:
99+
assert model_state is None, "duplicate model index found"
100+
model_state = model_index.state == "READY"
101+
assert model_state == expected_readiness
102+
103+
def test_vllm_is_healthy(self):
104+
num_repeats = 3
105+
for i in range(num_repeats):
106+
self._assert_model_ready(True)
107+
self._llm_infer()
108+
self._assert_text_output_valid()
109+
self._assert_model_ready(True)
110+
111+
def test_vllm_not_healthy(self):
112+
self._assert_model_ready(True)
113+
# The 1st infer should complete successfully
114+
self._llm_infer()
115+
self._assert_text_output_valid()
116+
self._assert_model_ready(True)
117+
# The 2nd infer should begin with health check failed
118+
self._llm_infer()
119+
self._assert_infer_exception(
120+
"Model is unavailable due to unhealthy vLLM engine"
121+
)
122+
self._assert_model_ready(False)
123+
# The 3rd infer should have model not found
124+
self._llm_infer()
125+
self._assert_infer_exception(
126+
"Request for unknown model: 'vllm_opt' has no available versions"
127+
)
128+
self._assert_model_ready(False)
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
from vllm.engine.async_llm_engine import AsyncLLMEngine as real_AsyncLLMEngine
28+
29+
30+
class mock_AsyncLLMEngine(real_AsyncLLMEngine):
31+
_mock_check_health_count = 0
32+
33+
async def check_health(self) -> None:
34+
self._mock_check_health_count += 1
35+
if self._mock_check_health_count > 1:
36+
raise RuntimeError("Simulated vLLM check_health() failure")

ci/L0_check_health_vllm/test.sh

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#!/bin/bash
2+
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
export CUDA_VISIBLE_DEVICES=0
29+
source ../common/util.sh
30+
31+
pip3 install pytest==8.1.1
32+
pip3 install tritonclient[grpc]
33+
34+
RET=0
35+
36+
function setup_model_repository {
37+
local sample_model_repo_path=${1:-"../../samples/model_repository"}
38+
rm -rf models vllm_baseline_output.pkl && mkdir -p models
39+
cp -r $sample_model_repo_path/vllm_model models/vllm_opt
40+
}
41+
42+
function enable_health_check {
43+
local enable_vllm_health_check="$1"
44+
echo -e "parameters: {" >> models/vllm_opt/config.pbtxt
45+
echo -e " key: \"ENABLE_VLLM_HEALTH_CHECK\"" >> models/vllm_opt/config.pbtxt
46+
echo -e " value: { string_value: \"$enable_vllm_health_check\" }" >> models/vllm_opt/config.pbtxt
47+
echo -e "}" >> models/vllm_opt/config.pbtxt
48+
}
49+
50+
function mock_vllm_async_llm_engine {
51+
mv /opt/tritonserver/backends/vllm/model.py /opt/tritonserver/backends/vllm/.model.py.backup
52+
cp /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
53+
sed -i 's/from vllm.engine.async_llm_engine import AsyncLLMEngine/from mock_async_llm_engine import mock_AsyncLLMEngine as AsyncLLMEngine/' /opt/tritonserver/backends/vllm/model.py
54+
cp mock_async_llm_engine.py /opt/tritonserver/backends/vllm
55+
}
56+
57+
function unmock_vllm_async_llm_engine {
58+
rm -f /opt/tritonserver/backends/vllm/mock_async_llm_engine.py /opt/tritonserver/backends/vllm/model.py
59+
mv /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
60+
}
61+
62+
function test_check_health {
63+
local test_name="$1"
64+
local unit_test_name="$2"
65+
66+
SERVER_LOG="$test_name.server.log"
67+
SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*"
68+
run_server
69+
if [ "$SERVER_PID" == "0" ]; then
70+
echo -e "\n***\n*** Failed to start $SERVER\n***"
71+
cat $SERVER_LOG
72+
exit 1
73+
fi
74+
75+
set +e
76+
python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log
77+
if [ $? -ne 0 ]; then
78+
echo -e "\n***\n*** $test_name FAILED. \n***"
79+
RET=1
80+
fi
81+
set -e
82+
83+
kill $SERVER_PID
84+
wait $SERVER_PID
85+
}
86+
87+
# Test health check unspecified
88+
setup_model_repository
89+
test_check_health "health_check_unspecified" "test_vllm_is_healthy"
90+
91+
# Test health check disabled
92+
setup_model_repository
93+
enable_health_check "false"
94+
test_check_health "health_check_disabled" "test_vllm_is_healthy"
95+
96+
# Test health check enabled
97+
setup_model_repository
98+
enable_health_check "true"
99+
test_check_health "health_check_enabled" "test_vllm_is_healthy"
100+
101+
# Mock check_health() from vLLM
102+
mock_vllm_async_llm_engine
103+
104+
# Test health check unspecified with mocked vLLM check_health() failure
105+
setup_model_repository
106+
test_check_health "health_check_unspecified_mocked_failure" "test_vllm_is_healthy"
107+
108+
# Test health check disabled with mocked vLLM check_health() failure
109+
setup_model_repository
110+
enable_health_check "false"
111+
test_check_health "health_check_disabled_mocked_failure" "test_vllm_is_healthy"
112+
113+
# Test health check enabled with mocked vLLM check_health() failure
114+
setup_model_repository
115+
enable_health_check "true"
116+
test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy"
117+
118+
# Unmock check_health()
119+
unmock_vllm_async_llm_engine
120+
121+
if [ $RET -eq 0 ]; then
122+
echo -e "\n***\n*** Test Passed\n***"
123+
else
124+
echo -e "\n***\n*** Test FAILED\n***"
125+
fi
126+
exit $RET

0 commit comments

Comments
 (0)