triton-inference-server · kthui · Dec 5, 2024 · Nov 16, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py
@@ -0,0 +1,128 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import os
+
+import numpy as np
+import pytest
+import tritonclient.grpc as grpcclient
+
+
+class TestCheckHealth:
+    _grpc_url = "localhost:8001"
+    _model_name = "vllm_opt"
+    _sampling_parameters = {"temperature": "0", "top_p": "1"}
+    _prompt = "In this example,"
+
+    def _get_inputs(self, prompt, stream=True, sampling_parameters=None):
+        inputs = []
+
+        inputs.append(grpcclient.InferInput("text_input", [1], "BYTES"))
+        inputs[-1].set_data_from_numpy(
+            np.array([prompt.encode("utf-8")], dtype=np.object_)
+        )
+
+        inputs.append(grpcclient.InferInput("stream", [1], "BOOL"))
+        inputs[-1].set_data_from_numpy(np.array([stream], dtype=bool))
+
+        if sampling_parameters is not None:
+            inputs.append(grpcclient.InferInput("sampling_parameters", [1], "BYTES"))
+            inputs[-1].set_data_from_numpy(
+                np.array(
+                    [json.dumps(sampling_parameters).encode("utf-8")], dtype=np.object_
+                )
+            )
+
+        return inputs
+
+    def _callback(self, result, error):
+        self._responses.append({"result": result, "error": error})
+
+    def _llm_infer(self):
+        inputs = self._get_inputs(
+            self._prompt, stream=True, sampling_parameters=self._sampling_parameters
+        )
+        self._responses = []
+        with grpcclient.InferenceServerClient(self._grpc_url) as client:
+            client.start_stream(self._callback)
+            client.async_stream_infer(
+                self._model_name, inputs=inputs, parameters=self._sampling_parameters
+            )
+            client.stop_stream()
+
+    def _assert_text_output_valid(self):
+        text_output = ""
+        for response in self._responses:
+            result, error = response["result"], response["error"]
+            assert error is None
+            text_output += result.as_numpy(name="text_output")[0].decode("utf-8")
+        assert len(text_output) > 0, "output is empty"
+        assert text_output.count(" ") > 4, "output is not a sentence"
+
+    def _assert_infer_exception(self, expected_exception_message):
+        assert len(self._responses) == 1
+        for response in self._responses:
+            result, error = response["result"], response["error"]
+            assert result is None
+            assert str(error) == expected_exception_message
+
+    def _assert_model_ready(self, expected_readiness):
+        with grpcclient.InferenceServerClient(self._grpc_url) as client:
+            assert client.is_model_ready(self._model_name) == expected_readiness
+
+    def test_vllm_is_healthy(self):
+        num_repeats = 3
+        for i in range(num_repeats):
+            self._assert_model_ready(True)
+            self._llm_infer()
+            self._assert_text_output_valid()
+        self._assert_model_ready(True)
+
+    def test_vllm_not_healthy(self):
+        self._assert_model_ready(True)
+        # The 1st infer should complete successfully
+        self._llm_infer()
+        self._assert_text_output_valid()
+        self._assert_model_ready(True)
+        # The 2nd infer should begin with health check failed
+        self._llm_infer()
+        self._assert_infer_exception("vLLM engine is not healthy")
+        self._assert_model_ready(False)
+        # The 3rd infer should have model not found
+        self._llm_infer()
+        self._assert_infer_exception(
+            "Request for unknown model: 'vllm_opt' has no available versions"
+        )
+        self._assert_model_ready(False)
+
+    def test_vllm_enable_health_check_multi_instance(self):
+        with open(os.environ["SERVER_LOG"]) as f:
+            server_log = f.read()
+        expected_vllm_warning = "[vllm] Health check may only be enabled when the model has exactly 1 instance but 2 are found"
+        assert expected_vllm_warning in server_log
+        # Health check should be disabled
+        self.test_vllm_is_healthy()
diff --git a/ci/L0_check_health_vllm/mock_async_llm_engine.py b/ci/L0_check_health_vllm/mock_async_llm_engine.py
@@ -0,0 +1,36 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from vllm.engine.async_llm_engine import AsyncLLMEngine as real_AsyncLLMEngine
+
+
+class mock_AsyncLLMEngine(real_AsyncLLMEngine):
+    _mock_check_health_count = 0
+
+    async def check_health(self) -> None:
+        self._mock_check_health_count += 1
+        if self._mock_check_health_count > 1:
+            raise RuntimeError("Simulated vLLM check_health() failure")
diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh
@@ -0,0 +1,141 @@
+#!/bin/bash
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+export CUDA_VISIBLE_DEVICES=0
+source ../common/util.sh
+
+pip3 install pytest==8.1.1
+pip3 install tritonclient[grpc]
+
+RET=0
+
+function setup_model_repository {
+    local sample_model_repo_path=${1:-"../../samples/model_repository"}
+    rm -rf models vllm_baseline_output.pkl && mkdir -p models
+    cp -r $sample_model_repo_path/vllm_model models/vllm_opt
+}
+
+function setup_model_repository_with_multi_instances {
+    setup_model_repository
+    echo -e "backend: \"vllm\"" > models/vllm_opt/config.pbtxt
+    echo -e "instance_group [" >> models/vllm_opt/config.pbtxt
+    echo -e "  { kind: KIND_MODEL }," >> models/vllm_opt/config.pbtxt
+    echo -e "  { kind: KIND_MODEL \n count: 1 }" >> models/vllm_opt/config.pbtxt
+    echo -e "]" >> models/vllm_opt/config.pbtxt
+}
+
+function enable_health_check {
+    local enable_vllm_health_check="$1"
+    echo -e "parameters: {" >> models/vllm_opt/config.pbtxt
+    echo -e "  key: \"ENABLE_VLLM_HEALTH_CHECK\"" >> models/vllm_opt/config.pbtxt
+    echo -e "  value: { string_value: \"$enable_vllm_health_check\" }" >> models/vllm_opt/config.pbtxt
+    echo -e "}" >> models/vllm_opt/config.pbtxt
+}
+
+function mock_vllm_async_llm_engine {
+    mv /opt/tritonserver/backends/vllm/model.py /opt/tritonserver/backends/vllm/.model.py.backup
+    cp /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
+    sed -i 's/from vllm.engine.async_llm_engine import AsyncLLMEngine/from mock_async_llm_engine import mock_AsyncLLMEngine as AsyncLLMEngine/' /opt/tritonserver/backends/vllm/model.py
+    cp mock_async_llm_engine.py /opt/tritonserver/backends/vllm
+}
+
+function unmock_vllm_async_llm_engine {
+    rm -f /opt/tritonserver/backends/vllm/mock_async_llm_engine.py /opt/tritonserver/backends/vllm/model.py
+    mv /opt/tritonserver/backends/vllm/.model.py.backup /opt/tritonserver/backends/vllm/model.py
+}
+
+function test_check_health {
+    local test_name="$1"
+    local unit_test_name="$2"
+
+    SERVER_LOG="$test_name.server.log"
+    SERVER_ARGS="--model-repository=models --model-control-mode=explicit --load-model=*"
+    run_server
+    if [ "$SERVER_PID" == "0" ]; then
+        echo -e "\n***\n*** Failed to start $SERVER\n***"
+        cat $SERVER_LOG
+        exit 1
+    fi
+
+    set +e
+    SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log
+    if [ $? -ne 0 ]; then
+        echo -e "\n***\n*** $test_name FAILED. \n***"
+        RET=1
+    fi
+    set -e
+
+    kill $SERVER_PID
+    wait $SERVER_PID
+}
+
+# Test health check unspecified
+setup_model_repository
+test_check_health "health_check_unspecified" "test_vllm_is_healthy"
+
+# Test health check disabled
+setup_model_repository
+enable_health_check "false"
+test_check_health "health_check_disabled" "test_vllm_is_healthy"
+
+# Test health check enabled
+setup_model_repository
+enable_health_check "true"
+test_check_health "health_check_enabled" "test_vllm_is_healthy"
+
+# Mock check_health() from vLLM
+mock_vllm_async_llm_engine
+
+# Test health check unspecified with mocked vLLM check_health() failure
+setup_model_repository
+test_check_health "health_check_unspecified_mocked_failure" "test_vllm_is_healthy"
+
+# Test health check disabled with mocked vLLM check_health() failure
+setup_model_repository
+enable_health_check "false"
+test_check_health "health_check_disabled_mocked_failure" "test_vllm_is_healthy"
+
+# Test health check enabled with mocked vLLM check_health() failure
+setup_model_repository
+enable_health_check "true"
+test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy"
+
+# Test health check enabled with mocked vLLM check_health() failure when there
+# are multiple instances
+setup_model_repository_with_multi_instances
+enable_health_check "true"
+test_check_health "health_check_enabled_multi_instance_mocked_failure" "test_vllm_enable_health_check_multi_instance"
+
+# Unmock check_health()
+unmock_vllm_async_llm_engine
+
+if [ $RET -eq 0 ]; then
+    echo -e "\n***\n*** Test Passed\n***"
+else
+    echo -e "\n***\n*** Test FAILED\n***"
+fi
+exit $RET
diff --git a/ci/common/util.sh b/ci/common/util.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
+SERVER=${SERVER:=/opt/tritonserver/bin/tritonserver}
 SERVER_IPADDR=${TRITONSERVER_IPADDR:=localhost}
 SERVER_LOG=${SERVER_LOG:=./server.log}
 SERVER_TIMEOUT=${SERVER_TIMEOUT:=120}