Allow health check to be enableable for all instance counts

kthui · kthui · commit 33367f64b7c0 · 2024-11-20T16:16:29.000-08:00
diff --git a/ci/L0_check_health_vllm/check_health_test.py b/ci/L0_check_health_vllm/check_health_test.py
@@ -25,10 +25,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import json
-import os
 
 import numpy as np
-import pytest
 import tritonclient.grpc as grpcclient
 
 
@@ -118,11 +116,3 @@ def test_vllm_not_healthy(self):
             "Request for unknown model: 'vllm_opt' has no available versions"
         )
         self._assert_model_ready(False)
-
-    def test_vllm_enable_health_check_multi_instance(self):
-        with open(os.environ["SERVER_LOG"]) as f:
-            server_log = f.read()
-        expected_vllm_warning = "[vllm] Health check may only be enabled when the model has exactly 1 instance but 2 are found"
-        assert expected_vllm_warning in server_log
-        # Health check should be disabled
-        self.test_vllm_is_healthy()
diff --git a/ci/L0_check_health_vllm/test.sh b/ci/L0_check_health_vllm/test.sh
@@ -39,15 +39,6 @@ function setup_model_repository {
     cp -r $sample_model_repo_path/vllm_model models/vllm_opt
 }
 
-function setup_model_repository_with_multi_instances {
-    setup_model_repository
-    echo -e "backend: \"vllm\"" > models/vllm_opt/config.pbtxt
-    echo -e "instance_group [" >> models/vllm_opt/config.pbtxt
-    echo -e "  { kind: KIND_MODEL }," >> models/vllm_opt/config.pbtxt
-    echo -e "  { kind: KIND_MODEL \n count: 1 }" >> models/vllm_opt/config.pbtxt
-    echo -e "]" >> models/vllm_opt/config.pbtxt
-}
-
 function enable_health_check {
     local enable_vllm_health_check="$1"
     echo -e "parameters: {" >> models/vllm_opt/config.pbtxt
@@ -82,7 +73,7 @@ function test_check_health {
     fi
 
     set +e
-    SERVER_LOG=$SERVER_LOG python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log
+    python3 -m pytest --junitxml=$test_name.report.xml -s -v check_health_test.py::TestCheckHealth::$unit_test_name > $test_name.log
     if [ $? -ne 0 ]; then
         echo -e "\n***\n*** $test_name FAILED. \n***"
         RET=1
@@ -124,12 +115,6 @@ setup_model_repository
 enable_health_check "true"
 test_check_health "health_check_enabled_mocked_failure" "test_vllm_not_healthy"
 
-# Test health check enabled with mocked vLLM check_health() failure when there
-# are multiple instances
-setup_model_repository_with_multi_instances
-enable_health_check "true"
-test_check_health "health_check_enabled_multi_instance_mocked_failure" "test_vllm_enable_health_check_multi_instance"
-
 # Unmock check_health()
 unmock_vllm_async_llm_engine
 
diff --git a/src/model.py b/src/model.py
@@ -112,7 +112,10 @@ def initialize(self, args):
         self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
 
         # Setup vLLM engine health check
-        self._setup_health_check()
+        self._enable_health_check = self._get_bool_config_param(
+            "ENABLE_VLLM_HEALTH_CHECK"
+        )
+        self._is_healthy = True
 
         # Prepare vLLM engine
         self.init_engine()
@@ -134,31 +137,6 @@ def initialize(self, args):
         self._shutdown_event = asyncio.Event()
         self._event_thread.start()
 
-    def _setup_health_check(self):
-        # Check if health check should be enabled
-        self._enable_health_check = (
-            "ENABLE_VLLM_HEALTH_CHECK" in self.model_config["parameters"]
-        ) and (
-            self.model_config["parameters"]["ENABLE_VLLM_HEALTH_CHECK"][
-                "string_value"
-            ].lower()
-            in ["yes", "true"]
-        )
-        # Setup health check if enabled
-        if self._enable_health_check:
-            # Only enable health check if there is exactly 1 instance
-            num_instances = 0
-            for group in self.model_config["instance_group"]:
-                num_instances += group["count"]
-            if num_instances != 1:
-                self.logger.log_warn(
-                    f"[vllm] Health check may only be enabled when the model has exactly 1 instance but {num_instances} are found"
-                )
-                self._enable_health_check = False
-                return
-            # Set is healthy flag
-            self._is_healthy = True
-
     def init_engine(self):
         # Currently, Triton needs to use decoupled policy for asynchronously
         # forwarding requests to vLLM engine, so assert it.
@@ -191,9 +169,7 @@ def init_engine(self):
         # Create vLLM custom metrics
         self.vllm_metrics = None
         if (
-            "REPORT_CUSTOM_METRICS" in self.model_config["parameters"]
-            and self.model_config["parameters"]["REPORT_CUSTOM_METRICS"]["string_value"]
-            == "yes"
+            self._get_bool_config_param("REPORT_CUSTOM_METRICS")
             and not aync_engine_args.disable_log_stats
         ):
             try:
@@ -214,6 +190,12 @@ def init_engine(self):
                 else:
                     raise e
 
+    def _get_bool_config_param(self, param_name: str) -> bool:
+        return (param_name in self.model_config["parameters"]) and (
+            self.model_config["parameters"][param_name]["string_value"].lower()
+            in ["yes", "true"]
+        )
+
     def setup_lora(self):
         self.enable_lora = False