Skip to content

Commit ca13f02

Browse files
committed
[WIP] Add vLLM health check
* [WIP] vLLM check_health() is async * [WIP] Fix model name query * [WIP] Health check may only be enabled when instance count is 1
1 parent b71088a commit ca13f02

File tree

1 file changed

+52
-0
lines changed

1 file changed

+52
-0
lines changed

src/model.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ def initialize(self, args):
111111
)
112112
self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
113113

114+
# Setup vLLM engine health check
115+
self._setup_health_check()
116+
114117
# Prepare vLLM engine
115118
self.init_engine()
116119

@@ -131,6 +134,31 @@ def initialize(self, args):
131134
self._shutdown_event = asyncio.Event()
132135
self._event_thread.start()
133136

137+
def _setup_health_check(self):
138+
# Check if health check should be enabled
139+
self._enable_health_check = (
140+
"ENABLE_VLLM_HEALTH_CHECK" in self.model_config["parameters"]
141+
) and (
142+
self.model_config["parameters"]["ENABLE_VLLM_HEALTH_CHECK"][
143+
"string_value"
144+
].lower()
145+
in ["yes", "true"]
146+
)
147+
# Setup health check if enabled
148+
if self._enable_health_check:
149+
# Only enable health check if there is exactly 1 instance
150+
num_instances = 0
151+
for group in self.model_config["instance_group"]:
152+
num_instances += group["count"]
153+
if num_instances != 1:
154+
self.logger.log_warn(
155+
f"[vllm] Health check may only be enabled when the model has exactly 1 instance but {num_instances} are found"
156+
)
157+
self._enable_health_check = False
158+
return
159+
# Set is healthy flag
160+
self._is_healthy = True
161+
134162
def init_engine(self):
135163
# Currently, Triton needs to use decoupled policy for asynchronously
136164
# forwarding requests to vLLM engine, so assert it.
@@ -542,6 +570,28 @@ def verify_loras(self, request):
542570
verified_request = request
543571
return verified_request
544572

573+
def _check_health(self, requests):
574+
coro = self.llm_engine.check_health()
575+
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
576+
try:
577+
future.result()
578+
except Exception as e:
579+
self.logger.log_error(f"[vllm] Engine is not healthy: {e}")
580+
pb_utils.unload_model(self.model_config["name"]) # non-blocking
581+
self._is_healthy = False
582+
if not self._is_healthy:
583+
for request in requests:
584+
request.get_response_sender().send(
585+
pb_utils.InferenceResponse(
586+
error=pb_utils.TritonError(
587+
message="vLLM engine is not healthy",
588+
code=pb_utils.TritonError.UNAVAILABLE,
589+
)
590+
),
591+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
592+
)
593+
return self._is_healthy
594+
545595
def execute(self, requests):
546596
"""
547597
Triton core issues requests to the backend via this method.
@@ -552,6 +602,8 @@ def execute(self, requests):
552602
is too loaded.
553603
We are pushing all the requests on vllm and let it handle the full traffic.
554604
"""
605+
if self._enable_health_check and not self._check_health(requests):
606+
return None
555607
for request in requests:
556608
request = self.verify_loras(request)
557609
if request is not None:

0 commit comments

Comments
 (0)