@@ -111,6 +111,9 @@ def initialize(self, args):
111111 )
112112 self .output_dtype = pb_utils .triton_string_to_numpy (output_config ["data_type" ])
113113
114+ # Setup vLLM engine health check
115+ self ._setup_health_check ()
116+
114117 # Prepare vLLM engine
115118 self .init_engine ()
116119
@@ -131,6 +134,31 @@ def initialize(self, args):
131134 self ._shutdown_event = asyncio .Event ()
132135 self ._event_thread .start ()
133136
137+ def _setup_health_check (self ):
138+ # Check if health check should be enabled
139+ self ._enable_health_check = (
140+ "ENABLE_VLLM_HEALTH_CHECK" in self .model_config ["parameters" ]
141+ ) and (
142+ self .model_config ["parameters" ]["ENABLE_VLLM_HEALTH_CHECK" ][
143+ "string_value"
144+ ].lower ()
145+ in ["yes" , "true" ]
146+ )
147+ # Setup health check if enabled
148+ if self ._enable_health_check :
149+ # Only enable health check if there is exactly 1 instance
150+ num_instances = 0
151+ for group in self .model_config ["instance_group" ]:
152+ num_instances += group ["count" ]
153+ if num_instances != 1 :
154+ self .logger .log_warn (
155+ f"[vllm] Health check may only be enabled when the model has exactly 1 instance but { num_instances } are found"
156+ )
157+ self ._enable_health_check = False
158+ return
159+ # Set is healthy flag
160+ self ._is_healthy = True
161+
134162 def init_engine (self ):
135163 # Currently, Triton needs to use decoupled policy for asynchronously
136164 # forwarding requests to vLLM engine, so assert it.
@@ -542,6 +570,28 @@ def verify_loras(self, request):
542570 verified_request = request
543571 return verified_request
544572
573+ def _check_health (self , requests ):
574+ coro = self .llm_engine .check_health ()
575+ future = asyncio .run_coroutine_threadsafe (coro , self ._loop )
576+ try :
577+ future .result ()
578+ except Exception as e :
579+ self .logger .log_error (f"[vllm] Engine is not healthy: { e } " )
580+ pb_utils .unload_model (self .model_config ["name" ]) # non-blocking
581+ self ._is_healthy = False
582+ if not self ._is_healthy :
583+ for request in requests :
584+ request .get_response_sender ().send (
585+ pb_utils .InferenceResponse (
586+ error = pb_utils .TritonError (
587+ message = "vLLM engine is not healthy" ,
588+ code = pb_utils .TritonError .UNAVAILABLE ,
589+ )
590+ ),
591+ flags = pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL ,
592+ )
593+ return self ._is_healthy
594+
545595 def execute (self , requests ):
546596 """
547597 Triton core issues requests to the backend via this method.
@@ -552,6 +602,8 @@ def execute(self, requests):
552602 is too loaded.
553603 We are pushing all the requests on vllm and let it handle the full traffic.
554604 """
605+ if self ._enable_health_check and not self ._check_health (requests ):
606+ return None
555607 for request in requests :
556608 request = self .verify_loras (request )
557609 if request is not None :
0 commit comments