File tree Expand file tree Collapse file tree 1 file changed +14
-4
lines changed Expand file tree Collapse file tree 1 file changed +14
-4
lines changed Original file line number Diff line number Diff line change @@ -264,12 +264,21 @@ async def generate(self, request):
264264 self .logger .log_info ("[vllm] Successfully cancelled the request" )
265265 break
266266 if stream :
267- response_sender .send (self .create_response (output ))
267+ if output .finished :
268+ response_sender .send (
269+ self .create_response (output ),
270+ flags = pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL ,
271+ )
272+ else :
273+ response_sender .send (self .create_response (output ))
268274 else :
269275 last_output = output
270276
271277 if not stream :
272- response_sender .send (self .create_response (last_output ))
278+ response_sender .send (
279+ self .create_response (last_output ),
280+ flags = pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL ,
281+ )
273282
274283 except Exception as e :
275284 self .logger .log_info (f"[vllm] Error generating stream: { e } " )
@@ -280,10 +289,11 @@ async def generate(self, request):
280289 response = pb_utils .InferenceResponse (
281290 output_tensors = [triton_output_tensor ], error = error
282291 )
283- response_sender .send (response )
292+ response_sender .send (
293+ response , flags = pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL
294+ )
284295 raise e
285296 finally :
286- response_sender .send (flags = pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL )
287297 self .ongoing_request_count -= 1
288298
289299 def execute (self , requests ):
You can’t perform that action at this time.
0 commit comments