@@ -288,7 +288,6 @@ def response_loop(self):
288288 if item is None :
289289 break
290290 response_sender , response , response_flag = item
291- del item
292291 try :
293292 response_sender .send (response , response_flag )
294293 except Exception as e :
@@ -298,9 +297,6 @@ def response_loop(self):
298297 finally :
299298 if response_flag == pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL :
300299 self .ongoing_request_count -= 1
301- del response_sender
302- if self .ongoing_request_count == 0 :
303- gc .collect ()
304300
305301 def create_response (self , vllm_output , prepend_input ):
306302 """
@@ -447,9 +443,6 @@ async def generate(self, request):
447443 finally :
448444 if decrement_ongoing_request_count :
449445 self .ongoing_request_count -= 1
450- del response_sender
451- if self .ongoing_request_count == 0 :
452- gc .collect ()
453446
454447 def verify_loras (self , request ):
455448 # We will check if the requested lora exists here, if not we will send a
@@ -527,3 +520,9 @@ def finalize(self):
527520 if self ._response_thread is not None :
528521 self ._response_thread .join ()
529522 self ._response_thread = None
523+
524+ # When using parallel tensors, the stub process may not shutdown due to
525+ # unreleased references, so manually run the garbage collector once.
526+ self .logger .log_info ("[vllm] Running Garbage Collector on finalize..." )
527+ gc .collect ()
528+ self .logger .log_info ("[vllm] Garbage Collector on finalize... done" )
0 commit comments