File tree Expand file tree Collapse file tree 1 file changed +10
-5
lines changed
lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl Expand file tree Collapse file tree 1 file changed +10
-5
lines changed Original file line number Diff line number Diff line change @@ -199,10 +199,15 @@ def accept_peer_task_loop(
199199
200200 if local_trans_task is None :
201201 remote_trans_task .error_info = "peer not find"
202- self .transporter .send_notify_to_prefill_node (
203- prefill_agent_name = remote_agent_name ,
204- notify = pickle .dumps (remote_trans_task .createRetObj ()),
205- )
202+ try :
203+ self .transporter .send_notify_to_prefill_node (
204+ prefill_agent_name = remote_agent_name ,
205+ notify = pickle .dumps (remote_trans_task .createRetObj ()),
206+ )
207+ except BaseException as e :
208+ logger .error (f"send notify to prefill node failed: { str (e )} " )
209+ logger .exception (str (e ))
210+ self .transporter .remove_remote_agent (peer_name = remote_agent_name )
206211 else :
207212 local_trans_task .nixl_src_page_index = remote_trans_task .nixl_src_page_index
208213
@@ -257,7 +262,7 @@ def read_peer_kv_loop(self):
257262 except BaseException as e :
258263 logger .error (f"read_blocks_paged node failed: { local_trans_task .to_str ()} " )
259264 logger .exception (str (e ))
260- self .transporter .remove_remote_agent (peer_name = local_trans_task .decode_agent_name )
265+ self .transporter .remove_remote_agent (peer_name = local_trans_task .prefill_agent_name )
261266 local_trans_task .error_info = f"read_blocks_paged failed: { str (e )} "
262267 self .failed_queue .put (local_trans_task )
263268 continue
You can’t perform that action at this time.
0 commit comments