Skip to content

Commit 5af52bf

Browse files
author
wangzaijun
committed
fix
1 parent 5b53069 commit 5af52bf

File tree

1 file changed

+10
-5
lines changed

1 file changed

+10
-5
lines changed

lightllm/server/router/model_infer/mode_backend/pd_nixl/decode_node_impl/decode_trans_process.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,10 +199,15 @@ def accept_peer_task_loop(
199199

200200
if local_trans_task is None:
201201
remote_trans_task.error_info = "peer not find"
202-
self.transporter.send_notify_to_prefill_node(
203-
prefill_agent_name=remote_agent_name,
204-
notify=pickle.dumps(remote_trans_task.createRetObj()),
205-
)
202+
try:
203+
self.transporter.send_notify_to_prefill_node(
204+
prefill_agent_name=remote_agent_name,
205+
notify=pickle.dumps(remote_trans_task.createRetObj()),
206+
)
207+
except BaseException as e:
208+
logger.error(f"send notify to prefill node failed: {str(e)}")
209+
logger.exception(str(e))
210+
self.transporter.remove_remote_agent(peer_name=remote_agent_name)
206211
else:
207212
local_trans_task.nixl_src_page_index = remote_trans_task.nixl_src_page_index
208213

@@ -257,7 +262,7 @@ def read_peer_kv_loop(self):
257262
except BaseException as e:
258263
logger.error(f"read_blocks_paged node failed: {local_trans_task.to_str()}")
259264
logger.exception(str(e))
260-
self.transporter.remove_remote_agent(peer_name=local_trans_task.decode_agent_name)
265+
self.transporter.remove_remote_agent(peer_name=local_trans_task.prefill_agent_name)
261266
local_trans_task.error_info = f"read_blocks_paged failed: {str(e)}"
262267
self.failed_queue.put(local_trans_task)
263268
continue

0 commit comments

Comments
 (0)