Skip to content

Commit a9d231c

Browse files
authored
Fix bug for concurrently visit zmq (#3233)
1 parent b20ffe3 commit a9d231c

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

fastdeploy/splitwise/internal_adapter_utils.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def __init__(self, cfg, engine, dp_rank):
4343
target=self._response_external_module_control_instruct, daemon=True
4444
)
4545
self.response_external_instruct_thread.start()
46+
self.response_lock = threading.Lock() # prevent to call send_multipart in zmq concurrently
4647

4748
def _get_current_server_info(self):
4849
"""
@@ -76,7 +77,8 @@ def _recv_external_module_control_instruct(self):
7677
payload_info = self._get_current_server_info()
7778
result = {"task_id": task_id_str, "result": payload_info}
7879
logger.info(f"Response for task: {task_id_str}")
79-
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
80+
with self.response_lock:
81+
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
8082

8183
elif task["cmd"] == "get_metrics":
8284
metrics_text = get_filtered_metrics(
@@ -85,7 +87,8 @@ def _recv_external_module_control_instruct(self):
8587
)
8688
result = {"task_id": task_id_str, "result": metrics_text}
8789
logger.info(f"Response for task: {task_id_str}")
88-
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
90+
with self.response_lock:
91+
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
8992
elif task["cmd"] == "connect_rdma":
9093
self.engine.engine_worker_queue.put_connect_rdma_task(task)
9194

@@ -100,7 +103,8 @@ def _response_external_module_control_instruct(self):
100103
task_id_str = result_data["task_id"]
101104
result = {"task_id": task_id_str, "result": result_data}
102105
logger.info(f"Response for task: {task_id_str}")
103-
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
106+
with self.response_lock:
107+
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
104108
else:
105109
time.sleep(0.001)
106110
except Exception as e:

0 commit comments

Comments
 (0)