@@ -34,6 +34,7 @@ def __init__(self, cfg, engine, dp_rank):
34
34
self .engine = engine
35
35
self .dp_rank = dp_rank
36
36
recv_control_cmd_ports = envs .FD_ZMQ_CONTROL_CMD_SERVER_PORTS .split ("," )
37
+ self .response_lock = threading .Lock () # prevent to call send_multipart in zmq concurrently
37
38
self .recv_control_cmd_server = ZmqTcpServer (port = recv_control_cmd_ports [dp_rank ], mode = zmq .ROUTER )
38
39
self .recv_external_instruct_thread = threading .Thread (
39
40
target = self ._recv_external_module_control_instruct , daemon = True
@@ -43,7 +44,6 @@ def __init__(self, cfg, engine, dp_rank):
43
44
target = self ._response_external_module_control_instruct , daemon = True
44
45
)
45
46
self .response_external_instruct_thread .start ()
46
- self .response_lock = threading .Lock () # prevent to call send_multipart in zmq concurrently
47
47
48
48
def _get_current_server_info (self ):
49
49
"""
@@ -71,13 +71,17 @@ def _recv_external_module_control_instruct(self):
71
71
"""
72
72
while True :
73
73
try :
74
- task = self .recv_control_cmd_server .recv_control_cmd ()
74
+ with self .response_lock :
75
+ task = self .recv_control_cmd_server .recv_control_cmd ()
76
+ if task is None :
77
+ time .sleep (0.001 )
78
+ continue
75
79
logger .info (f"Recieve control task: { task } " )
76
80
task_id_str = task ["task_id" ]
77
81
if task ["cmd" ] == "get_payload" :
78
82
payload_info = self ._get_current_server_info ()
79
83
result = {"task_id" : task_id_str , "result" : payload_info }
80
- logger .info (f"Response for task: { task_id_str } " )
84
+ logger .debug (f"Response for task: { task_id_str } " )
81
85
with self .response_lock :
82
86
self .recv_control_cmd_server .response_for_control_cmd (task_id_str , result )
83
87
@@ -87,7 +91,7 @@ def _recv_external_module_control_instruct(self):
87
91
extra_register_func = lambda reg : main_process_metrics .register_all (reg , workers = 1 ),
88
92
)
89
93
result = {"task_id" : task_id_str , "result" : metrics_text }
90
- logger .info (f"Response for task: { task_id_str } " )
94
+ logger .debug (f"Response for task: { task_id_str } " )
91
95
with self .response_lock :
92
96
self .recv_control_cmd_server .response_for_control_cmd (task_id_str , result )
93
97
elif task ["cmd" ] == "connect_rdma" :
0 commit comments