@@ -66,10 +66,10 @@ def __init__(self, args, router_port, detokenization_port, model_rpc_ports, metr
6666
6767 context = zmq .asyncio .Context (2 )
6868 self .recv_from_httpserver = context .socket (zmq .PULL )
69- self .recv_from_httpserver .bind (f"tcp:// 127.0.0.1:{ router_port } " )
69+ self .recv_from_httpserver .bind (f"{ args . zmq_mode } 127.0.0.1:{ router_port } " )
7070
7171 self .send_to_detokenization = context .socket (zmq .PUSH )
72- self .send_to_detokenization .connect (f"tcp:// 127.0.0.1:{ detokenization_port } " )
72+ self .send_to_detokenization .connect (f"{ args . zmq_mode } 127.0.0.1:{ detokenization_port } " )
7373 self .model_rpc_ports = model_rpc_ports
7474
7575 self .is_splitfuse_mode = args .splitfuse_mode
@@ -283,14 +283,15 @@ async def _step(self):
283283 self .running_batch = new_batch
284284 await self ._prefill_batch (self .running_batch )
285285 self ._filter_runing_batch ()
286- self .has_wait_tokens = 0
286+ self .has_wait_tokens = self . max_wait_tokens
287287 return
288288
289289 # 有运行请求,但是已经到了可以调度新的请求合并推理的时机
290290 if self .has_wait_tokens >= self .max_wait_tokens :
291291 new_mini_batch = self .req_queue .generate_new_batch (self .running_batch )
292292 self .has_wait_tokens = 0
293293 if new_mini_batch is not None :
294+ self .has_wait_tokens = self .max_wait_tokens
294295 self .stats_tool .count_prompt_tokens (new_mini_batch )
295296 await self ._prefill_batch (new_mini_batch )
296297 if not new_mini_batch .is_clear ():
@@ -426,6 +427,9 @@ def _update_init_status_to_batch(self, batch: Batch, req_to_req_status):
426427
427428 def _update_out_status_to_batch (self , batch : Batch , req_to_out_status ):
428429 new_batch_decode_need_tokens = [0 for _ in range (self .dp_size )] # 只有在 splitfuse 模式下有意义
430+
431+ start_time = 0
432+ # extral_info 字段如果推理后端输入时间标记, 则用来评估序列化所占用的时间, 主要用于调试时使用
429433 for req_id , (
430434 req_status ,
431435 cur_kv_len ,
@@ -434,6 +438,8 @@ def _update_out_status_to_batch(self, batch: Batch, req_to_out_status):
434438 finish_status_value ,
435439 extral_info ,
436440 ) in req_to_out_status .items ():
441+ if extral_info is not None :
442+ start_time = max (start_time , extral_info )
437443 req : Req = batch .id_to_reqs [req_id ]
438444 req .req_status = req_status
439445 req .cur_kv_len = cur_kv_len
@@ -446,6 +452,9 @@ def _update_out_status_to_batch(self, batch: Batch, req_to_out_status):
446452 new_batch_decode_need_tokens [req_dp_index ] += req .get_decode_need_tokens ()
447453
448454 batch .batch_decode_need_tokens = new_batch_decode_need_tokens
455+ rpyc_cost_time = (time .time () - start_time ) * 1000
456+ if 8 <= rpyc_cost_time <= 1000 :
457+ logger .warning (f"rpyc use too much time { rpyc_cost_time } ms, batch_size { len (req_to_out_status )} " )
449458 return
450459
451460 def _can_decode (self , batch : Batch ):
0 commit comments