@@ -244,17 +244,19 @@ async def loop_for_fwd(
244244 estimated_peak_token_count = self .shared_token_load .get_estimated_peak_token_count (d_i )
245245 logger .debug (
246246 f"dp_i { d_i } current batch size: { len (self .running_batch .reqs )} \n "
247- f"dp_i { d_i } paused req num: { self .req_queue .get_paused_req_num ()} \n "
247+ f"dp_i { d_i } paused req num: { self .req_queue .get_paused_req_num (d_i )} \n "
248248 f"dp_i { d_i } frozen token num: { frozen_token_num } \n "
249249 f"dp_i { d_i } estimated_peak_token_count: { estimated_peak_token_count } \n "
250250 f"dp_i { d_i } token used ratio: { token_ratio1 } not contain prompt cache tree unrefed token\n "
251251 f"dp_i { d_i } token used ratio: { token_ratio2 } contain prompt cache tree unrefed token"
252252 )
253+ self .metric_client .gauge_set (
254+ "lightllm_batch_pause_size" , self .req_queue .get_paused_req_num (d_i )
255+ )
253256 # pd decode mode need to update token_load more frequently
254257 self .req_queue .update_token_load (self .running_batch , force_update = self .is_pd_decode_mode )
255258 self .stats_tool .print_stats ()
256259 self .metric_client .gauge_set ("lightllm_batch_current_size" , len (self .running_batch .reqs ))
257- self .metric_client .gauge_set ("lightllm_batch_pause_size" , self .req_queue .get_paused_req_num ())
258260 self .metric_client .gauge_set ("lightllm_queue_size" , self .req_queue .get_wait_req_num ())
259261 self .metric_client .gauge_set (
260262 "lightllm_batch_current_max_tokens" ,
@@ -358,15 +360,13 @@ async def _step(self):
358360
359361 # Check if need pause some requests for decode.
360362 for dp_index in range (self .dp_size_in_node ):
361- if self ._can_decode (self .running_batch , dp_index = dp_index ):
362- continue
363- else :
363+ while not self ._can_decode (self .running_batch , dp_index = dp_index ):
364364 # pause strategy
365365 paused_reqs = select_paused_reqs (
366366 self .running_batch , self .pause_strategy , self .req_queue , self .max_total_token_num , dp_index = dp_index
367367 )
368368 await self ._pause_reqs (paused_reqs )
369- logger .debug (f"DP index { dp_index } pasues req num: { self .req_queue .get_paused_req_num ()} " )
369+ logger .debug (f"DP index { dp_index } pasues req num: { self .req_queue .get_paused_req_num (dp_index )} " )
370370 self .has_wait_tokens = 0
371371
372372 # Decode
0 commit comments