@@ -47,17 +47,19 @@ set_bucketing(){
4747 input_max=${input_max:- 1024}
4848 output_max=${output_max:- 2048}
4949 block_size=${block_size:- 128}
50+ BUCKET_PADDING_RATIO=${BUCKET_PADDING_RATIO:- " 0.25" } # tune this to balance warmup time and runtime performance
5051
5152 prompt_bs_step=1
5253 prompt_bs_min=1
5354 prompt_bs_max=$(( $max_num_batched_tokens / $input_min ))
5455 # prompt_bs_max = min(prompt_bs_max, max_num_seqs)
5556 prompt_bs_max=$(( $prompt_bs_max > $max_num_seqs ? $max_num_seqs : $prompt_bs_max ))
5657 # prompt_bs_max = CEILING.MATH(prompt_bs_max, prompt_bs_step)
57- prompt_bs_max=$(( ($prompt_bs_max + $prompt_bs_step - 1 ) / $prompt_bs_step * $prompt_bs_step ))
58+ prompt_bs_max=$(( ($prompt_bs_max + $prompt_bs_step - 1 ) / $prompt_bs_step * $prompt_bs_step ))
5859 export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:- $prompt_bs_min }
5960 export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:- $prompt_bs_step }
6061 export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:- $prompt_bs_max }
62+ export VLLM_PROMPT_BS_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
6163
6264 prompt_seq_step=128
6365 # prompt_seq_min = CEILING.MATH(input_min, prompt_seq_step)
@@ -67,6 +69,7 @@ set_bucketing(){
6769 export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:- $prompt_seq_min }
6870 export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:- $prompt_seq_step }
6971 export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:- $prompt_seq_max }
72+ export VLLM_PROMPT_SEQ_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
7073
7174 # decode_bs_step = ROUNDUP(max_num_seqs / 16, 0)
7275 decode_bs_step=$(( ($max_num_seqs + 15 ) / 16 ))
@@ -77,6 +80,7 @@ set_bucketing(){
7780 export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:- $decode_bs_min }
7881 export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:- $decode_bs_step }
7982 export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:- $decode_bs_max }
83+ export VLLM_DECODE_BS_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
8084
8185 decode_block_step=$decode_bs_max
8286 # decode_block_min = ROUNDUP(input_min / block_size, 0)
@@ -88,6 +92,7 @@ set_bucketing(){
8892 export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:- $decode_block_min }
8993 export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:- $decode_block_step }
9094 export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:- $decode_block_max }
95+ export VLLM_DECODE_BLOCK_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
9196}
9297
9398# clean existing INC scale
0 commit comments