Skip to content

Commit 7408e3a

Browse files
authored
add BUCKET_PADDING_RATIO to set padding ratio limit (#2029)
Works with HabanaAI/vllm-hpu-extension#379
1 parent e0c8be6 commit 7408e3a

File tree

3 files changed

+19
-9
lines changed

3 files changed

+19
-9
lines changed

scripts/quickstart/set_worker_node.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,10 @@ export VLLM_GRAPH_PROMPT_RATIO=0
6565

6666
#export VLLM_SKIP_WARMUP=true
6767

68-
unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
69-
unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
70-
unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
71-
unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
68+
unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX VLLM_PROMPT_BS_BUCKET_LIMIT
69+
unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX VLLM_PROMPT_SEQ_BUCKET_LIMIT
70+
unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX VLLM_DECODE_BS_BUCKET_LIMIT
71+
unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX VLLM_DECODE_BLOCK_BUCKET_LIMIT
7272

7373
set_bucketing
7474

scripts/quickstart/start_vllm.sh

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,36 +141,40 @@ input_max=$max_model_len
141141
output_max=$max_model_len
142142

143143

144-
unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
145-
unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
146-
unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
147-
unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
144+
unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX VLLM_PROMPT_BS_BUCKET_LIMIT
145+
unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX VLLM_PROMPT_SEQ_BUCKET_LIMIT
146+
unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX VLLM_DECODE_BS_BUCKET_LIMIT
147+
unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX VLLM_DECODE_BLOCK_BUCKET_LIMIT
148148

149149
#export VLLM_SKIP_WARMUP=True
150150

151151

152152

153153
# !!!!!!!!!!!!!!!!!!!! set bucketing !!!!!!!!!!!!!
154+
BUCKET_PADDING_RATIO=${BUCKET_PADDING_RATIO:-"0.25"} # tune this to balance warmup time and runtime performance
154155
prompt_bs_min=1
155156
prompt_bs_step=$(( $max_num_seqs > 32 ? 32 : $max_num_seqs ))
156157
prompt_bs_max=$(( $max_num_seqs > 64 ? 64 : $max_num_seqs ))
157158
export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:-$prompt_bs_min}
158159
export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:-$prompt_bs_step}
159160
export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:-$prompt_bs_max}
161+
export VLLM_PROMPT_BS_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
160162

161163
prompt_seq_step=128
162164
prompt_seq_min=128
163165
prompt_seq_max=$max_num_batched_tokens
164166
export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:-$prompt_seq_min}
165167
export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:-$prompt_seq_step}
166168
export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:-$prompt_seq_max}
169+
export VLLM_PROMPT_SEQ_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
167170

168171
decode_bs_min=1
169172
decode_bs_step=$(( $max_num_seqs > $default_decode_bs_step ? $default_decode_bs_step : $max_num_seqs ))
170173
decode_bs_max=$max_num_seqs
171174
export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:-$decode_bs_min}
172175
export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:-$decode_bs_step}
173176
export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:-$decode_bs_max}
177+
export VLLM_DECODE_BS_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
174178

175179
decode_block_min=128
176180
decode_block_step=128
@@ -179,6 +183,7 @@ decode_block_max=$(( ((max_num_seqs * max_model_len / block_size) > 128) ? (max_
179183
export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:-$decode_block_min}
180184
export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:-$decode_block_step}
181185
export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:-$decode_block_max}
186+
export VLLM_DECODE_BLOCK_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
182187

183188

184189
echo " environments are reseted "

scripts/quickstart/utils.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,17 +47,19 @@ set_bucketing(){
4747
input_max=${input_max:-1024}
4848
output_max=${output_max:-2048}
4949
block_size=${block_size:-128}
50+
BUCKET_PADDING_RATIO=${BUCKET_PADDING_RATIO:-"0.25"} # tune this to balance warmup time and runtime performance
5051

5152
prompt_bs_step=1
5253
prompt_bs_min=1
5354
prompt_bs_max=$(( $max_num_batched_tokens / $input_min ))
5455
# prompt_bs_max = min(prompt_bs_max, max_num_seqs)
5556
prompt_bs_max=$(( $prompt_bs_max > $max_num_seqs ? $max_num_seqs : $prompt_bs_max ))
5657
# prompt_bs_max = CEILING.MATH(prompt_bs_max, prompt_bs_step)
57-
prompt_bs_max=$(( ($prompt_bs_max + $prompt_bs_step - 1) / $prompt_bs_step * $prompt_bs_step ))
58+
prompt_bs_max=$(( ($prompt_bs_max + $prompt_bs_step - 1) / $prompt_bs_step * $prompt_bs_step ))
5859
export VLLM_PROMPT_BS_BUCKET_MIN=${VLLM_PROMPT_BS_BUCKET_MIN:-$prompt_bs_min}
5960
export VLLM_PROMPT_BS_BUCKET_STEP=${VLLM_PROMPT_BS_BUCKET_STEP:-$prompt_bs_step}
6061
export VLLM_PROMPT_BS_BUCKET_MAX=${VLLM_PROMPT_BS_BUCKET_MAX:-$prompt_bs_max}
62+
export VLLM_PROMPT_BS_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
6163

6264
prompt_seq_step=128
6365
# prompt_seq_min = CEILING.MATH(input_min, prompt_seq_step)
@@ -67,6 +69,7 @@ set_bucketing(){
6769
export VLLM_PROMPT_SEQ_BUCKET_MIN=${VLLM_PROMPT_SEQ_BUCKET_MIN:-$prompt_seq_min}
6870
export VLLM_PROMPT_SEQ_BUCKET_STEP=${VLLM_PROMPT_SEQ_BUCKET_STEP:-$prompt_seq_step}
6971
export VLLM_PROMPT_SEQ_BUCKET_MAX=${VLLM_PROMPT_SEQ_BUCKET_MAX:-$prompt_seq_max}
72+
export VLLM_PROMPT_SEQ_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
7073

7174
# decode_bs_step = ROUNDUP(max_num_seqs / 16, 0)
7275
decode_bs_step=$(( ($max_num_seqs + 15) / 16 ))
@@ -77,6 +80,7 @@ set_bucketing(){
7780
export VLLM_DECODE_BS_BUCKET_MIN=${VLLM_DECODE_BS_BUCKET_MIN:-$decode_bs_min}
7881
export VLLM_DECODE_BS_BUCKET_STEP=${VLLM_DECODE_BS_BUCKET_STEP:-$decode_bs_step}
7982
export VLLM_DECODE_BS_BUCKET_MAX=${VLLM_DECODE_BS_BUCKET_MAX:-$decode_bs_max}
83+
export VLLM_DECODE_BS_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
8084

8185
decode_block_step=$decode_bs_max
8286
# decode_block_min = ROUNDUP(input_min / block_size, 0)
@@ -88,6 +92,7 @@ set_bucketing(){
8892
export VLLM_DECODE_BLOCK_BUCKET_MIN=${VLLM_DECODE_BLOCK_BUCKET_MIN:-$decode_block_min}
8993
export VLLM_DECODE_BLOCK_BUCKET_STEP=${VLLM_DECODE_BLOCK_BUCKET_STEP:-$decode_block_step}
9094
export VLLM_DECODE_BLOCK_BUCKET_MAX=${VLLM_DECODE_BLOCK_BUCKET_MAX:-$decode_block_max}
95+
export VLLM_DECODE_BLOCK_BUCKET_LIMIT=${BUCKET_PADDING_RATIO}
9196
}
9297

9398
# clean existing INC scale

0 commit comments

Comments
 (0)