Skip to content

Commit 6a7796e

Browse files
authored
[Bug]: Limit num_reqs in dummy_run when max_num_seqs is small (vllm-project#26144)
Signed-off-by: Benjamin Chislett <[email protected]>
1 parent 47b9339 commit 6a7796e

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3060,7 +3060,7 @@ def _dummy_run(
30603060
assert not uniform_decode
30613061
# Create mixed batch:
30623062
# first half decode tokens, second half one prefill
3063-
num_decode_tokens = num_tokens // 2
3063+
num_decode_tokens = min(max_num_reqs - 1, num_tokens // 2)
30643064
num_prefill_tokens = num_tokens - num_decode_tokens
30653065
num_reqs = num_decode_tokens + 1
30663066

@@ -3072,7 +3072,7 @@ def _dummy_run(
30723072
max_query_len = num_prefill_tokens
30733073
elif uniform_decode:
30743074
assert not create_mixed_batch
3075-
num_reqs = cdiv(num_tokens, max_query_len)
3075+
num_reqs = min(max_num_reqs, cdiv(num_tokens, max_query_len))
30763076
num_scheduled_tokens_list = [max_query_len] * num_reqs
30773077
if num_tokens % max_query_len != 0:
30783078
num_scheduled_tokens_list[-1] = num_tokens % max_query_len

0 commit comments

Comments
 (0)