Skip to content

Commit 43548ce

Browse files
varun-sundar-rabindranathVarun Sundar Rabindranath
authored andcommitted
[BugFix] Pad input buffers in _dummy_run (vllm-project#26209)
Signed-off-by: Varun Sundar Rabindranath <[email protected]> Co-authored-by: Varun Sundar Rabindranath <[email protected]>
1 parent 4f1a652 commit 43548ce

File tree

1 file changed

+10
-8
lines changed

1 file changed

+10
-8
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3436,26 +3436,28 @@ def _dummy_run(
34363436
with self.maybe_dummy_run_with_lora(
34373437
self.lora_config, num_scheduled_tokens, remove_lora
34383438
):
3439-
model_kwargs = self._init_model_kwargs(num_tokens)
3439+
# Make sure padding doesn't exceed max_num_tokens
3440+
assert num_tokens_after_padding <= self.max_num_tokens
3441+
model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
34403442
if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
34413443
input_ids = None
3442-
inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
3444+
inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
34433445
model_kwargs = {
34443446
**model_kwargs,
34453447
**self._dummy_mm_kwargs(num_reqs),
34463448
}
34473449
elif self.enable_prompt_embeds:
34483450
input_ids = None
3449-
inputs_embeds = self.inputs_embeds.gpu[:num_tokens]
3450-
model_kwargs = self._init_model_kwargs(num_tokens)
3451+
inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
3452+
model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
34513453
else:
3452-
input_ids = self.input_ids.gpu[:num_tokens]
3454+
input_ids = self.input_ids.gpu[:num_tokens_after_padding]
34533455
inputs_embeds = None
34543456

34553457
if self.uses_mrope:
3456-
positions = self.mrope_positions.gpu[:, :num_tokens]
3458+
positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
34573459
else:
3458-
positions = self.positions.gpu[:num_tokens]
3460+
positions = self.positions.gpu[:num_tokens_after_padding]
34593461

34603462
if get_pp_group().is_first_rank:
34613463
intermediate_tensors = None
@@ -3470,7 +3472,7 @@ def _dummy_run(
34703472
)
34713473

34723474
intermediate_tensors = self.sync_and_slice_intermediate_tensors(
3473-
num_tokens, None, False
3475+
num_tokens_after_padding, None, False
34743476
)
34753477

34763478
# filter out the valid batch descriptor

0 commit comments

Comments
 (0)