@@ -3434,26 +3434,28 @@ def _dummy_run(
3434
3434
with self .maybe_dummy_run_with_lora (
3435
3435
self .lora_config , num_scheduled_tokens , remove_lora
3436
3436
):
3437
- model_kwargs = self ._init_model_kwargs (num_tokens )
3437
+ # Make sure padding doesn't exceed max_num_tokens
3438
+ assert num_tokens_after_padding <= self .max_num_tokens
3439
+ model_kwargs = self ._init_model_kwargs (num_tokens_after_padding )
3438
3440
if self .supports_mm_inputs and not self .model_config .is_encoder_decoder :
3439
3441
input_ids = None
3440
- inputs_embeds = self .inputs_embeds .gpu [:num_tokens ]
3442
+ inputs_embeds = self .inputs_embeds .gpu [:num_tokens_after_padding ]
3441
3443
model_kwargs = {
3442
3444
** model_kwargs ,
3443
3445
** self ._dummy_mm_kwargs (num_reqs ),
3444
3446
}
3445
3447
elif self .enable_prompt_embeds :
3446
3448
input_ids = None
3447
- inputs_embeds = self .inputs_embeds .gpu [:num_tokens ]
3448
- model_kwargs = self ._init_model_kwargs (num_tokens )
3449
+ inputs_embeds = self .inputs_embeds .gpu [:num_tokens_after_padding ]
3450
+ model_kwargs = self ._init_model_kwargs (num_tokens_after_padding )
3449
3451
else :
3450
- input_ids = self .input_ids .gpu [:num_tokens ]
3452
+ input_ids = self .input_ids .gpu [:num_tokens_after_padding ]
3451
3453
inputs_embeds = None
3452
3454
3453
3455
if self .uses_mrope :
3454
- positions = self .mrope_positions .gpu [:, :num_tokens ]
3456
+ positions = self .mrope_positions .gpu [:, :num_tokens_after_padding ]
3455
3457
else :
3456
- positions = self .positions .gpu [:num_tokens ]
3458
+ positions = self .positions .gpu [:num_tokens_after_padding ]
3457
3459
3458
3460
if get_pp_group ().is_first_rank :
3459
3461
intermediate_tensors = None
@@ -3468,7 +3470,7 @@ def _dummy_run(
3468
3470
)
3469
3471
3470
3472
intermediate_tensors = self .sync_and_slice_intermediate_tensors (
3471
- num_tokens , None , False
3473
+ num_tokens_after_padding , None , False
3472
3474
)
3473
3475
3474
3476
# filter out the valid batch descriptor
0 commit comments