@@ -3436,26 +3436,28 @@ def _dummy_run(
3436
3436
with self .maybe_dummy_run_with_lora (
3437
3437
self .lora_config , num_scheduled_tokens , remove_lora
3438
3438
):
3439
- model_kwargs = self ._init_model_kwargs (num_tokens )
3439
+ # Make sure padding doesn't exceed max_num_tokens
3440
+ assert num_tokens_after_padding <= self .max_num_tokens
3441
+ model_kwargs = self ._init_model_kwargs (num_tokens_after_padding )
3440
3442
if self .supports_mm_inputs and not self .model_config .is_encoder_decoder :
3441
3443
input_ids = None
3442
- inputs_embeds = self .inputs_embeds .gpu [:num_tokens ]
3444
+ inputs_embeds = self .inputs_embeds .gpu [:num_tokens_after_padding ]
3443
3445
model_kwargs = {
3444
3446
** model_kwargs ,
3445
3447
** self ._dummy_mm_kwargs (num_reqs ),
3446
3448
}
3447
3449
elif self .enable_prompt_embeds :
3448
3450
input_ids = None
3449
- inputs_embeds = self .inputs_embeds .gpu [:num_tokens ]
3450
- model_kwargs = self ._init_model_kwargs (num_tokens )
3451
+ inputs_embeds = self .inputs_embeds .gpu [:num_tokens_after_padding ]
3452
+ model_kwargs = self ._init_model_kwargs (num_tokens_after_padding )
3451
3453
else :
3452
- input_ids = self .input_ids .gpu [:num_tokens ]
3454
+ input_ids = self .input_ids .gpu [:num_tokens_after_padding ]
3453
3455
inputs_embeds = None
3454
3456
3455
3457
if self .uses_mrope :
3456
- positions = self .mrope_positions .gpu [:, :num_tokens ]
3458
+ positions = self .mrope_positions .gpu [:, :num_tokens_after_padding ]
3457
3459
else :
3458
- positions = self .positions .gpu [:num_tokens ]
3460
+ positions = self .positions .gpu [:num_tokens_after_padding ]
3459
3461
3460
3462
if get_pp_group ().is_first_rank :
3461
3463
intermediate_tensors = None
@@ -3470,7 +3472,7 @@ def _dummy_run(
3470
3472
)
3471
3473
3472
3474
intermediate_tensors = self .sync_and_slice_intermediate_tensors (
3473
- num_tokens , None , False
3475
+ num_tokens_after_padding , None , False
3474
3476
)
3475
3477
3476
3478
# filter out the valid batch descriptor
0 commit comments