Skip to content

Commit a4ce74c

Browse files
[VLM] Use shared field to pass token ids to model
1 parent 3b2005e commit a4ce74c

File tree

2 files changed

+235
-46
lines changed

2 files changed

+235
-46
lines changed

vllm/model_executor/models/internvl.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -564,8 +564,7 @@ def _call_hf_processor(
564564
# Since there may be extra tokens in the feature placeholders,
565565
# we need to pass the image token ID to the model to select the
566566
# tokens to merge from the vision encoder outputs
567-
processed_outputs["image_token_id"] = [image_token_id
568-
] * len(image_data)
567+
processed_outputs["image_token_id"] = torch.tensor(image_token_id)
569568

570569
return processed_outputs
571570

@@ -575,13 +574,14 @@ def _get_mm_fields_config(
575574
hf_processor_mm_kwargs: Mapping[str, object],
576575
) -> Mapping[str, MultiModalFieldConfig]:
577576
image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0))
577+
num_images = len(image_num_patches)
578578

579579
return dict(
580580
pixel_values_flat=MultiModalFieldConfig.flat_from_sizes(
581581
"image", image_num_patches),
582582
image_num_patches=MultiModalFieldConfig.batched("image"),
583583
image_embeds=MultiModalFieldConfig.batched("image"),
584-
image_token_id=MultiModalFieldConfig.batched("image"),
584+
image_token_id=MultiModalFieldConfig.shared("image", num_images),
585585
)
586586

587587
def _get_prompt_replacements(

0 commit comments

Comments
 (0)