Skip to content

Commit bd51f78

Browse files
authored
[V0 Deprecation][Models] Remove all V0 condition for mm embeddings merge (vllm-project#25331)
Signed-off-by: Isotr0py <[email protected]> Signed-off-by: isotr0py <[email protected]>
1 parent 65ecb4f commit bd51f78

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+13
-809
lines changed

vllm/model_executor/models/aya_vision.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -427,17 +427,6 @@ def forward(
427427
if intermediate_tensors is not None:
428428
inputs_embeds = None
429429

430-
# NOTE: In v1, inputs_embeds is always generated at model runner, this
431-
# condition is for v0 compatibility.
432-
elif inputs_embeds is None:
433-
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
434-
inputs_embeds = self.get_input_embeddings(
435-
input_ids,
436-
vision_embeddings,
437-
is_multimodal=input_ids == self.config.image_token_index,
438-
)
439-
input_ids = None
440-
441430
hidden_states = self.language_model.model(
442431
input_ids=input_ids,
443432
positions=positions,

vllm/model_executor/models/blip2.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -672,17 +672,6 @@ def forward(
672672
if intermediate_tensors is not None:
673673
inputs_embeds = None
674674

675-
# NOTE: In v1, inputs_embeds is always generated at model runner, this
676-
# condition is for v0 compatibility.
677-
elif inputs_embeds is None:
678-
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
679-
inputs_embeds = self.get_input_embeddings(
680-
input_ids,
681-
vision_embeddings,
682-
is_multimodal=input_ids == _IMAGE_TOKEN_ID,
683-
)
684-
input_ids = None
685-
686675
hidden_states = self.language_model.model(input_ids,
687676
positions,
688677
intermediate_tensors,

vllm/model_executor/models/chameleon.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,18 +1014,6 @@ def forward(
10141014
if intermediate_tensors is not None:
10151015
inputs_embeds = None
10161016

1017-
# NOTE: In v1, inputs_embeds is always generated at model runner, this
1018-
# condition is for v0 compatibility.
1019-
elif inputs_embeds is None:
1020-
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
1021-
image_token_id = self.model.vocabulary_mapping.image_token_id
1022-
inputs_embeds = self.get_input_embeddings(
1023-
input_ids,
1024-
vision_embeddings,
1025-
is_multimodal=input_ids == image_token_id,
1026-
)
1027-
input_ids = None
1028-
10291017
hidden_states = self.model(input_ids,
10301018
positions,
10311019
intermediate_tensors,

vllm/model_executor/models/cohere2_vision.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -440,17 +440,6 @@ def forward(
440440
if intermediate_tensors is not None:
441441
inputs_embeds = None
442442

443-
# NOTE: In v1, inputs_embeds is always generated at model runner, this
444-
# condition is for v0 compatibility.
445-
elif inputs_embeds is None:
446-
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
447-
inputs_embeds = self.get_input_embeddings(
448-
input_ids,
449-
vision_embeddings,
450-
is_multimodal=input_ids == self.config.image_token_id,
451-
)
452-
input_ids = None
453-
454443
hidden_states = self.language_model.model(
455444
input_ids=input_ids,
456445
positions=positions,

vllm/model_executor/models/deepseek_vl2.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -614,17 +614,6 @@ def forward(self,
614614
if intermediate_tensors is not None:
615615
inputs_embeds = None
616616

617-
# NOTE: In v1, inputs_embeds is always generated at model runner, this
618-
# condition is for v0 compatibility
619-
elif inputs_embeds is None:
620-
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
621-
inputs_embeds = self.get_input_embeddings(
622-
input_ids,
623-
vision_embeddings,
624-
is_multimodal=input_ids == self.image_token_id,
625-
)
626-
input_ids = None
627-
628617
hidden_states = self.language_model(input_ids,
629618
positions,
630619
intermediate_tensors,

vllm/model_executor/models/fuyu.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -352,17 +352,6 @@ def forward(
352352
if intermediate_tensors is not None:
353353
inputs_embeds = None
354354

355-
# NOTE: In v1, inputs_embeds is always generated at model runner, this
356-
# condition is for v0 compatibility.
357-
elif inputs_embeds is None:
358-
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
359-
inputs_embeds = self.get_input_embeddings(
360-
input_ids,
361-
vision_embeddings,
362-
is_multimodal=input_ids == _IMAGE_TOKEN_ID,
363-
)
364-
input_ids = None
365-
366355
hidden_states = self.language_model(
367356
input_ids=input_ids,
368357
positions=positions,

vllm/model_executor/models/gemma3_mm.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -596,25 +596,6 @@ def forward(self,
596596
if intermediate_tensors is not None:
597597
inputs_embeds = None
598598

599-
# NOTE: In v1, inputs_embeds is always generated at model runner, this
600-
# condition is for v0 compatibility.
601-
elif inputs_embeds is None:
602-
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
603-
604-
inputs_embeds = self.get_input_embeddings(
605-
input_ids,
606-
vision_embeddings,
607-
is_multimodal=input_ids == self.config.image_token_index,
608-
)
609-
if (vision_embeddings is not None) and len(vision_embeddings) != 0:
610-
kwargs = self.prepare_attn_masks(
611-
input_ids,
612-
positions,
613-
mask_dtype=self.dtype,
614-
**kwargs,
615-
)
616-
input_ids = None
617-
618599
hidden_states = self.language_model.model(input_ids,
619600
positions,
620601
intermediate_tensors,

vllm/model_executor/models/glm4_1v.py

Lines changed: 1 addition & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@
7171
from vllm.multimodal.profiling import BaseDummyInputsBuilder
7272
from vllm.platforms import _Backend
7373
from vllm.sequence import IntermediateTensors
74-
from vllm.transformers_utils.config import uses_mrope
7574
from vllm.utils.tensor_schema import TensorSchema, TensorShape
7675

7776
from ..layers.activation import SiluAndMul
@@ -80,8 +79,7 @@
8079
from .qwen2_vl import (_create_qwen2vl_field_factory,
8180
apply_rotary_pos_emb_vision)
8281
from .utils import (AutoWeightsLoader, WeightsMapper,
83-
init_vllm_registered_model, maybe_prefix,
84-
merge_multimodal_embeddings)
82+
init_vllm_registered_model, maybe_prefix)
8583
from .vision import get_vit_attn_backend, run_dp_sharded_mrope_vision_model
8684

8785
logger = init_logger(__name__)
@@ -1552,32 +1550,6 @@ def get_multimodal_embeddings(
15521550
multimodal_embeddings += video_embeddings
15531551
return multimodal_embeddings
15541552

1555-
def get_input_embeddings_v0(
1556-
self,
1557-
input_ids: torch.Tensor,
1558-
image_input: Optional[Glm4vImageInputs] = None,
1559-
video_input: Optional[Glm4vVideoInputs] = None,
1560-
) -> torch.Tensor:
1561-
inputs_embeds = self.get_input_embeddings(input_ids)
1562-
if image_input is not None:
1563-
image_embeds = self._process_image_input(image_input)
1564-
inputs_embeds = merge_multimodal_embeddings(
1565-
input_ids,
1566-
inputs_embeds,
1567-
image_embeds,
1568-
placeholder_token_id=self.config.image_token_id,
1569-
)
1570-
1571-
if video_input is not None:
1572-
video_embeds = self._process_video_input(video_input)
1573-
inputs_embeds = merge_multimodal_embeddings(
1574-
input_ids,
1575-
inputs_embeds,
1576-
video_embeds,
1577-
placeholder_token_id=self.config.video_token_id,
1578-
)
1579-
return inputs_embeds
1580-
15811553
def forward(
15821554
self,
15831555
input_ids: torch.Tensor,
@@ -1604,26 +1576,6 @@ def forward(
16041576
if intermediate_tensors is not None:
16051577
inputs_embeds = None
16061578

1607-
# NOTE: In v1, inputs_embeds is always generated at model runner from
1608-
# `get_multimodal_embeddings` and `get_input_embeddings`, this
1609-
# condition is only for v0 compatibility.
1610-
elif inputs_embeds is None:
1611-
image_input = self._parse_and_validate_image_input(**kwargs)
1612-
video_input = self._parse_and_validate_video_input(**kwargs)
1613-
1614-
if image_input is None and video_input is None:
1615-
inputs_embeds = None
1616-
else:
1617-
if uses_mrope(self.config):
1618-
assert positions.ndim == 2 and positions.size(0) == 3, (
1619-
"multimodal section rotary embedding requires "
1620-
f"(3, seq_len) positions, but got {positions.size()}")
1621-
inputs_embeds = self.get_input_embeddings_v0(
1622-
input_ids,
1623-
image_input=image_input,
1624-
video_input=video_input)
1625-
input_ids = None
1626-
16271579
hidden_states = self.language_model.model(
16281580
input_ids=input_ids,
16291581
positions=positions,

vllm/model_executor/models/glm4v.py

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from .chatglm import ChatGLMBaseModel, ChatGLMModel
4444
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
4545
SupportsMultiModal, SupportsPP)
46-
from .utils import flatten_bn, isin_list
46+
from .utils import flatten_bn
4747

4848

4949
class GLMVImagePixelInputs(TensorSchema):
@@ -618,21 +618,6 @@ def forward(
618618
if intermediate_tensors is not None:
619619
inputs_embeds = None
620620

621-
# NOTE: In v1, inputs_embeds is always generated at model runner, this
622-
# condition is for v0 compatibility.
623-
elif inputs_embeds is None:
624-
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
625-
inputs_embeds = self.get_input_embeddings(
626-
input_ids,
627-
vision_embeddings,
628-
is_multimodal=isin_list(input_ids, [
629-
self.config.boi_token_id,
630-
self.config.pad_token_id,
631-
self.config.eoi_token_id,
632-
]),
633-
)
634-
input_ids = None
635-
636621
hidden_states = self.transformer(input_ids, positions,
637622
intermediate_tensors, inputs_embeds)
638623

vllm/model_executor/models/granite_speech.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -765,17 +765,6 @@ def forward(
765765
if intermediate_tensors is not None:
766766
inputs_embeds = None
767767

768-
# NOTE: In v1, inputs_embeds is always generated at model runner, this
769-
# condition is for v0 compatibility.
770-
elif inputs_embeds is None:
771-
audio_embeds = self.get_multimodal_embeddings(**kwargs)
772-
inputs_embeds = self.get_input_embeddings(
773-
input_ids,
774-
audio_embeds,
775-
is_multimodal=input_ids == self.config.audio_token_index,
776-
)
777-
input_ids = None
778-
779768
model_output = self.language_model(input_ids, positions,
780769
intermediate_tensors, inputs_embeds)
781770
return model_output

0 commit comments

Comments
 (0)