71
71
from vllm .multimodal .profiling import BaseDummyInputsBuilder
72
72
from vllm .platforms import _Backend
73
73
from vllm .sequence import IntermediateTensors
74
- from vllm .transformers_utils .config import uses_mrope
75
74
from vllm .utils .tensor_schema import TensorSchema , TensorShape
76
75
77
76
from ..layers .activation import SiluAndMul
80
79
from .qwen2_vl import (_create_qwen2vl_field_factory ,
81
80
apply_rotary_pos_emb_vision )
82
81
from .utils import (AutoWeightsLoader , WeightsMapper ,
83
- init_vllm_registered_model , maybe_prefix ,
84
- merge_multimodal_embeddings )
82
+ init_vllm_registered_model , maybe_prefix )
85
83
from .vision import get_vit_attn_backend , run_dp_sharded_mrope_vision_model
86
84
87
85
logger = init_logger (__name__ )
@@ -1552,32 +1550,6 @@ def get_multimodal_embeddings(
1552
1550
multimodal_embeddings += video_embeddings
1553
1551
return multimodal_embeddings
1554
1552
1555
- def get_input_embeddings_v0 (
1556
- self ,
1557
- input_ids : torch .Tensor ,
1558
- image_input : Optional [Glm4vImageInputs ] = None ,
1559
- video_input : Optional [Glm4vVideoInputs ] = None ,
1560
- ) -> torch .Tensor :
1561
- inputs_embeds = self .get_input_embeddings (input_ids )
1562
- if image_input is not None :
1563
- image_embeds = self ._process_image_input (image_input )
1564
- inputs_embeds = merge_multimodal_embeddings (
1565
- input_ids ,
1566
- inputs_embeds ,
1567
- image_embeds ,
1568
- placeholder_token_id = self .config .image_token_id ,
1569
- )
1570
-
1571
- if video_input is not None :
1572
- video_embeds = self ._process_video_input (video_input )
1573
- inputs_embeds = merge_multimodal_embeddings (
1574
- input_ids ,
1575
- inputs_embeds ,
1576
- video_embeds ,
1577
- placeholder_token_id = self .config .video_token_id ,
1578
- )
1579
- return inputs_embeds
1580
-
1581
1553
def forward (
1582
1554
self ,
1583
1555
input_ids : torch .Tensor ,
@@ -1604,26 +1576,6 @@ def forward(
1604
1576
if intermediate_tensors is not None :
1605
1577
inputs_embeds = None
1606
1578
1607
- # NOTE: In v1, inputs_embeds is always generated at model runner from
1608
- # `get_multimodal_embeddings` and `get_input_embeddings`, this
1609
- # condition is only for v0 compatibility.
1610
- elif inputs_embeds is None :
1611
- image_input = self ._parse_and_validate_image_input (** kwargs )
1612
- video_input = self ._parse_and_validate_video_input (** kwargs )
1613
-
1614
- if image_input is None and video_input is None :
1615
- inputs_embeds = None
1616
- else :
1617
- if uses_mrope (self .config ):
1618
- assert positions .ndim == 2 and positions .size (0 ) == 3 , (
1619
- "multimodal section rotary embedding requires "
1620
- f"(3, seq_len) positions, but got { positions .size ()} " )
1621
- inputs_embeds = self .get_input_embeddings_v0 (
1622
- input_ids ,
1623
- image_input = image_input ,
1624
- video_input = video_input )
1625
- input_ids = None
1626
-
1627
1579
hidden_states = self .language_model .model (
1628
1580
input_ids = input_ids ,
1629
1581
positions = positions ,
0 commit comments