Skip to content

Commit f92d952

Browse files
[V0 Deprecation] Remove MultiModalPlaceholderMap (#25366)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 6d0b827 commit f92d952

File tree

7 files changed

+2
-128
lines changed

7 files changed

+2
-128
lines changed

tests/kernels/utils.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -959,7 +959,6 @@ def make_test_metadata(
959959
return attn_backend_obj.make_metadata(
960960
num_prefills=num_prefills,
961961
slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping),
962-
multi_modal_placeholder_index_maps=None,
963962
enable_kv_scales_calculation=True,
964963
num_prefill_tokens=num_prefill_tokens,
965964
num_decode_tokens=num_decode_tokens,
@@ -1009,7 +1008,6 @@ def make_test_metadata(
10091008
return attn_backend_obj.make_metadata(
10101009
num_prefills=num_prefills,
10111010
slot_mapping=kv_mmap.slot_mapping,
1012-
multi_modal_placeholder_index_maps=None,
10131011
enable_kv_scales_calculation=True,
10141012
num_prefill_tokens=num_prefill_tokens,
10151013
num_decode_tokens=num_decode_tokens,

vllm/attention/backends/abstract.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import torch
1111

1212
from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
13-
from vllm.multimodal import MultiModalPlaceholderMap
1413

1514

1615
class AttentionType:
@@ -116,15 +115,6 @@ class AttentionMetadata:
116115
# in block 0, and 1st slot in block 1, respectively.
117116
slot_mapping: torch.Tensor
118117

119-
# The index maps that relate multi-modal embeddings to the corresponding
120-
# placeholders.
121-
#
122-
# N.B. These aren't really related to attention and don't belong on this
123-
# type -- this is just a temporary solution to make them available to
124-
# `model_executable`.
125-
multi_modal_placeholder_index_maps: Optional[Dict[
126-
str, MultiModalPlaceholderMap.IndexMap]]
127-
128118
# Enable/disable KV scales calculation. This is so that we can disable the
129119
# calculation until after prefill and cuda graph capture.
130120
enable_kv_scales_calculation: bool

vllm/attention/backends/placeholder_attn.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4-
from collections import defaultdict
54
from dataclasses import dataclass
65
from itertools import accumulate
7-
from typing import Dict, List, Optional, Tuple, Type
6+
from typing import List, Optional, Tuple, Type
87

98
import torch
109

1110
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
1211
AttentionMetadata,
1312
AttentionMetadataBuilder)
1413
from vllm.attention.backends.utils import CommonAttentionState
15-
from vllm.multimodal import MultiModalPlaceholderMap
1614
from vllm.utils import async_tensor_h2d
1715

1816
# Placeholder attention backend for models like Mamba and pooling models that
@@ -141,8 +139,6 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
141139
num_prefill_tokens=self.num_prefill_tokens,
142140
num_decode_tokens=0,
143141
slot_mapping=slot_mapping,
144-
multi_modal_placeholder_index_maps=self.
145-
multi_modal_placeholder_index_maps,
146142
enable_kv_scales_calculation=self.enable_kv_scales_calculation,
147143
seq_lens=seq_lens,
148144
seq_lens_tensor=seq_lens_tensor,
@@ -178,7 +174,6 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
178174
num_prefill_tokens=0,
179175
num_decode_tokens=self.num_decode_tokens,
180176
slot_mapping=slot_mapping,
181-
multi_modal_placeholder_index_maps=None,
182177
enable_kv_scales_calculation=True,
183178
seq_lens=None,
184179
seq_lens_tensor=seq_lens_tensor,
@@ -210,9 +205,6 @@ def prepare(self):
210205
self.prefill_seq_lens: List[int] = []
211206
self.context_lens: List[int] = []
212207
self.curr_seq_lens: List[int] = []
213-
self.multimodal_placeholder_maps: Dict[
214-
str,
215-
MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
216208
self.num_prefills = 0
217209
self.num_prefill_tokens = 0
218210
self.num_decode_tokens = 0
@@ -232,12 +224,6 @@ def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
232224
self.context_lens.append(context_len)
233225

234226
if is_prompt:
235-
mm_maps = inter_data.multi_modal_placeholder_maps
236-
if mm_maps:
237-
for modality, placeholders in mm_maps.items():
238-
self.multimodal_placeholder_maps[modality].extend(
239-
placeholders)
240-
241227
self.num_prefills += 1
242228
self.num_prefill_tokens += token_len
243229
self.prefill_seq_lens.append(seq_len)
@@ -295,20 +281,13 @@ def build(self, seq_lens: List[int], query_lens: List[int],
295281
seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
296282
device, self.runner.pin_memory)
297283

298-
placeholder_index_maps = {
299-
modality: placeholder_map.index_map()
300-
for modality, placeholder_map in
301-
self.multimodal_placeholder_maps.items()
302-
}
303-
304284
# Placeholders
305285
slot_mapping_tensor = torch.empty(0)
306286
block_tables = torch.empty(0)
307287

308288
return PlaceholderAttentionMetadata(
309289
num_prefills=self.num_prefills,
310290
slot_mapping=slot_mapping_tensor,
311-
multi_modal_placeholder_index_maps=placeholder_index_maps,
312291
enable_kv_scales_calculation=True,
313292
num_prefill_tokens=self.num_prefill_tokens,
314293
num_decode_tokens=num_decode_tokens,

vllm/attention/backends/utils.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
"""Attention backend utils"""
4-
from collections import defaultdict
54
from contextlib import contextmanager
65
from dataclasses import dataclass
76
from itertools import accumulate
@@ -15,7 +14,6 @@
1514
from vllm.attention.backends.abstract import AttentionType
1615
from vllm.config import ModelConfig
1716
from vllm.logger import init_logger
18-
from vllm.multimodal import MultiModalPlaceholderMap
1917
from vllm.utils import async_tensor_h2d, make_tensor_with_pad
2018

2119
logger = init_logger(__name__)
@@ -135,9 +133,6 @@ def prepare(self):
135133
self.context_lens: List[int] = []
136134
self.block_tables: List[List[int]] = []
137135
self.curr_seq_lens: List[int] = []
138-
self.multimodal_placeholder_maps: Dict[
139-
str,
140-
MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
141136
self.num_prefills = 0
142137
self.num_prefill_tokens = 0
143138
self.num_decode_tokens = 0
@@ -154,12 +149,6 @@ def _add_seq_group(self, inter_data, chunked_prefill_enabled: bool):
154149
inter_data.curr_sliding_window_blocks):
155150
self.context_lens.append(context_len)
156151
if is_prompt:
157-
mm_maps = inter_data.multi_modal_placeholder_maps
158-
if mm_maps:
159-
for modality, placeholders in mm_maps.items():
160-
self.multimodal_placeholder_maps[modality].extend(
161-
placeholders)
162-
163152
self.num_prefills += 1
164153
self.num_prefill_tokens += token_len
165154
self.prefill_seq_lens.append(seq_len)
@@ -254,16 +243,10 @@ def build(self, seq_lens: List[int], query_lens: List[int],
254243
self.runner.pin_memory)
255244
seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32,
256245
device, self.runner.pin_memory)
257-
placeholder_index_maps = {
258-
modality: placeholder_map.index_map()
259-
for modality, placeholder_map in
260-
self.multimodal_placeholder_maps.items()
261-
}
262246

263247
return self._metadata_cls( # type: ignore
264248
num_prefills=self.num_prefills,
265249
slot_mapping=slot_mapping_tensor,
266-
multi_modal_placeholder_index_maps=placeholder_index_maps,
267250
enable_kv_scales_calculation=True,
268251
num_prefill_tokens=self.num_prefill_tokens,
269252
num_decode_tokens=num_decode_tokens,
@@ -320,7 +303,6 @@ def graph_capture_get_metadata_for_batch(
320303
num_prefill_tokens=0,
321304
num_decode_tokens=batch_size,
322305
slot_mapping=self._graph_slot_mapping[:batch_size],
323-
multi_modal_placeholder_index_maps=None,
324306
enable_kv_scales_calculation=True,
325307
seq_lens=None,
326308
seq_lens_tensor=self._graph_seq_lens[:batch_size],

vllm/multimodal/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3-
from .base import MultiModalPlaceholderMap
43
from .hasher import MultiModalHasher
54
from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
65
MultiModalDataDict, MultiModalKwargs,
@@ -27,7 +26,6 @@
2726
"MultiModalKwargs",
2827
"MultiModalKwargsItems",
2928
"MultiModalPlaceholderDict",
30-
"MultiModalPlaceholderMap",
3129
"MultiModalUUIDDict",
3230
"NestedTensors",
3331
"MULTIMODAL_REGISTRY",

vllm/multimodal/base.py

Lines changed: 1 addition & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -3,83 +3,11 @@
33

44
from abc import ABC, abstractmethod
55
from pathlib import Path
6-
from typing import Generic, NamedTuple, TypeVar
6+
from typing import Generic, TypeVar
77

88
_T = TypeVar("_T")
99

1010

11-
class MultiModalPlaceholderMap:
12-
"""
13-
Relates multi-modal embeddings to their corresponding placeholders.
14-
15-
Note: This is only used in V0.
16-
"""
17-
18-
class IndexMap(NamedTuple):
19-
src: list[int]
20-
dest: list[int]
21-
22-
src_ranges: list[range]
23-
"""
24-
The indices of the multi-modal embeddings that will replace the
25-
corresponding placeholder embeddings pointed to by ``dest_ranges``.
26-
"""
27-
28-
src_len: int
29-
"""
30-
The total number of flattened multi-modal embeddings.
31-
"""
32-
33-
dest_ranges: list[range]
34-
"""
35-
The indices of the placeholder embeddings that will be replaced by the
36-
multimodal embeddings.
37-
"""
38-
39-
dest_len: int
40-
"""
41-
The total number of embeddings in the destination tensor.
42-
"""
43-
44-
def __init__(self):
45-
self.src_ranges = []
46-
self.src_len = 0
47-
self.dest_ranges = []
48-
self.dest_len = 0
49-
50-
def extend(self, other: "MultiModalPlaceholderMap"):
51-
"""
52-
Adds the placeholders from another ``MultiModalPlaceholderMap`` to this
53-
instance based on the source and destination tensors being
54-
concatenated.
55-
"""
56-
57-
self.src_ranges.extend(
58-
range(self.src_len + r.start, self.src_len + r.stop)
59-
for r in other.src_ranges)
60-
self.src_len += other.src_len
61-
self.dest_ranges.extend(
62-
range(self.dest_len + r.start, self.dest_len + r.stop)
63-
for r in other.dest_ranges)
64-
self.dest_len += other.dest_len
65-
66-
def index_map(self) -> "IndexMap":
67-
"""
68-
Finalizes the placeholder map into lists of indices that can be used to
69-
index the source and destination tensors.
70-
"""
71-
72-
src_indices = [i for r in self.src_ranges for i in r]
73-
dest_indices = [i for r in self.dest_ranges for i in r]
74-
75-
if len(src_indices) != len(dest_indices):
76-
raise ValueError(
77-
f"The number of source ({len(src_indices)}) and destination "
78-
f"indices ({len(dest_indices)}) must be the same.")
79-
80-
return self.IndexMap(src=src_indices, dest=dest_indices)
81-
82-
8311
class MediaIO(ABC, Generic[_T]):
8412

8513
@abstractmethod

vllm/v1/attention/backends/cpu_attn.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,6 @@ def build(self,
425425
num_prompt_req], # prefill
426426
query_start_loc=query_start_loc_cpu[:num_reqs +
427427
1], # for logits index
428-
multi_modal_placeholder_index_maps=None,
429428
enable_kv_scales_calculation=False,
430429
)
431430

0 commit comments

Comments
 (0)