|
12 | 12 | cached_tokenizer_from_config)
|
13 | 13 | from vllm.utils import ClassRegistry
|
14 | 14 |
|
15 |
| -from .cache import (BaseMultiModalProcessorCache, |
16 |
| - processor_only_cache_from_config) |
| 15 | +from .cache import BaseMultiModalProcessorCache |
17 | 16 | from .processing import BaseMultiModalProcessor, BaseProcessingInfo
|
18 | 17 | from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
|
19 | 18 | DummyEncoderData, MultiModalProfiler)
|
@@ -176,35 +175,6 @@ def get_max_tokens_per_item_by_nonzero_modality(
|
176 | 175 | if mm_limits[key] > 0
|
177 | 176 | }
|
178 | 177 |
|
179 |
| - # TODO: Remove once V0 is gone |
180 |
| - def get_max_tokens_by_modality( |
181 |
| - self, |
182 |
| - model_config: "ModelConfig", |
183 |
| - ) -> Mapping[str, int]: |
184 |
| - """ |
185 |
| - Get the maximum number of tokens from each modality |
186 |
| - for profiling the memory usage of a model. |
187 |
| - """ |
188 |
| - cache = processor_only_cache_from_config(model_config, self) |
189 |
| - mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache) |
190 |
| - max_tokens_per_item = self.get_max_tokens_per_item_by_modality( |
191 |
| - model_config, |
192 |
| - cache=cache, |
193 |
| - ) |
194 |
| - |
195 |
| - return { |
196 |
| - key: mm_limits[key] * max_tokens_per_mm_item |
197 |
| - for key, max_tokens_per_mm_item in max_tokens_per_item.items() |
198 |
| - } |
199 |
| - |
200 |
| - # TODO: Remove once V0 is gone |
201 |
| - def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: |
202 |
| - """ |
203 |
| - Get the maximum number of multi-modal tokens |
204 |
| - for profiling the memory usage of a model. |
205 |
| - """ |
206 |
| - return sum(self.get_max_tokens_by_modality(model_config).values()) |
207 |
| - |
208 | 178 | def get_mm_limits_per_prompt(
|
209 | 179 | self,
|
210 | 180 | model_config: "ModelConfig",
|
|
0 commit comments