|
12 | 12 | cached_tokenizer_from_config) |
13 | 13 | from vllm.utils import ClassRegistry |
14 | 14 |
|
15 | | -from .cache import (BaseMultiModalProcessorCache, |
16 | | - processor_only_cache_from_config) |
| 15 | +from .cache import BaseMultiModalProcessorCache |
17 | 16 | from .processing import BaseMultiModalProcessor, BaseProcessingInfo |
18 | 17 | from .profiling import (BaseDummyInputsBuilder, DummyDecoderData, |
19 | 18 | DummyEncoderData, MultiModalProfiler) |
@@ -176,35 +175,6 @@ def get_max_tokens_per_item_by_nonzero_modality( |
176 | 175 | if mm_limits[key] > 0 |
177 | 176 | } |
178 | 177 |
|
179 | | - # TODO: Remove once V0 is gone |
180 | | - def get_max_tokens_by_modality( |
181 | | - self, |
182 | | - model_config: "ModelConfig", |
183 | | - ) -> Mapping[str, int]: |
184 | | - """ |
185 | | - Get the maximum number of tokens from each modality |
186 | | - for profiling the memory usage of a model. |
187 | | - """ |
188 | | - cache = processor_only_cache_from_config(model_config, self) |
189 | | - mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache) |
190 | | - max_tokens_per_item = self.get_max_tokens_per_item_by_modality( |
191 | | - model_config, |
192 | | - cache=cache, |
193 | | - ) |
194 | | - |
195 | | - return { |
196 | | - key: mm_limits[key] * max_tokens_per_mm_item |
197 | | - for key, max_tokens_per_mm_item in max_tokens_per_item.items() |
198 | | - } |
199 | | - |
200 | | - # TODO: Remove once V0 is gone |
201 | | - def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: |
202 | | - """ |
203 | | - Get the maximum number of multi-modal tokens |
204 | | - for profiling the memory usage of a model. |
205 | | - """ |
206 | | - return sum(self.get_max_tokens_by_modality(model_config).values()) |
207 | | - |
208 | 178 | def get_mm_limits_per_prompt( |
209 | 179 | self, |
210 | 180 | model_config: "ModelConfig", |
|
0 commit comments