|
23 | 23 | from vllm.logger import init_logger
|
24 | 24 | from vllm.logging_utils.dump_input import dump_engine_exception
|
25 | 25 | from vllm.lora.request import LoRARequest
|
| 26 | +from vllm.multimodal.inputs import MultiModalKwargs |
26 | 27 | from vllm.transformers_utils.config import (
|
27 | 28 | maybe_register_config_serialize_by_value)
|
28 |
| -from vllm.utils import make_zmq_socket, resolve_obj_by_qualname |
| 29 | +from vllm.utils import is_list_of, make_zmq_socket, resolve_obj_by_qualname |
29 | 30 | from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
|
30 | 31 | unify_kv_cache_configs)
|
31 | 32 | from vllm.v1.core.sched.interface import SchedulerInterface
|
@@ -203,6 +204,20 @@ def add_request(self, request: Request):
|
203 | 204 | raise ValueError(f"Unsupported task: {pooling_params.task!r} "
|
204 | 205 | f"Supported tasks: {supported_pooling_tasks}")
|
205 | 206 |
|
| 207 | + if request.mm_hashes is not None: |
| 208 | + # Here, if hash exists for a multimodal input, then it will be |
| 209 | + # fetched from the cache, else it will be added to the cache. |
| 210 | + # Note that the cache here is mirrored with the client cache, so |
| 211 | + # anything that has a hash must have a HIT cache entry here |
| 212 | + # as well. |
| 213 | + assert request.mm_inputs is not None |
| 214 | + updated_mm_inputs = self.mm_input_cache_server.get_and_update_p1( |
| 215 | + request.mm_inputs, request.mm_hashes) |
| 216 | + assert isinstance(updated_mm_inputs, list) |
| 217 | + assert is_list_of(updated_mm_inputs, MultiModalKwargs), ( |
| 218 | + "mm_inputs was not updated in EngineCore.add_request") |
| 219 | + request.mm_inputs = updated_mm_inputs |
| 220 | + |
206 | 221 | if request.use_structured_output:
|
207 | 222 | # Start grammar compilation asynchronously
|
208 | 223 | self.structured_output_manager.grammar_init(request)
|
@@ -830,16 +845,6 @@ def _post_process_add_request(self, request: EngineCoreRequest) -> Request:
|
830 | 845 |
|
831 | 846 | This call would be executed in parallel with Model forward which
|
832 | 847 | relaxes request preparation works out from critical path."""
|
833 |
| - if request.mm_hashes is not None: |
834 |
| - # Here, if hash exists for a multimodal input, then it will be |
835 |
| - # fetched from the cache, else it will be added to the cache. |
836 |
| - # Note that the cache here is mirrored with the client cache, so |
837 |
| - # anything that has a hash must have a HIT cache entry here |
838 |
| - # as well. |
839 |
| - assert request.mm_inputs is not None |
840 |
| - request.mm_inputs = self.mm_input_cache_server.get_and_update_p1( |
841 |
| - request.mm_inputs, request.mm_hashes) |
842 |
| - |
843 | 848 | return Request.from_engine_core_request(request)
|
844 | 849 |
|
845 | 850 |
|
|
0 commit comments