Convert EngineCoreRequest to Request before reaching the engine core thread

Jialin · Jialin · commit bccab821fa3b · 2025-07-21T23:34:55.000-07:00
Signed-off-by: Jialin Ouyang &lt;Jialin.Ouyang@gmail.com&gt;
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -194,7 +194,7 @@ def _initialize_kv_caches(
                      "warmup model) took %.2f seconds"), elapsed)
         return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
 
-    def add_request(self, request: EngineCoreRequest):
+    def add_request(self, request: Request):
         """Add request to the scheduler."""
         if pooling_params := request.pooling_params:
             supported_pooling_tasks = (
@@ -203,27 +203,16 @@ def add_request(self, request: EngineCoreRequest):
                 raise ValueError(f"Unsupported task: {pooling_params.task!r} "
                                  f"Supported tasks: {supported_pooling_tasks}")
 
-        if request.mm_hashes is not None:
-            # Here, if hash exists for a multimodal input, then it will be
-            # fetched from the cache, else it will be added to the cache.
-            # Note that the cache here is mirrored with the client cache, so
-            # anything that has a hash must have a HIT cache entry here
-            # as well.
-            assert request.mm_inputs is not None
-            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
-                request.mm_inputs, request.mm_hashes)
-
-        req = Request.from_engine_core_request(request)
-        if req.use_structured_output:
+        if request.use_structured_output:
             # Start grammar compilation asynchronously
-            self.structured_output_manager.grammar_init(req)
+            self.structured_output_manager.grammar_init(request)
 
-        if req.kv_transfer_params is not None and (
+        if request.kv_transfer_params is not None and (
                 not self.scheduler.get_kv_connector()):
             logger.warning("Got kv_transfer_params, but no KVConnector found. "
                            "Disabling KVTransfer for this request.")
 
-        self.scheduler.add_request(req)
+        self.scheduler.add_request(request)
 
     def abort_requests(self, request_ids: list[str]):
         """Abort requests from the scheduler."""
@@ -766,10 +755,11 @@ def process_input_sockets(self, input_addresses: list[str],
                         bytes(type_frame.buffer))
 
                     # Deserialize the request data.
-                    decoder = add_request_decoder if (
-                        request_type
-                        == EngineCoreRequestType.ADD) else generic_decoder
-                    request = decoder.decode(data_frames)
+                    if request_type == EngineCoreRequestType.ADD:
+                        request = add_request_decoder.decode(data_frames)
+                        request = self._post_process_add_request(request)
+                    else:
+                        request = generic_decoder.decode(data_frames)
 
                     # Push to input queue for core busy loop.
                     self.input_queue.put_nowait((request_type, request))
@@ -835,6 +825,23 @@ def process_output_sockets(self, output_paths: list[str],
                     # Limit the number of buffers to reuse.
                     reuse_buffers.append(buffer)
 
+    def _post_process_add_request(self, request: EngineCoreRequest) -> Request:
+        """Post-processes the request before reaching to EngineCore.
+        
+        This call would be executed in parallel with Model forward which
+        relaxes request preparation works out from critical path."""
+        if request.mm_hashes is not None:
+            # Here, if hash exists for a multimodal input, then it will be
+            # fetched from the cache, else it will be added to the cache.
+            # Note that the cache here is mirrored with the client cache, so
+            # anything that has a hash must have a HIT cache entry here
+            # as well.
+            assert request.mm_inputs is not None
+            request.mm_inputs = self.mm_input_cache_server.get_and_update_p1(
+                request.mm_inputs, request.mm_hashes)
+
+        return Request.from_engine_core_request(request)
+
 
 class DPEngineCoreProc(EngineCoreProc):
     """ZMQ-wrapper for running EngineCore in background process
@@ -915,7 +922,7 @@ def shutdown(self):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
-    def add_request(self, request: EngineCoreRequest):
+    def add_request(self, request: Request):
         if self.has_coordinator and request.current_wave != self.current_wave:
             if request.current_wave > self.current_wave:
                 self.current_wave = request.current_wave
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
@@ -35,10 +35,12 @@ def __init__(
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
+        current_wave: int = 0,
         priority: int = 0,
     ) -> None:
         self.request_id = request_id
         self.client_index = client_index
+        self.current_wave = current_wave
         self.priority = priority
         self.sampling_params = sampling_params
         self.pooling_params = pooling_params
@@ -131,6 +133,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
                 sampling_params=request.sampling_params) \
                     if request.sampling_params else None,
             cache_salt=request.cache_salt,
+            current_wave=request.current_wave,
             priority=request.priority,
         )