Update the EngineCore interface for backward compatibility

Jialin · Jialin · commit c3752dce42f9 · 2025-07-21T23:34:55.000-07:00
Signed-off-by: Jialin Ouyang &lt;Jialin.Ouyang@gmail.com&gt;
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -195,7 +195,9 @@ def _initialize_kv_caches(
                      "warmup model) took %.2f seconds"), elapsed)
         return num_gpu_blocks, num_cpu_blocks, scheduler_kv_cache_config
 
-    def add_request(self, request: Request):
+    def add_request(self, request: Union[EngineCoreRequest, Request]):
+        if type(request) is EngineCoreRequest:
+            request = self._preprocess_add_request(request)
         """Add request to the scheduler."""
         if pooling_params := request.pooling_params:
             supported_pooling_tasks = (
@@ -204,13 +206,13 @@ def add_request(self, request: Request):
                 raise ValueError(f"Unsupported task: {pooling_params.task!r} "
                                  f"Supported tasks: {supported_pooling_tasks}")
 
-        if request.mm_hashes is not None:
+        if request.mm_hashes:
             # Here, if hash exists for a multimodal input, then it will be
             # fetched from the cache, else it will be added to the cache.
             # Note that the cache here is mirrored with the client cache, so
             # anything that has a hash must have a HIT cache entry here
             # as well.
-            assert request.mm_inputs is not None
+            assert request.mm_inputs
             updated_mm_inputs = self.mm_input_cache_server.get_and_update_p1(
                 request.mm_inputs, request.mm_hashes)
             assert isinstance(updated_mm_inputs, list)
@@ -389,6 +391,13 @@ def save_tensorized_model(
         self.model_executor.save_tensorized_model(
             tensorizer_config=tensorizer_config, )
 
+    def _preprocess_add_request(self, request: EngineCoreRequest) -> Request:
+        """Preprocess the request.
+        
+        This function could be directly used in input processing thread to allow
+        request initialization running in parallel with Model forward"""
+        return Request.from_engine_core_request(request)
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
@@ -772,7 +781,7 @@ def process_input_sockets(self, input_addresses: list[str],
                     # Deserialize the request data.
                     if request_type == EngineCoreRequestType.ADD:
                         request = add_request_decoder.decode(data_frames)
-                        request = self._post_process_add_request(request)
+                        request = self._preprocess_add_request(request)
                     else:
                         request = generic_decoder.decode(data_frames)
 
@@ -840,13 +849,6 @@ def process_output_sockets(self, output_paths: list[str],
                     # Limit the number of buffers to reuse.
                     reuse_buffers.append(buffer)
 
-    def _post_process_add_request(self, request: EngineCoreRequest) -> Request:
-        """Post-processes the request before reaching to EngineCore.
-        
-        This call would be executed in parallel with Model forward which
-        relaxes request preparation works out from critical path."""
-        return Request.from_engine_core_request(request)
-
 
 class DPEngineCoreProc(EngineCoreProc):
     """ZMQ-wrapper for running EngineCore in background process
@@ -927,7 +929,7 @@ def shutdown(self):
         if dp_group := getattr(self, "dp_group", None):
             stateless_destroy_torch_distributed_process_group(dp_group)
 
-    def add_request(self, request: Request):
+    def add_request(self, request: Union[EngineCoreRequest, Request]):
         if self.has_coordinator and request.current_wave != self.current_wave:
             if request.current_wave > self.current_wave:
                 self.current_wave = request.current_wave
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -32,6 +32,7 @@
 from vllm.v1.engine.utils import (CoreEngineActorManager,
                                   CoreEngineProcManager, launch_core_engines)
 from vllm.v1.executor.abstract import Executor
+from vllm.v1.request import Request
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
 
 logger = init_logger(__name__)
@@ -104,7 +105,7 @@ def shutdown(self):
     def get_output(self) -> EngineCoreOutputs:
         raise NotImplementedError
 
-    def add_request(self, request: EngineCoreRequest) -> None:
+    def add_request(self, request: Union[EngineCoreRequest, Request]) -> None:
         raise NotImplementedError
 
     def profile(self, is_start: bool = True) -> None:
@@ -238,7 +239,7 @@ def get_output(self) -> EngineCoreOutputs:
         outputs, _ = self.engine_core.step()
         return outputs.get(0) or EngineCoreOutputs()
 
-    def add_request(self, request: EngineCoreRequest) -> None:
+    def add_request(self, request: Union[EngineCoreRequest, Request]) -> None:
         self.engine_core.add_request(request)
 
     def abort_requests(self, request_ids: list[str]) -> None:
@@ -603,7 +604,7 @@ def call_utility(self, method: str, *args) -> Any:
 
         return future.result()
 
-    def add_request(self, request: EngineCoreRequest) -> None:
+    def add_request(self, request: Union[EngineCoreRequest, Request]) -> None:
         if self.is_dp:
             self.engines_running = True
         self._send_input(EngineCoreRequestType.ADD, request)