vllm-project · p88h · Feb 24, 2025 · Feb 25, 2025 · Mar 1, 2025 · Mar 15, 2025
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+from collections import UserDict
+from dataclasses import dataclass
+from typing import Optional
+
+import msgspec
+import numpy as np
+import torch
+
+from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+
+
+class UnrecognizedType(UserDict):
+
+    def __init__(self, an_int: int):
+        super().__init__()
+        self.an_int = an_int
+
+
+@dataclass
+class MyType:
+    tensor1: torch.Tensor
+    a_string: str
+    list_of_tensors: list[torch.Tensor]
+    numpy_array: np.ndarray
+    unrecognized: UnrecognizedType
+
+
+def test_encode_decode():
+    """Test encode/decode loop with zero-copy tensors."""
+
+    obj = MyType(
+        tensor1=torch.randint(low=0, high=100, size=(10, ), dtype=torch.int32),
+        a_string="hello",
+        list_of_tensors=[
+            torch.rand((1, 10), dtype=torch.float32),
+            torch.rand((3, 5, 4), dtype=torch.float64)
+        ],
+        numpy_array=np.arange(20),
+        unrecognized=UnrecognizedType(33),
+    )
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyType)
+
+    encoded = encoder.encode(obj)
+
+    # There should be the main buffer + 3 tensor buffers + one ndarray buffer
+    assert len(encoded) == 5
+
+    decoded: MyType = decoder.decode(encoded)
+
+    assert_equal(decoded, obj)
+
+    # Test encode_into case
+
+    preallocated = bytearray()
+
+    encoded2 = encoder.encode_into(obj, preallocated)
+
+    assert len(encoded2) == 5
+    assert encoded2[0] is preallocated
+
+    decoded2: MyType = decoder.decode(encoded2)
+
+    assert_equal(decoded2, obj)
+
+
+class MyRequest(msgspec.Struct):
+    mm: Optional[list[MultiModalKwargs]]
+
+
+def test_multimodal_kwargs():
+    d = {
+        "foo": torch.zeros(1000, dtype=torch.float16),
+        "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)],
+        "baz": (torch.zeros(256, dtype=torch.int64), "i'm a tuple")
+    }
+
+    # pack mm kwargs into a mock request so that it can be decoded properly
+    req = MyRequest(mm=[MultiModalKwargs(d)])
+
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyRequest)
+
+    encoded = encoder.encode(req)
+
+    # 5 total tensors + top level buffer
+    assert len(encoded) == 6
+
+    total_len = sum([len(x) for x in encoded])
+
+    # expected total encoding length, should be 4384, +-20 for minor changes
+    assert total_len >= 4364 and total_len <= 4404
+
+    decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
+    assert torch.equal(d["foo"], decoded["foo"])
+
+
+def assert_equal(obj1: MyType, obj2: MyType):
+    assert torch.equal(obj1.tensor1, obj2.tensor1)
+    assert obj1.a_string == obj2.a_string
+    assert all(
+        torch.equal(a, b)
+        for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
+    assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
+    assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -490,14 +490,14 @@ def process_input_socket(self, input_path: str, engine_index: int):
 
             while True:
                 # (RequestType, RequestData)
-                type_frame, data_frame = socket.recv_multipart(copy=False)
+                type_frame, *data_frames = socket.recv_multipart(copy=False)
                 request_type = EngineCoreRequestType(bytes(type_frame.buffer))
 
                 # Deserialize the request data.
                 decoder = add_request_decoder if (
                     request_type
                     == EngineCoreRequestType.ADD) else generic_decoder
-                request = decoder.decode(data_frame.buffer)
+                request = decoder.decode(data_frames)
 
                 # Push to input queue for core busy loop.
                 self.input_queue.put_nowait((request_type, request))
@@ -514,8 +514,8 @@ def process_output_socket(self, output_path: str, engine_index: int):
             while True:
                 outputs = self.output_queue.get()
                 outputs.engine_index = engine_index
-                encoder.encode_into(outputs, buffer)
-                socket.send(buffer, copy=False)
+                buffers = encoder.encode_into(outputs, buffer)
+                socket.send_multipart(buffers, copy=False)
 
 
 ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -26,7 +26,7 @@
                             EngineCoreRequestType, UtilityOutput)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
 from vllm.v1.utils import BackgroundProcHandle
 
 logger = init_logger(__name__)
@@ -505,8 +505,8 @@ def process_outputs_socket():
                         # shutdown signal, exit thread.
                         break
 
-                    frame = out_socket.recv(copy=False)
-                    outputs = decoder.decode(frame.buffer)
+                    frames = out_socket.recv_multipart(copy=False)
+                    outputs = decoder.decode(frames)
                     if outputs.utility_output:
                         _process_utility_output(outputs.utility_output,
                                                 utility_results)
@@ -529,7 +529,7 @@ def get_output(self) -> EngineCoreOutputs:
     def _send_input(self, request_type: EngineCoreRequestType, request: Any):
         # (Identity, RequestType, SerializedRequest)
         msg = (self.core_engine.identity, request_type.value,
-               self.encoder.encode(request))
+               *self.encoder.encode(request))
         self.input_socket.send_multipart(msg, copy=False)
 
     def call_utility(self, method: str, *args) -> Any:
@@ -633,8 +633,8 @@ def _ensure_output_queue_task(self):
 
         async def process_outputs_socket():
             while True:
-                (frame, ) = await output_socket.recv_multipart(copy=False)
-                outputs: EngineCoreOutputs = decoder.decode(frame.buffer)
+                frames = await output_socket.recv_multipart(copy=False)
+                outputs: EngineCoreOutputs = decoder.decode(frames)
                 if outputs.utility_output:
                     _process_utility_output(outputs.utility_output,
                                             utility_results)
@@ -666,12 +666,12 @@ def _send_input(self,
         if engine is None:
             engine = self.core_engine
 
-        message = (request_type.value, self.encoder.encode(request))
+        message = (request_type.value, *self.encoder.encode(request))
         return self._send_input_message(message, engine)
 
-    def _send_input_message(self, message: tuple[bytes, bytes],
+    def _send_input_message(self, message: tuple[bytestr, ...],
                             engine: CoreEngine) -> Awaitable[None]:
-        message = (engine.identity, ) + message  # type: ignore[assignment]
+        message = (engine.identity, ) + message
         return self.input_socket.send_multipart(message, copy=False)
 
     async def call_utility_async(self, method: str, *args) -> Any:
@@ -684,8 +684,8 @@ async def _call_utility_async(self, method: str, *args,
         call_id = uuid.uuid1().int >> 64
         future = asyncio.get_running_loop().create_future()
         self.utility_results[call_id] = future
-        message = (EngineCoreRequestType.UTILITY.value,
-                   self.encoder.encode((call_id, method, args)))
+        message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
+            (call_id, method, args)))
         await self._send_input_message(message, engine)
         self._ensure_output_queue_task()
         return await future
@@ -760,7 +760,7 @@ def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
 
         # Control message used for triggering dp idle mode loop.
         self.start_dp_msg = (EngineCoreRequestType.START_DP.value,
-                             self.encoder.encode(None))
+                             *self.encoder.encode(None))
 
         self.num_engines_running = 0
         self.reqs_in_flight: dict[str, CoreEngine] = {}
@@ -794,7 +794,7 @@ async def add_request_async(self, request: EngineCoreRequest) -> None:
         # tokenized.
         request.prompt = None
 
-        msg = (EngineCoreRequestType.ADD.value, self.encoder.encode(request))
+        msg = (EngineCoreRequestType.ADD.value, *self.encoder.encode(request))
 
         chosen_engine = self.get_core_engine_for_request()
         self.reqs_in_flight[request.request_id] = chosen_engine