Bring back zero-copy, plus more review updates

p88h · p88h · commit 7cf549205fa6 · 2025-04-15T22:29:35.000+02:00
Signed-off-by: Staszek Pasko &lt;staszek@gmail.com&gt;
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
@@ -70,14 +70,12 @@ def test_encode_decode():
 
     assert_equal(decoded, obj)
 
-    # Test encode_into case
+    # Test whether MsgpackEncoder properly reuses the buffers.
 
-    preallocated = bytearray()
-
-    encoded2 = encoder.encode_into(obj, preallocated)
+    encoded2 = encoder.encode(obj)
 
     assert len(encoded2) == 6
-    assert encoded2[0] is preallocated
+    assert encoded2[0] is encoded[0]
 
     decoded2: MyType = decoder.decode(encoded2)
 
@@ -112,7 +110,7 @@ def test_multimodal_kwargs():
 
     assert len(encoded) == 6
 
-    total_len = sum(len(x) for x in encoded)
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
 
     # expected total encoding length, should be 44536, +-20 for minor changes
     assert total_len >= 44516 and total_len <= 44556
@@ -151,7 +149,7 @@ def test_multimodal_items_by_modality():
 
     assert len(encoded) == 8
 
-    total_len = sum([len(x) for x in encoded])
+    total_len = sum(memoryview(x).cast("B").nbytes for x in encoded)
 
     # expected total encoding length, should be 14255, +-20 for minor changes
     assert total_len >= 14235 and total_len <= 14275
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -507,15 +507,15 @@ def process_output_socket(self, output_path: str, engine_index: int):
         """Output socket IO thread."""
 
         # Msgpack serialization encoding.
+        # The wrapper keeps an internal encoding buffer that avoids
+        # creating a new buffer for each encode call.
         encoder = MsgpackEncoder()
-        # Reuse send buffer.
-        buffer = bytearray()
 
         with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
             while True:
                 outputs = self.output_queue.get()
                 outputs.engine_index = engine_index
-                buffers = encoder.encode_into(outputs, buffer)
+                buffers = encoder.encode(outputs)
                 socket.send_multipart(buffers, copy=False)
 
 
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import dataclasses
 import pickle
 from collections.abc import Sequence
-from dataclasses import asdict
 from inspect import isclass
 from itertools import chain
 from types import FunctionType
@@ -15,14 +15,26 @@
 from msgspec import msgpack
 
 from vllm import envs
-from vllm.multimodal.inputs import (BaseMultiModalField, MultiModalFieldConfig,
-                                    MultiModalFieldElem, MultiModalKwargs,
-                                    MultiModalKwargsItem, NestedTensors)
+from vllm.multimodal.inputs import (BaseMultiModalField,
+                                    MultiModalBatchedField,
+                                    MultiModalFieldConfig, MultiModalFieldElem,
+                                    MultiModalFlatField, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField, NestedTensors)
 
 CUSTOM_TYPE_PICKLE = 1
 CUSTOM_TYPE_CLOUDPICKLE = 2
 CUSTOM_TYPE_RAW_VIEW = 3
 
+# MultiModealField class serialization type map.
+# These need to list all possible field types and match them
+# to factory methods in `MultiModalFieldConfig`.
+MMF_CLASS_TO_FACTORY = {
+    MultiModalFlatField: "flat",
+    MultiModalSharedField: "shared",
+    MultiModalBatchedField: "batched",
+}
+
 bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
 
 
@@ -51,20 +63,15 @@ def __init__(self, size_threshold=None):
         self.aux_buffers: Optional[list[bytestr]] = None
         self.size_threshold = size_threshold
 
-    # TODO - merge these constructors and remove the need for externally managed
-    # serialization buffers.
     def encode(self, obj: Any) -> Sequence[bytestr]:
-        return self.encode_into(obj, self.msg_buffer)
-
-    def encode_into(self, obj: Any, buf: bytearray) -> Sequence[bytestr]:
         try:
             # This `bufs` list allows us to collect direct pointers to backing
             # buffers of tensors and np arrays, and return them along with the
             # top-level encoded buffer instead of copying their data into the
             # new buffer.
-            self.aux_buffers = [buf]
+            self.aux_buffers = [self.msg_buffer]
             bufs = self.aux_buffers
-            self.encoder.encode_into(obj, buf)
+            self.encoder.encode_into(obj, self.msg_buffer)
             return bufs
         finally:
             self.aux_buffers = None
@@ -111,11 +118,8 @@ def _encode_ndarray(
         self, obj: np.ndarray
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
-        # Either copy the memoryview directly or flatten the array to bytes.
-        # Sending memoryviews is theoretically faster, but in this particular
-        # case, it triggers some unnecessary copies anyway.
-        # With this, the tensors can still be zero-copy read.
-        arr_data = obj.tobytes()
+        # If the array is non-contiguous, we need to copy it first
+        arr_data = obj.data if obj.data.c_contiguous else obj.tobytes()
         if not obj.shape or obj.nbytes < self.size_threshold:
             # Encode small arrays and scalars inline. Using this extension type
             # ensures we can avoid copying when decoding.
@@ -136,11 +140,15 @@ def _encode_nested_tensors(self, obj: Any) -> NestedTensors:
         return [self._encode_nested_tensors(x) for x in obj]
 
     def _encode_field(self, field: BaseMultiModalField):
-        # Encode the field as a dictionary + special handling for .field
-        d = asdict(field)
-        # Strip first 10 characters and last 5 characters from the class name
-        # to get the field type name that matches the factory function name.
-        return (field.__class__.__name__[10:-5].lower(), *d.values())
+        # Figure out the factory name for the field type.
+        name = MMF_CLASS_TO_FACTORY.get(field.__class__)
+        if not name:
+            raise TypeError(f"Unsupported field type: {field.__class__}")
+        # We just need to copy all of the field values in order
+        # which will be then used to reconstruct the field.
+        field_values = (getattr(field, f.name)
+                        for f in dataclasses.fields(field))
+        return (name, *field_values)
 
 
 class MsgpackDecoder: