Copy memory when sending, zero copy when receiving

p88h · p88h · commit 176ba0609d56 · 2025-04-13T13:16:55.000+02:00
This helps reduce memory usage and keeps very good performance.

Signed-off-by: Staszek Pasko &lt;staszek@gmail.com&gt;
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
@@ -105,18 +105,17 @@ def test_multimodal_kwargs():
     # pack mm kwargs into a mock request so that it can be decoded properly
     req = MyRequest(mm=[MultiModalKwargs(d)])
 
-    encoder = MsgpackEncoder(size_threshold=16 * 1024)
+    encoder = MsgpackEncoder()
     decoder = MsgpackDecoder(MyRequest)
 
     encoded = encoder.encode(req)
 
-    # Only "foo" is larger than threshold
-    assert len(encoded) == 2
+    assert len(encoded) == 6
 
     total_len = sum(len(x) for x in encoded)
 
-    # expected total encoding length, should be 24541, +-20 for minor changes
-    assert total_len >= 24521 and total_len <= 24561
+    # expected total encoding length, should be 44536, +-20 for minor changes
+    assert total_len >= 44516 and total_len <= 44556
     decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
     assert all(nested_equal(d[k], decoded[k]) for k in d)
 
@@ -150,13 +149,12 @@ def test_multimodal_items_by_modality():
 
     encoded = encoder.encode(req)
 
-    # All messages are 'small', i.e. below 256MB default
-    assert len(encoded) == 1
+    assert len(encoded) == 8
 
     total_len = sum([len(x) for x in encoded])
 
-    # expected total encoding length, should be 14287, +-20 for minor changes
-    assert total_len >= 14267 and total_len <= 14307
+    # expected total encoding length, should be 14255, +-20 for minor changes
+    assert total_len >= 14235 and total_len <= 14275
     decoded: MultiModalKwargs = decoder.decode(encoded).mm[0]
 
     # check all modalities were recovered and do some basic sanity checks
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
@@ -31,15 +31,15 @@ class MsgpackEncoder:
     Note that unlike vanilla `msgspec` Encoders, this interface is generally
     not thread-safe when encoding tensors / numpy arrays.
 
-    By default, arrays below 256MB are serialized inline.
+    By default, arrays below 256B are serialized inline.
     Larger will get sent via dedicated messages. 
     Note that this is a per-tensor limit.
 
     Sending multiple large messages via zeromq saturates memory very quickly.
     See: https://github.com/vllm-project/vllm/issues/16185
     """
 
-    def __init__(self, size_threshold=256 * 1024 * 1024):
+    def __init__(self, size_threshold=256):
         self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
         # This is used as a local stash of buffers that we can then access from
         # our custom `msgspec` hook, `enc_hook`. We don't have a way to
@@ -102,7 +102,12 @@ def _encode_ndarray(
         self, obj: np.ndarray
     ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]:
         assert self.aux_buffers is not None
-        arr_data = obj.data if obj.data.c_contiguous else obj.tobytes()
+        # Either copy the memoryview directly or flatten the array to bytes.
+        # Sending memoryviews is theoretically faster, but in this particular
+        # case, it triggers some unnecessary copies anyway.
+        # With this, the tensors can still be zero-copy read.
+        arr_data = obj.data.tobytes() if obj.data.c_contiguous \
+            else obj.tobytes()
         if not obj.shape or obj.nbytes < self.size_threshold:
             # Encode small arrays and scalars inline. Using this extension type
             # ensures we can avoid copying when decoding.
@@ -165,8 +170,8 @@ def _decode_ndarray(self, arr: Any) -> np.ndarray:
         dtype, shape, data = arr
         # Copy from inline representation, otherwise Torch is unhappy since
         # the returned memory is non-writeable.
-        buffer = self.aux_buffers[data] if isinstance(
-            data, int) else bytearray(data).copy()
+        buffer = self.aux_buffers[data] if isinstance(data, int) \
+            else bytearray(data)
         return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape)
 
     def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]: