Add threshold env var, re-do field serialization, cleanup

p88h · p88h · commit 578aab87a346 · 2025-04-15T09:41:15.000+02:00
addresses review comments

Signed-off-by: Staszek Pasko &lt;staszek@gmail.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -107,6 +107,7 @@
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_XGRAMMAR_CACHE_MB: int = 0
+    VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
 
 
 def get_default_cache_root():
@@ -704,6 +705,16 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
     # It can be changed with this variable if needed for some reason.
     "VLLM_XGRAMMAR_CACHE_MB":
     lambda: int(os.getenv("VLLM_XGRAMMAR_CACHE_MB", "512")),
+
+    # Control the threshold for msgspec to use 'zero copy' for
+    # serialization/deserialization of tensors. Tensors below
+    # this limit will be encoded into the msgpack buffer, and
+    # tensors above will instead be sent via a separate message.
+    # While the sending side still actually copies the tensor
+    # in all cases, on the receiving side, tensors above this
+    # limit will actually be zero-copy decoded.
+    "VLLM_MSGPACK_ZERO_COPY_THRESHOLD":
+    lambda: int(os.getenv("VLLM_MSGPACK_ZERO_COPY_THRESHOLD", "256")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
@@ -282,15 +282,6 @@ def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
 
         return self._reduce_data([item.data for item in elems])
 
-    @abstractmethod
-    def field_type(self) -> tuple[Any, ...]:
-        """
-        Return the type of this field instance and constructor args.
-
-        Required for serialization.
-        """
-        raise NotImplementedError
-
 
 @dataclass(frozen=True)
 class MultiModalBatchedField(BaseMultiModalField):
@@ -321,9 +312,6 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
 
         return batch
 
-    def field_type(self) -> tuple[Any, ...]:
-        return ("batched", )
-
 
 @dataclass(frozen=True)
 class MultiModalFlatField(BaseMultiModalField):
@@ -356,9 +344,6 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
 
         return [e for elem in batch for e in elem]
 
-    def field_type(self) -> tuple[Any, ...]:
-        return ("flat", self.slices)
-
 
 @dataclass(frozen=True)
 class MultiModalSharedField(BaseMultiModalField):
@@ -380,9 +365,6 @@ def build_elems(
     def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
         return batch[0]
 
-    def field_type(self) -> tuple[Any, ...]:
-        return ("shared", self.batch_size)
-
 
 class MultiModalFieldConfig:
 
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
@@ -14,9 +14,10 @@
 import zmq
 from msgspec import msgpack
 
-from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalFieldElem,
-                                    MultiModalKwargs, MultiModalKwargsItem,
-                                    NestedTensors)
+from vllm import envs
+from vllm.multimodal.inputs import (BaseMultiModalField, MultiModalFieldConfig,
+                                    MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem, NestedTensors)
 
 CUSTOM_TYPE_PICKLE = 1
 CUSTOM_TYPE_CLOUDPICKLE = 2
@@ -39,16 +40,21 @@ class MsgpackEncoder:
     See: https://github.com/vllm-project/vllm/issues/16185
     """
 
-    def __init__(self, size_threshold=256):
+    def __init__(self, size_threshold=None):
+        if (size_threshold is None):
+            size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
         self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
         # This is used as a local stash of buffers that we can then access from
         # our custom `msgspec` hook, `enc_hook`. We don't have a way to
         # pass custom data to the hook otherwise.
+        self.msg_buffer = bytearray()
         self.aux_buffers: Optional[list[bytestr]] = None
         self.size_threshold = size_threshold
 
+    # TODO - merge these constructors and remove the need for externally managed
+    # serialization buffers.
     def encode(self, obj: Any) -> Sequence[bytestr]:
-        return self.encode_into(obj, bytearray())
+        return self.encode_into(obj, self.msg_buffer)
 
     def encode_into(self, obj: Any, buf: bytearray) -> Sequence[bytestr]:
         try:
@@ -85,9 +91,8 @@ def enc_hook(self, obj: Any) -> Any:
             ret = []
             for elem in obj.values():
                 # Encode as plain dictionary + special handling for .field
-                d = asdict(elem)
-                d["field"] = elem.field.field_type()
-                ret.append(d)
+                ret.append(
+                    asdict(elem) | {"field": self._encode_field(elem.field)})
             return ret
 
         if isinstance(obj, FunctionType):
@@ -106,8 +111,7 @@ def _encode_ndarray(
         # Sending memoryviews is theoretically faster, but in this particular
         # case, it triggers some unnecessary copies anyway.
         # With this, the tensors can still be zero-copy read.
-        arr_data = obj.data.tobytes() if obj.data.c_contiguous \
-            else obj.tobytes()
+        arr_data = obj.tobytes()
         if not obj.shape or obj.nbytes < self.size_threshold:
             # Encode small arrays and scalars inline. Using this extension type
             # ensures we can avoid copying when decoding.
@@ -122,6 +126,13 @@ def _encode_ndarray(
         # backing buffers that we've stashed in `aux_buffers`.
         return obj.dtype.str, obj.shape, data
 
+    def _encode_field(self, field: BaseMultiModalField):
+        # Encode the field as a dictionary + special handling for .field
+        d = asdict(field)
+        # Strip first 10 characters and last 5 characters from the class name
+        # to get the field type name that matches the factory function name.
+        return (field.__class__.__name__[10:-5].lower(), *d.values())
+
 
 class MsgpackDecoder:
     """Decoder with custom torch tensor and numpy array serialization.