diff --git a/images/01.jpg b/images/01.jpg new file mode 100755 index 000000000000..9c10070b1e73 Binary files /dev/null and b/images/01.jpg differ diff --git a/images/02.jpg b/images/02.jpg new file mode 100755 index 000000000000..4c4a8eef1d6a Binary files /dev/null and b/images/02.jpg differ diff --git a/images/03.jpg b/images/03.jpg new file mode 100755 index 000000000000..0e125a2c5aab Binary files /dev/null and b/images/03.jpg differ diff --git a/images/04.jpg b/images/04.jpg new file mode 100755 index 000000000000..3db3034c5e84 Binary files /dev/null and b/images/04.jpg differ diff --git a/images/05.jpg b/images/05.jpg new file mode 100755 index 000000000000..e8ba056241b4 Binary files /dev/null and b/images/05.jpg differ diff --git a/images/06.jpg b/images/06.jpg new file mode 100755 index 000000000000..3a545d3ec892 Binary files /dev/null and b/images/06.jpg differ diff --git a/images/07.jpg b/images/07.jpg new file mode 100755 index 000000000000..08dd709d9056 Binary files /dev/null and b/images/07.jpg differ diff --git a/images/08.jpg b/images/08.jpg new file mode 100755 index 000000000000..f1cf8ff0cc72 Binary files /dev/null and b/images/08.jpg differ diff --git a/images/09.jpg b/images/09.jpg new file mode 100755 index 000000000000..d0c807fb8b66 Binary files /dev/null and b/images/09.jpg differ diff --git a/images/10.jpg b/images/10.jpg new file mode 100755 index 000000000000..09401554a771 Binary files /dev/null and b/images/10.jpg differ diff --git a/images/11.jpg b/images/11.jpg new file mode 100755 index 000000000000..e5328402a251 Binary files /dev/null and b/images/11.jpg differ diff --git a/images/12.jpg b/images/12.jpg new file mode 100755 index 000000000000..3dc87333c030 Binary files /dev/null and b/images/12.jpg differ diff --git a/images/13.jpg b/images/13.jpg new file mode 100755 index 000000000000..8cf0572d6fdc Binary files /dev/null and b/images/13.jpg differ diff --git a/images/14.jpg b/images/14.jpg new file mode 100755 index 000000000000..2c3bface8727 Binary files /dev/null and b/images/14.jpg differ diff --git a/images/r01.jpg b/images/r01.jpg new file mode 100644 index 000000000000..b1992966a74d Binary files /dev/null and b/images/r01.jpg differ diff --git a/images/r02.jpg b/images/r02.jpg new file mode 100644 index 000000000000..791f76601ad0 Binary files /dev/null and b/images/r02.jpg differ diff --git a/images/r03.jpg b/images/r03.jpg new file mode 100644 index 000000000000..4283d8b79875 Binary files /dev/null and b/images/r03.jpg differ diff --git a/images/r04.jpg b/images/r04.jpg new file mode 100644 index 000000000000..c69c07bf20f7 Binary files /dev/null and b/images/r04.jpg differ diff --git a/images/r05.jpg b/images/r05.jpg new file mode 100644 index 000000000000..9e75e3d126d7 Binary files /dev/null and b/images/r05.jpg differ diff --git a/images/r06.jpg b/images/r06.jpg new file mode 100644 index 000000000000..2f99dbb87af4 Binary files /dev/null and b/images/r06.jpg differ diff --git a/images/r07.jpg b/images/r07.jpg new file mode 100644 index 000000000000..f13b4d66e551 Binary files /dev/null and b/images/r07.jpg differ diff --git a/images/r08.jpg b/images/r08.jpg new file mode 100644 index 000000000000..8620f5413a76 Binary files /dev/null and b/images/r08.jpg differ diff --git a/images/r09.jpg b/images/r09.jpg new file mode 100644 index 000000000000..d60af7639001 Binary files /dev/null and b/images/r09.jpg differ diff --git a/images/r10.jpg b/images/r10.jpg new file mode 100644 index 000000000000..6a4d2229fdbd Binary files /dev/null and b/images/r10.jpg differ diff --git a/images/r11.jpg b/images/r11.jpg new file mode 100644 index 000000000000..e25b30c264b1 Binary files /dev/null and b/images/r11.jpg differ diff --git a/images/r12.jpg b/images/r12.jpg new file mode 100644 index 000000000000..3ea859a16042 Binary files /dev/null and b/images/r12.jpg differ diff --git a/images/r13.jpg b/images/r13.jpg new file mode 100644 index 000000000000..c9c523fb669b Binary files /dev/null and b/images/r13.jpg differ diff --git a/images/r14.jpg b/images/r14.jpg new file mode 100644 index 000000000000..945e068e89cb Binary files /dev/null and b/images/r14.jpg differ diff --git a/images/r15.jpg b/images/r15.jpg new file mode 100644 index 000000000000..4283d8b79875 Binary files /dev/null and b/images/r15.jpg differ diff --git a/images/r16.jpg b/images/r16.jpg new file mode 100644 index 000000000000..f13b4d66e551 Binary files /dev/null and b/images/r16.jpg differ diff --git a/images/result.jpg b/images/result.jpg new file mode 100644 index 000000000000..904bb872fe22 Binary files /dev/null and b/images/result.jpg differ diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index 0fc3b074533d..9172a946d484 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -1,10 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 from collections import UserDict from dataclasses import dataclass +from typing import Optional +import msgspec import numpy as np import torch +from vllm.multimodal.inputs import (MultiModalBatchedField, + MultiModalFieldElem, MultiModalKwargs, + MultiModalKwargsItem, NestedTensors) from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder @@ -36,7 +41,7 @@ def test_encode_decode(): list_of_tensors=[ torch.rand((1, 10), dtype=torch.float32), torch.rand((3, 5, 4000), dtype=torch.float64), - torch.tensor(1984), # test scalar too + torch.tensor(1984), # test scalar too, ], numpy_array=np.arange(512), unrecognized=UnrecognizedType(33), @@ -70,6 +75,99 @@ def test_encode_decode(): assert_equal(decoded2, obj) +class MyRequest(msgspec.Struct): + mm: Optional[list[MultiModalKwargs]] + + +def test_multimodal_kwargs(): + d = { + "foo": + torch.zeros(1000, dtype=torch.float16), + "bar": [torch.zeros(i * 1000, dtype=torch.int8) for i in range(3)], + "baz": [ + torch.rand((256), dtype=torch.float16), + [ + torch.rand((1, 12), dtype=torch.float32), + torch.rand((3, 5, 7), dtype=torch.float64), + ], [torch.rand((4, 4), dtype=torch.float16)] + ], + } + + # pack mm kwargs into a mock request so that it can be decoded properly + req = MyRequest(mm=[MultiModalKwargs(d)]) + + encoder = MsgpackEncoder() + decoder = MsgpackDecoder(MyRequest) + + encoded = encoder.encode(req) + + # 8 total tensors + top level buffer + assert len(encoded) == 9 + + total_len = sum([len(x) for x in encoded]) + + # expected total encoding length, should be 4362, +-20 for minor changes + assert total_len >= 4342 and total_len <= 4382 + decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] + assert all(nested_equal(d[k], decoded[k]) for k in d) + + +def test_multimodal_items_by_modality(): + e1 = MultiModalFieldElem("audio", "a0", torch.zeros(1000, + dtype=torch.int16), + MultiModalBatchedField()) + e2 = MultiModalFieldElem( + "video", + "v0", + [torch.zeros(1000, dtype=torch.int8) for _ in range(4)], + MultiModalBatchedField(), + ) + e3 = MultiModalFieldElem("image", "i0", torch.zeros(1000, + dtype=torch.int32), + MultiModalBatchedField()) + e4 = MultiModalFieldElem("image", "i1", torch.zeros(1000, + dtype=torch.int32), + MultiModalBatchedField()) + audio = MultiModalKwargsItem.from_elems([e1]) + video = MultiModalKwargsItem.from_elems([e2]) + image = MultiModalKwargsItem.from_elems([e3, e4]) + mm = MultiModalKwargs.from_items([audio, video, image]) + + # pack mm kwargs into a mock request so that it can be decoded properly + req = MyRequest([mm]) + + encoder = MsgpackEncoder() + decoder = MsgpackDecoder(MyRequest) + + encoded = encoder.encode(req) + + # 5 total tensors + top level buffer + assert len(encoded) == 8 + + total_len = sum([len(x) for x in encoded]) + + # expected total encoding length, should be 7507, +-20 for minor changes + assert total_len >= 7487 and total_len <= 7527 + decoded: MultiModalKwargs = decoder.decode(encoded).mm[0] + + # check all modalities were recovered and do some basic sanity checks + assert len(decoded.modalities) == 3 + images = decoded.get_items("image") + assert len(images) == 1 + assert len(images[0].items()) == 2 + assert list(images[0].keys()) == ["i0", "i1"] + + # check the tensor contents and layout in the main dict + assert all(nested_equal(mm[k], decoded[k]) for k in mm) + + +def nested_equal(a: NestedTensors, b: NestedTensors): + if isinstance(a, torch.Tensor): + return torch.equal(a, b) + else: + return all([nested_equal(x, y) for (x, y) in zip(a, b)]) + + def assert_equal(obj1: MyType, obj2: MyType): assert torch.equal(obj1.tensor1, obj2.tensor1) assert obj1.a_string == obj2.a_string diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 99b352fdef80..2651540021fc 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -2,7 +2,9 @@ import pickle from collections.abc import Sequence +from dataclasses import asdict from inspect import isclass +from itertools import chain from types import FunctionType from typing import Any, Optional, Union @@ -12,12 +14,17 @@ import zmq from msgspec import msgpack +from vllm.logger import init_logger +from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs, + MultiModalKwargsItem, NestedTensors) + CUSTOM_TYPE_PICKLE = 1 CUSTOM_TYPE_CLOUDPICKLE = 2 # TODO calibrate this size INLINE_BUF_SIZE_THRESHOLD = 256 +logger = init_logger(__name__) bytestr = Union[bytes, bytearray, memoryview, zmq.Frame] @@ -64,6 +71,25 @@ def enc_hook(self, obj: Any) -> Any: if isinstance(obj, np.ndarray) and obj.dtype.kind not in ('O', 'V'): return self._encode_ndarray(obj) + if isinstance(obj, MultiModalKwargs): + mm: MultiModalKwargs = obj + if mm.modalities: + # ignore the main dict, it will be re-indexed. + # pass a list of MultiModalKwargsItem, then see below + # Any tensors *not* indexed by modality will be ignored. + return [mm.get_items(m) for m in mm.modalities] + # just return the main dict if there are no modalities + return {k: v for k, v in obj.items()} + + if isinstance(obj, MultiModalKwargsItem): + rd = {} + for k, v in obj.items(): + vv = asdict(v) + vv['field'] = pickle.dumps(v.field, + protocol=pickle.HIGHEST_PROTOCOL) + rd[k] = vv + return rd + if isinstance(obj, FunctionType): # `pickle` is generally faster than cloudpickle, but can have # problems serializing methods. @@ -87,7 +113,7 @@ def _encode_ndarray( # We serialize the ndarray as a tuple of native types. # The data is either inlined if small, or an index into a list of # backing buffers that we've stashed in `aux_buffers`. - return obj.dtype.str, obj.shape, data + return (obj.dtype.str, obj.shape, data) class MsgpackDecoder: @@ -121,15 +147,39 @@ def dec_hook(self, t: type, obj: Any) -> Any: if isclass(t): if issubclass(t, np.ndarray): return self._decode_ndarray(obj) + if issubclass(t, MultiModalKwargs) and isinstance(obj, dict): + return MultiModalKwargs( + {k: self._decode_nested(obj[k]) + for k in obj}) + if issubclass(t, MultiModalKwargs) and isinstance(obj, list): + return MultiModalKwargs.from_items(self._decode_items(obj)) if issubclass(t, torch.Tensor): return torch.from_numpy(self._decode_ndarray(obj)) return obj - def _decode_ndarray(self, arr: Any) -> np.ndarray: - dtype, shape, data = arr + def _decode_ndarray(self, obj: Any) -> np.ndarray: + (dtype, shape, data) = obj buffer = self.aux_buffers[data] if isinstance(data, int) else data return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape) + def _decode_items(self, obj: list) -> list[MultiModalKwargsItem]: + all = [] + for item in chain.from_iterable(obj): + elems = [] + for v in item.values(): + v['data'] = self._decode_nested(v['data']) + v['field'] = pickle.loads(v['field']) + elems.append(MultiModalFieldElem(**v)) + all.append(MultiModalKwargsItem.from_elems(elems)) + return all + + def _decode_nested(self, obj: Any) -> NestedTensors: + if isinstance(obj, list) and isinstance(obj[0], str): + return torch.from_numpy(self._decode_ndarray(obj)) + if isinstance(obj, list): + return [self._decode_nested(x) for x in obj] + raise TypeError(f"Unexpected NestedArray contents: {obj}") + def ext_hook(self, code: int, data: memoryview) -> Any: if code == CUSTOM_TYPE_PICKLE: return pickle.loads(data)