From 245c8698d2615d411c725bf9cd636513569cfa33 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 22 Aug 2024 15:51:30 -0700 Subject: [PATCH 01/46] First commit on DRIVERS-2926-BSON-Binary-Vectors --- bson/__init__.py | 25 +++++++- bson/binary.py | 7 +++ bson/vector.py | 156 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 186 insertions(+), 2 deletions(-) create mode 100644 bson/vector.py diff --git a/bson/__init__.py b/bson/__init__.py index e8ac7c4441..934618cb91 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -137,6 +137,7 @@ from bson.son import RE_TYPE, SON from bson.timestamp import Timestamp from bson.tz_util import utc +from bson.vector import DTYPE_CODES, INV_DTYPE_CODES, BinaryVector # Import some modules for type-checking only. if TYPE_CHECKING: @@ -150,6 +151,8 @@ except ImportError: _USE_C = False +_USE_C = False # TODO Replace + __all__ = [ "ALL_UUID_SUBTYPES", "CSHARP_LEGACY", @@ -158,6 +161,7 @@ "STANDARD", "UUID_SUBTYPE", "Binary", + "BinaryVector", "UuidRepresentation", "Code", "DEFAULT_CODEC_OPTIONS", @@ -379,7 +383,7 @@ def _get_binary( """Decode a BSON binary to bson.binary.Binary or python UUID.""" length, subtype = _UNPACK_LENGTH_SUBTYPE_FROM(data, position) position += 5 - if subtype == 2: + if subtype == 2: # length2 = _UNPACK_INT_FROM(data, position)[0] position += 4 if length2 != length - 4: @@ -404,6 +408,14 @@ def _get_binary( # Decode subtype 0 to 'bytes'. if subtype == 0: value = data[position:end] + elif subtype == 9: + dtype_int, padding = struct.unpack_from( + " _DocumentType try: if _raw_document_class(opts.document_class): return opts.document_class(data, opts) # type:ignore[call-arg] - _, end = _get_object_size(data, 0, len(data)) + _, end = _get_object_size(data, 0, len(data)) # todo - how does this work return cast("_DocumentType", _elements_to_dict(data, view, 4, end, opts)) except InvalidBSON: raise @@ -758,6 +770,14 @@ def _encode_binary(name: bytes, value: Binary, dummy0: Any, dummy1: Any) -> byte return b"\x05" + name + _PACK_LENGTH_SUBTYPE(len(value), subtype) + value +def _encode_vector(name: bytes, value: Any, dummy0: Any, dummy1: Any) -> bytes: + """Encode bson.binary.BinaryVector, a subtype of Binary.""" + metadata = struct.pack( + " bytes: """Encode uuid.UUID.""" uuid_representation = opts.uuid_representation @@ -887,6 +907,7 @@ def _encode_maxkey(name: bytes, dummy0: Any, dummy1: Any, dummy2: Any) -> bytes: type(None): _encode_none, uuid.UUID: _encode_uuid, Binary: _encode_binary, + BinaryVector: _encode_vector, Int64: _encode_long, Code: _encode_code, DBRef: _encode_dbref, diff --git a/bson/binary.py b/bson/binary.py index 5fe1bacd16..747a4b803f 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -191,6 +191,13 @@ class UuidRepresentation: """ +VECTOR_SUBTYPE = 9 +"""BSON binary subtype for densely packed vector data. + +.. versionadded:: 4.9 +""" + + USER_DEFINED_SUBTYPE = 128 """BSON binary subtype for any user defined structure. """ diff --git a/bson/vector.py b/bson/vector.py new file mode 100644 index 0000000000..69fd7fb183 --- /dev/null +++ b/bson/vector.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import logging +import struct +from typing import Any, List, Optional, Type + +import bson +from bson.binary import Binary + +logger = logging.getLogger(__name__) + +DTYPE_CODES = { + "int8": b"\x03", + "float32": b"\x27", + "bool": b"\x10", +} + +INV_DTYPE_CODES = {ord(v): k for k, v in DTYPE_CODES.items()} + +DTYPES_SUPPORTED = { + "int8": 8, # signed integers in [128, 127]. 8 bits + "bool": 1, # vector of bits received as ints in [0, 255] + "float32": 4, # +} + + +def int_to_bin(value, bit_width=8): + """Twos-complement representation of binary values. + Uses value of most-significant-bit (msb) to denote sign. + 0 is positive. 1 is negative + """ + if value >= 0: + binary_representation = bin(value)[2:].zfill(bit_width) + else: + # Compute two's complement + binary_representation = bin((1 << bit_width) + value)[2:] + return binary_representation + + +class BinaryVector(Binary): + """ + + TODO: + 1. Add bool_ + a. from_list + b. handle padding + c. as_list (first version as list of ints) + 2. Take dtype and padding out of binary payload. Move logic to __init__.py + 3. Turn dtype into enum + 4. Add docs + 5. Add a few simple tests. e.g. For empty and non-sensible inputs + 6. Get to BSON Specs + + + TODO: + - CLASS STRUCTURE + - Do we want to have a separate class or just add class methods? + - If we roll into Binary, we bake the dtype into the bytes. + - We just have to unroll this value when decoding + - What benefit do we get from subclassing this? ==> Not having to massively change BSON Spec! + - __eq__? not __hash__ though.. + -[DECODE] bson._get_binary + - because of _ELEMENT_GETTER[bson_type] -> bson._get_binary, the first line will get called. + - we then add dtype as 1 byte following that + - if subtype == 9: dtype = _UNPACK_DTYPE_FROM(data, position)[0]; position += 1 etc; _UNPACK_DTYPE_FROM = struct.Struct(" BinaryVector: + self = Binary.__new__(cls, data, bson.binary.VECTOR_SUBTYPE) + assert dtype in DTYPES_SUPPORTED + self.dtype = dtype # todo - decide if we wish to make private and expose via property + self.padding = padding + return self + + def __repr__(self) -> str: + return f"BinaryVector({bytes.__repr__(self)}, dtype={self.dtype}, padding={self.padding})" + + @classmethod + def from_list( + cls: Type[BinaryVector], num_list: List, dtype: str, padding: int = 0 + ) -> BinaryVector: + """Create a BSON Binary Vector subtype from a list of python objects. + + :param num_list: List of values + :param dtype: Data type of the values + :param padding: For fractional bytes, number of bits to ignore at end of vector. + :return: Binary packed data identified by dtype and padding. + """ + if dtype == "int8": # pack ints in [-128, 127] as signed int8 + format_str = "b" + elif dtype == "bool": # pack ints in [0, 255] as unsigned uint8 + format_str = "B" + elif dtype == "float32": # pack floats as float32 + format_str = "f" + else: + raise NotImplementedError("%s not yet supported" % dtype) + + data = struct.pack(f"{len(num_list)}{format_str}", *num_list) + return cls(data, dtype, padding) + + def as_list(self, dtype: Optional[str] = None, padding: Optional[int] = None) -> List[Any]: + """Create a list of python objects. + + BinaryVector was created with a specific dtype and padding. + The optional kwargs allow one to view data in other formats. + + :param dtype: Optional dtype to use instead of self.dtype + :param padding: Optional number of bytes to discard instead of self.padding + :return: List of numbers. + """ + dtype = dtype or self.dtype + padding = padding or self.padding + + if dtype == "bool": + n_values = len(self) # data packed as uint8 + unpacked_uint8s = struct.unpack(f"{n_values}B", self) + bits = [] + for uint8 in unpacked_uint8s: + bits.extend([int(bit) for bit in f"{uint8:08b}"]) + return bits[:-padding] + + elif dtype == "int8": + n_values = len(self) + dtype_format = "b" + format_string = f"{n_values}{dtype_format}" + unpacked_data = struct.unpack(format_string, self) + return list(unpacked_data) + + elif dtype == "float32": + n_bytes = len(self) + n_values = n_bytes // 4 + assert n_bytes % 4 == 0 + unpacked_data = struct.unpack(f"{n_values}f", self) + return list(unpacked_data) + + else: + raise NotImplementedError("BinaryVector dtype %i not yet supported" % dtype) From 031cd8cc2c49bf60e57a520f20c1886286bf9076 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 22 Aug 2024 17:54:49 -0700 Subject: [PATCH 02/46] Turns dtype into enum. Adds handling of padding, __eq__. Removal of notes. --- bson/__init__.py | 13 ++---- bson/vector.py | 119 ++++++++++++++++++----------------------------- 2 files changed, 48 insertions(+), 84 deletions(-) diff --git a/bson/__init__.py b/bson/__init__.py index 934618cb91..3b09dc2abd 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -137,7 +137,7 @@ from bson.son import RE_TYPE, SON from bson.timestamp import Timestamp from bson.tz_util import utc -from bson.vector import DTYPE_CODES, INV_DTYPE_CODES, BinaryVector +from bson.vector import BinaryVector # Import some modules for type-checking only. if TYPE_CHECKING: @@ -409,13 +409,10 @@ def _get_binary( if subtype == 0: value = data[position:end] elif subtype == 9: - dtype_int, padding = struct.unpack_from( - " byte def _encode_vector(name: bytes, value: Any, dummy0: Any, dummy1: Any) -> bytes: """Encode bson.binary.BinaryVector, a subtype of Binary.""" - metadata = struct.pack( - "= 0: - binary_representation = bin(value)[2:].zfill(bit_width) - else: - # Compute two's complement - binary_representation = bin((1 << bit_width) + value)[2:] - return binary_representation +DTYPE_FROM_HEX = {key.value: key for key in DTYPES} class BinaryVector(Binary): """ TODO: - 1. Add bool_ - a. from_list - b. handle padding - c. as_list (first version as list of ints) - 2. Take dtype and padding out of binary payload. Move logic to __init__.py - 3. Turn dtype into enum 4. Add docs - 5. Add a few simple tests. e.g. For empty and non-sensible inputs + 5. Add simple tests?. + - those in bson_vector.py + - empty and non-sensible inputs 6. Get to BSON Specs - - - TODO: - - CLASS STRUCTURE - - Do we want to have a separate class or just add class methods? - - If we roll into Binary, we bake the dtype into the bytes. - - We just have to unroll this value when decoding - - What benefit do we get from subclassing this? ==> Not having to massively change BSON Spec! - - __eq__? not __hash__ though.. - -[DECODE] bson._get_binary - - because of _ELEMENT_GETTER[bson_type] -> bson._get_binary, the first line will get called. - - we then add dtype as 1 byte following that - - if subtype == 9: dtype = _UNPACK_DTYPE_FROM(data, position)[0]; position += 1 etc; _UNPACK_DTYPE_FROM = struct.Struct(" BinaryVector: + def __new__(cls, data: Any, dtype: Union[DTYPES, bytes], padding: int = 0) -> BinaryVector: self = Binary.__new__(cls, data, bson.binary.VECTOR_SUBTYPE) - assert dtype in DTYPES_SUPPORTED - self.dtype = dtype # todo - decide if we wish to make private and expose via property + if isinstance(dtype, bytes): + dtype = DTYPE_FROM_HEX[dtype] + assert dtype in DTYPES + self.dtype = dtype # TODO - decide if we wish to make private and expose via property self.padding = padding return self - def __repr__(self) -> str: - return f"BinaryVector({bytes.__repr__(self)}, dtype={self.dtype}, padding={self.padding})" - @classmethod def from_list( - cls: Type[BinaryVector], num_list: List, dtype: str, padding: int = 0 + cls: Type[BinaryVector], num_list: List, dtype: DTYPES, padding: int = 0 ) -> BinaryVector: """Create a BSON Binary Vector subtype from a list of python objects. @@ -105,11 +55,11 @@ def from_list( :param padding: For fractional bytes, number of bits to ignore at end of vector. :return: Binary packed data identified by dtype and padding. """ - if dtype == "int8": # pack ints in [-128, 127] as signed int8 + if dtype == DTYPES.INT8: # pack ints in [-128, 127] as signed int8 format_str = "b" - elif dtype == "bool": # pack ints in [0, 255] as unsigned uint8 + elif dtype == DTYPES.BOOL: # pack ints in [0, 255] as unsigned uint8 format_str = "B" - elif dtype == "float32": # pack floats as float32 + elif dtype == DTYPES.FLOAT32: # pack floats as float32 format_str = "f" else: raise NotImplementedError("%s not yet supported" % dtype) @@ -117,7 +67,7 @@ def from_list( data = struct.pack(f"{len(num_list)}{format_str}", *num_list) return cls(data, dtype, padding) - def as_list(self, dtype: Optional[str] = None, padding: Optional[int] = None) -> List[Any]: + def as_list(self, dtype: Optional[DTYPES] = None, padding: Optional[int] = None) -> List[Any]: """Create a list of python objects. BinaryVector was created with a specific dtype and padding. @@ -130,7 +80,7 @@ def as_list(self, dtype: Optional[str] = None, padding: Optional[int] = None) -> dtype = dtype or self.dtype padding = padding or self.padding - if dtype == "bool": + if dtype == DTYPES.BOOL: n_values = len(self) # data packed as uint8 unpacked_uint8s = struct.unpack(f"{n_values}B", self) bits = [] @@ -138,14 +88,14 @@ def as_list(self, dtype: Optional[str] = None, padding: Optional[int] = None) -> bits.extend([int(bit) for bit in f"{uint8:08b}"]) return bits[:-padding] - elif dtype == "int8": + elif dtype == DTYPES.INT8: n_values = len(self) dtype_format = "b" format_string = f"{n_values}{dtype_format}" unpacked_data = struct.unpack(format_string, self) return list(unpacked_data) - elif dtype == "float32": + elif dtype == DTYPES.FLOAT32: n_bytes = len(self) n_values = n_bytes // 4 assert n_bytes % 4 == 0 @@ -153,4 +103,23 @@ def as_list(self, dtype: Optional[str] = None, padding: Optional[int] = None) -> return list(unpacked_data) else: - raise NotImplementedError("BinaryVector dtype %i not yet supported" % dtype) + raise NotImplementedError("BinaryVector dtype %s not yet supported" % dtype.name) + + def __repr__(self) -> str: + return f"BinaryVector({bytes.__repr__(self)}, dtype={self.dtype}, padding={self.padding})" + + def __eq__(self, other: Any) -> bool: + if isinstance(other, BinaryVector): + return ( + self.__subtype == other.subtype + and self.dtype == other.dtype + and self.padding == other.padding + and bytes(self) == bytes(other) + ) + return False + + def __ne__(self, other: Any) -> bool: + return not self == other + + def __hash__(self) -> int: + return super().__hash__() ^ hash(self.dtype) ^ hash(self.padding) From 8d4e8a2e031638b1ac5e52e200f419106dd6acd9 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 23 Aug 2024 09:41:24 -0700 Subject: [PATCH 03/46] Added docstring and comments --- bson/__init__.py | 1 + bson/vector.py | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/bson/__init__.py b/bson/__init__.py index 3b09dc2abd..8c2989ec18 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -408,6 +408,7 @@ def _get_binary( # Decode subtype 0 to 'bytes'. if subtype == 0: value = data[position:end] + # Decode subtype 9 to vector. elif subtype == 9: dtype, padding = struct.unpack_from(" BinaryVector: self = Binary.__new__(cls, data, bson.binary.VECTOR_SUBTYPE) if isinstance(dtype, bytes): dtype = DTYPE_FROM_HEX[dtype] assert dtype in DTYPES - self.dtype = dtype # TODO - decide if we wish to make private and expose via property + self.dtype = dtype self.padding = padding return self From 2df0d6b431c6a5f9832b6427513b91a66589574d Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 23 Aug 2024 10:18:57 -0700 Subject: [PATCH 04/46] Changed order of BinaryVector and Binary in bson._ENCODERS to get test.test_bson.TestBSON.test_encode_type_marker to pass --- bson/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bson/__init__.py b/bson/__init__.py index 8c2989ec18..7f26c5862a 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -902,8 +902,8 @@ def _encode_maxkey(name: bytes, dummy0: Any, dummy1: Any, dummy2: Any) -> bytes: tuple: _encode_list, type(None): _encode_none, uuid.UUID: _encode_uuid, - Binary: _encode_binary, BinaryVector: _encode_vector, + Binary: _encode_binary, Int64: _encode_long, Code: _encode_code, DBRef: _encode_dbref, From 315a11533ed1ae1c6c8b137784f4e949ce497419 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 23 Aug 2024 11:51:14 -0700 Subject: [PATCH 05/46] Changed order of BinaryVector and Binary in bson._ENCODERS to get test.test_bson.TestBSON.test_encode_type_marker to pass --- bson/vector.py | 2 +- test/test_bson.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/bson/vector.py b/bson/vector.py index cc42e68b76..9c2cf96a9e 100644 --- a/bson/vector.py +++ b/bson/vector.py @@ -117,7 +117,7 @@ def __repr__(self) -> str: def __eq__(self, other: Any) -> bool: if isinstance(other, BinaryVector): return ( - self.__subtype == other.subtype + self.subtype == other.subtype and self.dtype == other.dtype and self.padding == other.padding and bytes(self) == bytes(other) diff --git a/test/test_bson.py b/test/test_bson.py index a0190ef2d8..3664145e6b 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -63,6 +63,7 @@ from bson.son import SON from bson.timestamp import Timestamp from bson.tz_util import FixedOffset, utc +from bson.vector import DTYPES, BinaryVector class NotADict(abc.MutableMapping): @@ -148,6 +149,11 @@ def helper(doc): helper({"a binary": Binary(b"test", 128)}) helper({"a binary": Binary(b"test", 254)}) helper({"another binary": Binary(b"test", 2)}) + helper({"a binary vector": BinaryVector(b"\x01\x0f\xff", b"\x10", 3)}) + helper({"a binary vector": BinaryVector(b"\x01\x0f\xff", DTYPES.BOOL, 3)}) + helper({"a binary vector": BinaryVector(b"\x01\x0f\xff", b"\x03")}) + helper({"a binary vector": BinaryVector(b"\x01\x0f\xff", DTYPES.INT8)}) + helper({"a binary vector": BinaryVector(b"\xcd\xcc\x8c\xbf\xc7H7P", DTYPES.FLOAT32)}) helper(SON([("test dst", datetime.datetime(1993, 4, 4, 2))])) helper(SON([("test negative dst", datetime.datetime(1, 1, 1, 1, 1, 1))])) helper({"big float": float(10000000000)}) @@ -447,6 +453,18 @@ def test_basic_encode(self): encode({"test": Binary(b"test", 128)}), b"\x14\x00\x00\x00\x05\x74\x65\x73\x74\x00\x04\x00\x00\x00\x80\x74\x65\x73\x74\x00", ) + self.assertEqual( + encode({"vector_int8": BinaryVector.from_list([-128, -1, 127], DTYPES.INT8)}), + b"\x1c\x00\x00\x00\x05vector_int8\x00\x03\x00\x00\x00\t\x03\x00\x80\xff\x7f\x00", + ) + self.assertEqual( + encode({"vector_bool": BinaryVector.from_list([1, 127], DTYPES.BOOL)}), + b"\x1b\x00\x00\x00\x05vector_bool\x00\x02\x00\x00\x00\t\x10\x00\x01\x7f\x00", + ) + self.assertEqual( + encode({"vector_float32": BinaryVector.from_list([-1.1, 1.1e10], DTYPES.FLOAT32)}), + b"$\x00\x00\x00\x05vector_float32\x00\x08\x00\x00\x00\t'\x00\xcd\xcc\x8c\xbf\xac\xe9#P\x00", + ) self.assertEqual(encode({"test": None}), b"\x0B\x00\x00\x00\x0A\x74\x65\x73\x74\x00\x00") self.assertEqual( encode({"date": datetime.datetime(2007, 1, 8, 0, 30, 11)}), From d74314d44f41f3828dcdced9a5d13fd9d3665b38 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 23 Aug 2024 16:20:29 -0700 Subject: [PATCH 06/46] json_util dumps/loads of BinaryVector --- bson/json_util.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/bson/json_util.py b/bson/json_util.py index 4269ba9858..52ecdfc430 100644 --- a/bson/json_util.py +++ b/bson/json_util.py @@ -141,6 +141,7 @@ from bson.son import RE_TYPE from bson.timestamp import Timestamp from bson.tz_util import utc +from bson.vector import DTYPES, BinaryVector _RE_OPT_TABLE = { "i": re.I, @@ -608,6 +609,26 @@ def _parse_canonical_binary(doc: Any, json_options: JSONOptions) -> Union[Binary return _binary_or_uuid(data, int(subtype, 16), json_options) +def _parse_canonical_binary_vector(doc: Any, dummy0: Any) -> BinaryVector: + binary = doc["$binaryVector"] + b64 = binary["base64"] + dtype = getattr(DTYPES, binary["dtype"]) + padding = binary["padding"] + if not isinstance(b64, str): + raise TypeError(f"$binaryVector base64 must be a string: {doc}") + if not isinstance(dtype, DTYPES): + raise TypeError(f"$binaryVector dtype must a member of bson.vector.DTYPES: {doc}") + if not isinstance(padding, str) or len(padding) > 2: + raise TypeError(f"$binaryVector padding must be a string at most 2 characters: {doc}") + if len(binary) != 3: + raise TypeError( + f'$binaryVector must include only "base64", "dtype", and "padding" components: {doc}' + ) + + data = base64.b64decode(b64.encode()) + return BinaryVector(data, dtype, int(padding)) + + def _parse_canonical_datetime( doc: Any, json_options: JSONOptions ) -> Union[datetime.datetime, DatetimeMS]: @@ -820,6 +841,7 @@ def _parse_timestamp(doc: Any, dummy0: Any) -> Timestamp: "$minKey": _parse_canonical_minkey, "$maxKey": _parse_canonical_maxkey, "$binary": _parse_binary, + "$binaryVector": _parse_canonical_binary_vector, "$code": _parse_canonical_code, "$uuid": _parse_legacy_uuid, "$undefined": lambda _, _1: None, @@ -841,6 +863,22 @@ def _encode_binary(data: bytes, subtype: int, json_options: JSONOptions) -> Any: return {"$binary": {"base64": base64.b64encode(data).decode(), "subType": "%02x" % subtype}} +def _encode_binary_vector(obj: Any, json_options: JSONOptions) -> Any: + if json_options.json_mode == JSONMode.LEGACY: + return { + "$binaryVector": base64.b64encode(obj).decode(), + "dtype": "%s" % obj.dtype.name, + "padding": "%02x" % obj.padding, + } + return { + "$binaryVector": { + "base64": base64.b64encode(obj).decode(), + "dtype": "%s" % obj.dtype.name, + "padding": "%02x" % obj.padding, + } + } + + def _encode_datetimems(obj: Any, json_options: JSONOptions) -> dict: if ( json_options.datetime_representation == DatetimeRepresentation.ISO8601 @@ -992,6 +1030,7 @@ def _encode_maxkey(dummy0: Any, dummy1: Any) -> dict: str: _encode_noop, type(None): _encode_noop, uuid.UUID: _encode_uuid, + BinaryVector: _encode_binary_vector, Binary: _encode_binary_obj, Int64: _encode_int64, Code: _encode_code, From 27f13c85c1ba4d1bfbd9585797ed141f02ea267a Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 23 Aug 2024 18:18:17 -0700 Subject: [PATCH 07/46] Added bson_corpus tests. Needs more, and review of json_util --- bson/json_util.py | 13 ++++++++----- test/bson_corpus/vector.json | 12 ++++++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 test/bson_corpus/vector.json diff --git a/bson/json_util.py b/bson/json_util.py index 52ecdfc430..e768ca4fad 100644 --- a/bson/json_util.py +++ b/bson/json_util.py @@ -610,17 +610,20 @@ def _parse_canonical_binary(doc: Any, json_options: JSONOptions) -> Union[Binary def _parse_canonical_binary_vector(doc: Any, dummy0: Any) -> BinaryVector: - binary = doc["$binaryVector"] - b64 = binary["base64"] - dtype = getattr(DTYPES, binary["dtype"]) - padding = binary["padding"] + if "dtype" in doc: + b64 = doc["$binaryVector"] + else: + doc = doc["$binaryVector"] + b64 = doc["base64"] + dtype = getattr(DTYPES, doc["dtype"]) + padding = doc["padding"] if not isinstance(b64, str): raise TypeError(f"$binaryVector base64 must be a string: {doc}") if not isinstance(dtype, DTYPES): raise TypeError(f"$binaryVector dtype must a member of bson.vector.DTYPES: {doc}") if not isinstance(padding, str) or len(padding) > 2: raise TypeError(f"$binaryVector padding must be a string at most 2 characters: {doc}") - if len(binary) != 3: + if len(doc) != 3: raise TypeError( f'$binaryVector must include only "base64", "dtype", and "padding" components: {doc}' ) diff --git a/test/bson_corpus/vector.json b/test/bson_corpus/vector.json new file mode 100644 index 0000000000..f247c2ec47 --- /dev/null +++ b/test/bson_corpus/vector.json @@ -0,0 +1,12 @@ +{ + "description": "BinaryVector type [!! MORE TESTS REQUIRED. json_utils may need tweaking", + "bson_type": "0x05", + "test_key": "x", + "valid": [ + { + "description": "DTYPES.INT8 (Zero-length)", + "canonical_bson": "0f0000000578000000000009030000", + "canonical_extjson": "{\"x\": {\"$binaryVector\": {\"base64\": \"\", \"dtype\": \"INT8\", \"padding\": \"00\"}}}" + } + ] +} \ No newline at end of file From 263f8c7e1d19da6ec04515bc43a3ef0679aab4b3 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 12 Sep 2024 16:48:31 -0400 Subject: [PATCH 08/46] Removed BinaryVector as separate class. Instead, Binary includes as_vector, from_vector --- bson/__init__.py | 19 +------ bson/binary.py | 127 ++++++++++++++++++++++++++++++++++++++++++-- bson/json_util.py | 42 --------------- bson/vector.py | 131 ---------------------------------------------- test/test_bson.py | 60 ++++++++++++++++----- 5 files changed, 173 insertions(+), 206 deletions(-) delete mode 100644 bson/vector.py diff --git a/bson/__init__.py b/bson/__init__.py index 7f26c5862a..7ab64c788e 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -137,7 +137,6 @@ from bson.son import RE_TYPE, SON from bson.timestamp import Timestamp from bson.tz_util import utc -from bson.vector import BinaryVector # Import some modules for type-checking only. if TYPE_CHECKING: @@ -161,7 +160,6 @@ "STANDARD", "UUID_SUBTYPE", "Binary", - "BinaryVector", "UuidRepresentation", "Code", "DEFAULT_CODEC_OPTIONS", @@ -383,7 +381,7 @@ def _get_binary( """Decode a BSON binary to bson.binary.Binary or python UUID.""" length, subtype = _UNPACK_LENGTH_SUBTYPE_FROM(data, position) position += 5 - if subtype == 2: # + if subtype == 2: length2 = _UNPACK_INT_FROM(data, position)[0] position += 4 if length2 != length - 4: @@ -408,12 +406,6 @@ def _get_binary( # Decode subtype 0 to 'bytes'. if subtype == 0: value = data[position:end] - # Decode subtype 9 to vector. - elif subtype == 9: - dtype, padding = struct.unpack_from(" _DocumentType try: if _raw_document_class(opts.document_class): return opts.document_class(data, opts) # type:ignore[call-arg] - _, end = _get_object_size(data, 0, len(data)) # todo - how does this work + _, end = _get_object_size(data, 0, len(data)) return cast("_DocumentType", _elements_to_dict(data, view, 4, end, opts)) except InvalidBSON: raise @@ -768,12 +760,6 @@ def _encode_binary(name: bytes, value: Binary, dummy0: Any, dummy1: Any) -> byte return b"\x05" + name + _PACK_LENGTH_SUBTYPE(len(value), subtype) + value -def _encode_vector(name: bytes, value: Any, dummy0: Any, dummy1: Any) -> bytes: - """Encode bson.binary.BinaryVector, a subtype of Binary.""" - metadata = struct.pack(" bytes: """Encode uuid.UUID.""" uuid_representation = opts.uuid_representation @@ -902,7 +888,6 @@ def _encode_maxkey(name: bytes, dummy0: Any, dummy1: Any, dummy2: Any) -> bytes: tuple: _encode_list, type(None): _encode_none, uuid.UUID: _encode_uuid, - BinaryVector: _encode_vector, Binary: _encode_binary, Int64: _encode_long, Code: _encode_code, diff --git a/bson/binary.py b/bson/binary.py index 747a4b803f..478f8b3fe5 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -13,7 +13,10 @@ # limitations under the License. from __future__ import annotations -from typing import TYPE_CHECKING, Any, Tuple, Type, Union +import struct +from dataclasses import dataclass +from enum import Enum +from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, Union from uuid import UUID """Tools for representing BSON binary data. @@ -203,16 +206,48 @@ class UuidRepresentation: """ +class BinaryVectorDtype(Enum): + """Datatypes of vector subtype. + + The PACKED_BIT value represents a special case where vector values themselves + can only hold two values (0 or 1) but these are packed together into groups of 8, + a byte. In Python, these are displayed as ints in range(0,128). + """ + + INT8 = b"\x03" + FLOAT32 = b"\x27" + PACKED_BIT = b"\x10" + + +# Map from bytes to enum value, for decoding. +DTYPE_FROM_HEX = {key.value: key for key in BinaryVectorDtype} + + +@dataclass +class BinaryVector: + """Vector of numbers along with metadata for binary interoperability. + + dtype specifies the data type stored in binary. + padding specifies the number of bits in the final byte that are to be ignored + when a vector element's size is less than a byte + and the length of the vector is not a multiple of 8.""" + + data: list[float | int] + dtype: BinaryVectorDtype + padding: Optional[int] = 0 + + class Binary(bytes): """Representation of BSON binary data. + # TODO Add Vector subtype description + This is necessary because we want to represent Python strings as the BSON string type. We need to wrap binary data so we can tell the difference between what should be considered binary data and what should be considered a string when we encode to BSON. - Raises TypeError if `data` is not an instance of :class:`bytes` - or `subtype` is not an instance of :class:`int`. + Raises TypeError if subtype` is not an instance of :class:`int`. Raises ValueError if `subtype` is not in [0, 256). .. note:: @@ -344,6 +379,92 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}" ) + @classmethod + def from_vector( + cls: Type[Binary], + vector: list[int, float], + dtype: BinaryVectorDtype, + padding: Optional[int] = 0, + ) -> Binary: + """Create a BSON Binary Vector subtype from a list of python objects. + + The data type and byte padding are prepended to the vector itself. + + :param vector: List of values + :param dtype: Data type of the values + :param padding: For fractional bytes, number of bits to ignore at end of vector. + :return: Binary packed data identified by dtype and padding. + """ + if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8 + format_str = "b" + elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8 + format_str = "B" + elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32 + format_str = "f" + else: + raise NotImplementedError("%s not yet supported" % dtype) + + metadata = struct.pack(" BinaryVector: + """Create a list of python objects. + + The binary representation was created with a specific dtype and padding. + The optional kwargs allow one to view data in other formats, + which is particularly useful when one wishes to see binary/bit vectors + as INT8, hence with their proper lengths. + + :param dtype: Optional dtype to use instead of self.dtype + :param padding: Optional number of bytes to discard instead of self.padding + :return: List of numbers. + """ + + position = 0 + orig_dtype, orig_padding = struct.unpack_from(" 0: + unpacked_data = bits[:-padding] + else: + unpacked_data = bits + return BinaryVector(list(unpacked_data), dtype, padding) + + else: + dtype_format = "b" + format_string = f"{n_values}{dtype_format}" + unpacked_data = struct.unpack_from(format_string, self, position) + return BinaryVector(list(unpacked_data), dtype, padding) + + elif dtype == BinaryVectorDtype.FLOAT32: + n_bytes = len(self) - position + n_values = n_bytes // 4 + assert n_bytes % 4 == 0 + unpacked_data = struct.unpack_from(f"{n_values}f", self, position) + return BinaryVector(list(unpacked_data), dtype, padding) + else: + raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name) + @property def subtype(self) -> int: """Subtype of this binary data.""" diff --git a/bson/json_util.py b/bson/json_util.py index e768ca4fad..4269ba9858 100644 --- a/bson/json_util.py +++ b/bson/json_util.py @@ -141,7 +141,6 @@ from bson.son import RE_TYPE from bson.timestamp import Timestamp from bson.tz_util import utc -from bson.vector import DTYPES, BinaryVector _RE_OPT_TABLE = { "i": re.I, @@ -609,29 +608,6 @@ def _parse_canonical_binary(doc: Any, json_options: JSONOptions) -> Union[Binary return _binary_or_uuid(data, int(subtype, 16), json_options) -def _parse_canonical_binary_vector(doc: Any, dummy0: Any) -> BinaryVector: - if "dtype" in doc: - b64 = doc["$binaryVector"] - else: - doc = doc["$binaryVector"] - b64 = doc["base64"] - dtype = getattr(DTYPES, doc["dtype"]) - padding = doc["padding"] - if not isinstance(b64, str): - raise TypeError(f"$binaryVector base64 must be a string: {doc}") - if not isinstance(dtype, DTYPES): - raise TypeError(f"$binaryVector dtype must a member of bson.vector.DTYPES: {doc}") - if not isinstance(padding, str) or len(padding) > 2: - raise TypeError(f"$binaryVector padding must be a string at most 2 characters: {doc}") - if len(doc) != 3: - raise TypeError( - f'$binaryVector must include only "base64", "dtype", and "padding" components: {doc}' - ) - - data = base64.b64decode(b64.encode()) - return BinaryVector(data, dtype, int(padding)) - - def _parse_canonical_datetime( doc: Any, json_options: JSONOptions ) -> Union[datetime.datetime, DatetimeMS]: @@ -844,7 +820,6 @@ def _parse_timestamp(doc: Any, dummy0: Any) -> Timestamp: "$minKey": _parse_canonical_minkey, "$maxKey": _parse_canonical_maxkey, "$binary": _parse_binary, - "$binaryVector": _parse_canonical_binary_vector, "$code": _parse_canonical_code, "$uuid": _parse_legacy_uuid, "$undefined": lambda _, _1: None, @@ -866,22 +841,6 @@ def _encode_binary(data: bytes, subtype: int, json_options: JSONOptions) -> Any: return {"$binary": {"base64": base64.b64encode(data).decode(), "subType": "%02x" % subtype}} -def _encode_binary_vector(obj: Any, json_options: JSONOptions) -> Any: - if json_options.json_mode == JSONMode.LEGACY: - return { - "$binaryVector": base64.b64encode(obj).decode(), - "dtype": "%s" % obj.dtype.name, - "padding": "%02x" % obj.padding, - } - return { - "$binaryVector": { - "base64": base64.b64encode(obj).decode(), - "dtype": "%s" % obj.dtype.name, - "padding": "%02x" % obj.padding, - } - } - - def _encode_datetimems(obj: Any, json_options: JSONOptions) -> dict: if ( json_options.datetime_representation == DatetimeRepresentation.ISO8601 @@ -1033,7 +992,6 @@ def _encode_maxkey(dummy0: Any, dummy1: Any) -> dict: str: _encode_noop, type(None): _encode_noop, uuid.UUID: _encode_uuid, - BinaryVector: _encode_binary_vector, Binary: _encode_binary_obj, Int64: _encode_int64, Code: _encode_code, diff --git a/bson/vector.py b/bson/vector.py deleted file mode 100644 index 9c2cf96a9e..0000000000 --- a/bson/vector.py +++ /dev/null @@ -1,131 +0,0 @@ -from __future__ import annotations - -import logging -import struct -from enum import Enum -from typing import Any, List, Optional, Type, Union - -import bson -from bson.binary import Binary - -logger = logging.getLogger(__name__) - - -class DTYPES(Enum): - """Datatypes of vector.""" - - INT8 = b"\x03" - FLOAT32 = b"\x27" - BOOL = b"\x10" - - -# Map from bytes to enum value, for decoding. -DTYPE_FROM_HEX = {key.value: key for key in DTYPES} - - -class BinaryVector(Binary): - """Binary subtype for efficient storage and retrieval of vectors. - - Vectors here refer to densely packed one dimensional arrays of numbers, - all of the same data type (dtype). - These types loosely match those of PyArrow and Numpy. - - BinaryVector includes two additional bytes of metadata to the length and subtype - already prepended in the Binary class. - One byte defines the data type, described by the bson.binary.vector.DTYPE Enum. - Another byte declares the number of bits at the end that should be ignored, - in the case that a vector's length and type do not require a whole number of bytes. - This number is referred to as padding. - """ - - dtype: str - padding: int = 0 - - def __new__(cls, data: Any, dtype: Union[DTYPES, bytes], padding: int = 0) -> BinaryVector: - self = Binary.__new__(cls, data, bson.binary.VECTOR_SUBTYPE) - if isinstance(dtype, bytes): - dtype = DTYPE_FROM_HEX[dtype] - assert dtype in DTYPES - self.dtype = dtype - self.padding = padding - return self - - @classmethod - def from_list( - cls: Type[BinaryVector], num_list: List, dtype: DTYPES, padding: int = 0 - ) -> BinaryVector: - """Create a BSON Binary Vector subtype from a list of python objects. - - :param num_list: List of values - :param dtype: Data type of the values - :param padding: For fractional bytes, number of bits to ignore at end of vector. - :return: Binary packed data identified by dtype and padding. - """ - if dtype == DTYPES.INT8: # pack ints in [-128, 127] as signed int8 - format_str = "b" - elif dtype == DTYPES.BOOL: # pack ints in [0, 255] as unsigned uint8 - format_str = "B" - elif dtype == DTYPES.FLOAT32: # pack floats as float32 - format_str = "f" - else: - raise NotImplementedError("%s not yet supported" % dtype) - - data = struct.pack(f"{len(num_list)}{format_str}", *num_list) - return cls(data, dtype, padding) - - def as_list(self, dtype: Optional[DTYPES] = None, padding: Optional[int] = None) -> List[Any]: - """Create a list of python objects. - - BinaryVector was created with a specific dtype and padding. - The optional kwargs allow one to view data in other formats. - - :param dtype: Optional dtype to use instead of self.dtype - :param padding: Optional number of bytes to discard instead of self.padding - :return: List of numbers. - """ - dtype = dtype or self.dtype - padding = padding or self.padding - - if dtype == DTYPES.BOOL: - n_values = len(self) # data packed as uint8 - unpacked_uint8s = struct.unpack(f"{n_values}B", self) - bits = [] - for uint8 in unpacked_uint8s: - bits.extend([int(bit) for bit in f"{uint8:08b}"]) - return bits[:-padding] - - elif dtype == DTYPES.INT8: - n_values = len(self) - dtype_format = "b" - format_string = f"{n_values}{dtype_format}" - unpacked_data = struct.unpack(format_string, self) - return list(unpacked_data) - - elif dtype == DTYPES.FLOAT32: - n_bytes = len(self) - n_values = n_bytes // 4 - assert n_bytes % 4 == 0 - unpacked_data = struct.unpack(f"{n_values}f", self) - return list(unpacked_data) - - else: - raise NotImplementedError("BinaryVector dtype %s not yet supported" % dtype.name) - - def __repr__(self) -> str: - return f"BinaryVector({bytes.__repr__(self)}, dtype={self.dtype}, padding={self.padding})" - - def __eq__(self, other: Any) -> bool: - if isinstance(other, BinaryVector): - return ( - self.subtype == other.subtype - and self.dtype == other.dtype - and self.padding == other.padding - and bytes(self) == bytes(other) - ) - return False - - def __ne__(self, other: Any) -> bool: - return not self == other - - def __hash__(self) -> int: - return super().__hash__() ^ hash(self.dtype) ^ hash(self.padding) diff --git a/test/test_bson.py b/test/test_bson.py index 3664145e6b..abd7080e10 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -49,8 +49,9 @@ decode_iter, encode, is_valid, + json_util, ) -from bson.binary import USER_DEFINED_SUBTYPE, Binary, UuidRepresentation +from bson.binary import USER_DEFINED_SUBTYPE, Binary, BinaryVectorDtype, UuidRepresentation from bson.code import Code from bson.codec_options import CodecOptions, DatetimeConversion from bson.datetime_ms import _DATETIME_ERROR_SUGGESTION @@ -63,7 +64,6 @@ from bson.son import SON from bson.timestamp import Timestamp from bson.tz_util import FixedOffset, utc -from bson.vector import DTYPES, BinaryVector class NotADict(abc.MutableMapping): @@ -149,11 +149,9 @@ def helper(doc): helper({"a binary": Binary(b"test", 128)}) helper({"a binary": Binary(b"test", 254)}) helper({"another binary": Binary(b"test", 2)}) - helper({"a binary vector": BinaryVector(b"\x01\x0f\xff", b"\x10", 3)}) - helper({"a binary vector": BinaryVector(b"\x01\x0f\xff", DTYPES.BOOL, 3)}) - helper({"a binary vector": BinaryVector(b"\x01\x0f\xff", b"\x03")}) - helper({"a binary vector": BinaryVector(b"\x01\x0f\xff", DTYPES.INT8)}) - helper({"a binary vector": BinaryVector(b"\xcd\xcc\x8c\xbf\xc7H7P", DTYPES.FLOAT32)}) + helper({"binary packed bit vector": Binary(b"\x10\x00\x7f\x07", 9)}) + helper({"binary int8 vector": Binary(b"\x03\x00\x7f\x07", 9)}) + helper({"binary float32 vector": Binary(b"'\x00\x00\x00\xfeB\x00\x00\xe0@", 9)}) helper(SON([("test dst", datetime.datetime(1993, 4, 4, 2))])) helper(SON([("test negative dst", datetime.datetime(1, 1, 1, 1, 1, 1))])) helper({"big float": float(10000000000)}) @@ -454,16 +452,18 @@ def test_basic_encode(self): b"\x14\x00\x00\x00\x05\x74\x65\x73\x74\x00\x04\x00\x00\x00\x80\x74\x65\x73\x74\x00", ) self.assertEqual( - encode({"vector_int8": BinaryVector.from_list([-128, -1, 127], DTYPES.INT8)}), - b"\x1c\x00\x00\x00\x05vector_int8\x00\x03\x00\x00\x00\t\x03\x00\x80\xff\x7f\x00", + encode({"vector_int8": Binary.from_vector([-128, -1, 127], BinaryVectorDtype.INT8)}), + b"\x1c\x00\x00\x00\x05vector_int8\x00\x05\x00\x00\x00\t\x03\x00\x80\xff\x7f\x00", ) self.assertEqual( - encode({"vector_bool": BinaryVector.from_list([1, 127], DTYPES.BOOL)}), - b"\x1b\x00\x00\x00\x05vector_bool\x00\x02\x00\x00\x00\t\x10\x00\x01\x7f\x00", + encode({"vector_bool": Binary.from_vector([1, 127], BinaryVectorDtype.PACKED_BIT)}), + b"\x1b\x00\x00\x00\x05vector_bool\x00\x04\x00\x00\x00\t\x10\x00\x01\x7f\x00", ) self.assertEqual( - encode({"vector_float32": BinaryVector.from_list([-1.1, 1.1e10], DTYPES.FLOAT32)}), - b"$\x00\x00\x00\x05vector_float32\x00\x08\x00\x00\x00\t'\x00\xcd\xcc\x8c\xbf\xac\xe9#P\x00", + encode( + {"vector_float32": Binary.from_vector([-1.1, 1.1e10], BinaryVectorDtype.FLOAT32)} + ), + b"$\x00\x00\x00\x05vector_float32\x00\n\x00\x00\x00\t'\x00\xcd\xcc\x8c\xbf\xac\xe9#P\x00", ) self.assertEqual(encode({"test": None}), b"\x0B\x00\x00\x00\x0A\x74\x65\x73\x74\x00\x00") self.assertEqual( @@ -729,6 +729,40 @@ def test_uuid_legacy(self): transformed = bin.as_uuid(UuidRepresentation.PYTHON_LEGACY) self.assertEqual(id, transformed) + def test_vector(self): + list_vector = [127, 7] + # As INT8, vector has length 2 + binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8) + vector = binary_vector.as_vector() + assert vector.data == list_vector + # test encoding roundtrip + assert {"vector": binary_vector} == decode(encode({"vector": binary_vector})) + # test json roundtrip # TODO - Is this the wrong place? + assert binary_vector == json_util.loads(json_util.dumps(binary_vector)) + + # For vectors of bits, aka PACKED_BIT type, vector has length 8 * 2 + packed_bit_binary = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT) + packed_bit_vec = packed_bit_binary.as_vector() + assert packed_bit_vec.data == list_vector + # If we wish to see the bit vector unpacked to its true length, we can + unpacked_vec = packed_bit_binary.as_vector(BinaryVectorDtype.INT8) + assert len(unpacked_vec.data) == 8 * len(list_vector) + assert set(unpacked_vec.data) == {0, 1} + + # A padding parameter permits vectors of length that aren't divisible by 8 + # The following ignores the last 3 bits in list_vector, + # hence it's length is 8 * len(list_vector) - padding + padding = 3 + padded_vec = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT, padding=padding) + assert padded_vec.as_vector().data == list_vector + assert ( + len(padded_vec.as_vector(BinaryVectorDtype.INT8).data) == 8 * len(list_vector) - padding + ) + + # FLOAT32 is also implemented + float_binary = Binary.from_vector(list_vector, BinaryVectorDtype.FLOAT32) + assert all(isinstance(d, float) for d in float_binary.as_vector().data) + # The C extension was segfaulting on unicode RegExs, so we have this test # that doesn't really test anything but the lack of a segfault. def test_unicode_regex(self): From f8bcdef693002e298c433bfe7ae39182633b1c67 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 13 Sep 2024 11:25:51 -0400 Subject: [PATCH 09/46] Stop setting _USD_C to False --- bson/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bson/__init__.py b/bson/__init__.py index 7ab64c788e..e8ac7c4441 100644 --- a/bson/__init__.py +++ b/bson/__init__.py @@ -150,8 +150,6 @@ except ImportError: _USE_C = False -_USE_C = False # TODO Replace - __all__ = [ "ALL_UUID_SUBTYPES", "CSHARP_LEGACY", From 5435785ae2a6678681522ba2eadc6291c991ce94 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 13 Sep 2024 12:14:13 -0400 Subject: [PATCH 10/46] mypy fixes --- bson/binary.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index 478f8b3fe5..9502ecf4ce 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -16,7 +16,7 @@ import struct from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, Any, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, Union from uuid import UUID """Tools for representing BSON binary data. @@ -232,7 +232,7 @@ class BinaryVector: when a vector element's size is less than a byte and the length of the vector is not a multiple of 8.""" - data: list[float | int] + data: Sequence[float | int] dtype: BinaryVectorDtype padding: Optional[int] = 0 @@ -434,34 +434,34 @@ def as_vector( if dtype == BinaryVectorDtype.PACKED_BIT: # data packed as uint8 dtype_format = "B" - unpacked_uint8s = struct.unpack_from(f"{n_values}{dtype_format}", self, position) - return BinaryVector(list(unpacked_uint8s), dtype, padding) + unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position)) + return BinaryVector(unpacked_uint8s, dtype, padding) elif dtype == BinaryVectorDtype.INT8: if orig_dtype == BinaryVectorDtype.PACKED_BIT: # Special case for when wishes to see UNPACKED embedding - unpacked_uint8s = struct.unpack_from(f"{n_values}B", self, position) + unpacked_uint8s = list(struct.unpack_from(f"{n_values}B", self, position)) bits = [] for uint8 in unpacked_uint8s: bits.extend([int(bit) for bit in f"{uint8:08b}"]) if padding and padding > 0: - unpacked_data = bits[:-padding] + unpacked_data: list[int] = bits[:-padding] else: unpacked_data = bits - return BinaryVector(list(unpacked_data), dtype, padding) + return BinaryVector(unpacked_data, dtype, padding) else: dtype_format = "b" format_string = f"{n_values}{dtype_format}" - unpacked_data = struct.unpack_from(format_string, self, position) - return BinaryVector(list(unpacked_data), dtype, padding) + unpacked_data = list(struct.unpack_from(format_string, self, position)) + return BinaryVector(unpacked_data, dtype, padding) elif dtype == BinaryVectorDtype.FLOAT32: n_bytes = len(self) - position n_values = n_bytes // 4 assert n_bytes % 4 == 0 - unpacked_data = struct.unpack_from(f"{n_values}f", self, position) - return BinaryVector(list(unpacked_data), dtype, padding) + unpacked_data = list(struct.unpack_from(f"{n_values}f", self, position)) + return BinaryVector(unpacked_data, dtype, padding) else: raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name) From 5c4d152983b114f6357188c5a0dd7ce4c797ac73 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 13 Sep 2024 13:17:05 -0400 Subject: [PATCH 11/46] Removed stub vector.json for bson_corpus tests --- test/bson_corpus/vector.json | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 test/bson_corpus/vector.json diff --git a/test/bson_corpus/vector.json b/test/bson_corpus/vector.json deleted file mode 100644 index f247c2ec47..0000000000 --- a/test/bson_corpus/vector.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "description": "BinaryVector type [!! MORE TESTS REQUIRED. json_utils may need tweaking", - "bson_type": "0x05", - "test_key": "x", - "valid": [ - { - "description": "DTYPES.INT8 (Zero-length)", - "canonical_bson": "0f0000000578000000000009030000", - "canonical_extjson": "{\"x\": {\"$binaryVector\": {\"base64\": \"\", \"dtype\": \"INT8\", \"padding\": \"00\"}}}" - } - ] -} \ No newline at end of file From f86d04082087e82fec28337e603a82707a52837b Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 13 Sep 2024 17:05:42 -0400 Subject: [PATCH 12/46] More tests --- test/bson_corpus/binary.json | 30 ++++++++++++++++++++++++++++++ test/test_bson.py | 29 ++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/test/bson_corpus/binary.json b/test/bson_corpus/binary.json index 20aaef743b..0e0056f3a2 100644 --- a/test/bson_corpus/binary.json +++ b/test/bson_corpus/binary.json @@ -74,6 +74,36 @@ "description": "$type query operator (conflicts with legacy $binary form with $type field)", "canonical_bson": "180000000378001000000010247479706500020000000000", "canonical_extjson": "{\"x\" : { \"$type\" : {\"$numberInt\": \"2\"}}}" + }, + { + "description": "subtype 0x09 Vector FLOAT32", + "canonical_bson": "170000000578000A0000000927000000FE420000E04000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector INT8", + "canonical_bson": "11000000057800040000000903007F0700", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector PACKED_BIT", + "canonical_bson": "11000000057800040000000910007F0700", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector (Zero-length) FLOAT32", + "canonical_bson": "0F0000000578000200000009270000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector (Zero-length) INT8", + "canonical_bson": "0F0000000578000200000009030000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "subtype 0x09 Vector (Zero-length) PACKED_BIT", + "canonical_bson": "0F0000000578000200000009100000", + "canonical_extjson": "{\"x\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}" } ], "decodeErrors": [ diff --git a/test/test_bson.py b/test/test_bson.py index abd7080e10..67400e357a 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -730,6 +730,9 @@ def test_uuid_legacy(self): self.assertEqual(id, transformed) def test_vector(self): + """Tests of subtype 9""" + # We start with valid cases, across the 3 dtypes implemented. + # Work with a simple vector that can be interpreted as int8, float32, or ubyte list_vector = [127, 7] # As INT8, vector has length 2 binary_vector = Binary.from_vector(list_vector, BinaryVectorDtype.INT8) @@ -737,7 +740,7 @@ def test_vector(self): assert vector.data == list_vector # test encoding roundtrip assert {"vector": binary_vector} == decode(encode({"vector": binary_vector})) - # test json roundtrip # TODO - Is this the wrong place? + # test json roundtrip assert binary_vector == json_util.loads(json_util.dumps(binary_vector)) # For vectors of bits, aka PACKED_BIT type, vector has length 8 * 2 @@ -759,13 +762,33 @@ def test_vector(self): len(padded_vec.as_vector(BinaryVectorDtype.INT8).data) == 8 * len(list_vector) - padding ) + # It is worthwhile explicitly showing the values encoded to BSON + padded_doc = {"padded_vec": padded_vec} + assert ( + encode(padded_doc) + == b"\x1a\x00\x00\x00\x05padded_vec\x00\x04\x00\x00\x00\t\x10\x03\x7f\x07\x00" + ) + # and dumped to json + assert ( + json_util.dumps(padded_doc) + == '{"padded_vec": {"$binary": {"base64": "EAN/Bw==", "subType": "09"}}}' + ) + # FLOAT32 is also implemented float_binary = Binary.from_vector(list_vector, BinaryVectorDtype.FLOAT32) assert all(isinstance(d, float) for d in float_binary.as_vector().data) - # The C extension was segfaulting on unicode RegExs, so we have this test - # that doesn't really test anything but the lack of a segfault. + # Now some invalid cases + for x in [-1, 257]: + try: + Binary.from_vector([x], BinaryVectorDtype.PACKED_BIT) + except struct.error as e: + assert str(e) == "ubyte format requires 0 <= number <= 255" + def test_unicode_regex(self): + """Tests we do not get a segfault for C extension on unicode RegExs. + This had been happening. + """ regex = re.compile("revisi\xf3n") decode(encode({"regex": regex})) From adcb9450e6099c118ef4245cdfb214077e159efe Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Sat, 14 Sep 2024 16:41:57 -0400 Subject: [PATCH 13/46] Added description of subtype 9 to bson.Binary docstring --- bson/binary.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index 9502ecf4ce..f0e63b8639 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -240,13 +240,17 @@ class BinaryVector: class Binary(bytes): """Representation of BSON binary data. - # TODO Add Vector subtype description - - This is necessary because we want to represent Python strings as - the BSON string type. We need to wrap binary data so we can tell + We want to represent Python strings as the BSON string type. + We need to wrap binary data so that we can tell the difference between what should be considered binary data and what should be considered a string when we encode to BSON. + Subtype 9 provides a space-efficient representation of 1-dimensional vector data. + Its data is prepended with two bytes of metadata. + The first (dtype) describes its data type, such as float32 or int8. + The second (padding) prescribes the number of bits to ignore in the final byte. + This is relevant when the element size of the dtype is not a multiple of 8. + Raises TypeError if subtype` is not an instance of :class:`int`. Raises ValueError if `subtype` is not in [0, 256). @@ -259,8 +263,9 @@ class Binary(bytes): `_ to use - .. versionchanged:: 3.9 - Support any bytes-like type that implements the buffer protocol. + .. versionchanged:: + 3.9 Support any bytes-like type that implements the buffer protocol. + 4.9 Addition of vector subtype. """ _type_marker = 5 From 7986cc531fd7dca02e75302396b73f71ad9048d6 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 16 Sep 2024 16:59:26 -0400 Subject: [PATCH 14/46] Addressed comments in docstrings. --- bson/binary.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index f0e63b8639..ff310bd7db 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -209,9 +209,13 @@ class UuidRepresentation: class BinaryVectorDtype(Enum): """Datatypes of vector subtype. + FLOAT32: Pack floats as float32 + INT8: Pack ints in [-128, 127] as signed int8 + PACKED_BIT: Pack ints in [0, 255] as unsigned uint8 + The PACKED_BIT value represents a special case where vector values themselves can only hold two values (0 or 1) but these are packed together into groups of 8, - a byte. In Python, these are displayed as ints in range(0,128). + a byte. In Python, these are displayed as ints in range [0, 255] """ INT8 = b"\x03" @@ -263,9 +267,11 @@ class Binary(bytes): `_ to use - .. versionchanged:: - 3.9 Support any bytes-like type that implements the buffer protocol. - 4.9 Addition of vector subtype. + .. versionchanged:: 3.9 + Support any bytes-like type that implements the buffer protocol. + + .. versionchanged:: 4.9 + Addition of vector subtype. """ _type_marker = 5 From 26b8398530b3f7d5bf1672c6ba78427a52ae4102 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 16 Sep 2024 18:38:21 -0400 Subject: [PATCH 15/46] Eased string comparison of exception in xfail in test_bson --- test/test_bson.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_bson.py b/test/test_bson.py index 67400e357a..a1c0be4e91 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -783,7 +783,7 @@ def test_vector(self): try: Binary.from_vector([x], BinaryVectorDtype.PACKED_BIT) except struct.error as e: - assert str(e) == "ubyte format requires 0 <= number <= 255" + assert "format requires 0 <= number <= 255" in str(e) def test_unicode_regex(self): """Tests we do not get a segfault for C extension on unicode RegExs. From 28de28a0e4b9f39b1056744cec938a9bdfb3bf6c Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Tue, 17 Sep 2024 15:48:51 -0400 Subject: [PATCH 16/46] Updates to docstrings of BinaryVector and BinaryVectorDtype --- bson/binary.py | 37 ++++++++++++++++++++++++++----------- doc/api/bson/binary.rst | 8 ++++++++ 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index ff310bd7db..780e271e28 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -209,13 +209,17 @@ class UuidRepresentation: class BinaryVectorDtype(Enum): """Datatypes of vector subtype. - FLOAT32: Pack floats as float32 - INT8: Pack ints in [-128, 127] as signed int8 - PACKED_BIT: Pack ints in [0, 255] as unsigned uint8 + :param FLOAT32: (0x27) Pack list of :class:`float` as float32 + :param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8 + :param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8 - The PACKED_BIT value represents a special case where vector values themselves - can only hold two values (0 or 1) but these are packed together into groups of 8, + The `PACKED_BIT` value represents a special case where vector values themselves + can only be of two values (0 or 1) but these are packed together into groups of 8, a byte. In Python, these are displayed as ints in range [0, 255] + + Each value is of type bytes with a length of one. + + .. versionadded:: 4.9 """ INT8 = b"\x03" @@ -231,10 +235,14 @@ class BinaryVectorDtype(Enum): class BinaryVector: """Vector of numbers along with metadata for binary interoperability. - dtype specifies the data type stored in binary. - padding specifies the number of bits in the final byte that are to be ignored - when a vector element's size is less than a byte - and the length of the vector is not a multiple of 8.""" + :param data: Sequence of numbers representing the mathematical vector. + :param dtype: The data type stored in binary + :param padding: The number of bits in the final byte that are to be ignored + when a vector element's size is less than a byte + and the length of the vector is not a multiple of 8. + + .. versionadded:: 4.9 + """ data: Sequence[float | int] dtype: BinaryVectorDtype @@ -397,14 +405,19 @@ def from_vector( dtype: BinaryVectorDtype, padding: Optional[int] = 0, ) -> Binary: - """Create a BSON Binary Vector subtype from a list of python objects. + """Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers. - The data type and byte padding are prepended to the vector itself. + To interpret the representation of the numbers, a data type must be included. + See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions. + + The dtype and padding are prepended to the binary data's value. :param vector: List of values :param dtype: Data type of the values :param padding: For fractional bytes, number of bits to ignore at end of vector. :return: Binary packed data identified by dtype and padding. + + .. versionadded:: 4.9 """ if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8 format_str = "b" @@ -432,6 +445,8 @@ def as_vector( :param dtype: Optional dtype to use instead of self.dtype :param padding: Optional number of bytes to discard instead of self.padding :return: List of numbers. + + .. versionadded:: 4.9 """ position = 0 diff --git a/doc/api/bson/binary.rst b/doc/api/bson/binary.rst index c933a687b9..084fd02d50 100644 --- a/doc/api/bson/binary.rst +++ b/doc/api/bson/binary.rst @@ -21,6 +21,14 @@ .. autoclass:: UuidRepresentation :members: + .. autoclass:: BinaryVectorDtype + :members: + :show-inheritance: + + .. autoclass:: BinaryVector + :members: + + .. autoclass:: Binary(data, subtype=BINARY_SUBTYPE) :members: :show-inheritance: From 68235b801389bcad18d72ca53238175fae6dcfa0 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Tue, 17 Sep 2024 17:19:48 -0400 Subject: [PATCH 17/46] Simplified expected exeption case. Will be refactored with yaml anyway.. --- test/test_bson.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_bson.py b/test/test_bson.py index a1c0be4e91..33811468c0 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -782,8 +782,10 @@ def test_vector(self): for x in [-1, 257]: try: Binary.from_vector([x], BinaryVectorDtype.PACKED_BIT) - except struct.error as e: - assert "format requires 0 <= number <= 255" in str(e) + except Exception as exc: + self.assertTrue(isinstance(exc, struct.error)) + else: + self.fail("Failed to raise an exception.") def test_unicode_regex(self): """Tests we do not get a segfault for C extension on unicode RegExs. From e2a1a3c0be25258ca77ad049423ecaae316f329c Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Wed, 18 Sep 2024 18:26:31 -0400 Subject: [PATCH 18/46] Added draft of test runner --- .../bson_binary_vector/vector-test-cases.json | 87 ++++++++++++++ test/test_bson_binary_vector.py | 113 ++++++++++++++++++ 2 files changed, 200 insertions(+) create mode 100644 test/bson_binary_vector/vector-test-cases.json create mode 100644 test/test_bson_binary_vector.py diff --git a/test/bson_binary_vector/vector-test-cases.json b/test/bson_binary_vector/vector-test-cases.json new file mode 100644 index 0000000000..6c86b43300 --- /dev/null +++ b/test/bson_binary_vector/vector-test-cases.json @@ -0,0 +1,87 @@ +{ + "description": "Basic Tests of Binary Vectors, subtype 9", + "test_key": "vector", + "tests": [ + { + "description": "Simple Vector INT8", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0, + "canonical_bson": "1600000005766563746F7200040000000903007F0700", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "Simple Vector FLOAT32", + "valid": true, + "vector": [127.0, 7.0], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}" + }, + { + "description": "Simple Vector PACKED_BIT", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1600000005766563746F7200040000000910007F0700", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "Empty Vector INT8", + "valid": true, + "vector": [], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009030000", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "Empty Vector FLOAT32", + "valid": true, + "vector": [], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009270000", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "Empty Vector PACKED_BIT", + "valid": true, + "vector": [], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009100000", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}" + }, + { + "description": "Infinity Vector FLOAT32", + "valid": true, + "vector": ["-inf", 0.0, "inf"], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAID/AAAAAAAAgH8=\", \"subType\": \"09\"}}}" + } + ], + "invalid": [ + { + "description": "Overflow Vector INT8", + "valid": false, + "vector": [256], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + } + ] +} + diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py new file mode 100644 index 0000000000..03700d862e --- /dev/null +++ b/test/test_bson_binary_vector.py @@ -0,0 +1,113 @@ +# Copyright 2024-present MongoDB, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import binascii +import codecs +import functools +import glob +import json +import os +import struct +import sys +from decimal import DecimalException +from pathlib import Path +from test import unittest + +from bson import decode, encode, json_util +from bson.binary import Binary, BinaryVectorDtype + +_TEST_PATH = Path(__file__).parent / "bson_binary_vector" + + +class TestBSONBinaryVector(unittest.TestCase): + """Runs Binary Vector subtype tests. + + Follows the style of the BSON corpus specification tests. + Tests are automatically generated on import + from json files in _TEST_PATH via `create_tests`. + The actual tests are defined in the inner function `run_test` + of the test generator `create_test`.""" + + +def create_test(case_spec): + """Create standard test given specification in json. + + We use the naming convention expected (exp) and observed (obj) + to differentiate what is in the json (expected or suffix _exp) + from what is produced by the API (observed or suffix _obs) + """ + test_key = case_spec.get("test_key") + + def run_test(self): + for test_case in case_spec.get("tests", []): + description = test_case["description"] + vector_exp = test_case["vector"] + dtype_hex_exp = test_case["dtype_hex"] + dtype_alias_exp = test_case.get("dtype_alias") + padding_exp = test_case.get("padding") + canonical_bson_exp = test_case["canonical_bson"] + canonical_extjson_exp = test_case["canonical_extjson"] + # Convert dtype hex string into bytes + dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little")) + + if test_case["valid"]: + # Convert bson string to bytes + cB_exp = binascii.unhexlify(canonical_bson_exp.encode("utf8")) + decoded_doc = decode(cB_exp) + binary_obs = decoded_doc[test_key] + # Handle special float cases like '-inf' + if dtype_exp in [BinaryVectorDtype.FLOAT32]: + vector_exp = [float(x) for x in vector_exp] + + # Test round-tripping canonical bson. + self.assertEqual(encode(decoded_doc), cB_exp, description) + + # Test BSON to Binary Vector + vector_obs = binary_obs.as_vector() + self.assertEqual(vector_obs.dtype, dtype_exp) + if dtype_alias_exp: + self.assertEqual(vector_obs.dtype, BinaryVectorDtype[dtype_alias_exp]) + self.assertEqual(vector_obs.data, vector_exp) + self.assertEqual(vector_obs.padding, padding_exp) + + # Test Binary Vector to BSON + vector_exp = Binary.from_vector(vector_exp, dtype_exp, padding_exp) + cB_obs = binascii.hexlify(encode({test_key: vector_exp})).decode().upper() + self.assertEqual(cB_obs, canonical_bson_exp) + + # Test JSON + self.assertEqual(json_util.loads(canonical_extjson_exp), decoded_doc) + self.assertEqual(json_util.dumps(decoded_doc), canonical_extjson_exp) + + else: + with self.assertRaises(struct.error): + Binary.from_vector(vector_exp, dtype_exp) + + return run_test + + +def create_tests(): + for filename in _TEST_PATH.glob("*.json"): + with codecs.open(filename, encoding="utf-8") as test_file: + test_method = create_test(json.load(test_file)) + setattr(TestBSONBinaryVector, "test_" + filename.stem, test_method) + + +create_tests() + + +if __name__ == "__main__": + unittest.main() From bf9758a08978051f4ff2ba0ddaac93bec7dca219 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 19 Sep 2024 12:30:57 -0400 Subject: [PATCH 19/46] Added test cases: padding, and overflow --- bson/binary.py | 2 + .../bson_binary_vector/vector-test-cases.json | 50 +++++++++++++++++++ test/test_bson_binary_vector.py | 18 ++++--- 3 files changed, 62 insertions(+), 8 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index 780e271e28..0a27a5b541 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -421,10 +421,12 @@ def from_vector( """ if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8 format_str = "b" + assert not padding, f"padding does not apply to {dtype=}" elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8 format_str = "B" elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32 format_str = "f" + assert not padding, f"padding does not apply to {dtype=}" else: raise NotImplementedError("%s not yet supported" % dtype) diff --git a/test/bson_binary_vector/vector-test-cases.json b/test/bson_binary_vector/vector-test-cases.json index 6c86b43300..f0326dd14e 100644 --- a/test/bson_binary_vector/vector-test-cases.json +++ b/test/bson_binary_vector/vector-test-cases.json @@ -71,6 +71,16 @@ "padding": 0, "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00", "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAID/AAAAAAAAgH8=\", \"subType\": \"09\"}}}" + }, + { + "description": "PACKED_BIT with padding", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 3, + "canonical_bson": "1600000005766563746F7200040000000910037F0700", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAN/Bw==\", \"subType\": \"09\"}}}" } ], "invalid": [ @@ -81,6 +91,46 @@ "dtype_hex": "0x03", "dtype_alias": "INT8", "padding": 0 + }, + { + "description": "Overflow Vector PACKED_BIT", + "valid": false, + "vector": [256], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + }, + { + "description": "Underflow Vector INT8", + "valid": false, + "vector": [-1], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + }, + { + "description": "Underflow Vector PACKED_BIT", + "valid": false, + "vector": [-1], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + }, + { + "description": "INT8 with padding", + "valid": false, + "vector": [127, 7], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 3 + }, + { + "description": "FLOAT32 with padding", + "valid": false, + "vector": [127.0, 7.0], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 3 } ] } diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py index 03700d862e..b59e95642b 100644 --- a/test/test_bson_binary_vector.py +++ b/test/test_bson_binary_vector.py @@ -77,23 +77,25 @@ def run_test(self): # Test BSON to Binary Vector vector_obs = binary_obs.as_vector() - self.assertEqual(vector_obs.dtype, dtype_exp) + self.assertEqual(vector_obs.dtype, dtype_exp, description) if dtype_alias_exp: - self.assertEqual(vector_obs.dtype, BinaryVectorDtype[dtype_alias_exp]) - self.assertEqual(vector_obs.data, vector_exp) - self.assertEqual(vector_obs.padding, padding_exp) + self.assertEqual( + vector_obs.dtype, BinaryVectorDtype[dtype_alias_exp], description + ) + self.assertEqual(vector_obs.data, vector_exp, description) + self.assertEqual(vector_obs.padding, padding_exp, description) # Test Binary Vector to BSON vector_exp = Binary.from_vector(vector_exp, dtype_exp, padding_exp) cB_obs = binascii.hexlify(encode({test_key: vector_exp})).decode().upper() - self.assertEqual(cB_obs, canonical_bson_exp) + self.assertEqual(cB_obs, canonical_bson_exp, description) # Test JSON - self.assertEqual(json_util.loads(canonical_extjson_exp), decoded_doc) - self.assertEqual(json_util.dumps(decoded_doc), canonical_extjson_exp) + self.assertEqual(json_util.loads(canonical_extjson_exp), decoded_doc, description) + self.assertEqual(json_util.dumps(decoded_doc), canonical_extjson_exp, description) else: - with self.assertRaises(struct.error): + with self.assertRaises(struct.error, msg=description): Binary.from_vector(vector_exp, dtype_exp) return run_test From c4c7af7e77d13c67de13957c803dbc333b7af193 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 19 Sep 2024 14:15:14 -0400 Subject: [PATCH 20/46] Cast Path to str --- test/test_bson_binary_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py index b59e95642b..4302bc2829 100644 --- a/test/test_bson_binary_vector.py +++ b/test/test_bson_binary_vector.py @@ -103,7 +103,7 @@ def run_test(self): def create_tests(): for filename in _TEST_PATH.glob("*.json"): - with codecs.open(filename, encoding="utf-8") as test_file: + with codecs.open(str(filename), encoding="utf-8") as test_file: test_method = create_test(json.load(test_file)) setattr(TestBSONBinaryVector, "test_" + filename.stem, test_method) From de5a24532f9b33601ef3d29b562274df7690aa7b Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Thu, 19 Sep 2024 22:22:47 -0400 Subject: [PATCH 21/46] Simplified as_vector API --- bson/binary.py | 67 +++++++++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 39 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index 0a27a5b541..3f795157ee 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -434,62 +434,51 @@ def from_vector( data = struct.pack(f"{len(vector)}{format_str}", *vector) return cls(metadata + data, subtype=VECTOR_SUBTYPE) - def as_vector( - self, dtype: Optional[BinaryVectorDtype] = None, padding: Optional[int] = 0 - ) -> BinaryVector: - """Create a list of python objects. + def as_vector(self, uncompressed: Optional[bool] = False) -> BinaryVector: + """From the Binary, create a list of numbers, along with dtype and padding. - The binary representation was created with a specific dtype and padding. - The optional kwargs allow one to view data in other formats, - which is particularly useful when one wishes to see binary/bit vectors - as INT8, hence with their proper lengths. - :param dtype: Optional dtype to use instead of self.dtype - :param padding: Optional number of bytes to discard instead of self.padding - :return: List of numbers. + :param uncompressed: If true, return the true mathematical vector. + This is only necessary for datatypes where padding is applicable. + For example, setting this to True for a PACKED_BIT vector will result + in a List[int] of zeros and ones. + :return: List of numbers, along with dtype and padding. .. versionadded:: 4.9 """ position = 0 - orig_dtype, orig_padding = struct.unpack_from(" 0: - unpacked_data: list[int] = bits[:-padding] - else: - unpacked_data = bits - return BinaryVector(unpacked_data, dtype, padding) - - else: - dtype_format = "b" - format_string = f"{n_values}{dtype_format}" - unpacked_data = list(struct.unpack_from(format_string, self, position)) - return BinaryVector(unpacked_data, dtype, padding) + vector = bits[:-padding] if padding else bits + return BinaryVector(vector, dtype, padding) - elif dtype == BinaryVectorDtype.FLOAT32: - n_bytes = len(self) - position - n_values = n_bytes // 4 - assert n_bytes % 4 == 0 - unpacked_data = list(struct.unpack_from(f"{n_values}f", self, position)) - return BinaryVector(unpacked_data, dtype, padding) else: raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name) From 43bcce49779240ac307916827aa4920c707b0df8 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Sep 2024 08:42:49 -0400 Subject: [PATCH 22/46] Added test case: list of floats with dtype int8 raises exception --- test/bson_binary_vector/vector-test-cases.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/bson_binary_vector/vector-test-cases.json b/test/bson_binary_vector/vector-test-cases.json index f0326dd14e..ffd322a9ab 100644 --- a/test/bson_binary_vector/vector-test-cases.json +++ b/test/bson_binary_vector/vector-test-cases.json @@ -131,6 +131,14 @@ "dtype_hex": "0x27", "dtype_alias": "FLOAT32", "padding": 3 + }, + { + "description": "INT8 with float inputs", + "valid": false, + "vector": [127.77, 7.77], + "dtype_hex": "0x27", + "dtype_alias": "INT8", + "padding": 0 } ] } From 41ee0bb0cd3d8a608b5f98d2f2238a155d92580a Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Sep 2024 10:24:46 -0400 Subject: [PATCH 23/46] Set default padding to 0 in test runner --- test/test_bson_binary_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py index 4302bc2829..c0e8241804 100644 --- a/test/test_bson_binary_vector.py +++ b/test/test_bson_binary_vector.py @@ -57,7 +57,7 @@ def run_test(self): vector_exp = test_case["vector"] dtype_hex_exp = test_case["dtype_hex"] dtype_alias_exp = test_case.get("dtype_alias") - padding_exp = test_case.get("padding") + padding_exp = test_case.get("padding", 0) canonical_bson_exp = test_case["canonical_bson"] canonical_extjson_exp = test_case["canonical_extjson"] # Convert dtype hex string into bytes From 9d52aeb001c3ec4ef329f356ea6bcc20c24d7d12 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Sep 2024 10:59:48 -0400 Subject: [PATCH 24/46] Updated test_bson for new as_vector API --- test/test_bson.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/test/test_bson.py b/test/test_bson.py index 33811468c0..072c5a1191 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -748,7 +748,7 @@ def test_vector(self): packed_bit_vec = packed_bit_binary.as_vector() assert packed_bit_vec.data == list_vector # If we wish to see the bit vector unpacked to its true length, we can - unpacked_vec = packed_bit_binary.as_vector(BinaryVectorDtype.INT8) + unpacked_vec = packed_bit_binary.as_vector(uncompressed=True) assert len(unpacked_vec.data) == 8 * len(list_vector) assert set(unpacked_vec.data) == {0, 1} @@ -758,9 +758,7 @@ def test_vector(self): padding = 3 padded_vec = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT, padding=padding) assert padded_vec.as_vector().data == list_vector - assert ( - len(padded_vec.as_vector(BinaryVectorDtype.INT8).data) == 8 * len(list_vector) - padding - ) + assert len(padded_vec.as_vector(uncompressed=True).data) == 8 * len(list_vector) - padding # It is worthwhile explicitly showing the values encoded to BSON padded_doc = {"padded_vec": padded_vec} From 0d344641103d834c48ef7aba5e850cedef5f4c48 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Sep 2024 11:55:56 -0400 Subject: [PATCH 25/46] Updated resync-specs.sh to include bson-binary-vector --- .evergreen/resync-specs.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.evergreen/resync-specs.sh b/.evergreen/resync-specs.sh index ac69449729..a3f4204069 100755 --- a/.evergreen/resync-specs.sh +++ b/.evergreen/resync-specs.sh @@ -76,6 +76,9 @@ do atlas-data-lake-testing|data_lake) cpjson atlas-data-lake-testing/tests/ data_lake ;; + bson-binary-vector|bson_binary_vector) + cpjson bson-binary-vector/tests/ bson_binary_vector/test + ;; bson-corpus|bson_corpus) cpjson bson-corpus/tests/ bson_corpus ;; From 1d496560b92fbd1bf9d7108d929ea486ab8d3ba1 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Sep 2024 12:29:58 -0400 Subject: [PATCH 26/46] Updated resync-specs.sh and test cases --- .evergreen/resync-specs.sh | 2 +- test/bson_binary_vector/vector-test-cases.json | 10 ++++------ test/test_bson_binary_vector.py | 8 ++++---- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.evergreen/resync-specs.sh b/.evergreen/resync-specs.sh index a3f4204069..dca116c2d3 100755 --- a/.evergreen/resync-specs.sh +++ b/.evergreen/resync-specs.sh @@ -77,7 +77,7 @@ do cpjson atlas-data-lake-testing/tests/ data_lake ;; bson-binary-vector|bson_binary_vector) - cpjson bson-binary-vector/tests/ bson_binary_vector/test + cpjson bson-binary-vector/tests/ bson_binary_vector ;; bson-corpus|bson_corpus) cpjson bson-corpus/tests/ bson_corpus diff --git a/test/bson_binary_vector/vector-test-cases.json b/test/bson_binary_vector/vector-test-cases.json index ffd322a9ab..32f3e441ba 100644 --- a/test/bson_binary_vector/vector-test-cases.json +++ b/test/bson_binary_vector/vector-test-cases.json @@ -81,13 +81,11 @@ "padding": 3, "canonical_bson": "1600000005766563746F7200040000000910037F0700", "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAN/Bw==\", \"subType\": \"09\"}}}" - } - ], - "invalid": [ + }, { "description": "Overflow Vector INT8", "valid": false, - "vector": [256], + "vector": [128], "dtype_hex": "0x03", "dtype_alias": "INT8", "padding": 0 @@ -103,7 +101,7 @@ { "description": "Underflow Vector INT8", "valid": false, - "vector": [-1], + "vector": [-129], "dtype_hex": "0x03", "dtype_alias": "INT8", "padding": 0 @@ -136,7 +134,7 @@ "description": "INT8 with float inputs", "valid": false, "vector": [127.77, 7.77], - "dtype_hex": "0x27", + "dtype_hex": "0x03", "dtype_alias": "INT8", "padding": 0 } diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py index c0e8241804..3a6d45e8d4 100644 --- a/test/test_bson_binary_vector.py +++ b/test/test_bson_binary_vector.py @@ -58,8 +58,8 @@ def run_test(self): dtype_hex_exp = test_case["dtype_hex"] dtype_alias_exp = test_case.get("dtype_alias") padding_exp = test_case.get("padding", 0) - canonical_bson_exp = test_case["canonical_bson"] - canonical_extjson_exp = test_case["canonical_extjson"] + canonical_bson_exp = test_case.get("canonical_bson") + canonical_extjson_exp = test_case.get("canonical_extjson") # Convert dtype hex string into bytes dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little")) @@ -95,8 +95,8 @@ def run_test(self): self.assertEqual(json_util.dumps(decoded_doc), canonical_extjson_exp, description) else: - with self.assertRaises(struct.error, msg=description): - Binary.from_vector(vector_exp, dtype_exp) + with self.assertRaises((struct.error, AssertionError), msg=description): + Binary.from_vector(vector_exp, dtype_exp, padding_exp) return run_test From 2af0ca4f4d158506b45fca19b2a4a6318b635007 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 20 Sep 2024 18:25:37 -0400 Subject: [PATCH 27/46] Broke tests into 3 files by dtype --- test/bson_binary_vector/float32.json | 45 ++++++ test/bson_binary_vector/int8.json | 59 ++++++++ test/bson_binary_vector/packed_bit.json | 53 +++++++ .../bson_binary_vector/vector-test-cases.json | 143 ------------------ 4 files changed, 157 insertions(+), 143 deletions(-) create mode 100644 test/bson_binary_vector/float32.json create mode 100644 test/bson_binary_vector/int8.json create mode 100644 test/bson_binary_vector/packed_bit.json delete mode 100644 test/bson_binary_vector/vector-test-cases.json diff --git a/test/bson_binary_vector/float32.json b/test/bson_binary_vector/float32.json new file mode 100644 index 0000000000..9ec72861d4 --- /dev/null +++ b/test/bson_binary_vector/float32.json @@ -0,0 +1,45 @@ +{ + "description": "Tests of Binary subtype 9, Vectors, with dtype FLOAT32", + "test_key": "vector", + "tests": [ + { + "description": "Simple Vector FLOAT32", + "valid": true, + "vector": [127.0, 7.0], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}" + }, + { + "description": "Empty Vector FLOAT32", + "valid": true, + "vector": [], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009270000", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "Infinity Vector FLOAT32", + "valid": true, + "vector": ["-inf", 0.0, "inf"], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 0, + "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAID/AAAAAAAAgH8=\", \"subType\": \"09\"}}}" + }, + { + "description": "FLOAT32 with padding", + "valid": false, + "vector": [127.0, 7.0], + "dtype_hex": "0x27", + "dtype_alias": "FLOAT32", + "padding": 3 + } + ] +} + diff --git a/test/bson_binary_vector/int8.json b/test/bson_binary_vector/int8.json new file mode 100644 index 0000000000..92eab609e8 --- /dev/null +++ b/test/bson_binary_vector/int8.json @@ -0,0 +1,59 @@ +{ + "description": "Tests of Binary subtype 9, Vectors, with dtype INT8", + "test_key": "vector", + "tests": [ + { + "description": "Simple Vector INT8", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0, + "canonical_bson": "1600000005766563746F7200040000000903007F0700", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "Empty Vector INT8", + "valid": true, + "vector": [], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009030000", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}" + }, + { + "description": "Overflow Vector INT8", + "valid": false, + "vector": [128], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + }, + { + "description": "Underflow Vector INT8", + "valid": false, + "vector": [-129], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + }, + { + "description": "INT8 with padding", + "valid": false, + "vector": [127, 7], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 3 + }, + { + "description": "INT8 with float inputs", + "valid": false, + "vector": [127.77, 7.77], + "dtype_hex": "0x03", + "dtype_alias": "INT8", + "padding": 0 + } + ] +} + diff --git a/test/bson_binary_vector/packed_bit.json b/test/bson_binary_vector/packed_bit.json new file mode 100644 index 0000000000..de108876a9 --- /dev/null +++ b/test/bson_binary_vector/packed_bit.json @@ -0,0 +1,53 @@ +{ + "description": "Tests of Binary subtype 9, Vectors, with dtype PACKED_BIT", + "test_key": "vector", + "tests": [ + { + "description": "Simple Vector PACKED_BIT", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1600000005766563746F7200040000000910007F0700", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "Empty Vector PACKED_BIT", + "valid": true, + "vector": [], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0, + "canonical_bson": "1400000005766563746F72000200000009100000", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}" + }, + { + "description": "PACKED_BIT with padding", + "valid": true, + "vector": [127, 7], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 3, + "canonical_bson": "1600000005766563746F7200040000000910037F0700", + "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAN/Bw==\", \"subType\": \"09\"}}}" + }, + { + "description": "Overflow Vector PACKED_BIT", + "valid": false, + "vector": [256], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + }, + { + "description": "Underflow Vector PACKED_BIT", + "valid": false, + "vector": [-1], + "dtype_hex": "0x10", + "dtype_alias": "PACKED_BIT", + "padding": 0 + } + ] +} + diff --git a/test/bson_binary_vector/vector-test-cases.json b/test/bson_binary_vector/vector-test-cases.json deleted file mode 100644 index 32f3e441ba..0000000000 --- a/test/bson_binary_vector/vector-test-cases.json +++ /dev/null @@ -1,143 +0,0 @@ -{ - "description": "Basic Tests of Binary Vectors, subtype 9", - "test_key": "vector", - "tests": [ - { - "description": "Simple Vector INT8", - "valid": true, - "vector": [127, 7], - "dtype_hex": "0x03", - "dtype_alias": "INT8", - "padding": 0, - "canonical_bson": "1600000005766563746F7200040000000903007F0700", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}" - }, - { - "description": "Simple Vector FLOAT32", - "valid": true, - "vector": [127.0, 7.0], - "dtype_hex": "0x27", - "dtype_alias": "FLOAT32", - "padding": 0, - "canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}" - }, - { - "description": "Simple Vector PACKED_BIT", - "valid": true, - "vector": [127, 7], - "dtype_hex": "0x10", - "dtype_alias": "PACKED_BIT", - "padding": 0, - "canonical_bson": "1600000005766563746F7200040000000910007F0700", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}" - }, - { - "description": "Empty Vector INT8", - "valid": true, - "vector": [], - "dtype_hex": "0x03", - "dtype_alias": "INT8", - "padding": 0, - "canonical_bson": "1400000005766563746F72000200000009030000", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}" - }, - { - "description": "Empty Vector FLOAT32", - "valid": true, - "vector": [], - "dtype_hex": "0x27", - "dtype_alias": "FLOAT32", - "padding": 0, - "canonical_bson": "1400000005766563746F72000200000009270000", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}" - }, - { - "description": "Empty Vector PACKED_BIT", - "valid": true, - "vector": [], - "dtype_hex": "0x10", - "dtype_alias": "PACKED_BIT", - "padding": 0, - "canonical_bson": "1400000005766563746F72000200000009100000", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}" - }, - { - "description": "Infinity Vector FLOAT32", - "valid": true, - "vector": ["-inf", 0.0, "inf"], - "dtype_hex": "0x27", - "dtype_alias": "FLOAT32", - "padding": 0, - "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAID/AAAAAAAAgH8=\", \"subType\": \"09\"}}}" - }, - { - "description": "PACKED_BIT with padding", - "valid": true, - "vector": [127, 7], - "dtype_hex": "0x10", - "dtype_alias": "PACKED_BIT", - "padding": 3, - "canonical_bson": "1600000005766563746F7200040000000910037F0700", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAN/Bw==\", \"subType\": \"09\"}}}" - }, - { - "description": "Overflow Vector INT8", - "valid": false, - "vector": [128], - "dtype_hex": "0x03", - "dtype_alias": "INT8", - "padding": 0 - }, - { - "description": "Overflow Vector PACKED_BIT", - "valid": false, - "vector": [256], - "dtype_hex": "0x10", - "dtype_alias": "PACKED_BIT", - "padding": 0 - }, - { - "description": "Underflow Vector INT8", - "valid": false, - "vector": [-129], - "dtype_hex": "0x03", - "dtype_alias": "INT8", - "padding": 0 - }, - { - "description": "Underflow Vector PACKED_BIT", - "valid": false, - "vector": [-1], - "dtype_hex": "0x10", - "dtype_alias": "PACKED_BIT", - "padding": 0 - }, - { - "description": "INT8 with padding", - "valid": false, - "vector": [127, 7], - "dtype_hex": "0x03", - "dtype_alias": "INT8", - "padding": 3 - }, - { - "description": "FLOAT32 with padding", - "valid": false, - "vector": [127.0, 7.0], - "dtype_hex": "0x27", - "dtype_alias": "FLOAT32", - "padding": 3 - }, - { - "description": "INT8 with float inputs", - "valid": false, - "vector": [127.77, 7.77], - "dtype_hex": "0x03", - "dtype_alias": "INT8", - "padding": 0 - } - ] -} - From c93bae19089567a2cd61b640fbf7da235112e9c3 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 27 Sep 2024 08:30:55 -0400 Subject: [PATCH 28/46] Update bson/binary.py Fix typo. Co-authored-by: Steven Silvester --- bson/binary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bson/binary.py b/bson/binary.py index 3f795157ee..ddd323069d 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -263,7 +263,7 @@ class Binary(bytes): The second (padding) prescribes the number of bits to ignore in the final byte. This is relevant when the element size of the dtype is not a multiple of 8. - Raises TypeError if subtype` is not an instance of :class:`int`. + Raises TypeError if `subtype` is not an instance of :class:`int`. Raises ValueError if `subtype` is not in [0, 256). .. note:: From f374b5aa6405650ee40d1958c13d71ef7ddf5a81 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Fri, 27 Sep 2024 09:14:46 -0400 Subject: [PATCH 29/46] Removed json from test_bson_binary_vector and its jsons --- test/bson_binary_vector/float32.json | 9 +++------ test/bson_binary_vector/int8.json | 6 ++---- test/bson_binary_vector/packed_bit.json | 9 +++------ test/test_bson_binary_vector.py | 12 +----------- 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/test/bson_binary_vector/float32.json b/test/bson_binary_vector/float32.json index 9ec72861d4..bbbe00b758 100644 --- a/test/bson_binary_vector/float32.json +++ b/test/bson_binary_vector/float32.json @@ -9,8 +9,7 @@ "dtype_hex": "0x27", "dtype_alias": "FLOAT32", "padding": 0, - "canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAP5CAADgQA==\", \"subType\": \"09\"}}}" + "canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000" }, { "description": "Empty Vector FLOAT32", @@ -19,8 +18,7 @@ "dtype_hex": "0x27", "dtype_alias": "FLOAT32", "padding": 0, - "canonical_bson": "1400000005766563746F72000200000009270000", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwA=\", \"subType\": \"09\"}}}" + "canonical_bson": "1400000005766563746F72000200000009270000" }, { "description": "Infinity Vector FLOAT32", @@ -29,8 +27,7 @@ "dtype_hex": "0x27", "dtype_alias": "FLOAT32", "padding": 0, - "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"JwAAAID/AAAAAAAAgH8=\", \"subType\": \"09\"}}}" + "canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00" }, { "description": "FLOAT32 with padding", diff --git a/test/bson_binary_vector/int8.json b/test/bson_binary_vector/int8.json index 92eab609e8..7529721e5e 100644 --- a/test/bson_binary_vector/int8.json +++ b/test/bson_binary_vector/int8.json @@ -9,8 +9,7 @@ "dtype_hex": "0x03", "dtype_alias": "INT8", "padding": 0, - "canonical_bson": "1600000005766563746F7200040000000903007F0700", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwB/Bw==\", \"subType\": \"09\"}}}" + "canonical_bson": "1600000005766563746F7200040000000903007F0700" }, { "description": "Empty Vector INT8", @@ -19,8 +18,7 @@ "dtype_hex": "0x03", "dtype_alias": "INT8", "padding": 0, - "canonical_bson": "1400000005766563746F72000200000009030000", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"AwA=\", \"subType\": \"09\"}}}" + "canonical_bson": "1400000005766563746F72000200000009030000" }, { "description": "Overflow Vector INT8", diff --git a/test/bson_binary_vector/packed_bit.json b/test/bson_binary_vector/packed_bit.json index de108876a9..a41cd593f5 100644 --- a/test/bson_binary_vector/packed_bit.json +++ b/test/bson_binary_vector/packed_bit.json @@ -9,8 +9,7 @@ "dtype_hex": "0x10", "dtype_alias": "PACKED_BIT", "padding": 0, - "canonical_bson": "1600000005766563746F7200040000000910007F0700", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAB/Bw==\", \"subType\": \"09\"}}}" + "canonical_bson": "1600000005766563746F7200040000000910007F0700" }, { "description": "Empty Vector PACKED_BIT", @@ -19,8 +18,7 @@ "dtype_hex": "0x10", "dtype_alias": "PACKED_BIT", "padding": 0, - "canonical_bson": "1400000005766563746F72000200000009100000", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAA=\", \"subType\": \"09\"}}}" + "canonical_bson": "1400000005766563746F72000200000009100000" }, { "description": "PACKED_BIT with padding", @@ -29,8 +27,7 @@ "dtype_hex": "0x10", "dtype_alias": "PACKED_BIT", "padding": 3, - "canonical_bson": "1600000005766563746F7200040000000910037F0700", - "canonical_extjson": "{\"vector\": {\"$binary\": {\"base64\": \"EAN/Bw==\", \"subType\": \"09\"}}}" + "canonical_bson": "1600000005766563746F7200040000000910037F0700" }, { "description": "Overflow Vector PACKED_BIT", diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py index 3a6d45e8d4..0aec32199c 100644 --- a/test/test_bson_binary_vector.py +++ b/test/test_bson_binary_vector.py @@ -16,17 +16,12 @@ import binascii import codecs -import functools -import glob import json -import os import struct -import sys -from decimal import DecimalException from pathlib import Path from test import unittest -from bson import decode, encode, json_util +from bson import decode, encode from bson.binary import Binary, BinaryVectorDtype _TEST_PATH = Path(__file__).parent / "bson_binary_vector" @@ -59,7 +54,6 @@ def run_test(self): dtype_alias_exp = test_case.get("dtype_alias") padding_exp = test_case.get("padding", 0) canonical_bson_exp = test_case.get("canonical_bson") - canonical_extjson_exp = test_case.get("canonical_extjson") # Convert dtype hex string into bytes dtype_exp = BinaryVectorDtype(int(dtype_hex_exp, 16).to_bytes(1, byteorder="little")) @@ -90,10 +84,6 @@ def run_test(self): cB_obs = binascii.hexlify(encode({test_key: vector_exp})).decode().upper() self.assertEqual(cB_obs, canonical_bson_exp, description) - # Test JSON - self.assertEqual(json_util.loads(canonical_extjson_exp), decoded_doc, description) - self.assertEqual(json_util.dumps(decoded_doc), canonical_extjson_exp, description) - else: with self.assertRaises((struct.error, AssertionError), msg=description): Binary.from_vector(vector_exp, dtype_exp, padding_exp) From 0db98667ed273a05db1b104142967d3210e1038b Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 17:49:52 -0400 Subject: [PATCH 30/46] Addition of Provision (BETA) specifiers change references to 4.10 --- bson/binary.py | 18 +++++++++--------- doc/changelog.rst | 5 +++++ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index ddd323069d..b7902e7dc2 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -195,9 +195,9 @@ class UuidRepresentation: VECTOR_SUBTYPE = 9 -"""BSON binary subtype for densely packed vector data. +"""**(BETA)** BSON binary subtype for densely packed vector data. -.. versionadded:: 4.9 +.. versionadded:: 4.10 """ @@ -207,7 +207,7 @@ class UuidRepresentation: class BinaryVectorDtype(Enum): - """Datatypes of vector subtype. + """**(BETA)** Datatypes of vector subtype. :param FLOAT32: (0x27) Pack list of :class:`float` as float32 :param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8 @@ -233,7 +233,7 @@ class BinaryVectorDtype(Enum): @dataclass class BinaryVector: - """Vector of numbers along with metadata for binary interoperability. + """**(BETA)** Vector of numbers along with metadata for binary interoperability. :param data: Sequence of numbers representing the mathematical vector. :param dtype: The data type stored in binary @@ -257,7 +257,7 @@ class Binary(bytes): the difference between what should be considered binary data and what should be considered a string when we encode to BSON. - Subtype 9 provides a space-efficient representation of 1-dimensional vector data. + **(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data. Its data is prepended with two bytes of metadata. The first (dtype) describes its data type, such as float32 or int8. The second (padding) prescribes the number of bits to ignore in the final byte. @@ -278,8 +278,8 @@ class Binary(bytes): .. versionchanged:: 3.9 Support any bytes-like type that implements the buffer protocol. - .. versionchanged:: 4.9 - Addition of vector subtype. + .. versionchanged:: 4.10 + **(BETA)** Addition of vector subtype. """ _type_marker = 5 @@ -405,7 +405,7 @@ def from_vector( dtype: BinaryVectorDtype, padding: Optional[int] = 0, ) -> Binary: - """Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers. + """**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers. To interpret the representation of the numbers, a data type must be included. See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions. @@ -435,7 +435,7 @@ def from_vector( return cls(metadata + data, subtype=VECTOR_SUBTYPE) def as_vector(self, uncompressed: Optional[bool] = False) -> BinaryVector: - """From the Binary, create a list of numbers, along with dtype and padding. + """**(BETA)** From the Binary, create a list of numbers, along with dtype and padding. :param uncompressed: If true, return the true mathematical vector. diff --git a/doc/changelog.rst b/doc/changelog.rst index dfb3c79827..a2191994cc 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -1,6 +1,11 @@ Changelog ========= +Changes in Version 4.10.0 +------------------------- + +- Provisional **(BETA)** support for a new Binary BSON subtype used for efficient storage and retrieval of vectors: densely packed arrays of numbers, all of the same type. + Changes in Version 4.9.0 ------------------------- From 0532803b2a8835616ce35ff0a290a6d5ae7806bc Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 19:26:57 -0400 Subject: [PATCH 31/46] Add references to from_vector() and as_vector() --- doc/changelog.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index a2191994cc..f3a73aac0f 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -4,7 +4,9 @@ Changelog Changes in Version 4.10.0 ------------------------- -- Provisional **(BETA)** support for a new Binary BSON subtype used for efficient storage and retrieval of vectors: densely packed arrays of numbers, all of the same type. +- Provisional **(BETA)** support for a new Binary BSON subtype used for efficient storage and retrieval of vectors: + densely packed arrays of numbers, all of the same type. + This includes new methods :meth:`~bson.binary.Binary.from_vector` and :meth:`~bson.binary.Binary.as_vector`. Changes in Version 4.9.0 ------------------------- From 3edeef61e6bec211a34ae234ae79f1c2f73ff60b Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 19:30:01 -0400 Subject: [PATCH 32/46] Add subtype number in changelog --- doc/changelog.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index f3a73aac0f..8cf92136a4 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -4,7 +4,7 @@ Changelog Changes in Version 4.10.0 ------------------------- -- Provisional **(BETA)** support for a new Binary BSON subtype used for efficient storage and retrieval of vectors: +- Provisional **(BETA)** support for a new Binary BSON subtype (9) used for efficient storage and retrieval of vectors: densely packed arrays of numbers, all of the same type. This includes new methods :meth:`~bson.binary.Binary.from_vector` and :meth:`~bson.binary.Binary.as_vector`. From d199597a574baf805536793bf33c17850e405286 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 19:42:56 -0400 Subject: [PATCH 33/46] Raise ValueErrors not AssertionErrors. Bumped from 4.9 to 4.10 --- bson/binary.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index b7902e7dc2..a5550206d4 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -219,7 +219,7 @@ class BinaryVectorDtype(Enum): Each value is of type bytes with a length of one. - .. versionadded:: 4.9 + .. versionadded:: 4.10 """ INT8 = b"\x03" @@ -241,7 +241,7 @@ class BinaryVector: when a vector element's size is less than a byte and the length of the vector is not a multiple of 8. - .. versionadded:: 4.9 + .. versionadded:: 4.10 """ data: Sequence[float | int] @@ -417,16 +417,18 @@ def from_vector( :param padding: For fractional bytes, number of bits to ignore at end of vector. :return: Binary packed data identified by dtype and padding. - .. versionadded:: 4.9 + .. versionadded:: 4.10 """ if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8 format_str = "b" - assert not padding, f"padding does not apply to {dtype=}" + if padding: + raise ValueError(f"padding does not apply to {dtype=}") elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8 format_str = "B" elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32 format_str = "f" - assert not padding, f"padding does not apply to {dtype=}" + if padding: + raise ValueError(f"padding does not apply to {dtype=}") else: raise NotImplementedError("%s not yet supported" % dtype) @@ -444,7 +446,7 @@ def as_vector(self, uncompressed: Optional[bool] = False) -> BinaryVector: in a List[int] of zeros and ones. :return: List of numbers, along with dtype and padding. - .. versionadded:: 4.9 + .. versionadded:: 4.10 """ position = 0 @@ -462,7 +464,10 @@ def as_vector(self, uncompressed: Optional[bool] = False) -> BinaryVector: elif dtype == BinaryVectorDtype.FLOAT32: n_bytes = len(self) - position n_values = n_bytes // 4 - assert n_bytes % 4 == 0 + if n_bytes % 4: + raise ValueError( + "Corrupt data. N bytes for a float32 vector must be a multiple of 4." + ) vector = list(struct.unpack_from(f"{n_values}f", self, position)) return BinaryVector(vector, dtype, padding) From abc7cd32db6e1840cd8d6da135fc2d4c4b8a1d11 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 19:50:52 -0400 Subject: [PATCH 34/46] Docstring for as_vector --- bson/binary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bson/binary.py b/bson/binary.py index a5550206d4..50634ca89f 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -444,7 +444,7 @@ def as_vector(self, uncompressed: Optional[bool] = False) -> BinaryVector: This is only necessary for datatypes where padding is applicable. For example, setting this to True for a PACKED_BIT vector will result in a List[int] of zeros and ones. - :return: List of numbers, along with dtype and padding. + :return: BinaryVector - a list of numbers, along with dtype and padding. .. versionadded:: 4.10 """ From 4550c2068774d80e567653ee7a789cac1dc40709 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 19:56:32 -0400 Subject: [PATCH 35/46] Add slots for BinaryVector --- bson/binary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bson/binary.py b/bson/binary.py index 50634ca89f..0b5a7323f0 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -231,7 +231,7 @@ class BinaryVectorDtype(Enum): DTYPE_FROM_HEX = {key.value: key for key in BinaryVectorDtype} -@dataclass +@dataclass(slots=True) class BinaryVector: """**(BETA)** Vector of numbers along with metadata for binary interoperability. From 99d44e1a9b1c1bb25dc877db6744c0038b8fd872 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 20:01:09 -0400 Subject: [PATCH 36/46] Check subtype before decoding --- bson/binary.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bson/binary.py b/bson/binary.py index 0b5a7323f0..1609fa7a1a 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -449,6 +449,9 @@ def as_vector(self, uncompressed: Optional[bool] = False) -> BinaryVector: .. versionadded:: 4.10 """ + if self.subtype != VECTOR_SUBTYPE: + raise TypeError("Binary object does not have vector subtype 9.") + position = 0 dtype, padding = struct.unpack_from(" Date: Mon, 30 Sep 2024 20:43:27 -0400 Subject: [PATCH 37/46] Try slots with default padding --- bson/binary.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index 1609fa7a1a..7d310b17a4 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -16,7 +16,7 @@ import struct from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union from uuid import UUID """Tools for representing BSON binary data. @@ -244,9 +244,10 @@ class BinaryVector: .. versionadded:: 4.10 """ + __slots__ = ("data", "dtype", "padding") data: Sequence[float | int] dtype: BinaryVectorDtype - padding: Optional[int] = 0 + padding: int = 0 class Binary(bytes): @@ -403,7 +404,7 @@ def from_vector( cls: Type[Binary], vector: list[int, float], dtype: BinaryVectorDtype, - padding: Optional[int] = 0, + padding: int = 0, ) -> Binary: """**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers. @@ -436,7 +437,7 @@ def from_vector( data = struct.pack(f"{len(vector)}{format_str}", *vector) return cls(metadata + data, subtype=VECTOR_SUBTYPE) - def as_vector(self, uncompressed: Optional[bool] = False) -> BinaryVector: + def as_vector(self, uncompressed: bool = False) -> BinaryVector: """**(BETA)** From the Binary, create a list of numbers, along with dtype and padding. From 637c47496e1cd9d32ceb0b3168b74e6d5e79b038 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 20:47:38 -0400 Subject: [PATCH 38/46] Removed slots arg --- bson/binary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bson/binary.py b/bson/binary.py index 7d310b17a4..c25d6bd136 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -231,7 +231,7 @@ class BinaryVectorDtype(Enum): DTYPE_FROM_HEX = {key.value: key for key in BinaryVectorDtype} -@dataclass(slots=True) +@dataclass class BinaryVector: """**(BETA)** Vector of numbers along with metadata for binary interoperability. From 2d511f60608f40db245e9d5e12857d3315553e81 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 20:55:43 -0400 Subject: [PATCH 39/46] Update dataclass --- bson/binary.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index c25d6bd136..b20db77a55 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -234,20 +234,22 @@ class BinaryVectorDtype(Enum): @dataclass class BinaryVector: """**(BETA)** Vector of numbers along with metadata for binary interoperability. - - :param data: Sequence of numbers representing the mathematical vector. - :param dtype: The data type stored in binary - :param padding: The number of bits in the final byte that are to be ignored - when a vector element's size is less than a byte - and the length of the vector is not a multiple of 8. - .. versionadded:: 4.10 """ __slots__ = ("data", "dtype", "padding") - data: Sequence[float | int] - dtype: BinaryVectorDtype - padding: int = 0 + + def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0): + """ + :param data: Sequence of numbers representing the mathematical vector. + :param dtype: The data type stored in binary + :param padding: The number of bits in the final byte that are to be ignored + when a vector element's size is less than a byte + and the length of the vector is not a multiple of 8. + """ + self.data = data + self.dtype = dtype + self.padding = padding class Binary(bytes): From 17e1d331854678a42c085ee91bf95be089b2ca0b Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 20:59:03 -0400 Subject: [PATCH 40/46] Remove unompressed kwarg from as_vector --- bson/binary.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index b20db77a55..31ae57fec6 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -439,15 +439,10 @@ def from_vector( data = struct.pack(f"{len(vector)}{format_str}", *vector) return cls(metadata + data, subtype=VECTOR_SUBTYPE) - def as_vector(self, uncompressed: bool = False) -> BinaryVector: + def as_vector(self) -> BinaryVector: """**(BETA)** From the Binary, create a list of numbers, along with dtype and padding. - - :param uncompressed: If true, return the true mathematical vector. - This is only necessary for datatypes where padding is applicable. - For example, setting this to True for a PACKED_BIT vector will result - in a List[int] of zeros and ones. - :return: BinaryVector - a list of numbers, along with dtype and padding. + :return: BinaryVector .. versionadded:: 4.10 """ @@ -481,14 +476,7 @@ def as_vector(self, uncompressed: bool = False) -> BinaryVector: # data packed as uint8 dtype_format = "B" unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position)) - if not uncompressed: - return BinaryVector(unpacked_uint8s, dtype, padding) - else: - bits = [] - for uint8 in unpacked_uint8s: - bits.extend([int(bit) for bit in f"{uint8:08b}"]) - vector = bits[:-padding] if padding else bits - return BinaryVector(vector, dtype, padding) + return BinaryVector(unpacked_uint8s, dtype, padding) else: raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name) From ce5f3e3fd658767d7d723c566d30609141a92549 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 21:03:58 -0400 Subject: [PATCH 41/46] Changed TypeError to ValueError --- bson/binary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bson/binary.py b/bson/binary.py index 31ae57fec6..0860fe14a0 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -448,7 +448,7 @@ def as_vector(self) -> BinaryVector: """ if self.subtype != VECTOR_SUBTYPE: - raise TypeError("Binary object does not have vector subtype 9.") + raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.") position = 0 dtype, padding = struct.unpack_from(" Date: Mon, 30 Sep 2024 21:25:31 -0400 Subject: [PATCH 42/46] Updates after removing uncompressed --- test/test_bson.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/test_bson.py b/test/test_bson.py index 072c5a1191..96aa897d19 100644 --- a/test/test_bson.py +++ b/test/test_bson.py @@ -747,10 +747,6 @@ def test_vector(self): packed_bit_binary = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT) packed_bit_vec = packed_bit_binary.as_vector() assert packed_bit_vec.data == list_vector - # If we wish to see the bit vector unpacked to its true length, we can - unpacked_vec = packed_bit_binary.as_vector(uncompressed=True) - assert len(unpacked_vec.data) == 8 * len(list_vector) - assert set(unpacked_vec.data) == {0, 1} # A padding parameter permits vectors of length that aren't divisible by 8 # The following ignores the last 3 bits in list_vector, @@ -758,7 +754,11 @@ def test_vector(self): padding = 3 padded_vec = Binary.from_vector(list_vector, BinaryVectorDtype.PACKED_BIT, padding=padding) assert padded_vec.as_vector().data == list_vector - assert len(padded_vec.as_vector(uncompressed=True).data) == 8 * len(list_vector) - padding + # To visualize how this looks as a binary vector.. + uncompressed = "" + for val in list_vector: + uncompressed += format(val, "08b") + assert uncompressed[:-padding] == "0111111100000" # It is worthwhile explicitly showing the values encoded to BSON padded_doc = {"padded_vec": padded_vec} From 8aaa2f677319845f2d262f9112a4533270aa55ca Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 21:39:51 -0400 Subject: [PATCH 43/46] Fixed expected exceptions in invalid test cases --- test/test_bson_binary_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_bson_binary_vector.py b/test/test_bson_binary_vector.py index 0aec32199c..00c82bbb65 100644 --- a/test/test_bson_binary_vector.py +++ b/test/test_bson_binary_vector.py @@ -85,7 +85,7 @@ def run_test(self): self.assertEqual(cB_obs, canonical_bson_exp, description) else: - with self.assertRaises((struct.error, AssertionError), msg=description): + with self.assertRaises((struct.error, ValueError), msg=description): Binary.from_vector(vector_exp, dtype_exp, padding_exp) return run_test From 8946daf3cfddf611ccaccbc130de6d421aee6b49 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 21:46:59 -0400 Subject: [PATCH 44/46] padding in now Optional[int] = None --- bson/binary.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index 0860fe14a0..3582e70a7a 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -16,7 +16,7 @@ import struct from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, Union from uuid import UUID """Tools for representing BSON binary data. @@ -406,7 +406,7 @@ def from_vector( cls: Type[Binary], vector: list[int, float], dtype: BinaryVectorDtype, - padding: int = 0, + padding: Optional[int] = None, ) -> Binary: """**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers. @@ -428,6 +428,7 @@ def from_vector( raise ValueError(f"padding does not apply to {dtype=}") elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8 format_str = "B" + padding = 0 if padding is None else padding elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32 format_str = "f" if padding: From 9397129440a849d9f91036747a1ff8ac2f45e679 Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 21:52:57 -0400 Subject: [PATCH 45/46] padding does need to be an integer --- bson/binary.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index 3582e70a7a..0860fe14a0 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -16,7 +16,7 @@ import struct from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, Any, Optional, Sequence, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union from uuid import UUID """Tools for representing BSON binary data. @@ -406,7 +406,7 @@ def from_vector( cls: Type[Binary], vector: list[int, float], dtype: BinaryVectorDtype, - padding: Optional[int] = None, + padding: int = 0, ) -> Binary: """**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers. @@ -428,7 +428,6 @@ def from_vector( raise ValueError(f"padding does not apply to {dtype=}") elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8 format_str = "B" - padding = 0 if padding is None else padding elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32 format_str = "f" if padding: From 913403bf8827bd56cff70709eb7261da029c6aaf Mon Sep 17 00:00:00 2001 From: Casey Clements Date: Mon, 30 Sep 2024 22:03:20 -0400 Subject: [PATCH 46/46] Removed unneeded ugly TYPE_FROM_HEX = {key.value: key for key in BinaryVectorDtype} --- bson/binary.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index 0860fe14a0..47c52d4892 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -227,10 +227,6 @@ class BinaryVectorDtype(Enum): PACKED_BIT = b"\x10" -# Map from bytes to enum value, for decoding. -DTYPE_FROM_HEX = {key.value: key for key in BinaryVectorDtype} - - @dataclass class BinaryVector: """**(BETA)** Vector of numbers along with metadata for binary interoperability.