1313# limitations under the License.
1414from __future__ import annotations
1515
16- from typing import TYPE_CHECKING , Any , Tuple , Type , Union
16+ import struct
17+ from dataclasses import dataclass
18+ from enum import Enum
19+ from typing import TYPE_CHECKING , Any , Sequence , Tuple , Type , Union
1720from uuid import UUID
1821
1922"""Tools for representing BSON binary data.
@@ -191,21 +194,75 @@ class UuidRepresentation:
191194"""
192195
193196
197+ VECTOR_SUBTYPE = 9
198+ """**(BETA)** BSON binary subtype for densely packed vector data.
199+
200+ .. versionadded:: 4.10
201+ """
202+
203+
194204USER_DEFINED_SUBTYPE = 128
195205"""BSON binary subtype for any user defined structure.
196206"""
197207
198208
209+ class BinaryVectorDtype (Enum ):
210+ """**(BETA)** Datatypes of vector subtype.
211+
212+ :param FLOAT32: (0x27) Pack list of :class:`float` as float32
213+ :param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
214+ :param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
215+
216+ The `PACKED_BIT` value represents a special case where vector values themselves
217+ can only be of two values (0 or 1) but these are packed together into groups of 8,
218+ a byte. In Python, these are displayed as ints in range [0, 255]
219+
220+ Each value is of type bytes with a length of one.
221+
222+ .. versionadded:: 4.10
223+ """
224+
225+ INT8 = b"\x03 "
226+ FLOAT32 = b"\x27 "
227+ PACKED_BIT = b"\x10 "
228+
229+
230+ @dataclass
231+ class BinaryVector :
232+ """**(BETA)** Vector of numbers along with metadata for binary interoperability.
233+ .. versionadded:: 4.10
234+ """
235+
236+ __slots__ = ("data" , "dtype" , "padding" )
237+
238+ def __init__ (self , data : Sequence [float | int ], dtype : BinaryVectorDtype , padding : int = 0 ):
239+ """
240+ :param data: Sequence of numbers representing the mathematical vector.
241+ :param dtype: The data type stored in binary
242+ :param padding: The number of bits in the final byte that are to be ignored
243+ when a vector element's size is less than a byte
244+ and the length of the vector is not a multiple of 8.
245+ """
246+ self .data = data
247+ self .dtype = dtype
248+ self .padding = padding
249+
250+
199251class Binary (bytes ):
200252 """Representation of BSON binary data.
201253
202- This is necessary because we want to represent Python strings as
203- the BSON string type. We need to wrap binary data so we can tell
254+ We want to represent Python strings as the BSON string type.
255+ We need to wrap binary data so that we can tell
204256 the difference between what should be considered binary data and
205257 what should be considered a string when we encode to BSON.
206258
207- Raises TypeError if `data` is not an instance of :class:`bytes`
208- or `subtype` is not an instance of :class:`int`.
259+ **(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
260+ Its data is prepended with two bytes of metadata.
261+ The first (dtype) describes its data type, such as float32 or int8.
262+ The second (padding) prescribes the number of bits to ignore in the final byte.
263+ This is relevant when the element size of the dtype is not a multiple of 8.
264+
265+ Raises TypeError if `subtype` is not an instance of :class:`int`.
209266 Raises ValueError if `subtype` is not in [0, 256).
210267
211268 .. note::
@@ -218,7 +275,10 @@ class Binary(bytes):
218275 to use
219276
220277 .. versionchanged:: 3.9
221- Support any bytes-like type that implements the buffer protocol.
278+ Support any bytes-like type that implements the buffer protocol.
279+
280+ .. versionchanged:: 4.10
281+ **(BETA)** Addition of vector subtype.
222282 """
223283
224284 _type_marker = 5
@@ -337,6 +397,86 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI
337397 f"cannot decode subtype { self .subtype } to { UUID_REPRESENTATION_NAMES [uuid_representation ]} "
338398 )
339399
400+ @classmethod
401+ def from_vector (
402+ cls : Type [Binary ],
403+ vector : list [int , float ],
404+ dtype : BinaryVectorDtype ,
405+ padding : int = 0 ,
406+ ) -> Binary :
407+ """**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
408+
409+ To interpret the representation of the numbers, a data type must be included.
410+ See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
411+
412+ The dtype and padding are prepended to the binary data's value.
413+
414+ :param vector: List of values
415+ :param dtype: Data type of the values
416+ :param padding: For fractional bytes, number of bits to ignore at end of vector.
417+ :return: Binary packed data identified by dtype and padding.
418+
419+ .. versionadded:: 4.10
420+ """
421+ if dtype == BinaryVectorDtype .INT8 : # pack ints in [-128, 127] as signed int8
422+ format_str = "b"
423+ if padding :
424+ raise ValueError (f"padding does not apply to { dtype = } " )
425+ elif dtype == BinaryVectorDtype .PACKED_BIT : # pack ints in [0, 255] as unsigned uint8
426+ format_str = "B"
427+ elif dtype == BinaryVectorDtype .FLOAT32 : # pack floats as float32
428+ format_str = "f"
429+ if padding :
430+ raise ValueError (f"padding does not apply to { dtype = } " )
431+ else :
432+ raise NotImplementedError ("%s not yet supported" % dtype )
433+
434+ metadata = struct .pack ("<sB" , dtype .value , padding )
435+ data = struct .pack (f"{ len (vector )} { format_str } " , * vector )
436+ return cls (metadata + data , subtype = VECTOR_SUBTYPE )
437+
438+ def as_vector (self ) -> BinaryVector :
439+ """**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
440+
441+ :return: BinaryVector
442+
443+ .. versionadded:: 4.10
444+ """
445+
446+ if self .subtype != VECTOR_SUBTYPE :
447+ raise ValueError (f"Cannot decode subtype { self .subtype } as a vector." )
448+
449+ position = 0
450+ dtype , padding = struct .unpack_from ("<sB" , self , position )
451+ position += 2
452+ dtype = BinaryVectorDtype (dtype )
453+ n_values = len (self ) - position
454+
455+ if dtype == BinaryVectorDtype .INT8 :
456+ dtype_format = "b"
457+ format_string = f"{ n_values } { dtype_format } "
458+ vector = list (struct .unpack_from (format_string , self , position ))
459+ return BinaryVector (vector , dtype , padding )
460+
461+ elif dtype == BinaryVectorDtype .FLOAT32 :
462+ n_bytes = len (self ) - position
463+ n_values = n_bytes // 4
464+ if n_bytes % 4 :
465+ raise ValueError (
466+ "Corrupt data. N bytes for a float32 vector must be a multiple of 4."
467+ )
468+ vector = list (struct .unpack_from (f"{ n_values } f" , self , position ))
469+ return BinaryVector (vector , dtype , padding )
470+
471+ elif dtype == BinaryVectorDtype .PACKED_BIT :
472+ # data packed as uint8
473+ dtype_format = "B"
474+ unpacked_uint8s = list (struct .unpack_from (f"{ n_values } { dtype_format } " , self , position ))
475+ return BinaryVector (unpacked_uint8s , dtype , padding )
476+
477+ else :
478+ raise NotImplementedError ("Binary Vector dtype %s not yet supported" % dtype .name )
479+
340480 @property
341481 def subtype (self ) -> int :
342482 """Subtype of this binary data."""
0 commit comments