Skip to content

Commit 17388d1

Browse files
committed
Vastly improve performance
2 parents 6feec44 + b111cbf commit 17388d1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+3797
-475
lines changed

.evergreen/resync-specs.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ do
7676
atlas-data-lake-testing|data_lake)
7777
cpjson atlas-data-lake-testing/tests/ data_lake
7878
;;
79+
bson-binary-vector|bson_binary_vector)
80+
cpjson bson-binary-vector/tests/ bson_binary_vector
81+
;;
7982
bson-corpus|bson_corpus)
8083
cpjson bson-corpus/tests/ bson_corpus
8184
;;

bson/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,7 +1324,7 @@ def decode_iter(
13241324
elements = data[position : position + obj_size]
13251325
position += obj_size
13261326

1327-
yield _bson_to_dict(elements, opts) # type:ignore[misc, type-var]
1327+
yield _bson_to_dict(elements, opts) # type:ignore[misc]
13281328

13291329

13301330
@overload
@@ -1370,7 +1370,7 @@ def decode_file_iter(
13701370
raise InvalidBSON("cut off in middle of objsize")
13711371
obj_size = _UNPACK_INT_FROM(size_data, 0)[0] - 4
13721372
elements = size_data + file_obj.read(max(0, obj_size))
1373-
yield _bson_to_dict(elements, opts) # type:ignore[type-var, arg-type, misc]
1373+
yield _bson_to_dict(elements, opts) # type:ignore[arg-type, misc]
13741374

13751375

13761376
def is_valid(bson: bytes) -> bool:

bson/_cbsonmodule.c

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ static PyObject* _test_long_long_to_str(PyObject* self, PyObject* args) {
207207
*
208208
* Returns a new ref */
209209
static PyObject* _error(char* name) {
210-
PyObject* error;
210+
PyObject* error = NULL;
211211
PyObject* errors = PyImport_ImportModule("bson.errors");
212212
if (!errors) {
213213
return NULL;
@@ -279,7 +279,7 @@ static PyObject* datetime_from_millis(long long millis) {
279279
* micros = diff * 1000 111000
280280
* Resulting in datetime(1, 1, 1, 1, 1, 1, 111000) -- the expected result
281281
*/
282-
PyObject* datetime;
282+
PyObject* datetime = NULL;
283283
int diff = (int)(((millis % 1000) + 1000) % 1000);
284284
int microseconds = diff * 1000;
285285
Time64_T seconds = (millis - diff) / 1000;
@@ -294,7 +294,7 @@ static PyObject* datetime_from_millis(long long millis) {
294294
timeinfo.tm_sec,
295295
microseconds);
296296
if(!datetime) {
297-
PyObject *etype, *evalue, *etrace;
297+
PyObject *etype = NULL, *evalue = NULL, *etrace = NULL;
298298

299299
/*
300300
* Calling _error clears the error state, so fetch it first.
@@ -350,8 +350,8 @@ static PyObject* datetime_ms_from_millis(PyObject* self, long long millis){
350350
return NULL;
351351
}
352352

353-
PyObject* dt;
354-
PyObject* ll_millis;
353+
PyObject* dt = NULL;
354+
PyObject* ll_millis = NULL;
355355

356356
if (!(ll_millis = PyLong_FromLongLong(millis))){
357357
return NULL;
@@ -1790,7 +1790,7 @@ static PyObject* _cbson_dict_to_bson(PyObject* self, PyObject* args) {
17901790
PyObject* result;
17911791
unsigned char check_keys;
17921792
unsigned char top_level = 1;
1793-
PyObject* options_obj;
1793+
PyObject* options_obj = NULL;
17941794
codec_options_t options;
17951795
buffer_t buffer;
17961796
PyObject* raw_bson_document_bytes_obj;
@@ -2512,8 +2512,8 @@ static PyObject* get_value(PyObject* self, PyObject* name, const char* buffer,
25122512
* Wrap any non-InvalidBSON errors in InvalidBSON.
25132513
*/
25142514
if (PyErr_Occurred()) {
2515-
PyObject *etype, *evalue, *etrace;
2516-
PyObject *InvalidBSON;
2515+
PyObject *etype = NULL, *evalue = NULL, *etrace = NULL;
2516+
PyObject *InvalidBSON = NULL;
25172517

25182518
/*
25192519
* Calling _error clears the error state, so fetch it first.
@@ -2585,8 +2585,8 @@ static int _element_to_dict(PyObject* self, const char* string,
25852585
if (!*name) {
25862586
/* If NULL is returned then wrap the UnicodeDecodeError
25872587
in an InvalidBSON error */
2588-
PyObject *etype, *evalue, *etrace;
2589-
PyObject *InvalidBSON;
2588+
PyObject *etype = NULL, *evalue = NULL, *etrace = NULL;
2589+
PyObject *InvalidBSON = NULL;
25902590

25912591
PyErr_Fetch(&etype, &evalue, &etrace);
25922592
if (PyErr_GivenExceptionMatches(etype, PyExc_Exception)) {
@@ -2620,7 +2620,7 @@ static PyObject* _cbson_element_to_dict(PyObject* self, PyObject* args) {
26202620
/* TODO: Support buffer protocol */
26212621
char* string;
26222622
PyObject* bson;
2623-
PyObject* options_obj;
2623+
PyObject* options_obj = NULL;
26242624
codec_options_t options;
26252625
unsigned position;
26262626
unsigned max;
@@ -2732,7 +2732,7 @@ static PyObject* _cbson_bson_to_dict(PyObject* self, PyObject* args) {
27322732
int32_t size;
27332733
Py_ssize_t total_size;
27342734
const char* string;
2735-
PyObject* bson;
2735+
PyObject* bson = NULL;
27362736
codec_options_t options;
27372737
PyObject* result = NULL;
27382738
PyObject* options_obj;

bson/binary.py

Lines changed: 149 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
# limitations under the License.
1414
from __future__ import annotations
1515

16-
from typing import TYPE_CHECKING, Any, Tuple, Type, Union
16+
import struct
17+
from dataclasses import dataclass
18+
from enum import Enum
19+
from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
1720
from uuid import UUID
1821

1922
"""Tools for representing BSON binary data.
@@ -191,21 +194,75 @@ class UuidRepresentation:
191194
"""
192195

193196

197+
VECTOR_SUBTYPE = 9
198+
"""**(BETA)** BSON binary subtype for densely packed vector data.
199+
200+
.. versionadded:: 4.10
201+
"""
202+
203+
194204
USER_DEFINED_SUBTYPE = 128
195205
"""BSON binary subtype for any user defined structure.
196206
"""
197207

198208

209+
class BinaryVectorDtype(Enum):
210+
"""**(BETA)** Datatypes of vector subtype.
211+
212+
:param FLOAT32: (0x27) Pack list of :class:`float` as float32
213+
:param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
214+
:param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
215+
216+
The `PACKED_BIT` value represents a special case where vector values themselves
217+
can only be of two values (0 or 1) but these are packed together into groups of 8,
218+
a byte. In Python, these are displayed as ints in range [0, 255]
219+
220+
Each value is of type bytes with a length of one.
221+
222+
.. versionadded:: 4.10
223+
"""
224+
225+
INT8 = b"\x03"
226+
FLOAT32 = b"\x27"
227+
PACKED_BIT = b"\x10"
228+
229+
230+
@dataclass
231+
class BinaryVector:
232+
"""**(BETA)** Vector of numbers along with metadata for binary interoperability.
233+
.. versionadded:: 4.10
234+
"""
235+
236+
__slots__ = ("data", "dtype", "padding")
237+
238+
def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
239+
"""
240+
:param data: Sequence of numbers representing the mathematical vector.
241+
:param dtype: The data type stored in binary
242+
:param padding: The number of bits in the final byte that are to be ignored
243+
when a vector element's size is less than a byte
244+
and the length of the vector is not a multiple of 8.
245+
"""
246+
self.data = data
247+
self.dtype = dtype
248+
self.padding = padding
249+
250+
199251
class Binary(bytes):
200252
"""Representation of BSON binary data.
201253
202-
This is necessary because we want to represent Python strings as
203-
the BSON string type. We need to wrap binary data so we can tell
254+
We want to represent Python strings as the BSON string type.
255+
We need to wrap binary data so that we can tell
204256
the difference between what should be considered binary data and
205257
what should be considered a string when we encode to BSON.
206258
207-
Raises TypeError if `data` is not an instance of :class:`bytes`
208-
or `subtype` is not an instance of :class:`int`.
259+
**(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
260+
Its data is prepended with two bytes of metadata.
261+
The first (dtype) describes its data type, such as float32 or int8.
262+
The second (padding) prescribes the number of bits to ignore in the final byte.
263+
This is relevant when the element size of the dtype is not a multiple of 8.
264+
265+
Raises TypeError if `subtype` is not an instance of :class:`int`.
209266
Raises ValueError if `subtype` is not in [0, 256).
210267
211268
.. note::
@@ -218,7 +275,10 @@ class Binary(bytes):
218275
to use
219276
220277
.. versionchanged:: 3.9
221-
Support any bytes-like type that implements the buffer protocol.
278+
Support any bytes-like type that implements the buffer protocol.
279+
280+
.. versionchanged:: 4.10
281+
**(BETA)** Addition of vector subtype.
222282
"""
223283

224284
_type_marker = 5
@@ -337,6 +397,89 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI
337397
f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}"
338398
)
339399

400+
@classmethod
401+
def from_vector(
402+
cls: Type[Binary],
403+
vector: list[int, float],
404+
dtype: BinaryVectorDtype,
405+
padding: int = 0,
406+
) -> Binary:
407+
"""**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
408+
409+
To interpret the representation of the numbers, a data type must be included.
410+
See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
411+
412+
The dtype and padding are prepended to the binary data's value.
413+
414+
:param vector: List of values
415+
:param dtype: Data type of the values
416+
:param padding: For fractional bytes, number of bits to ignore at end of vector.
417+
:return: Binary packed data identified by dtype and padding.
418+
419+
.. versionadded:: 4.10
420+
"""
421+
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
422+
format_str = "b"
423+
if padding:
424+
raise ValueError(f"padding does not apply to {dtype=}")
425+
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
426+
format_str = "B"
427+
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
428+
format_str = "f"
429+
if padding:
430+
raise ValueError(f"padding does not apply to {dtype=}")
431+
else:
432+
raise NotImplementedError("%s not yet supported" % dtype)
433+
434+
metadata = struct.pack("<sB", dtype.value, padding)
435+
data = struct.pack(f"<{len(vector)}{format_str}", *vector)
436+
return cls(metadata + data, subtype=VECTOR_SUBTYPE)
437+
438+
def as_vector(self) -> BinaryVector:
439+
"""**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
440+
441+
:return: BinaryVector
442+
443+
.. versionadded:: 4.10
444+
"""
445+
446+
if self.subtype != VECTOR_SUBTYPE:
447+
raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.")
448+
449+
position = 0
450+
dtype, padding = struct.unpack_from("<sB", self, position)
451+
position += 2
452+
dtype = BinaryVectorDtype(dtype)
453+
n_values = len(self) - position
454+
455+
if dtype == BinaryVectorDtype.INT8:
456+
dtype_format = "b"
457+
format_string = f"<{n_values}{dtype_format}"
458+
vector = list(struct.unpack_from(format_string, self, position))
459+
return BinaryVector(vector, dtype, padding)
460+
461+
elif dtype == BinaryVectorDtype.FLOAT32:
462+
n_bytes = len(self) - position
463+
n_values = n_bytes // 4
464+
if n_bytes % 4:
465+
raise ValueError(
466+
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
467+
)
468+
dtype_format = "f"
469+
format_string = f"<{n_values}{dtype_format}"
470+
vector = list(struct.unpack_from(format_string, self, position))
471+
return BinaryVector(vector, dtype, padding)
472+
473+
elif dtype == BinaryVectorDtype.PACKED_BIT:
474+
# data packed as uint8
475+
dtype_format = "B"
476+
format_string = f"<{n_values}{dtype_format}"
477+
unpacked_uint8s = list(struct.unpack_from(format_string, self, position))
478+
return BinaryVector(unpacked_uint8s, dtype, padding)
479+
480+
else:
481+
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
482+
340483
@property
341484
def subtype(self) -> int:
342485
"""Subtype of this binary data."""

bson/decimal128.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def __init__(self, value: _VALUE_OPTIONS) -> None:
223223
"from list or tuple. Must have exactly 2 "
224224
"elements."
225225
)
226-
self.__high, self.__low = value # type: ignore
226+
self.__high, self.__low = value
227227
else:
228228
raise TypeError(f"Cannot convert {value!r} to Decimal128")
229229

bson/json_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ def __new__(
324324
"JSONOptions.datetime_representation must be one of LEGACY, "
325325
"NUMBERLONG, or ISO8601 from DatetimeRepresentation."
326326
)
327-
self = cast(JSONOptions, super().__new__(cls, *args, **kwargs)) # type:ignore[arg-type]
327+
self = cast(JSONOptions, super().__new__(cls, *args, **kwargs))
328328
if json_mode not in (JSONMode.LEGACY, JSONMode.RELAXED, JSONMode.CANONICAL):
329329
raise ValueError(
330330
"JSONOptions.json_mode must be one of LEGACY, RELAXED, "

bson/son.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def __init__(
6868
self.update(kwargs)
6969

7070
def __new__(cls: Type[SON[_Key, _Value]], *args: Any, **kwargs: Any) -> SON[_Key, _Value]:
71-
instance = super().__new__(cls, *args, **kwargs) # type: ignore[type-var]
71+
instance = super().__new__(cls, *args, **kwargs)
7272
instance.__keys = []
7373
return instance
7474

doc/api/bson/binary.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@
2121
.. autoclass:: UuidRepresentation
2222
:members:
2323

24+
.. autoclass:: BinaryVectorDtype
25+
:members:
26+
:show-inheritance:
27+
28+
.. autoclass:: BinaryVector
29+
:members:
30+
31+
2432
.. autoclass:: Binary(data, subtype=BINARY_SUBTYPE)
2533
:members:
2634
:show-inheritance:

doc/async-tutorial.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
Async Tutorial
22
==============
33

4+
.. warning:: This API is currently in beta, meaning the classes, methods,
5+
and behaviors described within may change before the full release.
6+
If you come across any bugs during your use of this API,
7+
please file a Jira ticket in the "Python Driver" project at https://jira.mongodb.org/browse/PYTHON.
8+
49
.. code-block:: pycon
510
611
from pymongo import AsyncMongoClient

0 commit comments

Comments
 (0)