Skip to content

Commit e2fa03b

Browse files
authored
ARROW-52 Add support for BSON binary type (#128)
1 parent e92dd2f commit e2fa03b

File tree

14 files changed

+628
-37
lines changed

14 files changed

+628
-37
lines changed

bindings/python/pymongoarrow/api.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,10 @@
1818
from bson import encode
1919
from bson.codec_options import TypeEncoder, TypeRegistry
2020
from bson.raw_bson import RawBSONDocument
21+
from numpy import ndarray
22+
from pandas import NA, DataFrame
2123
from pyarrow import Schema as ArrowSchema
2224
from pyarrow import Table
23-
24-
try:
25-
from numpy import ndarray
26-
except ImportError:
27-
ndarray = None
28-
29-
try:
30-
from pandas import NA, DataFrame
31-
except ImportError:
32-
DataFrame = None
33-
NA = None
34-
3525
from pymongo.bulk import BulkWriteError
3626
from pymongo.common import MAX_WRITE_BATCH_SIZE
3727
from pymongoarrow.context import PyMongoArrowContext

bindings/python/pymongoarrow/context.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from bson.codec_options import DEFAULT_CODEC_OPTIONS
1515
from pyarrow import Table, timestamp
1616
from pymongoarrow.lib import (
17+
BinaryBuilder,
1718
BoolBuilder,
1819
DatetimeBuilder,
1920
DocumentBuilder,
@@ -37,6 +38,7 @@
3738
_BsonArrowTypes.bool: BoolBuilder,
3839
_BsonArrowTypes.document: DocumentBuilder,
3940
_BsonArrowTypes.array: ListBuilder,
41+
_BsonArrowTypes.binary: BinaryBuilder,
4042
}
4143

4244

@@ -90,7 +92,9 @@ def from_schema(cls, schema, codec_options=DEFAULT_CODEC_OPTIONS):
9092
elif builder_cls == ListBuilder:
9193
arrow_type = schema.typemap[fname]
9294
builder_map[encoded_fname] = ListBuilder(arrow_type, tzinfo)
93-
95+
elif builder_cls == BinaryBuilder:
96+
subtype = schema.typemap[fname].subtype
97+
builder_map[encoded_fname] = BinaryBuilder(subtype)
9498
else:
9599
builder_map[encoded_fname] = builder_cls()
96100
return cls(schema, builder_map)

bindings/python/pymongoarrow/lib.pyx

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ from pyarrow.lib import (
3131

3232
from pymongoarrow.errors import InvalidBSON, PyMongoArrowError
3333
from pymongoarrow.context import PyMongoArrowContext
34-
from pymongoarrow.types import _BsonArrowTypes, _atypes, ObjectIdType, Decimal128StringType
34+
from pymongoarrow.types import _BsonArrowTypes, _atypes, ObjectIdType, Decimal128StringType, BinaryType
3535

3636
# Cython imports
3737
from cpython cimport PyBytes_Size, object
@@ -68,6 +68,7 @@ _builder_type_map = {
6868
BSON_TYPE_DOCUMENT: DocumentBuilder,
6969
BSON_TYPE_DECIMAL128: StringBuilder,
7070
BSON_TYPE_ARRAY: ListBuilder,
71+
BSON_TYPE_BINARY: BinaryBuilder
7172
}
7273

7374
_field_type_map = {
@@ -82,6 +83,10 @@ _field_type_map = {
8283

8384
cdef extract_field_dtype(bson_iter_t * doc_iter, bson_iter_t * child_iter, bson_type_t value_t, context):
8485
"""Get the appropropriate data type for a specific field"""
86+
cdef const uint8_t *val_buf = NULL
87+
cdef uint32_t val_buf_len = 0
88+
cdef bson_subtype_t subtype
89+
8590
if value_t in _field_type_map:
8691
field_type = _field_type_map[value_t]
8792
elif value_t == BSON_TYPE_ARRAY:
@@ -93,6 +98,9 @@ cdef extract_field_dtype(bson_iter_t * doc_iter, bson_iter_t * child_iter, bson_
9398
field_type = extract_document_dtype(child_iter, context)
9499
elif value_t == BSON_TYPE_DATE_TIME:
95100
field_type = timestamp('ms', tz=context.tzinfo)
101+
elif value_t == BSON_TYPE_BINARY:
102+
bson_iter_binary (doc_iter, &subtype, &val_buf_len, &val_buf)
103+
field_type = BinaryType(subtype)
96104
else:
97105
raise PyMongoArrowError('unknown value type {}'.format(value_t))
98106
return field_type
@@ -129,10 +137,8 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
129137
cdef char *decimal128_str = <char *> malloc(
130138
BSON_DECIMAL128_STRING * sizeof(char))
131139
cdef uint32_t str_len
132-
cdef const uint8_t *doc_buf = NULL
133-
cdef uint32_t doc_buf_len = 0;
134-
cdef const uint8_t *arr_buf = NULL
135-
cdef uint32_t arr_buf_len = 0;
140+
cdef const uint8_t *val_buf = NULL
141+
cdef uint32_t val_buf_len = 0
136142
cdef bson_decimal128_t dec128
137143
cdef bson_type_t value_t
138144
cdef const char * bson_str
@@ -142,6 +148,7 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
142148
cdef bson_iter_t child_iter
143149
cdef const char* key
144150
cdef Py_ssize_t count = 0
151+
cdef bson_subtype_t subtype
145152

146153
builder_map = context.builder_map
147154

@@ -155,6 +162,7 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
155162
t_bool = _BsonArrowTypes.bool
156163
t_document = _BsonArrowTypes.document
157164
t_array = _BsonArrowTypes.array
165+
t_binary = _BsonArrowTypes.binary
158166

159167

160168
# initialize count to current length of builders
@@ -197,6 +205,10 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
197205
list_dtype = extract_array_dtype(&child_iter, context)
198206
list_dtype = list_(list_dtype)
199207
builder = ListBuilder(list_dtype, context.tzinfo, value_builder=arr_value_builder)
208+
elif builder_type == BinaryBuilder:
209+
bson_iter_binary (&doc_iter, &subtype,
210+
&val_buf_len, &val_buf)
211+
builder = BinaryBuilder(subtype)
200212
else:
201213
builder = builder_type()
202214
if arr_value_builder is None:
@@ -257,20 +269,28 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
257269
builder.append_null()
258270
elif ftype == t_document:
259271
if value_t == BSON_TYPE_DOCUMENT:
260-
bson_iter_document(&doc_iter, &doc_buf_len, &doc_buf)
261-
if doc_buf_len <= 0:
272+
bson_iter_document(&doc_iter, &val_buf_len, &val_buf)
273+
if val_buf_len <= 0:
262274
raise ValueError("Subdocument is invalid")
263-
builder.append(<bytes>doc_buf[:doc_buf_len])
275+
builder.append(<bytes>val_buf[:val_buf_len])
264276
else:
265277
builder.append_null()
266278
elif ftype == t_array:
267279
if value_t == BSON_TYPE_ARRAY:
268-
bson_iter_array(&doc_iter, &doc_buf_len, &doc_buf)
269-
if doc_buf_len <= 0:
280+
bson_iter_array(&doc_iter, &val_buf_len, &val_buf)
281+
if val_buf_len <= 0:
270282
raise ValueError("Subarray is invalid")
271-
builder.append(<bytes>doc_buf[:doc_buf_len])
283+
builder.append(<bytes>val_buf[:val_buf_len])
272284
else:
273285
builder.append_null()
286+
elif ftype == t_binary:
287+
if value_t == BSON_TYPE_BINARY:
288+
bson_iter_binary (&doc_iter, &subtype,
289+
&val_buf_len, &val_buf)
290+
if subtype != builder.subtype:
291+
builder.append_null()
292+
else:
293+
builder.append(<bytes>val_buf[:val_buf_len])
274294
else:
275295
raise PyMongoArrowError('unknown ftype {}'.format(ftype))
276296
count += 1
@@ -534,6 +554,8 @@ cdef object get_field_builder(field, tzinfo):
534554
field_builder = ObjectIdBuilder()
535555
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.decimal128_str:
536556
field_builder = StringBuilder()
557+
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.binary:
558+
field_builder = BinaryBuilder(field_type.subtype)
537559
else:
538560
field_builder = StringBuilder()
539561
return field_builder
@@ -596,6 +618,7 @@ cdef class DocumentBuilder(_ArrayBuilderBase):
596618
cdef shared_ptr[CStructBuilder] unwrap(self):
597619
return self.builder
598620

621+
599622
cdef class ListBuilder(_ArrayBuilderBase):
600623
type_marker = _BsonArrowTypes.array
601624

@@ -647,3 +670,36 @@ cdef class ListBuilder(_ArrayBuilderBase):
647670

648671
cdef shared_ptr[CListBuilder] unwrap(self):
649672
return self.builder
673+
674+
675+
cdef class BinaryBuilder(_ArrayBuilderBase):
676+
type_marker = _BsonArrowTypes.binary
677+
cdef:
678+
shared_ptr[CBinaryBuilder] builder
679+
uint8_t _subtype
680+
681+
def __cinit__(self, uint8_t subtype):
682+
self._subtype = subtype
683+
self.builder.reset(new CBinaryBuilder())
684+
685+
cpdef append_null(self):
686+
self.builder.get().AppendNull()
687+
688+
@property
689+
def subtype(self):
690+
return self._subtype
691+
692+
def __len__(self):
693+
return self.builder.get().length()
694+
695+
cpdef append(self, value):
696+
self.builder.get().Append(<bytes>value, len(value))
697+
698+
cpdef finish(self):
699+
cdef shared_ptr[CArray] out
700+
with nogil:
701+
self.builder.get().Finish(&out)
702+
return pyarrow_wrap_array(out).cast(BinaryType(self._subtype))
703+
704+
cdef shared_ptr[CBinaryBuilder] unwrap(self):
705+
return self.builder

bindings/python/pymongoarrow/libbson.pxd

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,9 @@ cdef extern from "<bson/bson.h>":
4848
ctypedef struct bson_iter_t:
4949
pass
5050

51+
ctypedef enum bson_subtype_t:
52+
pass
53+
5154
ctypedef struct bson_oid_t:
5255
uint8_t bytes[12]
5356

@@ -137,6 +140,10 @@ cdef extern from "<bson/bson.h>":
137140
uint32_t *array_len, # OUT
138141
const uint8_t **array)
139142

143+
void bson_iter_binary (const bson_iter_t *iter, # IN
144+
bson_subtype_t *subtype, # OUT
145+
uint32_t *binary_len, # OUT
146+
const uint8_t **binary) # OUT
140147

141148
# bson_reader_t API
142149
cdef extern from "<bson/bson.h>":

0 commit comments

Comments
 (0)