Skip to content

Commit 1998d74

Browse files
authored
ARROW-12 Support ObjectId datatype (#44)
1 parent 800fc55 commit 1998d74

File tree

11 files changed

+149
-28
lines changed

11 files changed

+149
-28
lines changed

bindings/python/docs/source/changelog.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
Changelog
22
=========
33

4+
Changes in Version 0.3.0
5+
------------------------
6+
- Support for `ObjectId` `bson` type.
7+
48
Changes in Version 0.2.0
59
------------------------
610

bindings/python/docs/source/supported_types.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ Support for additional types will be added in subsequent releases.
1515

1616
* - BSON Type
1717
- Type Identifiers
18+
* - ObjectId
19+
- :class:`py.bytes`, :class:`bson.ObjectId`, an instance of :class:`pyarrow.FixedSizeBinaryScalar`
1820
* - 64-bit binary floating point
1921
- :class:`py.float`, an instance of :meth:`pyarrow.float64`
2022
* - 32-bit integer
@@ -27,6 +29,10 @@ Support for additional types will be added in subsequent releases.
2729
Type identifiers can be used to specify that a field is of a certain type
2830
during :class:`pymongoarrow.api.Schema` declaration. For example, if your data
2931
has fields 'f1' and 'f2' bearing types 32-bit integer and UTC datetime
30-
respectively, your schema can be defined as::
32+
respectively, and '_id' that is an `ObjectId`, your schema can be defined as::
3133

32-
schema = Schema({'f1': pyarrow.int32(), 'f2': pyarrow.timestamp('ms')})
34+
schema = Schema({
35+
'_id': ObjectId,
36+
'f1': pyarrow.int32(),
37+
'f2': pyarrow.timestamp('ms')
38+
})

bindings/python/pymongoarrow/bson.pyi

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def process_bson_stream(bson_stream, context):
3636
t_int64 = _BsonArrowTypes.int64
3737
t_double = _BsonArrowTypes.double
3838
t_datetime = _BsonArrowTypes.datetime
39+
t_oid = _BsonArrowTypes.objectid
3940
builder_map = context.builder_map
4041

4142
# initialize count to current length of builders
@@ -69,6 +70,11 @@ def process_bson_stream(bson_stream, context):
6970
builder.append(bson_iter_as_int64(&doc_iter))
7071
else:
7172
builder.append_null()
73+
elif ftype == t_oid:
74+
if value_t == BSON_TYPE_OID:
75+
builder.append(<bytes>(<uint8_t*>bson_iter_oid(&doc_iter))[:12])
76+
else:
77+
builder.append_null()
7278
elif ftype == t_double:
7379
if (value_t == BSON_TYPE_DOUBLE or
7480
value_t == BSON_TYPE_BOOL or

bindings/python/pymongoarrow/builders.pyi

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,40 @@ cdef class _ArrayBuilderBase:
1919
self.append(value)
2020

2121

22+
cdef class ObjectIdBuilder(_ArrayBuilderBase):
23+
type_marker = _BsonArrowTypes.objectid
24+
cdef:
25+
shared_ptr[CFixedSizeBinaryBuilder] builder
26+
27+
def __cinit__(self, MemoryPool memory_pool=None):
28+
cdef shared_ptr[CDataType] dtype = fixed_size_binary(12)
29+
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
30+
self.builder.reset(new CFixedSizeBinaryBuilder(dtype, pool))
31+
32+
cpdef append_null(self):
33+
self.builder.get().AppendNull()
34+
35+
def __len__(self):
36+
return self.builder.get().length()
37+
38+
cpdef append(self, value):
39+
if value is None or value is np.nan:
40+
self.builder.get().AppendNull()
41+
elif isinstance(value, bytes):
42+
self.builder.get().Append(value)
43+
else:
44+
raise TypeError('ObjectIdBuilder only accepts bytes objects')
45+
46+
cpdef finish(self):
47+
cdef shared_ptr[CArray] out
48+
with nogil:
49+
self.builder.get().Finish(&out)
50+
return pyarrow_wrap_array(out)
51+
52+
cdef shared_ptr[CFixedSizeBinaryBuilder] unwrap(self):
53+
return self.builder
54+
55+
2256
cdef class Int32Builder(_ArrayBuilderBase):
2357
type_marker = _BsonArrowTypes.int32
2458
cdef:

bindings/python/pymongoarrow/context.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,16 @@
1414
from bson.codec_options import DEFAULT_CODEC_OPTIONS
1515

1616
from pyarrow import timestamp, Table
17-
from pymongoarrow.lib import Int32Builder, Int64Builder, DoubleBuilder, DatetimeBuilder
17+
from pymongoarrow.lib import Int32Builder, Int64Builder, DoubleBuilder, DatetimeBuilder, ObjectIdBuilder
1818
from pymongoarrow.types import _get_internal_typemap, _BsonArrowTypes
1919

2020

2121
_TYPE_TO_BUILDER_CLS = {
2222
_BsonArrowTypes.int32: Int32Builder,
2323
_BsonArrowTypes.int64: Int64Builder,
2424
_BsonArrowTypes.double: DoubleBuilder,
25-
_BsonArrowTypes.datetime: DatetimeBuilder
25+
_BsonArrowTypes.datetime: DatetimeBuilder,
26+
_BsonArrowTypes.objectid: ObjectIdBuilder
2627
}
2728

2829

bindings/python/pymongoarrow/lib.pyx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ from libcpp cimport bool as cbool
3434
from libcpp.map cimport map
3535
from libcpp.string cimport string
3636
from pyarrow.lib cimport *
37+
from pymongoarrow.libarrow cimport *
3738
from pymongoarrow.libbson cimport *
3839
from pymongoarrow.types import _BsonArrowTypes
3940

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# Copyright 2021-present MongoDB, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Cython compiler directives
16+
# cython: language_level=3
17+
# distutils: language=c
18+
from libc.stdint cimport int32_t, uint8_t
19+
from pyarrow.lib cimport *
20+
from pyarrow.includes.libarrow cimport (CStatus, CMemoryPool)
21+
22+
23+
# libarrow type wrappings
24+
25+
cdef extern from "arrow/builder.h" namespace "arrow" nogil:
26+
cdef cppclass CFixedSizeBinaryBuilder" arrow::FixedSizeBinaryBuilder"(CArrayBuilder):
27+
CFixedSizeBinaryBuilder(shared_ptr[CDataType], CMemoryPool* pool)
28+
CStatus Append(const uint8_t* value)
29+
30+
31+
cdef extern from "arrow/type_fwd.h" namespace "arrow" nogil:
32+
shared_ptr[CDataType] fixed_size_binary(int32_t byte_width);

bindings/python/pymongoarrow/libbson.pxd

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ cdef extern from "<bson/bson.h>":
108108

109109
int64_t bson_iter_date_time(const bson_iter_t *iter)
110110

111+
const bson_oid_t * bson_iter_oid (const bson_iter_t *iter)
112+
111113
# TODO: add decimal128
112114

113115
double bson_iter_double(const bson_iter_t *iter)

bindings/python/pymongoarrow/types.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from datetime import datetime
1515
import enum
1616

17-
from bson import Int64
17+
from bson import Int64, ObjectId
1818

1919
from pyarrow import timestamp, float64, int64, int32
2020
from pyarrow import DataType as _ArrowDataType
@@ -26,21 +26,28 @@ class _BsonArrowTypes(enum.Enum):
2626
double = 2
2727
int32 = 3
2828
int64 = 4
29+
objectid = 5
2930

3031

3132
_TYPE_NORMALIZER_FACTORY = {
3233
Int64: lambda _: int64(),
3334
float: lambda _: float64(),
3435
int: lambda _: int64(),
35-
datetime: lambda _: timestamp('ms') # TODO: add tzinfo support
36+
datetime: lambda _: timestamp('ms'), # TODO: add tzinfo support
37+
ObjectId: lambda _: ObjectId
3638
}
3739

3840

41+
def _is_objectid(obj):
42+
return obj == ObjectId
43+
44+
3945
_TYPE_CHECKER_TO_INTERNAL_TYPE = {
4046
_atypes.is_int32: _BsonArrowTypes.int32,
4147
_atypes.is_int64: _BsonArrowTypes.int64,
4248
_atypes.is_float64: _BsonArrowTypes.double,
43-
_atypes.is_timestamp: _BsonArrowTypes.datetime
49+
_atypes.is_timestamp: _BsonArrowTypes.datetime,
50+
_is_objectid: _BsonArrowTypes.objectid
4451
}
4552

4653

@@ -64,8 +71,14 @@ def _get_internal_typemap(typemap):
6471
internal_typemap = {}
6572
for fname, ftype in typemap.items():
6673
for checker, internal_id in _TYPE_CHECKER_TO_INTERNAL_TYPE.items():
67-
if checker(ftype):
68-
internal_typemap[fname] = internal_id
69-
continue
74+
# Catch error where the pyarrow checkers are looking for an `id`
75+
# attribute that might not exist on non-pyarrow types
76+
# (like ObjectId). For example, `is_int32()` checks for
77+
# `t.id == lib.Type_INT32`.
78+
try:
79+
if checker(ftype):
80+
internal_typemap[fname] = internal_id
81+
except AttributeError:
82+
pass
7083
assert len(internal_typemap) == len(typemap)
7184
return internal_typemap

bindings/python/test/test_bson.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,12 @@
1818
from pymongoarrow.context import PyMongoArrowContext
1919
from pymongoarrow.lib import process_bson_stream
2020
from pymongoarrow.schema import Schema
21-
from pymongoarrow.types import int32, int64
21+
from pymongoarrow.types import int32, int64, ObjectId
2222

2323

2424
class TestBsonToArrowConversionBase(TestCase):
2525
def setUp(self):
26-
self.schema = Schema({'_id': int32(),
26+
self.schema = Schema({'_id': ObjectId,
2727
'data': int64()})
2828
self.context = PyMongoArrowContext.from_schema(
2929
self.schema)
@@ -47,26 +47,29 @@ def _run_test(self, doclist, as_dict):
4747

4848

4949
class TestValidBsonToArrowConversion(TestBsonToArrowConversionBase):
50+
5051
def test_simple(self):
51-
docs = [{'_id': 1, 'data': 10},
52-
{'_id': 2, 'data': 20},
53-
{'_id': 3, 'data': 30},
54-
{'_id': 4, 'data': 40}]
52+
ids = [ObjectId() for i in range(4)]
53+
docs = [{'_id': ids[0], 'data': 10},
54+
{'_id': ids[1], 'data': 20},
55+
{'_id': ids[2], 'data': 30},
56+
{'_id': ids[3], 'data': 40}]
5557
as_dict = {
56-
'_id': [1, 2, 3, 4],
58+
'_id': [oid.binary for oid in ids] ,
5759
'data': [10, 20, 30, 40]}
5860

5961
self._run_test(docs, as_dict)
6062

6163
def test_with_nulls(self):
62-
docs = [{'_id': 1, 'data': 10},
63-
{'_id': 2, 'data': 20},
64-
{'_id': 3},
65-
{'_id': 4, 'data': 40},
64+
ids = [ObjectId() for i in range(4)]
65+
docs = [{'_id': ids[0], 'data': 10},
66+
{'_id': ids[1], 'data': 20},
67+
{'_id': ids[2]},
68+
{'_id': ids[3], 'data': 40},
6669
{'foo': 1},
6770
{}]
6871
as_dict = {
69-
'_id': [1, 2, 3, 4, None, None],
72+
'_id': [oid.binary for oid in ids] + [None, None],
7073
'data': [10, 20, None, 40, None, None]}
7174

7275
self._run_test(docs, as_dict)
@@ -79,12 +82,13 @@ def _generate_payload(doclist):
7982
doclist)[:-2]
8083

8184
def test_simple(self):
82-
docs = [{'_id': 1, 'data': 10},
83-
{'_id': 2, 'data': 20},
84-
{'_id': 3, 'data': 30},
85-
{'_id': 4, 'data': 40}]
85+
ids = [ObjectId() for i in range(4)]
86+
docs = [{'_id': ids[0], 'data': 10},
87+
{'_id': ids[1], 'data': 20},
88+
{'_id': ids[2], 'data': 30},
89+
{'_id': ids[3], 'data': 40}]
8690
as_dict = {
87-
'_id': [1, 2, 3, 4],
91+
'_id': [oid.binary for oid in ids],
8892
'data': [10, 20, 30, 40]}
8993

9094
with self.assertRaisesRegex(

0 commit comments

Comments
 (0)