Skip to content

Commit b67be9c

Browse files
authored
ARROW-15 Add support for BSON Decimal128 type (#132)
* ARROW-15 Add support for BSON Decimal128 type * add builder test * ARROW-15 Add support for BSON Decimal128 type * Add PandasDecimal128 and update tests * fix typo * wip remove test workarounds * fix auto schema * remove unused import * clean up struct * address review * test on big endian linux * debug * remove other builds * run checkout first * fix build-libbson * install cmake * make apt non-interactive * fix syntax * use python3 * install pip * install pip * fix syntax * remove support for big-endian decimal128 * add note about big-endian systems * add comment
1 parent 5304537 commit b67be9c

File tree

16 files changed

+252
-83
lines changed

16 files changed

+252
-83
lines changed

bindings/python/README.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ installed::
6161

6262
$ python -m pip install pandas
6363

64+
Note: ``pymongoarrow`` is not supported or tested on big-endian systems
65+
(e.g. Linux s390x).
66+
6467
Development Install
6568
===================
6669

bindings/python/docs/source/supported_types.rst

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,13 @@ Supported Types
66
PyMongoArrow currently supports a small subset of all BSON types.
77
Support for additional types will be added in subsequent releases.
88

9-
.. note:: PyMongoArrow does not currently fully support extension types with Pandas/NumPy or Arrow.
10-
However, they can be used in schemas.
11-
This means that ObjectId and Decimal128 are not fully supported in Pandas DataFrames or Arrow Tables.
12-
Instead, the schema type will be converted to a string or object representation of the type.
13-
For more information see :doc:`extension_types`.
149

1510
.. note:: For more information about BSON types, see the
1611
`BSON specification <http://bsonspec.org/spec.html>`_.
1712

13+
.. note:: ``Decimal128`` types are only supported on little-endian systems.
14+
On big-endian systems, ``null`` will be used.
15+
1816
.. list-table::
1917
:widths: auto
2018
:header-rows: 1
@@ -28,7 +26,9 @@ Support for additional types will be added in subsequent releases.
2826
* - Embedded array
2927
- :class:`py.list`, an instance of :class:`pyarrow.list_`,
3028
* - ObjectId
31-
- :class:`py.bytes`, :class:`bson.ObjectId`, an instance of :class:`pymongoarrow.types.ObjectIdType`, an instance of :class:`pyarrow.FixedSizeBinaryScalar`
29+
- :class:`py.bytes`, :class:`bson.ObjectId`, an instance of :class:`pymongoarrow.types.ObjectIdType`, an instance of :class:`pymongoarrow.pandas_types.PandasObjectId`
30+
* - Decimal128
31+
- :class:`bson.Decimal128`, an instance of :class:`pymongoarrow.types.Decimal128Type`, an instance of :class:`pymongoarrow.pandas_types.PandasDecimal128`.
3232
* - Boolean
3333
- an instance of :class:`~pyarrow.bool_`, :class:`~py.bool`
3434
* - 64-bit binary floating point

bindings/python/pymongoarrow/api.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ def write(collection, tabular):
338338
tab_size = len(tabular)
339339
if isinstance(tabular, Table):
340340
_validate_schema(tabular.schema.types)
341-
elif DataFrame is not None and isinstance(tabular, DataFrame):
341+
elif isinstance(tabular, DataFrame):
342342
_validate_schema(ArrowSchema.from_pandas(tabular).types)
343343
elif (
344344
isinstance(tabular, dict)
@@ -359,9 +359,8 @@ def write(collection, tabular):
359359

360360
# Handle Pandas NA objects.
361361
codec_options = collection.codec_options
362-
if DataFrame is not None:
363-
type_registry = TypeRegistry([_PandasNACodec()])
364-
codec_options = codec_options.with_options(type_registry=type_registry)
362+
type_registry = TypeRegistry([_PandasNACodec()])
363+
codec_options = codec_options.with_options(type_registry=type_registry)
365364

366365
while cur_offset < tab_size:
367366
cur_size = 0

bindings/python/pymongoarrow/context.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
BinaryBuilder,
1818
BoolBuilder,
1919
DatetimeBuilder,
20+
Decimal128Builder,
2021
DocumentBuilder,
2122
DoubleBuilder,
2223
Int32Builder,
@@ -33,7 +34,7 @@
3334
_BsonArrowTypes.double: DoubleBuilder,
3435
_BsonArrowTypes.datetime: DatetimeBuilder,
3536
_BsonArrowTypes.objectid: ObjectIdBuilder,
36-
_BsonArrowTypes.decimal128_str: StringBuilder,
37+
_BsonArrowTypes.decimal128: Decimal128Builder,
3738
_BsonArrowTypes.string: StringBuilder,
3839
_BsonArrowTypes.bool: BoolBuilder,
3940
_BsonArrowTypes.document: DocumentBuilder,

bindings/python/pymongoarrow/lib.pyx

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
import copy
2121
import datetime
2222
import enum
23+
import struct as pystruct
24+
import sys
2325

2426
# Python imports
2527
import bson
@@ -31,15 +33,14 @@ from pyarrow.lib import (
3133

3234
from pymongoarrow.errors import InvalidBSON, PyMongoArrowError
3335
from pymongoarrow.context import PyMongoArrowContext
34-
from pymongoarrow.types import _BsonArrowTypes, _atypes, ObjectIdType, Decimal128StringType, BinaryType
36+
from pymongoarrow.types import _BsonArrowTypes, _atypes, ObjectIdType, Decimal128Type as Decimal128Type_, BinaryType
3537

3638
# Cython imports
3739
from cpython cimport PyBytes_Size, object
3840
from cython.operator cimport dereference
3941
from libcpp cimport bool as cbool
4042
from libcpp.map cimport map
4143
from libcpp.vector cimport vector
42-
from libc.stdlib cimport malloc, free
4344
from pyarrow.lib cimport *
4445
from pymongoarrow.libarrow cimport *
4546
from pymongoarrow.libbson cimport *
@@ -66,7 +67,7 @@ _builder_type_map = {
6667
BSON_TYPE_UTF8: StringBuilder,
6768
BSON_TYPE_BOOL: BoolBuilder,
6869
BSON_TYPE_DOCUMENT: DocumentBuilder,
69-
BSON_TYPE_DECIMAL128: StringBuilder,
70+
BSON_TYPE_DECIMAL128: Decimal128Builder,
7071
BSON_TYPE_ARRAY: ListBuilder,
7172
BSON_TYPE_BINARY: BinaryBuilder
7273
}
@@ -78,7 +79,7 @@ _field_type_map = {
7879
BSON_TYPE_OID: ObjectIdType(),
7980
BSON_TYPE_UTF8: string(),
8081
BSON_TYPE_BOOL: bool_(),
81-
BSON_TYPE_DECIMAL128: Decimal128StringType(),
82+
BSON_TYPE_DECIMAL128: Decimal128Type_(),
8283
}
8384

8485
cdef extract_field_dtype(bson_iter_t * doc_iter, bson_iter_t * child_iter, bson_type_t value_t, context):
@@ -134,8 +135,6 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
134135
cdef const uint8_t* docstream = <const uint8_t *>bson_stream
135136
cdef size_t length = <size_t>PyBytes_Size(bson_stream)
136137
cdef bson_reader_t* stream_reader = bson_reader_new_from_data(docstream, length)
137-
cdef char *decimal128_str = <char *> malloc(
138-
BSON_DECIMAL128_STRING * sizeof(char))
139138
cdef uint32_t str_len
140139
cdef const uint8_t *val_buf = NULL
141140
cdef uint32_t val_buf_len = 0
@@ -163,7 +162,7 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
163162
t_document = _BsonArrowTypes.document
164163
t_array = _BsonArrowTypes.array
165164
t_binary = _BsonArrowTypes.binary
166-
165+
t_decimal128 = _BsonArrowTypes.decimal128
167166

168167
# initialize count to current length of builders
169168
for _, builder in builder_map.items():
@@ -243,10 +242,17 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
243242
if value_t == BSON_TYPE_UTF8:
244243
bson_str = bson_iter_utf8(&doc_iter, &str_len)
245244
builder.append(<bytes>(bson_str)[:str_len])
246-
elif value_t == BSON_TYPE_DECIMAL128:
245+
else:
246+
builder.append_null()
247+
elif ftype == t_decimal128:
248+
if value_t == BSON_TYPE_DECIMAL128:
247249
bson_iter_decimal128(&doc_iter, &dec128)
248-
bson_decimal128_to_string(&dec128, decimal128_str)
249-
builder.append(<bytes>(decimal128_str))
250+
if sys.byteorder == 'little':
251+
val = pystruct.pack('<QQ', dec128.low, dec128.high)
252+
builder.append(val)
253+
else:
254+
# We do not support big-endian systems.
255+
builder.append_null()
250256
else:
251257
builder.append_null()
252258
elif ftype == t_double:
@@ -300,7 +306,6 @@ def process_bson_stream(bson_stream, context, arr_value_builder=None):
300306
builder.append_null()
301307
finally:
302308
bson_reader_destroy(stream_reader)
303-
free(decimal128_str)
304309

305310

306311
# Builders
@@ -523,6 +528,35 @@ cdef class BoolBuilder(_ArrayBuilderBase):
523528
return self.builder
524529

525530

531+
cdef class Decimal128Builder(_ArrayBuilderBase):
532+
type_marker = _BsonArrowTypes.decimal128
533+
534+
cdef:
535+
shared_ptr[CFixedSizeBinaryBuilder] builder
536+
537+
def __cinit__(self, MemoryPool memory_pool=None):
538+
cdef shared_ptr[CDataType] dtype = fixed_size_binary(16)
539+
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
540+
self.builder.reset(new CFixedSizeBinaryBuilder(dtype, pool))
541+
542+
cpdef append_null(self):
543+
self.builder.get().AppendNull()
544+
545+
def __len__(self):
546+
return self.builder.get().length()
547+
548+
cpdef append(self, value):
549+
self.builder.get().Append(value)
550+
551+
cpdef finish(self):
552+
cdef shared_ptr[CArray] out
553+
with nogil:
554+
self.builder.get().Finish(&out)
555+
return pyarrow_wrap_array(out).cast(Decimal128Type_())
556+
557+
cdef shared_ptr[CFixedSizeBinaryBuilder] unwrap(self):
558+
return self.builder
559+
526560

527561
cdef object get_field_builder(field, tzinfo):
528562
""""Find the appropriate field builder given a pyarrow field"""
@@ -552,8 +586,8 @@ cdef object get_field_builder(field, tzinfo):
552586
field_builder = ListBuilder(field_type, tzinfo)
553587
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.objectid:
554588
field_builder = ObjectIdBuilder()
555-
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.decimal128_str:
556-
field_builder = StringBuilder()
589+
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.decimal128:
590+
field_builder = Decimal128Builder()
557591
elif getattr(field_type, '_type_marker') == _BsonArrowTypes.binary:
558592
field_builder = BinaryBuilder(field_type.subtype)
559593
else:

bindings/python/pymongoarrow/libbson.pxd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ cdef extern from "<bson/bson.h>":
3838
pass
3939

4040
ctypedef struct bson_decimal128_t:
41-
pass
41+
uint64_t high
42+
uint64_t low
4243

4344
ctypedef struct bson_error_t:
4445
uint32_t domain

bindings/python/pymongoarrow/pandas_types.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import numpy as np
2222
import pandas as pd
2323
import pyarrow as pa
24-
from bson import Binary, ObjectId
24+
from bson import Binary, Decimal128, ObjectId
2525
from pandas.api.extensions import (
2626
ExtensionArray,
2727
ExtensionDtype,
@@ -56,7 +56,10 @@ def __from_arrow__(self, array: Union[pa.Array, pa.ChunkedArray]) -> ExtensionAr
5656
typ = self.type
5757
for val in np.array(arr):
5858
if not pd.isna(val) and not isinstance(val, typ):
59-
val = typ(val)
59+
if typ == Decimal128:
60+
val = Decimal128.from_bid(val)
61+
else:
62+
val = typ(val)
6063
vals.append(val)
6164
arr = np.array(vals, dtype=object)
6265
# using _from_sequence to ensure None is converted to NA
@@ -249,3 +252,27 @@ def __arrow_array__(self, type=None):
249252
from pymongoarrow.types import ObjectIdType
250253

251254
return pa.array(self.data, type=ObjectIdType())
255+
256+
257+
@register_extension_dtype
258+
class PandasDecimal128(PandasBSONDtype):
259+
"""A pandas extension type for BSON ObjectId data type."""
260+
261+
type = Decimal128
262+
263+
@classmethod
264+
def construct_array_type(cls) -> Type["PandasDecimal128Array"]:
265+
return PandasDecimal128Array
266+
267+
268+
class PandasDecimal128Array(PandasBSONExtensionArray):
269+
"""A pandas extension type for BSON Binary data arrays."""
270+
271+
@property
272+
def _default_dtype(self):
273+
return PandasDecimal128()
274+
275+
def __arrow_array__(self, type=None):
276+
from pymongoarrow.types import Decimal128Type
277+
278+
return pa.array(self.data, type=Decimal128Type())

bindings/python/pymongoarrow/types.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
struct,
3232
timestamp,
3333
)
34-
from pymongoarrow.pandas_types import PandasBinary, PandasObjectId
34+
from pymongoarrow.pandas_types import PandasBinary, PandasDecimal128, PandasObjectId
3535

3636

3737
class _BsonArrowTypes(enum.Enum):
@@ -42,7 +42,7 @@ class _BsonArrowTypes(enum.Enum):
4242
objectid = 5
4343
string = 6
4444
bool = 7
45-
decimal128_str = 8
45+
decimal128 = 8
4646
document = 9
4747
array = 10
4848
binary = 11
@@ -78,21 +78,26 @@ def to_pandas_dtype(self):
7878

7979
class Decimal128Scalar(ExtensionScalar):
8080
def as_py(self):
81-
return Decimal128(self.value.as_py())
81+
if self.value is None:
82+
return None
83+
return Decimal128.from_bid(self.value.as_py())
8284

8385

84-
class Decimal128StringType(PyExtensionType):
85-
_type_marker = _BsonArrowTypes.decimal128_str
86+
class Decimal128Type(PyExtensionType):
87+
_type_marker = _BsonArrowTypes.decimal128
8688

8789
def __init__(self):
88-
super().__init__(string())
90+
super().__init__(binary(16))
8991

9092
def __reduce__(self):
91-
return Decimal128StringType, ()
93+
return Decimal128Type, ()
9294

9395
def __arrow_ext_scalar_class__(self):
9496
return Decimal128Scalar
9597

98+
def to_pandas_dtype(self):
99+
return PandasDecimal128()
100+
96101

97102
class BinaryScalar(ExtensionScalar):
98103
def as_py(self):
@@ -131,9 +136,9 @@ def _is_objectid(obj):
131136
return type_marker == ObjectIdType._type_marker
132137

133138

134-
def _is_decimal128_str(obj):
139+
def _is_decimal128(obj):
135140
type_marker = getattr(obj, "_type_marker", "")
136-
return type_marker == Decimal128StringType._type_marker
141+
return type_marker == Decimal128Type._type_marker
137142

138143

139144
def _is_binary(obj):
@@ -150,7 +155,7 @@ def _is_binary(obj):
150155
# must be used directly.
151156
datetime: lambda _: timestamp("ms"),
152157
ObjectId: lambda _: ObjectIdType(),
153-
Decimal128: lambda _: Decimal128StringType(),
158+
Decimal128: lambda _: Decimal128Type(),
154159
str: lambda _: string(),
155160
bool: lambda _: bool_(),
156161
Binary: lambda subtype: BinaryType(subtype),
@@ -181,7 +186,7 @@ def get_numpy_type(type):
181186
_atypes.is_float64: _BsonArrowTypes.double,
182187
_atypes.is_timestamp: _BsonArrowTypes.datetime,
183188
_is_objectid: _BsonArrowTypes.objectid,
184-
_is_decimal128_str: _BsonArrowTypes.decimal128_str,
189+
_is_decimal128: _BsonArrowTypes.decimal128,
185190
_is_binary: _BsonArrowTypes.binary,
186191
_atypes.is_string: _BsonArrowTypes.string,
187192
_atypes.is_boolean: _BsonArrowTypes.bool,

0 commit comments

Comments
 (0)