Skip to content

Commit 2e65cb4

Browse files
authored
ARROW-15 Add support for BSON Decimal128 type (#72)
1 parent 3d39c2c commit 2e65cb4

File tree

12 files changed

+234
-23
lines changed

12 files changed

+234
-23
lines changed

bindings/python/docs/source/changelog.rst

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
Changelog
22
=========
33

4+
Changes in Version 0.4.0
5+
------------------------
6+
- Support for :class:`~bson.decimal128.Decimal128` type.
7+
- Support for macOS arm64 architecture on Python 3.9+.
8+
49
Changes in Version 0.3.0
510
------------------------
611
- Support for `PyArrow` 7.0.
7-
- Support for `ObjectId` `bson` type.
12+
- Support for :class:`~bson.objectid.ObjectId` type.
813
- Improve error message when schema contains an unsupported type.
914
- Add support for BSON string type.
1015
- Add support for BSON boolean type.
11-
- Upgraded to bundle ``libbson`` 1.21.1. If installing from source, the minimum supported ``libbson`` version is now 1.21.0.
16+
- Upgraded to bundle `libbson <http://mongoc.org/libbson/current/index.html>`_ 1.21.1. If installing from source, the minimum supported ``libbson`` version is now 1.21.0.
1217
- Dropped Python 3.6 support (it was dropped in `PyArrow` 7.0).
1318

1419
Changes in Version 0.2.0
@@ -17,8 +22,8 @@ Changes in Version 0.2.0
1722
- Support for PyMongo 4.0.
1823
- Support for Python 3.10.
1924
- Support for Windows.
20-
- ``find_arrow_all`` now accepts a user-provided ``projection``.
21-
- ``find_raw_batches`` now accepts a ``session`` object.
25+
- :meth:`~pymongoarrow.api.find_arrow_all` now accepts a user-provided ``projection``.
26+
- :meth:`~pymongoarrow.api.find_arrow_all` now accepts a ``session`` object.
2227
- Note: The supported version of ``pyarrow`` is now ``>=6,<6.1``.
2328

2429
Changes in Version 0.1.1

bindings/python/docs/source/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,5 @@
7474
"pandas": ("https://pandas.pydata.org/docs/", None),
7575
"numpy": ("https://numpy.org/doc/1.20/", None),
7676
"pymongo": ("https://pymongo.readthedocs.io/en/stable/", None),
77+
"bson": ("https://pymongo.readthedocs.io/en/stable/", None),
7778
}

bindings/python/pymongoarrow/api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
"find_pandas_all",
3737
"aggregate_numpy_all",
3838
"find_numpy_all",
39+
"write",
3940
"Schema",
4041
]
4142

bindings/python/pymongoarrow/context.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
_BsonArrowTypes.double: DoubleBuilder,
3131
_BsonArrowTypes.datetime: DatetimeBuilder,
3232
_BsonArrowTypes.objectid: ObjectIdBuilder,
33+
_BsonArrowTypes.decimal128_str: StringBuilder,
3334
_BsonArrowTypes.string: StringBuilder,
3435
_BsonArrowTypes.bool: BoolBuilder,
3536
}

bindings/python/pymongoarrow/lib.pyx

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ from cython.operator cimport dereference
3333
from libcpp cimport bool as cbool
3434
from libcpp.map cimport map
3535
from libcpp.string cimport string
36+
from libc.stdlib cimport malloc, free
3637
from pyarrow.lib cimport *
3738
from pyarrow.lib import tobytes
3839
from pymongoarrow.libarrow cimport *
@@ -59,11 +60,14 @@ def process_bson_stream(bson_stream, context):
5960
cdef bson_reader_t* stream_reader = bson_reader_new_from_data(docstream, length)
6061
cdef const bson_t * doc = NULL
6162
cdef bson_iter_t doc_iter
63+
cdef bson_decimal128_t dec128
6264
cdef const char* key
6365
cdef bson_type_t value_t
6466
cdef Py_ssize_t count = 0
6567
cdef const char * bson_str
6668
cdef uint32_t str_len
69+
cdef char *decimal128_str = <char *> malloc(
70+
BSON_DECIMAL128_STRING * sizeof(char))
6771

6872
# Localize types for better performance.
6973
t_int32 = _BsonArrowTypes.int32
@@ -115,6 +119,10 @@ def process_bson_stream(bson_stream, context):
115119
if value_t == BSON_TYPE_UTF8:
116120
bson_str = bson_iter_utf8(&doc_iter, &str_len)
117121
builder.append(<bytes>(bson_str)[:str_len])
122+
elif value_t == BSON_TYPE_DECIMAL128:
123+
bson_iter_decimal128(&doc_iter, &dec128)
124+
bson_decimal128_to_string(&dec128, decimal128_str)
125+
builder.append(<bytes>(decimal128_str))
118126
else:
119127
builder.append_null()
120128
elif ftype == t_double:
@@ -144,6 +152,7 @@ def process_bson_stream(bson_stream, context):
144152
builder.append_null()
145153
finally:
146154
bson_reader_destroy(stream_reader)
155+
free(decimal128_str)
147156

148157

149158
# Builders

bindings/python/pymongoarrow/libbson.pxd

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ cdef extern from "<bson/bson.h>":
124124

125125
const char * bson_iter_utf8(const bson_iter_t *iter, uint32_t *length)
126126

127+
bint bson_iter_decimal128 (const bson_iter_t *iter, bson_decimal128_t *dec)
127128

128129
# bson_reader_t API
129130
cdef extern from "<bson/bson.h>":
@@ -133,6 +134,10 @@ cdef extern from "<bson/bson.h>":
133134

134135
void bson_reader_destroy(bson_reader_t *reader)
135136

137+
# bson_decimal128 API
138+
cdef extern from "<bson/bson.h>":
139+
cdef int32_t BSON_DECIMAL128_STRING
140+
void bson_decimal128_to_string (const bson_decimal128_t *dec, char *str)
136141

137142
# runtime version checking API
138143
cdef extern from "<bson/bson.h>":

bindings/python/pymongoarrow/types.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import numpy as np
1818
import pyarrow as pa
1919
import pyarrow.types as _atypes
20-
from bson import Int64, ObjectId
20+
from bson import Decimal128, Int64, ObjectId
2121
from pyarrow import DataType as _ArrowDataType
2222
from pyarrow import PyExtensionType, binary, bool_, float64, int64, string, timestamp
2323

@@ -30,6 +30,7 @@ class _BsonArrowTypes(enum.Enum):
3030
objectid = 5
3131
string = 6
3232
bool = 7
33+
decimal128_str = 8
3334

3435

3536
# Custom Extension Types.
@@ -47,6 +48,16 @@ def __reduce__(self):
4748
return ObjectIdType, ()
4849

4950

51+
class Decimal128StringType(PyExtensionType):
52+
_type_marker = _BsonArrowTypes.decimal128_str
53+
54+
def __init__(self):
55+
super().__init__(string())
56+
57+
def __reduce__(self):
58+
return Decimal128StringType, ()
59+
60+
5061
# Internal Type Handling.
5162

5263

@@ -55,14 +66,20 @@ def _is_objectid(obj):
5566
return type_marker == ObjectIdType._type_marker
5667

5768

69+
def _is_decimal128_str(obj):
70+
type_marker = getattr(obj, "_type_marker", "")
71+
return type_marker == Decimal128StringType._type_marker
72+
73+
5874
_TYPE_NORMALIZER_FACTORY = {
5975
Int64: lambda _: int64(),
6076
float: lambda _: float64(),
6177
int: lambda _: int64(),
6278
datetime: lambda _: timestamp("ms"), # TODO: add tzinfo support
6379
ObjectId: lambda _: ObjectIdType(),
64-
str: lambda: string(),
65-
bool: lambda: bool_(),
80+
Decimal128: lambda _: Decimal128StringType(),
81+
str: lambda _: string(),
82+
bool: lambda _: bool_(),
6683
}
6784

6885

@@ -90,6 +107,7 @@ def get_numpy_type(type):
90107
_atypes.is_float64: _BsonArrowTypes.double,
91108
_atypes.is_timestamp: _BsonArrowTypes.datetime,
92109
_is_objectid: _BsonArrowTypes.objectid,
110+
_is_decimal128_str: _BsonArrowTypes.decimal128_str,
93111
_atypes.is_string: _BsonArrowTypes.string,
94112
_atypes.is_boolean: _BsonArrowTypes.bool,
95113
}

bindings/python/test/test_arrow.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
from test.utils import AllowListEventListener
1919

2020
import pymongo
21-
from pyarrow import Table, bool_, decimal256, float64, int32, int64
21+
from bson import Decimal128, ObjectId
22+
from pyarrow import Table, binary, bool_, decimal256, float64, int32, int64
2223
from pyarrow import schema as ArrowSchema
2324
from pyarrow import string, timestamp
2425
from pyarrow.parquet import read_table, write_table
@@ -27,6 +28,7 @@
2728
from pymongoarrow.api import Schema, aggregate_arrow_all, find_arrow_all, write
2829
from pymongoarrow.errors import ArrowWriteError
2930
from pymongoarrow.monkey import patch_all
31+
from pymongoarrow.types import Decimal128StringType, ObjectIdType
3032

3133

3234
class TestArrowApiMixin:
@@ -292,3 +294,42 @@ def run_find(self, *args, **kwargs):
292294

293295
def run_aggregate(self, *args, **kwargs):
294296
return self.coll.aggregate_arrow_all(*args, **kwargs)
297+
298+
299+
class TestBSONTypes(unittest.TestCase):
300+
@classmethod
301+
def setUpClass(cls):
302+
if not client_context.connected:
303+
raise unittest.SkipTest("cannot connect to MongoDB")
304+
cls.cmd_listener = AllowListEventListener("find", "aggregate")
305+
cls.getmore_listener = AllowListEventListener("getMore")
306+
cls.client = client_context.get_client(
307+
event_listeners=[cls.getmore_listener, cls.cmd_listener]
308+
)
309+
310+
def test_find_decimal128(self):
311+
oids = list(ObjectId() for i in range(4))
312+
decs = [Decimal128(i) for i in ["0.1", "1.0", "1e-5"]]
313+
schema = Schema({"_id": ObjectIdType(), "data": Decimal128StringType()})
314+
expected = Table.from_pydict(
315+
{
316+
"_id": [i.binary for i in oids],
317+
"data": [str(decs[0]), str(decs[1]), str(decs[2]), None],
318+
},
319+
ArrowSchema([("_id", binary(12)), ("data", string())]),
320+
)
321+
coll = self.client.pymongoarrow_test.get_collection(
322+
"test", write_concern=WriteConcern(w="majority")
323+
)
324+
325+
coll.drop()
326+
coll.insert_many(
327+
[
328+
{"_id": oids[0], "data": decs[0]},
329+
{"_id": oids[1], "data": decs[1]},
330+
{"_id": oids[2], "data": decs[2]},
331+
{"_id": oids[3]},
332+
]
333+
)
334+
table = find_arrow_all(coll, {}, schema=schema)
335+
self.assertEqual(table, expected)

bindings/python/test/test_bson.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@
1414
from unittest import TestCase
1515

1616
import pyarrow as pa
17-
from bson import InvalidBSON, encode
17+
from bson import Decimal128, Int64, InvalidBSON, encode
1818
from pymongoarrow.context import PyMongoArrowContext
1919
from pymongoarrow.lib import process_bson_stream
2020
from pymongoarrow.schema import Schema
21-
from pymongoarrow.types import ObjectId, ObjectIdType, bool_, int64, string
21+
from pymongoarrow.types import ObjectId, ObjectIdType, int64, string
2222

2323

2424
class TestBsonToArrowConversionBase(TestCase):
@@ -147,9 +147,24 @@ def test_object_id_type(self):
147147
assert result.type._type_marker == ObjectIdType._type_marker
148148

149149

150+
class TestInt64Type(TestBsonToArrowConversionBase):
151+
def setUp(self):
152+
self.schema = Schema({"data": Int64})
153+
self.context = PyMongoArrowContext.from_schema(self.schema)
154+
155+
def test_simple(self):
156+
docs = [
157+
{"data": Int64(1e15)},
158+
{"data": Int64(-1e5)},
159+
{"data": Int64(10)},
160+
]
161+
as_dict = {"data": [1e15, -1e5, 10]}
162+
self._run_test(docs, as_dict)
163+
164+
150165
class TestBooleanType(TestBsonToArrowConversionBase):
151166
def setUp(self):
152-
self.schema = Schema({"data": bool_()})
167+
self.schema = Schema({"data": bool})
153168
self.context = PyMongoArrowContext.from_schema(self.schema)
154169

155170
def test_simple(self):
@@ -163,3 +178,32 @@ def test_simple(self):
163178
]
164179
as_dict = {"data": [True, False, None, None, False, True]}
165180
self._run_test(docs, as_dict)
181+
182+
183+
class TestStringType(TestBsonToArrowConversionBase):
184+
def setUp(self):
185+
self.schema = Schema({"data": str})
186+
self.context = PyMongoArrowContext.from_schema(self.schema)
187+
188+
def test_simple(self):
189+
docs = [
190+
{"data": "a"},
191+
{"data": "dätá"},
192+
]
193+
as_dict = {"data": ["a", "dätá"]}
194+
self._run_test(docs, as_dict)
195+
196+
197+
class TestDecimal128StringType(TestBsonToArrowConversionBase):
198+
def setUp(self):
199+
self.schema = Schema({"data": Decimal128})
200+
self.context = PyMongoArrowContext.from_schema(self.schema)
201+
202+
def test_simple(self):
203+
docs = [
204+
{"data": Decimal128("0.01")},
205+
{"data": Decimal128("1e-5")},
206+
{"data": Decimal128("1.0e+5")},
207+
]
208+
as_dict = {"data": ["0.01", "0.00001", "1.0E+5"]}
209+
self._run_test(docs, as_dict)

bindings/python/test/test_monkey.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
class TestMonkey(TestCase):
2020
def test_all_methods_listed_for_patching(self):
21-
unpatched_apis = {"Schema"}
21+
unpatched_apis = {"Schema", "write"}
2222
all_methods = set(api.__all__) - unpatched_apis
2323
methods_to_patch = set(api._PATCH_METHODS)
2424
self.assertEqual(all_methods, methods_to_patch)

0 commit comments

Comments
 (0)