Skip to content

Commit c63dd94

Browse files
authored
ARROW-167 int32 should coerce BSON types the same as int64 (#153)
1 parent 9b27065 commit c63dd94

File tree

4 files changed

+74
-6
lines changed

4 files changed

+74
-6
lines changed

bindings/python/pymongoarrow/lib.pyx

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ cdef void process_raw_bson_stream(const uint8_t * docstream, size_t length, obje
172172
cdef map[cstring, void *] builder_map
173173
cdef map[cstring, void*].iterator it
174174
cdef bson_subtype_t subtype
175+
cdef int32_t val32
176+
cdef int64_t val64
175177

176178
cdef _ArrayBuilderBase builder = None
177179
cdef Int32Builder int32_builder
@@ -262,8 +264,24 @@ cdef void process_raw_bson_stream(const uint8_t * docstream, size_t length, obje
262264
value_t = bson_iter_type(&doc_iter)
263265
if ftype == BSON_TYPE_INT32:
264266
int32_builder = builder
265-
if value_t == BSON_TYPE_INT32:
266-
int32_builder.append_raw(bson_iter_int32(&doc_iter))
267+
if (value_t == BSON_TYPE_INT32 or value_t == BSON_TYPE_BOOL):
268+
int32_builder.append_raw(bson_iter_as_int64(&doc_iter))
269+
elif value_t == BSON_TYPE_INT64:
270+
val64 = bson_iter_as_int64(&doc_iter)
271+
val32 = <int32_t> val64
272+
if val64 == val32:
273+
int32_builder.append_raw(val32)
274+
else:
275+
# Use append (not append_raw) to surface overflow errors.
276+
int32_builder.append(val64)
277+
elif value_t == BSON_TYPE_DOUBLE:
278+
# Treat nan as null.
279+
val = bson_iter_as_double(&doc_iter)
280+
if isnan(val):
281+
int32_builder.append_null()
282+
else:
283+
# Use append (not append_raw) to surface overflow errors.
284+
int32_builder.append(bson_iter_as_int64(&doc_iter))
267285
else:
268286
int32_builder.append_null()
269287
elif ftype == BSON_TYPE_INT64:

bindings/python/test/test_arrow.py

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ def test_auto_schema_heterogeneous(self):
487487
self.coll.insert_many(data)
488488
for func in [find_arrow_all, aggregate_arrow_all]:
489489
out = func(self.coll, {} if func == find_arrow_all else []).drop(["_id"])
490-
self.assertEqual(out["a"].to_pylist(), [1, None, None, 4])
490+
self.assertEqual(out["a"].to_pylist(), [1, None, 1, 4])
491491

492492
def test_auto_schema_tz(self):
493493
# Create table with random data of various types.
@@ -564,7 +564,7 @@ def test_malformed_embedded_documents(self):
564564
dict(data=dict(a=1, b=True)),
565565
dict(data=dict(a=1, b=True, c="bar")),
566566
dict(data=dict(a=1)),
567-
dict(data=dict(a=True, b=False)),
567+
dict(data=dict(a="str", b=False)),
568568
]
569569
self.coll.drop()
570570
self.coll.insert_many(data)
@@ -590,6 +590,56 @@ def test_mixed_subtype(self):
590590
res = find_arrow_all(coll, {}, schema=schema)
591591
self.assertEqual(res["data"].to_pylist(), [Binary(b"1", 10), None])
592592

593+
def _test_mixed_types_int(self, inttype):
594+
docs = [
595+
{"a": 1},
596+
{"a": 2.9}, # float should be truncated.
597+
{"a": True}, # True should be 1.
598+
{"a": False}, # False should be 0.
599+
{"a": float("nan")}, # Should be null.
600+
{"a": None}, # Should be null.
601+
{}, # Should be null.
602+
{"a": "string"}, # Should be null.
603+
]
604+
self.coll.delete_many({})
605+
self.coll.insert_many(docs)
606+
table = find_arrow_all(self.coll, {}, projection={"_id": 0}, schema=Schema({"a": inttype}))
607+
expected = Table.from_pylist(
608+
[
609+
{"a": 1},
610+
{"a": 2},
611+
{"a": 1},
612+
{"a": 0},
613+
{"a": None},
614+
{"a": None},
615+
{},
616+
{"a": None},
617+
],
618+
schema=ArrowSchema([field("a", inttype)]),
619+
)
620+
self.assertEqual(table, expected)
621+
622+
def test_mixed_types_int32(self):
623+
self._test_mixed_types_int(int32())
624+
# Value too large to fit in int32 should cause an overflow error.
625+
self.coll.delete_many({})
626+
self.coll.insert_one({"a": 2 << 34})
627+
with self.assertRaises(OverflowError):
628+
find_arrow_all(self.coll, {}, projection={"_id": 0}, schema=Schema({"a": int32()}))
629+
# Test double overflowing int32
630+
self.coll.delete_many({})
631+
self.coll.insert_one({"a": float(2 << 34)})
632+
with self.assertRaises(OverflowError):
633+
find_arrow_all(self.coll, {}, projection={"_id": 0}, schema=Schema({"a": int32()}))
634+
635+
def test_mixed_types_int64(self):
636+
self._test_mixed_types_int(int64())
637+
# Test double overflowing int64
638+
self.coll.delete_many({})
639+
self.coll.insert_one({"a": float(2 << 65)})
640+
with self.assertRaises(OverflowError):
641+
find_arrow_all(self.coll, {}, projection={"_id": 0}, schema=Schema({"a": int32()}))
642+
593643

594644
class TestArrowExplicitApi(ArrowApiTestMixin, unittest.TestCase):
595645
def run_find(self, *args, **kwargs):

bindings/python/test/test_numpy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ def test_auto_schema_heterogeneous(self):
235235
with self.subTest(func.__name__):
236236
out = func(self.coll, {} if func == find_numpy_all else [])
237237
del out["_id"]
238-
np.equal(out, [1.0, np.nan, np.nan, 4.0])
238+
np.equal(out, [1.0, np.nan, 1.0, 4.0])
239239

240240
def test_auto_schema_tz(self):
241241
schema = {

bindings/python/test/test_pandas.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ def test_auto_schema_heterogeneous(self):
269269
self.coll.insert_many(data)
270270
for func in [find_pandas_all, aggregate_pandas_all]:
271271
out = func(self.coll, {} if func == find_pandas_all else []).drop(columns=["_id"])
272-
np.equal(out["a"], [1.0, np.nan, np.nan, 4.0])
272+
np.equal(out["a"], [1.0, np.nan, 1, 4.0])
273273

274274
def test_auto_schema_tz(self):
275275
schema = {

0 commit comments

Comments
 (0)