ARROW-117 "process_bson_stream" Does Not Construct Projection With Auto-Discovered Schema (#92)

juliusgeo · web-flow · commit 9ec5ca8c7995 · 2022-08-29T11:26:56.000-07:00
diff --git a/bindings/python/pymongoarrow/api.py b/bindings/python/pymongoarrow/api.py
@@ -129,8 +129,9 @@ def aggregate_arrow_all(collection, pipeline, *, schema=None, **kwargs):
                 UserWarning,
                 stacklevel=2,
             )
+    if schema:
+        pipeline.append({"$project": schema._get_projection()})
 
-    pipeline.append({"$project": schema._get_projection()})
     raw_batch_cursor = collection.aggregate_raw_batches(pipeline, **kwargs)
     for batch in raw_batch_cursor:
         process_bson_stream(batch, context)
@@ -201,10 +202,12 @@ def _arrow_to_numpy(arrow_table, schema=None):
     """
     container = {}
     if not schema:
-        schema = arrow_table.schema
+        schema = {i.name: i.type for i in arrow_table.schema}
+    else:
+        schema = schema.typemap
 
     for fname in schema:
-        dtype = get_numpy_type(schema.typemap[fname])
+        dtype = get_numpy_type(schema[fname])
         if dtype == np.str_:
             container[fname] = arrow_table[fname].to_pandas().to_numpy(dtype=dtype)
         else:
diff --git a/bindings/python/test/test_arrow.py b/bindings/python/test/test_arrow.py
@@ -348,17 +348,19 @@ def test_auto_schema(self):
         self.coll.drop()
         res = write(self.coll, data)
         self.assertEqual(len(data), res.raw_result["insertedCount"])
-        out = find_arrow_all(self.coll, {}).drop(["_id"])
-        self.assertEqual(data, out)
+        for func in [find_arrow_all, aggregate_arrow_all]:
+            out = func(self.coll, {} if func == find_arrow_all else []).drop(["_id"])
+            self.assertEqual(data, out)
 
     def test_auto_schema_heterogeneous(self):
         vals = [1, "2", True, 4]
         data = [{"a": v} for v in vals]
 
         self.coll.drop()
         self.coll.insert_many(data)
-        out = find_arrow_all(self.coll, {}).drop(["_id"])
-        self.assertEqual(out["a"].to_pylist(), [1, None, None, 4])
+        for func in [find_arrow_all, aggregate_arrow_all]:
+            out = func(self.coll, {} if func == find_arrow_all else []).drop(["_id"])
+            self.assertEqual(out["a"].to_pylist(), [1, None, None, 4])
 
     def test_auto_schema_tz(self):
         # Create table with random data of various types.
@@ -381,8 +383,12 @@ def test_auto_schema_tz(self):
         codec_options = CodecOptions(tzinfo=timezone("US/Eastern"), tz_aware=True)
         res = write(self.coll.with_options(codec_options=codec_options), data)
         self.assertEqual(len(data), res.raw_result["insertedCount"])
-        out = find_arrow_all(self.coll.with_options(codec_options=codec_options), {}).drop(["_id"])
-        self.assertEqual(data, out)
+        for func in [find_arrow_all, aggregate_arrow_all]:
+            out = func(
+                self.coll.with_options(codec_options=codec_options),
+                {} if func == find_arrow_all else [],
+            ).drop(["_id"])
+            self.assertEqual(data, out)
 
 
 class TestArrowExplicitApi(TestArrowApiMixin, unittest.TestCase):
diff --git a/bindings/python/test/test_numpy.py b/bindings/python/test/test_numpy.py
@@ -19,7 +19,7 @@
 from unittest import mock
 
 import numpy as np
-from bson import Decimal128, ObjectId
+from bson import CodecOptions, Decimal128, ObjectId
 from pyarrow import int32, int64
 from pymongo import DESCENDING, WriteConcern
 from pymongo.collection import Collection
@@ -30,6 +30,7 @@
     Decimal128StringType,
     ObjectIdType,
 )
+from pytz import timezone
 
 
 class NumpyTestBase(unittest.TestCase):
@@ -210,6 +211,67 @@ def test_string_bool(self):
             ),
         )
 
+    def test_auto_schema(self):
+        schema = {
+            "bool": "bool",
+            "dt": "datetime64[ms]",
+            "string": "str",
+        }
+        data = {
+            "string": [None] + [str(i) for i in range(2)],
+            "bool": [True for _ in range(3)],
+            "dt": [datetime.datetime(1970 + i, 1, 1) for i in range(3)],
+        }
+        data = self.schemafied_ndarray_dict(data, schema)
+
+        self.coll.drop()
+        res = write(self.coll, data)
+        self.assertEqual(len(data), res.raw_result["insertedCount"])
+        for func in [find_numpy_all, aggregate_numpy_all]:
+            with self.subTest(func.__name__):
+                out = func(self.coll, {} if func == find_numpy_all else [])
+                del out["_id"]
+                self.assert_numpy_equal(data, out)
+
+    def test_auto_schema_heterogeneous(self):
+        vals = [1, "2", True, 4]
+        data = [{"a": v} for v in vals]
+
+        self.coll.drop()
+        self.coll.insert_many(data)
+        for func in [find_numpy_all, aggregate_numpy_all]:
+            with self.subTest(func.__name__):
+                out = func(self.coll, {} if func == find_numpy_all else [])
+                del out["_id"]
+                np.equal(out, [1.0, np.nan, np.nan, 4.0])
+
+    def test_auto_schema_tz(self):
+        schema = {
+            "bool": "bool",
+            "dt": "datetime64[ms]",
+            "string": "str",
+        }
+        data = {
+            "string": [str(i) for i in range(3)],
+            "bool": [True for _ in range(3)],
+            "dt": [
+                datetime.datetime(1970 + i, 1, 1, tzinfo=timezone("US/Eastern")) for i in range(3)
+            ],
+        }
+        data = self.schemafied_ndarray_dict(data, schema)
+        self.coll.drop()
+        codec_options = CodecOptions(tzinfo=timezone("US/Eastern"), tz_aware=True)
+        res = write(self.coll.with_options(codec_options=codec_options), data)
+        self.assertEqual(len(data), res.raw_result["insertedCount"])
+        for func in [find_numpy_all, aggregate_numpy_all]:
+            with self.subTest(func.__name__):
+                out = func(
+                    self.coll.with_options(codec_options=codec_options),
+                    {} if func == find_numpy_all else [],
+                )
+                del out["_id"]
+                self.assert_numpy_equal(data, out)
+
 
 class TestBSONTypes(NumpyTestBase):
     @classmethod
diff --git a/bindings/python/test/test_pandas.py b/bindings/python/test/test_pandas.py
@@ -22,7 +22,7 @@
 import pandas as pd
 import pandas.testing
 import pyarrow
-from bson import Decimal128, ObjectId
+from bson import CodecOptions, Decimal128, ObjectId
 from pyarrow import decimal256, int32, int64
 from pymongo import DESCENDING, WriteConcern
 from pymongo.collection import Collection
@@ -33,6 +33,7 @@
     Decimal128StringType,
     ObjectIdType,
 )
+from pytz import timezone
 
 
 class PandasTestBase(unittest.TestCase):
@@ -188,6 +189,64 @@ def test_string_bool(self):
             ),
         )
 
+    def test_auto_schema(self):
+        schema = {
+            "bool": "bool",
+            "dt": "datetime64[ns]",
+            "string": "str",
+        }
+        data = pd.DataFrame(
+            data={
+                "string": [None] + [str(i) for i in range(2)],
+                "bool": [True for _ in range(3)],
+                "dt": [datetime.datetime(1970 + i, 1, 1) for i in range(3)],
+            },
+        ).astype(schema)
+
+        self.coll.drop()
+        res = write(self.coll, data)
+        self.assertEqual(len(data), res.raw_result["insertedCount"])
+        for func in [find_pandas_all, aggregate_pandas_all]:
+            out = func(self.coll, {} if func == find_pandas_all else []).drop(columns=["_id"])
+            pd.testing.assert_frame_equal(data, out)
+
+    def test_auto_schema_heterogeneous(self):
+        vals = [1, "2", True, 4]
+        data = [{"a": v} for v in vals]
+
+        self.coll.drop()
+        self.coll.insert_many(data)
+        for func in [find_pandas_all, aggregate_pandas_all]:
+            out = func(self.coll, {} if func == find_pandas_all else []).drop(columns=["_id"])
+            np.equal(out["a"], [1.0, np.nan, np.nan, 4.0])
+
+    def test_auto_schema_tz(self):
+        schema = {
+            "bool": "bool",
+            "dt": "datetime64[ns]",
+            "string": "str",
+        }
+        data = pd.DataFrame(
+            data={
+                "string": [None] + [str(i) for i in range(2)],
+                "bool": [True for _ in range(3)],
+                "dt": [
+                    datetime.datetime(1970 + i, 1, 1, tzinfo=timezone("US/Eastern"))
+                    for i in range(3)
+                ],
+            },
+        ).astype(schema)
+        self.coll.drop()
+        codec_options = CodecOptions(tzinfo=timezone("US/Eastern"), tz_aware=True)
+        res = write(self.coll.with_options(codec_options=codec_options), data)
+        self.assertEqual(len(data), res.raw_result["insertedCount"])
+        for func in [find_pandas_all, aggregate_pandas_all]:
+            out = func(
+                self.coll.with_options(codec_options=codec_options),
+                {} if func == find_pandas_all else [],
+            ).drop(columns=["_id"])
+            pd.testing.assert_frame_equal(data, out)
+
 
 class TestBSONTypes(PandasTestBase):
     @classmethod