ARROW-73 Add support and testing for Pandas (#69)

juliusgeo · web-flow · commit ddc172f496c6 · 2022-04-07T19:25:13.000-07:00
diff --git a/bindings/python/benchmark.py b/bindings/python/benchmark.py
@@ -31,7 +31,9 @@
 dtypes = {}
 schemas = {}
 raw_bsons = {}
+
 arrow_tables = {}
+pandas_tables = {}
 
 
 def _setup():
@@ -91,6 +93,8 @@ def _setup():
     raw_bsons[LARGE] = raw_bson_large
     arrow_tables[SMALL] = find_arrow_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
     arrow_tables[LARGE] = find_arrow_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
+    pandas_tables[SMALL] = find_pandas_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
+    pandas_tables[LARGE] = find_pandas_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
 
 
 def _teardown():
@@ -163,6 +167,11 @@ def insert_conventional(use_large):
     db[collection_names[use_large]].insert_many(tab)
 
 
+@bench("insert_pandas")
+def insert_pandas(use_large):
+    write(db[collection_names[use_large]], pandas_tables[use_large])
+
+
 parser = argparse.ArgumentParser(
     formatter_class=argparse.RawTextHelpFormatter,
     epilog="""
diff --git a/bindings/python/pymongoarrow/api.py b/bindings/python/pymongoarrow/api.py
@@ -15,6 +15,9 @@
 
 from bson import encode
 from bson.raw_bson import RawBSONDocument
+from pandas import DataFrame
+from pyarrow import Schema as ArrowSchema
+from pyarrow import Table
 from pymongo.bulk import BulkWriteError
 from pymongo.common import MAX_WRITE_BATCH_SIZE
 from pymongoarrow.context import PyMongoArrowContext
@@ -256,9 +259,13 @@ def _transform_bwe(bwe, offset):
 
 
 def _tabular_generator(tabular):
-    for i in tabular.to_batches():
-        for row in i.to_pylist():
-            yield row
+    if isinstance(tabular, Table):
+        for i in tabular.to_batches():
+            for row in i.to_pylist():
+                yield row
+    elif isinstance(tabular, DataFrame):
+        for i in tabular.to_dict("records"):
+            yield i
 
 
 def write(collection, tabular):
@@ -273,7 +280,10 @@ def write(collection, tabular):
       An instance of :class:`result.ArrowWriteResult`.
     """
 
-    _validate_schema(tabular.schema)
+    if isinstance(tabular, Table):
+        _validate_schema(tabular.schema.types)
+    elif isinstance(tabular, DataFrame):
+        _validate_schema(ArrowSchema.from_pandas(tabular).types)
     cur_offset = 0
     results = {
         "insertedCount": 0,
diff --git a/bindings/python/pymongoarrow/types.py b/bindings/python/pymongoarrow/types.py
@@ -112,6 +112,6 @@ def _in_type_map(t):
 
 
 def _validate_schema(schema):
-    for i in schema.types:
+    for i in schema:
         if not _in_type_map(i):
             raise ValueError(f'Unsupported data type "{i}" in schema')
diff --git a/bindings/python/test/test_pandas.py b/bindings/python/test/test_pandas.py
@@ -13,13 +13,17 @@
 # limitations under the License.
 # from datetime import datetime, timedelta
 import unittest
+import unittest.mock as mock
 from test import client_context
 from test.utils import AllowListEventListener
 
+import numpy as np
 import pandas as pd
-from pyarrow import int32, int64
+from pyarrow import bool_, decimal128, float64, int32, int64, string, timestamp
 from pymongo import DESCENDING, WriteConcern
-from pymongoarrow.api import Schema, aggregate_pandas_all, find_pandas_all
+from pymongo.collection import Collection
+from pymongoarrow.api import Schema, aggregate_pandas_all, find_pandas_all, write
+from pymongoarrow.errors import ArrowWriteError
 
 
 class TestExplicitPandasApi(unittest.TestCase):
@@ -76,3 +80,81 @@ def test_aggregate_simple(self):
         assert len(agg_cmd.command["pipeline"]) == 2
         self.assertEqual(agg_cmd.command["pipeline"][0]["$project"], projection)
         self.assertEqual(agg_cmd.command["pipeline"][1]["$project"], {"_id": True, "data": True})
+
+    def round_trip(self, data, schema, coll=None):
+        if coll is None:
+            coll = self.coll
+        coll.drop()
+        res = write(self.coll, data)
+        self.assertEqual(len(data), res.raw_result["insertedCount"])
+        pd.testing.assert_frame_equal(data, find_pandas_all(coll, {}, schema=schema))
+        return res
+
+    def test_write_error(self):
+        schema = {"_id": "int32", "data": "int64"}
+
+        data = pd.DataFrame(
+            data={"_id": [i for i in range(10001)] * 2, "data": [i * 2 for i in range(10001)] * 2}
+        ).astype(schema)
+        with self.assertRaises(ArrowWriteError):
+            try:
+                self.round_trip(data, Schema({"_id": int32(), "data": int64()}))
+            except ArrowWriteError as awe:
+                self.assertEqual(
+                    10001, awe.details["writeErrors"][0]["index"], awe.details["nInserted"]
+                )
+                raise awe
+
+    def test_write_schema_validation(self):
+        schema = {
+            "data": "int64",
+            "float": "float64",
+            "datetime": "datetime64[ms]",
+            "string": "object",
+            "bool": "bool",
+        }
+        data = pd.DataFrame(
+            data={
+                "data": [i for i in range(2)],
+                "float": [i for i in range(2)],
+                "datetime": [i for i in range(2)],
+                "string": [str(i) for i in range(2)],
+                "bool": [True for _ in range(2)],
+            }
+        ).astype(schema)
+        self.round_trip(
+            data,
+            Schema(
+                {
+                    "data": int64(),
+                    "float": float64(),
+                    "datetime": timestamp("ms"),
+                    "string": string(),
+                    "bool": bool_(),
+                }
+            ),
+        )
+
+        schema = {"_id": "int32", "data": np.ubyte()}
+        data = pd.DataFrame(
+            data={"_id": [i for i in range(2)], "data": [i for i in range(2)]}
+        ).astype(schema)
+        with self.assertRaises(ValueError):
+            self.round_trip(data, Schema({"_id": int32(), "data": decimal128(2)}))
+
+    @mock.patch.object(Collection, "insert_many", side_effect=Collection.insert_many, autospec=True)
+    def test_write_batching(self, mock):
+        schema = {"_id": "int64"}
+        data = pd.DataFrame(
+            data={"_id": [i for i in range(100040)]},
+        ).astype(schema)
+        self.round_trip(
+            data,
+            Schema(
+                {
+                    "_id": int64(),
+                }
+            ),
+            coll=self.coll,
+        )
+        self.assertEqual(mock.call_count, 2)