ARROW-74 Add support and testing for NumPy (#71)

juliusgeo · web-flow · commit 3d39c2cba6c2 · 2022-04-18T17:15:33.000-07:00
diff --git a/bindings/python/benchmark.py b/bindings/python/benchmark.py
@@ -34,6 +34,7 @@
 
 arrow_tables = {}
 pandas_tables = {}
+numpy_arrays = {}
 
 
 def _setup():
@@ -95,6 +96,8 @@ def _setup():
     arrow_tables[LARGE] = find_arrow_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
     pandas_tables[SMALL] = find_pandas_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
     pandas_tables[LARGE] = find_pandas_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
+    numpy_arrays[SMALL] = find_numpy_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
+    numpy_arrays[LARGE] = find_numpy_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
 
 
 def _teardown():
@@ -172,6 +175,11 @@ def insert_pandas(use_large):
     write(db[collection_names[use_large]], pandas_tables[use_large])
 
 
+@bench("insert_numpy")
+def insert_numpy(use_large):
+    write(db[collection_names[use_large]], numpy_arrays[use_large])
+
+
 parser = argparse.ArgumentParser(
     formatter_class=argparse.RawTextHelpFormatter,
     epilog="""
diff --git a/bindings/python/pymongoarrow/api.py b/bindings/python/pymongoarrow/api.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 import warnings
 
+import numpy as np
 from bson import encode
 from bson.raw_bson import RawBSONDocument
+from numpy import ndarray
 from pandas import DataFrame
 from pyarrow import Schema as ArrowSchema
 from pyarrow import Table
@@ -25,7 +27,7 @@
 from pymongoarrow.lib import process_bson_stream
 from pymongoarrow.result import ArrowWriteResult
 from pymongoarrow.schema import Schema
-from pymongoarrow.types import _validate_schema
+from pymongoarrow.types import _validate_schema, get_numpy_type
 
 __all__ = [
     "aggregate_arrow_all",
@@ -187,7 +189,11 @@ def _arrow_to_numpy(arrow_table, schema):
     """
     container = {}
     for fname in schema:
-        container[fname] = arrow_table[fname].to_numpy()
+        dtype = get_numpy_type(schema.typemap[fname])
+        if dtype == np.str_:
+            container[fname] = arrow_table[fname].to_pandas().to_numpy(dtype=dtype)
+        else:
+            container[fname] = arrow_table[fname].to_numpy()
     return container
 
 
@@ -264,8 +270,15 @@ def _tabular_generator(tabular):
             for row in i.to_pylist():
                 yield row
     elif isinstance(tabular, DataFrame):
-        for i in tabular.to_dict("records"):
-            yield i
+        for row in tabular.to_dict("records"):
+            yield row
+    elif isinstance(tabular, dict):
+        iter_dict = {k: np.nditer(v) for k, v in tabular.items()}
+        try:
+            while True:
+                yield {k: next(i).item() for k, i in iter_dict.items()}
+        except StopIteration:
+            return
 
 
 def write(collection, tabular):
@@ -279,17 +292,30 @@ def write(collection, tabular):
     :Returns:
       An instance of :class:`result.ArrowWriteResult`.
     """
-
-    if isinstance(tabular, Table):
-        _validate_schema(tabular.schema.types)
-    elif isinstance(tabular, DataFrame):
-        _validate_schema(ArrowSchema.from_pandas(tabular).types)
     cur_offset = 0
     results = {
         "insertedCount": 0,
     }
-    tabular_gen = _tabular_generator(tabular)
     tab_size = len(tabular)
+    if isinstance(tabular, Table):
+        _validate_schema(tabular.schema.types)
+    elif isinstance(tabular, DataFrame):
+        _validate_schema(ArrowSchema.from_pandas(tabular).types)
+    elif (
+        isinstance(tabular, dict)
+        and len(tabular.values()) >= 1
+        and all([isinstance(i, ndarray) for i in tabular.values()])
+    ):
+        _validate_schema([i.dtype for i in tabular.values()])
+        tab_size = len(next(iter(tabular.values())))
+    else:
+        raise ValueError(
+            f"Invalid tabular data object of type {type(tabular)} \n"
+            "Please ensure that it is one of the supported types: "
+            "DataFrame, Table, or a dictionary containing NumPy arrays."
+        )
+
+    tabular_gen = _tabular_generator(tabular)
     while cur_offset < tab_size:
         cur_size = 0
         cur_batch = []
diff --git a/bindings/python/pymongoarrow/types.py b/bindings/python/pymongoarrow/types.py
@@ -14,6 +14,8 @@
 import enum
 from datetime import datetime
 
+import numpy as np
+import pyarrow as pa
 import pyarrow.types as _atypes
 from bson import Int64, ObjectId
 from pyarrow import DataType as _ArrowDataType
@@ -64,6 +66,24 @@ def _is_objectid(obj):
 }
 
 
+_TYPE_CHECKER_TO_NUMPY = {
+    _atypes.is_int32: np.int32,
+    _atypes.is_int64: np.int64,
+    _atypes.is_float64: np.float64,
+    _atypes.is_timestamp: "datetime64[ms]",
+    _is_objectid: np.object,
+    _atypes.is_string: np.str_,
+    _atypes.is_boolean: np.bool_,
+}
+
+
+def get_numpy_type(type):
+    for checker, comp_type in _TYPE_CHECKER_TO_NUMPY.items():
+        if checker(type):
+            return comp_type
+    return None
+
+
 _TYPE_CHECKER_TO_INTERNAL_TYPE = {
     _atypes.is_int32: _BsonArrowTypes.int32,
     _atypes.is_int64: _BsonArrowTypes.int64,
@@ -105,6 +125,11 @@ def _get_internal_typemap(typemap):
 
 
 def _in_type_map(t):
+    if isinstance(t, np.dtype):
+        try:
+            t = pa.from_numpy_dtype(t)
+        except pa.lib.ArrowNotImplementedError:
+            return False
     for checker in _TYPE_CHECKER_TO_INTERNAL_TYPE.keys():
         if checker(t):
             return True
diff --git a/bindings/python/test/test_arrow.py b/bindings/python/test/test_arrow.py
@@ -18,7 +18,7 @@
 from test.utils import AllowListEventListener
 
 import pymongo
-from pyarrow import Table, bool_, decimal128, float64, int32, int64
+from pyarrow import Table, bool_, decimal256, float64, int32, int64
 from pyarrow import schema as ArrowSchema
 from pyarrow import string, timestamp
 from pyarrow.parquet import read_table, write_table
@@ -229,7 +229,7 @@ def test_write_schema_validation(self):
         )
         self.round_trip(data, Schema(schema))
 
-        schema = {"_id": int32(), "data": decimal128(2)}
+        schema = {"_id": int32(), "data": decimal256(2)}
         data = Table.from_pydict(
             {"_id": [i for i in range(2)], "data": [i for i in range(2)]},
             ArrowSchema(schema),
diff --git a/bindings/python/test/test_numpy.py b/bindings/python/test/test_numpy.py
@@ -15,11 +15,14 @@
 import unittest
 from test import client_context
 from test.utils import AllowListEventListener
+from unittest import mock
 
 import numpy as np
-from pyarrow import int32, int64
+from pyarrow import bool_, float64, int32, int64, string, timestamp
 from pymongo import DESCENDING, WriteConcern
-from pymongoarrow.api import Schema, aggregate_numpy_all, find_numpy_all
+from pymongo.collection import Collection
+from pymongoarrow.api import Schema, aggregate_numpy_all, find_numpy_all, write
+from pymongoarrow.errors import ArrowWriteError
 
 
 class TestExplicitNumPyApi(unittest.TestCase):
@@ -47,11 +50,11 @@ def setUp(self):
 
     def assert_numpy_equal(self, actual, expected):
         self.assertIsInstance(actual, dict)
-        for field in self.schema:
+        for field in expected:
             # workaround np.nan == np.nan evaluating to False
             a = np.nan_to_num(actual[field])
             e = np.nan_to_num(expected[field])
-            self.assertTrue(np.all(a == e))
+            np.testing.assert_array_equal(a, e)
             self.assertEqual(actual[field].dtype, expected[field].dtype)
 
     def test_find_simple(self):
@@ -80,7 +83,7 @@ def test_find_simple(self):
     def test_aggregate_simple(self):
         expected = {
             "_id": np.array([1, 2, 3, 4], dtype=np.int32),
-            "data": np.array([20, 40, 60, np.nan], dtype=np.float64),
+            "data": np.array([20, 40, 60, None], dtype=np.float64),
         }
         projection = {"_id": True, "data": {"$multiply": [2, "$data"]}}
         actual = aggregate_numpy_all(self.coll, [{"$project": projection}], schema=self.schema)
@@ -91,3 +94,90 @@ def test_aggregate_simple(self):
         assert len(agg_cmd.command["pipeline"]) == 2
         self.assertEqual(agg_cmd.command["pipeline"][0]["$project"], projection)
         self.assertEqual(agg_cmd.command["pipeline"][1]["$project"], {"_id": True, "data": True})
+
+    def round_trip(self, data, schema, coll=None):
+        if coll is None:
+            coll = self.coll
+        coll.drop()
+        res = write(self.coll, data)
+        self.assertEqual(len(list(data.values())[0]), res.raw_result["insertedCount"])
+        self.assert_numpy_equal(find_numpy_all(coll, {}, schema=schema), data)
+        return res
+
+    def schemafied_ndarray_dict(self, dict, schema):
+        ret = {}
+        for k, v in dict.items():
+            ret[k] = np.array(v, dtype=schema[k])
+        return ret
+
+    def test_write_error(self):
+        schema = {"_id": "int32", "data": "int64"}
+        length = 10001
+        data = {"_id": [i for i in range(length)] * 2, "data": [i * 2 for i in range(length)] * 2}
+        data = self.schemafied_ndarray_dict(data, schema)
+        with self.assertRaises(ArrowWriteError):
+            try:
+                self.round_trip(data, Schema({"_id": int32(), "data": int64()}))
+            except ArrowWriteError as awe:
+                self.assertEqual(
+                    10001, awe.details["writeErrors"][0]["index"], awe.details["nInserted"]
+                )
+                raise awe
+
+    def test_write_schema_validation(self):
+        schema = {
+            "data": "int64",
+            "float": "float64",
+            "datetime": "datetime64[ms]",
+            "string": "str",
+            "bool": "bool",
+        }
+        data = {
+            "data": [i for i in range(2)],
+            "float": [i for i in range(2)],
+            "datetime": [i for i in range(2)],
+            "string": [str(i) for i in range(2)],
+            "bool": [True for _ in range(2)],
+        }
+        data = self.schemafied_ndarray_dict(data, schema)
+        self.round_trip(
+            data,
+            Schema(
+                {
+                    "data": int64(),
+                    "float": float64(),
+                    "datetime": timestamp("ms"),
+                    "string": string(),
+                    "bool": bool_(),
+                }
+            ),
+        )
+
+        schema = {"_id": "int32", "data": np.ubyte()}
+        data = {"_id": [i for i in range(2)], "data": [i for i in range(2)]}
+        data = self.schemafied_ndarray_dict(data, schema)
+        with self.assertRaises(ValueError):
+            self.round_trip(data, Schema({"_id": int32(), "data": np.ubyte()}))
+
+    @mock.patch.object(Collection, "insert_many", side_effect=Collection.insert_many, autospec=True)
+    def test_write_batching(self, mock):
+        schema = {"_id": "int64"}
+        data = {"_id": [i for i in range(100040)]}
+        data = self.schemafied_ndarray_dict(data, schema)
+
+        self.round_trip(
+            data,
+            Schema(
+                {
+                    "_id": int64(),
+                }
+            ),
+            coll=self.coll,
+        )
+        self.assertEqual(mock.call_count, 2)
+
+    def test_write_dictionaries(self):
+        with self.assertRaisesRegex(
+            ValueError, "Invalid tabular data object of type <class 'dict'>"
+        ):
+            write(self.coll, {"foo": 1})
diff --git a/bindings/python/test/test_pandas.py b/bindings/python/test/test_pandas.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 import pandas as pd
-from pyarrow import bool_, decimal128, float64, int32, int64, string, timestamp
+from pyarrow import bool_, decimal256, float64, int32, int64, string, timestamp
 from pymongo import DESCENDING, WriteConcern
 from pymongo.collection import Collection
 from pymongoarrow.api import Schema, aggregate_pandas_all, find_pandas_all, write
@@ -140,7 +140,7 @@ def test_write_schema_validation(self):
             data={"_id": [i for i in range(2)], "data": [i for i in range(2)]}
         ).astype(schema)
         with self.assertRaises(ValueError):
-            self.round_trip(data, Schema({"_id": int32(), "data": decimal128(2)}))
+            self.round_trip(data, Schema({"_id": int32(), "data": decimal256(2)}))
 
     @mock.patch.object(Collection, "insert_many", side_effect=Collection.insert_many, autospec=True)
     def test_write_batching(self, mock):