ARROW-71 Add basic support for writing to MongoDB from PyArrow (#64)

juliusgeo · web-flow · commit e243c67df80f · 2022-04-01T18:57:29.000-07:00
diff --git a/bindings/python/pymongoarrow/api.py b/bindings/python/pymongoarrow/api.py
@@ -13,9 +13,16 @@
 # limitations under the License.
 import warnings
 
+from bson import encode
+from bson.raw_bson import RawBSONDocument
+from pymongo.bulk import BulkWriteError
+from pymongo.common import MAX_WRITE_BATCH_SIZE
 from pymongoarrow.context import PyMongoArrowContext
+from pymongoarrow.errors import ArrowWriteError
 from pymongoarrow.lib import process_bson_stream
+from pymongoarrow.result import ArrowWriteResult
 from pymongoarrow.schema import Schema
+from pymongoarrow.types import _validate_schema
 
 __all__ = [
     "aggregate_arrow_all",
@@ -37,6 +44,12 @@
     "find_numpy_all",
 ]
 
+# MongoDB 3.6's maxMessageSizeBytes minus some overhead to account
+# for the command plus OP_MSG.
+_MAX_MESSAGE_SIZE = 48000000 - 16 * 1024
+# The maximum number of bulk write operations in one batch.
+_MAX_WRITE_BATCH_SIZE = max(100000, MAX_WRITE_BATCH_SIZE)
+
 
 def find_arrow_all(collection, query, *, schema, **kwargs):
     """Method that returns the results of a find query as a
@@ -233,3 +246,60 @@ def aggregate_numpy_all(collection, pipeline, *, schema, **kwargs):
     return _arrow_to_numpy(
         aggregate_arrow_all(collection, pipeline, schema=schema, **kwargs), schema
     )
+
+
+def _transform_bwe(bwe, offset):
+    bwe["nInserted"] += offset
+    for i in bwe["writeErrors"]:
+        i["index"] += offset
+    return bwe
+
+
+def _tabular_generator(tabular):
+    for i in tabular.to_batches():
+        for row in i.to_pylist():
+            yield row
+
+
+def write(collection, tabular):
+    """Write data from `tabular` into the given MongoDB `collection`.
+
+    :Parameters:
+      - `collection`: Instance of :class:`~pymongo.collection.Collection`.
+        against which to run the operation.
+      - `tabular`: A tabular data store to use for the write operation.
+
+    :Returns:
+      An instance of :class:`result.ArrowWriteResult`.
+    """
+
+    _validate_schema(tabular.schema)
+    cur_offset = 0
+    results = {
+        "insertedCount": 0,
+    }
+    tabular_gen = _tabular_generator(tabular)
+    while cur_offset < len(tabular):
+        cur_size = 0
+        cur_batch = []
+        i = 0
+        while (
+            cur_size <= _MAX_MESSAGE_SIZE
+            and len(cur_batch) <= _MAX_WRITE_BATCH_SIZE
+            and cur_offset + i < len(tabular)
+        ):
+            enc_tab = RawBSONDocument(
+                encode(next(tabular_gen), codec_options=collection.codec_options)
+            )
+            cur_batch.append(enc_tab)
+            cur_size += len(enc_tab)
+            i += 1
+        try:
+            collection.insert_many(cur_batch)
+        except BulkWriteError as bwe:
+            raise ArrowWriteError(_transform_bwe(dict(bwe.details), cur_offset)) from bwe
+
+        results["insertedCount"] += i
+        cur_offset += i
+
+    return ArrowWriteResult(results)
diff --git a/bindings/python/pymongoarrow/errors.py b/bindings/python/pymongoarrow/errors.py
@@ -21,3 +21,30 @@ class PyMongoArrowError(Exception):
     """Base class for all PyMongoArrow exceptions."""
 
     pass
+
+
+class ArrowWriteError(PyMongoArrowError):
+    """Error raised when we encounter an exception writing into MongoDB"""
+
+    def __init__(self, details):
+        self._details = details
+
+    @property
+    def details(self):
+        """Details for the error.
+
+        It is a dictionary of key-value pairs giving diagnostic information about what went wrong. To see the entire dictionary simply use `print(awe.details)`.
+
+        Details will have the following format:
+        {
+            'writeErrors': [...],
+            'writeConcernErrors': [...],
+            'nInserted': ...,
+            'nUpserted': ...,
+            'nMatched': ...,
+            'nModified': ...,
+            'nRemoved': ...,
+            'upserted': [...]
+        }
+        """
+        return self._details
diff --git a/bindings/python/pymongoarrow/result.py b/bindings/python/pymongoarrow/result.py
@@ -0,0 +1,31 @@
+# Copyright 2022-present MongoDB, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Results returned by PyMongoArrow."""
+
+
+class ArrowWriteResult:
+    def __init__(self, result_dict):
+        self._result = result_dict
+
+    def __repr__(self):
+        return repr(self._result)
+
+    @property
+    def inserted_count(self):
+        return self._result.get("insertedCount", 0)
+
+    @property
+    def raw_result(self):
+        return self._result
diff --git a/bindings/python/pymongoarrow/types.py b/bindings/python/pymongoarrow/types.py
@@ -102,3 +102,16 @@ def _get_internal_typemap(typemap):
             )
 
     return internal_typemap
+
+
+def _in_type_map(t):
+    for checker in _TYPE_CHECKER_TO_INTERNAL_TYPE.keys():
+        if checker(t):
+            return True
+    return False
+
+
+def _validate_schema(schema):
+    for i in schema.types:
+        if not _in_type_map(i):
+            raise ValueError(f'Unsupported data type "{i}" in schema')
diff --git a/bindings/python/test/test_arrow.py b/bindings/python/test/test_arrow.py
@@ -17,10 +17,13 @@
 from test.utils import AllowListEventListener
 
 import pymongo
-from pyarrow import Table, int32, int64
+from pyarrow import Table, bool_, decimal128, float64, int32, int64
 from pyarrow import schema as ArrowSchema
+from pyarrow import string, timestamp
 from pymongo import DESCENDING, WriteConcern
-from pymongoarrow.api import Schema, aggregate_arrow_all, find_arrow_all
+from pymongo.collection import Collection
+from pymongoarrow.api import Schema, aggregate_arrow_all, find_arrow_all, write
+from pymongoarrow.errors import ArrowWriteError
 from pymongoarrow.monkey import patch_all
 
 
@@ -180,6 +183,70 @@ def test_aggregate_omits_id_if_not_in_schema(self):
                 self.assertFalse(stage[op_name]["_id"])
                 break
 
+    def round_trip(self, data, schema, coll=None):
+        if coll is None:
+            coll = self.coll
+        self.coll.drop()
+        res = write(self.coll, data)
+        self.assertEqual(len(data), res.raw_result["insertedCount"])
+        self.assertEqual(data, find_arrow_all(coll, {}, schema=schema))
+        return res
+
+    def test_write_error(self):
+        schema = {"_id": int32(), "data": int64()}
+        data = Table.from_pydict(
+            {"_id": [i for i in range(10001)] * 2, "data": [i * 2 for i in range(10001)] * 2},
+            ArrowSchema(schema),
+        )
+        with self.assertRaises(ArrowWriteError):
+            try:
+                self.round_trip(data, Schema(schema))
+            except ArrowWriteError as awe:
+                self.assertEqual(
+                    10001, awe.details["writeErrors"][0]["index"], awe.details["nInserted"]
+                )
+                raise awe
+
+    def test_write_schema_validation(self):
+        schema = {
+            "data": int64(),
+            "float": float64(),
+            "datetime": timestamp("ms"),
+            "string": string(),
+            "bool": bool_(),
+        }
+        data = Table.from_pydict(
+            {
+                "data": [i for i in range(2)],
+                "float": [i for i in range(2)],
+                "datetime": [i for i in range(2)],
+                "string": [str(i) for i in range(2)],
+                "bool": [True for _ in range(2)],
+            },
+            ArrowSchema(schema),
+        )
+        self.round_trip(data, Schema(schema))
+
+        schema = {"_id": int32(), "data": decimal128(2)}
+        data = Table.from_pydict(
+            {"_id": [i for i in range(2)], "data": [i for i in range(2)]},
+            ArrowSchema(schema),
+        )
+        with self.assertRaises(ValueError):
+            self.round_trip(data, Schema(schema))
+
+    @mock.patch.object(Collection, "insert_many", side_effect=Collection.insert_many, autospec=True)
+    def test_write_batching(self, mock):
+        schema = {
+            "_id": int64(),
+        }
+        data = Table.from_pydict(
+            {"_id": [i for i in range(100040)]},
+            ArrowSchema(schema),
+        )
+        self.round_trip(data, Schema(schema), coll=self.coll)
+        self.assertEqual(mock.call_count, 2)
+
 
 class TestArrowExplicitApi(TestArrowApiMixin, unittest.TestCase):
     def run_find(self, *args, **kwargs):