ARROW-86 Basic auto-discovery of schemas (#88)

bcwarner · web-flow · commit a5b6291e9b09 · 2022-08-11T16:58:19.000-07:00
diff --git a/bindings/python/pymongoarrow/api.py b/bindings/python/pymongoarrow/api.py
@@ -58,15 +58,15 @@
 _MAX_WRITE_BATCH_SIZE = max(100000, MAX_WRITE_BATCH_SIZE)
 
 
-def find_arrow_all(collection, query, *, schema, **kwargs):
+def find_arrow_all(collection, query, *, schema=None, **kwargs):
     """Method that returns the results of a find query as a
     :class:`pyarrow.Table` instance.
 
     :Parameters:
       - `collection`: Instance of :class:`~pymongo.collection.Collection`.
         against which to run the ``find`` operation.
       - `query`: A mapping containing the query to use for the find operation.
-      - `schema`: Instance of :class:`~pymongoarrow.schema.Schema`.
+      - `schema` (optional): Instance of :class:`~pymongoarrow.schema.Schema`.
 
     Additional keyword-arguments passed to this method will be passed
     directly to the underlying ``find`` operation.
@@ -84,23 +84,25 @@ def find_arrow_all(collection, query, *, schema, **kwargs):
                 stacklevel=2,
             )
 
-    kwargs.setdefault("projection", schema._get_projection())
+    if schema:
+        kwargs.setdefault("projection", schema._get_projection())
+
     raw_batch_cursor = collection.find_raw_batches(query, **kwargs)
     for batch in raw_batch_cursor:
         process_bson_stream(batch, context)
 
     return context.finish()
 
 
-def aggregate_arrow_all(collection, pipeline, *, schema, **kwargs):
+def aggregate_arrow_all(collection, pipeline, *, schema=None, **kwargs):
     """Method that returns the results of an aggregation pipeline as a
     :class:`pyarrow.Table` instance.
 
     :Parameters:
       - `collection`: Instance of :class:`~pymongo.collection.Collection`.
         against which to run the ``aggregate`` operation.
       - `pipeline`: A list of aggregation pipeline stages.
-      - `schema`: Instance of :class:`~pymongoarrow.schema.Schema`.
+      - `schema` (optional): Instance of :class:`~pymongoarrow.schema.Schema`.
 
     Additional keyword-arguments passed to this method will be passed
     directly to the underlying ``aggregate`` operation.
@@ -143,15 +145,15 @@ def _arrow_to_pandas(arrow_table):
     return arrow_table.to_pandas(split_blocks=True, self_destruct=True)
 
 
-def find_pandas_all(collection, query, *, schema, **kwargs):
+def find_pandas_all(collection, query, *, schema=None, **kwargs):
     """Method that returns the results of a find query as a
     :class:`pandas.DataFrame` instance.
 
     :Parameters:
       - `collection`: Instance of :class:`~pymongo.collection.Collection`.
         against which to run the ``find`` operation.
       - `query`: A mapping containing the query to use for the find operation.
-      - `schema`: Instance of :class:`~pymongoarrow.schema.Schema`.
+      - `schema` (optional): Instance of :class:`~pymongoarrow.schema.Schema`.
 
     Additional keyword-arguments passed to this method will be passed
     directly to the underlying ``find`` operation.
@@ -162,15 +164,15 @@ def find_pandas_all(collection, query, *, schema, **kwargs):
     return _arrow_to_pandas(find_arrow_all(collection, query, schema=schema, **kwargs))
 
 
-def aggregate_pandas_all(collection, pipeline, *, schema, **kwargs):
+def aggregate_pandas_all(collection, pipeline, *, schema=None, **kwargs):
     """Method that returns the results of an aggregation pipeline as a
     :class:`pandas.DataFrame` instance.
 
     :Parameters:
       - `collection`: Instance of :class:`~pymongo.collection.Collection`.
         against which to run the ``find`` operation.
       - `pipeline`: A list of aggregation pipeline stages.
-      - `schema`: Instance of :class:`~pymongoarrow.schema.Schema`.
+      - `schema` (optional): Instance of :class:`~pymongoarrow.schema.Schema`.
 
     Additional keyword-arguments passed to this method will be passed
     directly to the underlying ``aggregate`` operation.
@@ -181,7 +183,7 @@ def aggregate_pandas_all(collection, pipeline, *, schema, **kwargs):
     return _arrow_to_pandas(aggregate_arrow_all(collection, pipeline, schema=schema, **kwargs))
 
 
-def _arrow_to_numpy(arrow_table, schema):
+def _arrow_to_numpy(arrow_table, schema=None):
     """Helper function that converts an Arrow Table to a dictionary
     containing NumPy arrays. The memory buffers backing the given Arrow Table
     may be destroyed after conversion if the resulting Numpy array(s) is not a
@@ -190,6 +192,9 @@ def _arrow_to_numpy(arrow_table, schema):
     See https://arrow.apache.org/docs/python/numpy.html for details.
     """
     container = {}
+    if not schema:
+        schema = arrow_table.schema
+
     for fname in schema:
         dtype = get_numpy_type(schema.typemap[fname])
         if dtype == np.str_:
@@ -199,7 +204,7 @@ def _arrow_to_numpy(arrow_table, schema):
     return container
 
 
-def find_numpy_all(collection, query, *, schema, **kwargs):
+def find_numpy_all(collection, query, *, schema=None, **kwargs):
     """Method that returns the results of a find query as a
     :class:`dict` instance whose keys are field names and values are
     :class:`~numpy.ndarray` instances bearing the appropriate dtype.
@@ -208,7 +213,7 @@ def find_numpy_all(collection, query, *, schema, **kwargs):
       - `collection`: Instance of :class:`~pymongo.collection.Collection`.
         against which to run the ``find`` operation.
       - `query`: A mapping containing the query to use for the find operation.
-      - `schema`: Instance of :class:`~pymongoarrow.schema.Schema`.
+      - `schema` (optional): Instance of :class:`~pymongoarrow.schema.Schema`.
 
     Additional keyword-arguments passed to this method will be passed
     directly to the underlying ``find`` operation.
@@ -228,7 +233,7 @@ def find_numpy_all(collection, query, *, schema, **kwargs):
     return _arrow_to_numpy(find_arrow_all(collection, query, schema=schema, **kwargs), schema)
 
 
-def aggregate_numpy_all(collection, pipeline, *, schema, **kwargs):
+def aggregate_numpy_all(collection, pipeline, *, schema=None, **kwargs):
     """Method that returns the results of an aggregation pipeline as a
     :class:`dict` instance whose keys are field names and values are
     :class:`~numpy.ndarray` instances bearing the appropriate dtype.
@@ -237,7 +242,7 @@ def aggregate_numpy_all(collection, pipeline, *, schema, **kwargs):
       - `collection`: Instance of :class:`~pymongo.collection.Collection`.
         against which to run the ``find`` operation.
       - `query`: A mapping containing the query to use for the find operation.
-      - `schema`: Instance of :class:`~pymongoarrow.schema.Schema`.
+      - `schema` (optional): Instance of :class:`~pymongoarrow.schema.Schema`.
 
     Additional keyword-arguments passed to this method will be passed
     directly to the underlying ``aggregate`` operation.
diff --git a/bindings/python/pymongoarrow/context.py b/bindings/python/pymongoarrow/context.py
@@ -39,7 +39,7 @@
 class PyMongoArrowContext:
     """A context for converting BSON-formatted data to an Arrow Table."""
 
-    def __init__(self, schema, builder_map):
+    def __init__(self, schema, builder_map, codec_options=None):
         """Initialize the context.
 
         :Parameters:
@@ -49,6 +49,10 @@ def __init__(self, schema, builder_map):
         """
         self.schema = schema
         self.builder_map = builder_map
+        if self.schema is None and codec_options is not None:
+            self.tzinfo = codec_options.tzinfo
+        else:
+            self.tzinfo = None
 
     @classmethod
     def from_schema(cls, schema, codec_options=DEFAULT_CODEC_OPTIONS):
@@ -60,6 +64,9 @@ def from_schema(cls, schema, codec_options=DEFAULT_CODEC_OPTIONS):
           - `codec_options` (optional): An instance of
             :class:`~bson.codec_options.CodecOptions`.
         """
+        if schema is None:
+            return cls(schema, {})
+
         builder_map = {}
         str_type_map = _get_internal_typemap(schema.typemap)
         for fname, ftype in str_type_map.items():
diff --git a/bindings/python/pymongoarrow/lib.pyx b/bindings/python/pymongoarrow/lib.pyx
@@ -53,6 +53,15 @@ cdef const bson_t* bson_reader_read_safe(bson_reader_t* stream_reader) except? N
         raise InvalidBSON("Could not read BSON document stream")
     return doc
 
+_builder_type_map = {
+    0x10: Int32Builder,
+    0x12: Int64Builder,
+    0x01: DoubleBuilder,
+    0x09: DatetimeBuilder,
+    0x07: ObjectIdBuilder,
+    0x02: StringBuilder,
+    0x08: BoolBuilder
+}
 
 def process_bson_stream(bson_stream, context):
     cdef const uint8_t* docstream = <const uint8_t *>bson_stream
@@ -94,6 +103,21 @@ def process_bson_stream(bson_stream, context):
             while bson_iter_next(&doc_iter):
                 key = bson_iter_key(&doc_iter)
                 builder = builder_map.get(key)
+                if builder is None and context.schema is None:
+                    # Only run if there is no schema.
+                    ftype = bson_iter_type(&doc_iter)
+                    if ftype not in _builder_type_map:
+                        continue
+
+                    builder_type = _builder_type_map[ftype]
+                    if builder_type == DatetimeBuilder and context.tzinfo is not None:
+                        arrow_type = timestamp(arrow_type.unit, tz=context.tzinfo)
+                        builder_map[key] = builder_type(dtype=arrow_type)
+                    else:
+                        builder_map[key] = builder_type()
+                    builder = builder_map[key]
+                    for _ in range(count):
+                        builder.append_null()
                 if builder is not None:
                     ftype = builder.type_marker
                     value_t = bson_iter_type(&doc_iter)
diff --git a/bindings/python/test/test_arrow.py b/bindings/python/test/test_arrow.py
@@ -14,12 +14,13 @@
 import os
 import unittest
 import unittest.mock as mock
+from datetime import datetime
 from test import client_context
 from test.utils import AllowListEventListener, TestNullsBase
 
 import pyarrow
 import pymongo
-from bson import Decimal128, ObjectId
+from bson import CodecOptions, Decimal128, ObjectId
 from pyarrow import Table, binary, bool_, decimal256, float64, int32, int64
 from pyarrow import schema as ArrowSchema
 from pyarrow import string, timestamp
@@ -34,6 +35,7 @@
     Decimal128StringType,
     ObjectIdType,
 )
+from pytz import timezone
 
 
 class TestArrowApiMixin:
@@ -326,6 +328,62 @@ def test_string_bool(self):
             ),
         )
 
+    def test_auto_schema(self):
+        # Create table with random data of various types.
+        data = Table.from_pydict(
+            {
+                "string": [None] + [str(i) for i in range(2)],
+                "bool": [True for _ in range(3)],
+                "dt": [datetime(1970 + i, 1, 1) for i in range(3)],
+            },
+            ArrowSchema(
+                {
+                    "bool": bool_(),
+                    "dt": timestamp("ms"),
+                    "string": string(),
+                }
+            ),
+        )
+
+        self.coll.drop()
+        res = write(self.coll, data)
+        self.assertEqual(len(data), res.raw_result["insertedCount"])
+        out = find_arrow_all(self.coll, {}).drop(["_id"])
+        self.assertEqual(data, out)
+
+    def test_auto_schema_heterogeneous(self):
+        vals = [1, "2", True, 4]
+        data = [{"a": v} for v in vals]
+
+        self.coll.drop()
+        self.coll.insert_many(data)
+        out = find_arrow_all(self.coll, {}).drop(["_id"])
+        self.assertEqual(out["a"].to_pylist(), [1, None, None, 4])
+
+    def test_auto_schema_tz(self):
+        # Create table with random data of various types.
+        data = Table.from_pydict(
+            {
+                "bool": [True for _ in range(3)],
+                "dt": [datetime(1970 + i, 1, 1, tzinfo=timezone("US/Eastern")) for i in range(3)],
+                "string": [None] + [str(i) for i in range(2)],
+            },
+            ArrowSchema(
+                {
+                    "bool": bool_(),
+                    "dt": timestamp("ms"),
+                    "string": string(),
+                }
+            ),
+        )
+
+        self.coll.drop()
+        codec_options = CodecOptions(tzinfo=timezone("US/Eastern"), tz_aware=True)
+        res = write(self.coll.with_options(codec_options=codec_options), data)
+        self.assertEqual(len(data), res.raw_result["insertedCount"])
+        out = find_arrow_all(self.coll.with_options(codec_options=codec_options), {}).drop(["_id"])
+        self.assertEqual(data, out)
+
 
 class TestArrowExplicitApi(TestArrowApiMixin, unittest.TestCase):
     def run_find(self, *args, **kwargs):