ARROW-160 Consolidate benchmarks (#141)

juliusgeo · web-flow · commit 60c2d64691aa · 2023-03-28T14:51:42.000-07:00
diff --git a/bindings/python/benchmark.py b/bindings/python/benchmark.py
diff --git a/bindings/python/benchmarks/benchmarks.py b/bindings/python/benchmarks/benchmarks.py
@@ -17,6 +17,8 @@
 import os
 import string
 
+import numpy as np
+import pandas as pd
 import pyarrow
 import pymongo
 from bson import BSON
@@ -54,6 +56,8 @@
 
 large_doc_keys = [c * i for c in string.ascii_lowercase for i in range(1, 101)]
 schemas[SMALL] = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
+dtypes[SMALL] = np.dtype([("x", np.int64), ("y", np.float64)])
+dtypes[LARGE] = np.dtype([(k, np.float64) for k in large_doc_keys])
 schemas[LARGE] = Schema({k: pyarrow.float64() for k in large_doc_keys})
 large = db[collection_names[LARGE]]
 large.drop()
@@ -95,3 +99,44 @@ def time_insert_pandas(self):
 
     def time_insert_numpy(self):
         write(db[collection_names[CUR_SIZE]], numpy_arrays[CUR_SIZE])
+
+
+class ProfileRead:
+    """
+    A benchmark that times the performance of various kinds
+    of reading MongoDB data.
+    """
+
+    def setup(self):
+        db[collection_names[CUR_SIZE]].drop()
+
+    def time_conventional_ndarray(self):
+        collection = db[collection_names[CUR_SIZE]]
+        cursor = collection.find()
+        dtype = dtypes[CUR_SIZE]
+
+        if CUR_SIZE == LARGE:
+            np.array([tuple(doc[k] for k in large_doc_keys) for doc in cursor], dtype=dtype)
+        else:
+            np.array([(doc["x"], doc["y"]) for doc in cursor], dtype=dtype)
+
+    def time_to_numpy(self):
+        c = db[collection_names[CUR_SIZE]]
+        schema = schemas[CUR_SIZE]
+        find_numpy_all(c, {}, schema=schema)
+
+    def time_conventional_pandas(self):
+        collection = db[collection_names[CUR_SIZE]]
+        _ = dtypes[CUR_SIZE]
+        cursor = collection.find(projection={"_id": 0})
+        _ = pd.DataFrame(list(cursor))
+
+    def time_to_pandas(self):
+        c = db[collection_names[CUR_SIZE]]
+        schema = schemas[CUR_SIZE]
+        find_pandas_all(c, {}, schema=schema)
+
+    def time_to_arrow(self):
+        c = db[collection_names[CUR_SIZE]]
+        schema = schemas[CUR_SIZE]
+        find_arrow_all(c, {}, schema=schema)
diff --git a/bindings/python/docs/source/developer/benchmarks.rst b/bindings/python/docs/source/developer/benchmarks.rst
@@ -0,0 +1,12 @@
+Running Benchmarks
+==================
+
+.. highlight:: bash
+
+System Requirements
+-------------------
+
+To run the benchmarks, you need the `asv <https://pypi.org/project/asv/>`_ package,
+which can then be invoked like so::
+
+  $ asv run --strict -E existing
diff --git a/bindings/python/docs/source/developer/index.rst b/bindings/python/docs/source/developer/index.rst
@@ -7,3 +7,4 @@ Technical guide for contributors to PyMongoArrow.
    :maxdepth: 1
 
    installation
+   benchmarks

Original file line number	Diff line number	Diff line change
`@@ -7,3 +7,4 @@ Technical guide for contributors to PyMongoArrow.`
`7`	`7`	`:maxdepth: 1`
`8`	`8`
`9`	`9`	`installation`
	`10`	`+ benchmarks`