ARROW-76 Add performance testing for NumPy and optimize (#74)

juliusgeo · web-flow · commit 8ed061bde066 · 2022-04-20T07:35:00.000-07:00
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,75 @@
+name: Python Benchmark
+
+on:
+  pull_request:
+
+concurrency:
+  group: benchmark-${{ github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    working-directory: ./bindings/python
+    shell: bash
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: ["macos-latest"]
+        python-version: ["3.10"]
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: '**/setup.cfg'
+      - name: Start MongoDB on MacOS
+        run: |
+          mkdir data
+          mongod --fork --dbpath=$(pwd)/data --logpath=$PWD/mongo.log
+      - name: Install libbson
+        run: |
+          LIBBSON_INSTALL_DIR=$(pwd)/libbson ./build-libbson.sh
+      - name: Install Python dependencies
+        run: |
+          python -m pip install -U pip
+      - name: Install pymongoarrow
+        run: |
+          # Install the library
+          LIBBSON_INSTALL_DIR=$(pwd)/libbson python -m pip install -vvv -e ".[test]"
+      - name: Run tests
+        run: |
+          set -eu
+          run_asv () {
+              if [ ! -e "asv.conf.json" ] ; then
+                  git checkout refs/bm/pr asv.conf.json
+                  git checkout refs/bm/pr benchmarks/__init__.py
+                  git checkout refs/bm/pr benchmarks/benchmarks.py
+              fi
+              git show --no-patch --format="%H (%s)"
+              asv run --strict --verbose --show-stderr -E existing --set-commit-hash $(git rev-parse HEAD)
+          }
+
+          pip install asv virtualenv
+          asv machine --yes
+          git fetch origin main:main
+          git update-ref refs/bm/pr HEAD
+          # We know this is a PR run. The branch is a GitHub refs/pull/*/merge ref, so
+          # the current target that this PR will be merged into is HEAD^1.
+          git update-ref refs/bm/merge-target $(git log -n 1 --pretty=format:"%H" main --)
+          git checkout --force refs/bm/pr --
+          run_asv
+
+
+          git checkout --force refs/bm/merge-target --
+          run_asv
+
+          asv compare refs/bm/merge-target refs/bm/pr --
+      - name: Fail if any benchmarks have slowed down too much
+        run: |
+          ! asv compare --factor 1.2 --split refs/bm/merge-target refs/bm/pr | grep -q "got worse"
diff --git a/bindings/python/MANIFEST.in b/bindings/python/MANIFEST.in
@@ -9,6 +9,7 @@ exclude addtags.py
 exclude benchmark.py
 exclude .flake8
 exclude RELEASE.rst
+exclude asv.conf.json
 
 graft pymongoarrow
 
diff --git a/bindings/python/asv.conf.json b/bindings/python/asv.conf.json
@@ -0,0 +1,19 @@
+{
+    "version": 1,
+    "project": "pymongoarrow",
+    "project_url": "https://mongo-arrow.readthedocs.io/en/latest/",
+    "repo": "../../",
+    "repo_subdir": "bindings/python",
+    "branches": ["main"],
+    "matrix": {
+         "req": {
+             "pyarrow": ["7.0.0"],
+             "pymongo": ["3.11", "4.1.1"],
+             "pandas": [],
+             "Cython": [],
+             "numpy": []
+         },
+         "env": {"BENCHMARK_SIZE": ["LARGE", "SMALL"]},
+    },
+    "environment_type": "virtualenv",
+}
diff --git a/bindings/python/benchmarks/__init__.py b/bindings/python/benchmarks/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-present MongoDB, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/bindings/python/benchmarks/benchmarks.py b/bindings/python/benchmarks/benchmarks.py
@@ -0,0 +1,97 @@
+# Copyright 2022-present MongoDB, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+import os
+import string
+
+import pyarrow
+import pymongo
+from bson import BSON
+from pymongoarrow.api import (
+    Schema,
+    find_arrow_all,
+    find_numpy_all,
+    find_pandas_all,
+    write,
+)
+
+CUR_SIZE = True if os.environ.get("BENCHMARK_SIZE") == "LARGE" else False
+N_LARGE_DOCS = 1000
+N_SMALL_DOCS = 100000
+assert pymongo.has_c()
+SMALL = False
+LARGE = True
+collection_names = {LARGE: "large", SMALL: "small"}
+dtypes = {}
+schemas = {}
+
+arrow_tables = {}
+pandas_tables = {}
+numpy_arrays = {}
+
+large_doc_keys = None
+
+db = pymongo.MongoClient().pymongoarrow_test
+small = db[collection_names[SMALL]]
+small.drop()
+
+small.insert_many(
+    [collections.OrderedDict([("x", 1), ("y", math.pi)]) for _ in range(N_SMALL_DOCS)]
+)
+
+large_doc_keys = [c * i for c in string.ascii_lowercase for i in range(1, 101)]
+schemas[SMALL] = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
+schemas[LARGE] = Schema({k: pyarrow.float64() for k in large_doc_keys})
+large = db[collection_names[LARGE]]
+large.drop()
+# 2600 keys: 'a', 'aa', 'aaa', .., 'zz..z'
+large_doc = collections.OrderedDict([(k, math.pi) for k in large_doc_keys])
+print(
+    "%d large docs, %dk each with %d keys"
+    % (N_LARGE_DOCS, len(BSON.encode(large_doc)) // 1024, len(large_doc_keys))
+)
+
+large.insert_many([large_doc.copy() for _ in range(N_LARGE_DOCS)])
+
+arrow_tables[SMALL] = find_arrow_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
+arrow_tables[LARGE] = find_arrow_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
+pandas_tables[SMALL] = find_pandas_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
+pandas_tables[LARGE] = find_pandas_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
+numpy_arrays[SMALL] = find_numpy_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
+numpy_arrays[LARGE] = find_numpy_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
+
+
+class ProfileInsert:
+    """
+    A benchmark that times the performance of various kinds
+    of inserting tabular data.
+    """
+
+    def setup(self):
+        db[collection_names[CUR_SIZE]].drop()
+
+    def time_insert_arrow(self):
+        write(db[collection_names[CUR_SIZE]], arrow_tables[CUR_SIZE])
+
+    def time_insert_conventional(self):
+        tab = arrow_tables[CUR_SIZE].to_pylist()
+        db[collection_names[CUR_SIZE]].insert_many(tab)
+
+    def time_insert_pandas(self):
+        write(db[collection_names[CUR_SIZE]], pandas_tables[CUR_SIZE])
+
+    def time_insert_numpy(self):
+        write(db[collection_names[CUR_SIZE]], numpy_arrays[CUR_SIZE])