Skip to content

Commit 8ed061b

Browse files
authored
ARROW-76 Add performance testing for NumPy and optimize (#74)
1 parent 2e65cb4 commit 8ed061b

File tree

5 files changed

+205
-0
lines changed

5 files changed

+205
-0
lines changed

.github/workflows/benchmark.yml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
name: Python Benchmark
2+
3+
on:
4+
pull_request:
5+
6+
concurrency:
7+
group: benchmark-${{ github.ref }}
8+
cancel-in-progress: true
9+
10+
defaults:
11+
run:
12+
working-directory: ./bindings/python
13+
shell: bash
14+
15+
jobs:
16+
build:
17+
runs-on: ${{ matrix.os }}
18+
strategy:
19+
matrix:
20+
os: ["macos-latest"]
21+
python-version: ["3.10"]
22+
fail-fast: false
23+
steps:
24+
- uses: actions/checkout@v2
25+
- name: Setup Python
26+
uses: actions/setup-python@v2
27+
with:
28+
python-version: ${{ matrix.python-version }}
29+
cache: 'pip'
30+
cache-dependency-path: '**/setup.cfg'
31+
- name: Start MongoDB on MacOS
32+
run: |
33+
mkdir data
34+
mongod --fork --dbpath=$(pwd)/data --logpath=$PWD/mongo.log
35+
- name: Install libbson
36+
run: |
37+
LIBBSON_INSTALL_DIR=$(pwd)/libbson ./build-libbson.sh
38+
- name: Install Python dependencies
39+
run: |
40+
python -m pip install -U pip
41+
- name: Install pymongoarrow
42+
run: |
43+
# Install the library
44+
LIBBSON_INSTALL_DIR=$(pwd)/libbson python -m pip install -vvv -e ".[test]"
45+
- name: Run tests
46+
run: |
47+
set -eu
48+
run_asv () {
49+
if [ ! -e "asv.conf.json" ] ; then
50+
git checkout refs/bm/pr asv.conf.json
51+
git checkout refs/bm/pr benchmarks/__init__.py
52+
git checkout refs/bm/pr benchmarks/benchmarks.py
53+
fi
54+
git show --no-patch --format="%H (%s)"
55+
asv run --strict --verbose --show-stderr -E existing --set-commit-hash $(git rev-parse HEAD)
56+
}
57+
58+
pip install asv virtualenv
59+
asv machine --yes
60+
git fetch origin main:main
61+
git update-ref refs/bm/pr HEAD
62+
# We know this is a PR run. The branch is a GitHub refs/pull/*/merge ref, so
63+
# the current target that this PR will be merged into is HEAD^1.
64+
git update-ref refs/bm/merge-target $(git log -n 1 --pretty=format:"%H" main --)
65+
git checkout --force refs/bm/pr --
66+
run_asv
67+
68+
69+
git checkout --force refs/bm/merge-target --
70+
run_asv
71+
72+
asv compare refs/bm/merge-target refs/bm/pr --
73+
- name: Fail if any benchmarks have slowed down too much
74+
run: |
75+
! asv compare --factor 1.2 --split refs/bm/merge-target refs/bm/pr | grep -q "got worse"

bindings/python/MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ exclude addtags.py
99
exclude benchmark.py
1010
exclude .flake8
1111
exclude RELEASE.rst
12+
exclude asv.conf.json
1213

1314
graft pymongoarrow
1415

bindings/python/asv.conf.json

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"version": 1,
3+
"project": "pymongoarrow",
4+
"project_url": "https://mongo-arrow.readthedocs.io/en/latest/",
5+
"repo": "../../",
6+
"repo_subdir": "bindings/python",
7+
"branches": ["main"],
8+
"matrix": {
9+
"req": {
10+
"pyarrow": ["7.0.0"],
11+
"pymongo": ["3.11", "4.1.1"],
12+
"pandas": [],
13+
"Cython": [],
14+
"numpy": []
15+
},
16+
"env": {"BENCHMARK_SIZE": ["LARGE", "SMALL"]},
17+
},
18+
"environment_type": "virtualenv",
19+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2022-present MongoDB, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# Copyright 2022-present MongoDB, Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import collections
16+
import math
17+
import os
18+
import string
19+
20+
import pyarrow
21+
import pymongo
22+
from bson import BSON
23+
from pymongoarrow.api import (
24+
Schema,
25+
find_arrow_all,
26+
find_numpy_all,
27+
find_pandas_all,
28+
write,
29+
)
30+
31+
CUR_SIZE = True if os.environ.get("BENCHMARK_SIZE") == "LARGE" else False
32+
N_LARGE_DOCS = 1000
33+
N_SMALL_DOCS = 100000
34+
assert pymongo.has_c()
35+
SMALL = False
36+
LARGE = True
37+
collection_names = {LARGE: "large", SMALL: "small"}
38+
dtypes = {}
39+
schemas = {}
40+
41+
arrow_tables = {}
42+
pandas_tables = {}
43+
numpy_arrays = {}
44+
45+
large_doc_keys = None
46+
47+
db = pymongo.MongoClient().pymongoarrow_test
48+
small = db[collection_names[SMALL]]
49+
small.drop()
50+
51+
small.insert_many(
52+
[collections.OrderedDict([("x", 1), ("y", math.pi)]) for _ in range(N_SMALL_DOCS)]
53+
)
54+
55+
large_doc_keys = [c * i for c in string.ascii_lowercase for i in range(1, 101)]
56+
schemas[SMALL] = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
57+
schemas[LARGE] = Schema({k: pyarrow.float64() for k in large_doc_keys})
58+
large = db[collection_names[LARGE]]
59+
large.drop()
60+
# 2600 keys: 'a', 'aa', 'aaa', .., 'zz..z'
61+
large_doc = collections.OrderedDict([(k, math.pi) for k in large_doc_keys])
62+
print(
63+
"%d large docs, %dk each with %d keys"
64+
% (N_LARGE_DOCS, len(BSON.encode(large_doc)) // 1024, len(large_doc_keys))
65+
)
66+
67+
large.insert_many([large_doc.copy() for _ in range(N_LARGE_DOCS)])
68+
69+
arrow_tables[SMALL] = find_arrow_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
70+
arrow_tables[LARGE] = find_arrow_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
71+
pandas_tables[SMALL] = find_pandas_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
72+
pandas_tables[LARGE] = find_pandas_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
73+
numpy_arrays[SMALL] = find_numpy_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
74+
numpy_arrays[LARGE] = find_numpy_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
75+
76+
77+
class ProfileInsert:
78+
"""
79+
A benchmark that times the performance of various kinds
80+
of inserting tabular data.
81+
"""
82+
83+
def setup(self):
84+
db[collection_names[CUR_SIZE]].drop()
85+
86+
def time_insert_arrow(self):
87+
write(db[collection_names[CUR_SIZE]], arrow_tables[CUR_SIZE])
88+
89+
def time_insert_conventional(self):
90+
tab = arrow_tables[CUR_SIZE].to_pylist()
91+
db[collection_names[CUR_SIZE]].insert_many(tab)
92+
93+
def time_insert_pandas(self):
94+
write(db[collection_names[CUR_SIZE]], pandas_tables[CUR_SIZE])
95+
96+
def time_insert_numpy(self):
97+
write(db[collection_names[CUR_SIZE]], numpy_arrays[CUR_SIZE])

0 commit comments

Comments
 (0)