Skip to content

Commit edaa054

Browse files
authored
ARROW-146 Update benchmark tests to include BSON Arrays (#140)
1 parent 2853cb0 commit edaa054

File tree

4 files changed

+183
-72
lines changed

4 files changed

+183
-72
lines changed

.github/workflows/benchmark.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ jobs:
5050
git checkout refs/bm/pr benchmarks/benchmarks.py
5151
fi
5252
git show --no-patch --format="%H (%s)"
53-
asv run --strict --verbose --show-stderr -E existing --set-commit-hash $(git rev-parse HEAD)
53+
asv run --python=`which python` --set-commit-hash $(git rev-parse HEAD) -vvv
5454
}
5555
5656
asv machine --yes

bindings/python/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,7 @@ docs/_build/
4141
# libbson files
4242
libbson
4343
mongo-c-driver-*
44+
45+
46+
# Benchmark output files
47+
results/*

bindings/python/asv.conf.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
"Cython": [],
1414
"numpy": []
1515
},
16-
"env": {"BENCHMARK_SIZE": ["LARGE", "SMALL"]},
16+
"env": {
17+
"N_DOCS": ["50000", "1000"],
18+
},
1719
},
1820
"environment_type": "virtualenv",
1921
}

bindings/python/benchmarks/benchmarks.py

Lines changed: 175 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
14+
import abc
1515
import collections
1616
import math
1717
import os
18-
import string
18+
from abc import ABC
1919

2020
import numpy as np
2121
import pandas as pd
@@ -30,113 +30,218 @@
3030
write,
3131
)
3232

33-
CUR_SIZE = True if os.environ.get("BENCHMARK_SIZE") == "LARGE" else False
34-
N_LARGE_DOCS = 1000
35-
N_SMALL_DOCS = 100000
33+
N_DOCS = int(os.environ.get("N_DOCS"))
34+
name_to_obj = {"list": list, "dict": dict}
3635
assert pymongo.has_c()
37-
SMALL = False
38-
LARGE = True
39-
collection_names = {LARGE: "large", SMALL: "small"}
40-
dtypes = {}
41-
schemas = {}
42-
43-
arrow_tables = {}
44-
pandas_tables = {}
45-
numpy_arrays = {}
46-
47-
large_doc_keys = None
48-
4936
db = pymongo.MongoClient().pymongoarrow_test
50-
small = db[collection_names[SMALL]]
51-
small.drop()
52-
53-
small.insert_many(
54-
[collections.OrderedDict([("x", 1), ("y", math.pi)]) for _ in range(N_SMALL_DOCS)]
55-
)
56-
57-
large_doc_keys = [c * i for c in string.ascii_lowercase for i in range(1, 101)]
58-
schemas[SMALL] = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
59-
dtypes[SMALL] = np.dtype([("x", np.int64), ("y", np.float64)])
60-
dtypes[LARGE] = np.dtype([(k, np.float64) for k in large_doc_keys])
61-
schemas[LARGE] = Schema({k: pyarrow.float64() for k in large_doc_keys})
62-
large = db[collection_names[LARGE]]
63-
large.drop()
64-
# 2600 keys: 'a', 'aa', 'aaa', .., 'zz..z'
65-
large_doc = collections.OrderedDict([(k, math.pi) for k in large_doc_keys])
66-
print(
67-
"%d large docs, %dk each with %d keys"
68-
% (N_LARGE_DOCS, len(BSON.encode(large_doc)) // 1024, len(large_doc_keys))
69-
)
70-
71-
large.insert_many([large_doc.copy() for _ in range(N_LARGE_DOCS)])
7237

73-
arrow_tables[SMALL] = find_arrow_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
74-
arrow_tables[LARGE] = find_arrow_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
75-
pandas_tables[SMALL] = find_pandas_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
76-
pandas_tables[LARGE] = find_pandas_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
77-
numpy_arrays[SMALL] = find_numpy_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL])
78-
numpy_arrays[LARGE] = find_numpy_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE])
38+
LARGE_DOC_SIZE = 50
7939

8040

81-
class ProfileInsert:
41+
# We have to use ABCs because ASV doesn't support any other way of skipping tests.
42+
class Insert(ABC):
8243
"""
8344
A benchmark that times the performance of various kinds
8445
of inserting tabular data.
8546
"""
8647

48+
timeout = 100000
49+
50+
@abc.abstractmethod
8751
def setup(self):
88-
db[collection_names[CUR_SIZE]].drop()
52+
raise NotImplementedError
8953

9054
def time_insert_arrow(self):
91-
write(db[collection_names[CUR_SIZE]], arrow_tables[CUR_SIZE])
55+
write(db.benchmark, self.arrow_table)
9256

9357
def time_insert_conventional(self):
94-
tab = arrow_tables[CUR_SIZE].to_pylist()
95-
db[collection_names[CUR_SIZE]].insert_many(tab)
58+
tab = self.arrow_table.to_pylist()
59+
db.benchmark.insert_many(tab)
9660

9761
def time_insert_pandas(self):
98-
write(db[collection_names[CUR_SIZE]], pandas_tables[CUR_SIZE])
62+
write(db.benchmark, self.pandas_table)
9963

10064
def time_insert_numpy(self):
101-
write(db[collection_names[CUR_SIZE]], numpy_arrays[CUR_SIZE])
65+
write(db.benchmark, self.numpy_arrays)
10266

10367

104-
class ProfileRead:
68+
class Read(ABC):
10569
"""
10670
A benchmark that times the performance of various kinds
10771
of reading MongoDB data.
10872
"""
10973

74+
timeout = 100000
75+
76+
@abc.abstractmethod
11077
def setup(self):
111-
db[collection_names[CUR_SIZE]].drop()
78+
raise NotImplementedError
79+
80+
# We need this because the naive methods don't always convert nested objects.
81+
@staticmethod
82+
def exercise_table(table):
83+
pass
11284

11385
def time_conventional_ndarray(self):
114-
collection = db[collection_names[CUR_SIZE]]
86+
collection = db.benchmark
11587
cursor = collection.find()
116-
dtype = dtypes[CUR_SIZE]
117-
118-
if CUR_SIZE == LARGE:
119-
np.array([tuple(doc[k] for k in large_doc_keys) for doc in cursor], dtype=dtype)
88+
dtype = self.dtypes
89+
if "Large" in type(self).__name__:
90+
np.array([tuple(doc[k] for k in self.large_doc_keys) for doc in cursor], dtype=dtype)
12091
else:
12192
np.array([(doc["x"], doc["y"]) for doc in cursor], dtype=dtype)
12293

12394
def time_to_numpy(self):
124-
c = db[collection_names[CUR_SIZE]]
125-
schema = schemas[CUR_SIZE]
126-
find_numpy_all(c, {}, schema=schema)
95+
c = db.benchmark
96+
find_numpy_all(c, {}, schema=self.schema)
12797

12898
def time_conventional_pandas(self):
129-
collection = db[collection_names[CUR_SIZE]]
130-
_ = dtypes[CUR_SIZE]
99+
collection = db.benchmark
131100
cursor = collection.find(projection={"_id": 0})
132101
_ = pd.DataFrame(list(cursor))
133102

134103
def time_to_pandas(self):
135-
c = db[collection_names[CUR_SIZE]]
136-
schema = schemas[CUR_SIZE]
137-
find_pandas_all(c, {}, schema=schema)
104+
c = db.benchmark
105+
find_pandas_all(c, {}, schema=self.schema, projection={"_id": 0})
138106

139107
def time_to_arrow(self):
140-
c = db[collection_names[CUR_SIZE]]
141-
schema = schemas[CUR_SIZE]
142-
find_arrow_all(c, {}, schema=schema)
108+
c = db.benchmark
109+
table = find_arrow_all(c, {}, schema=self.schema)
110+
self.exercise_table(table)
111+
112+
def time_conventional_arrow(self):
113+
c = db.benchmark
114+
f = list(c.find({}, projection={"_id": 0}))
115+
table = pyarrow.Table.from_pylist(f)
116+
self.exercise_table(table)
117+
118+
119+
class ProfileReadArray(Read):
120+
def setup(self):
121+
coll = db.benchmark
122+
coll.drop()
123+
base_dict = collections.OrderedDict(
124+
[("x", 1), ("y", math.pi), ("emb", [math.pi for _ in range(64)])]
125+
)
126+
schema_dict = {"x": pyarrow.int64(), "y": pyarrow.float64()}
127+
dtypes_list = np.dtype([("x", np.int64), ("y", np.float64)])
128+
self.schema = Schema(schema_dict)
129+
self.dtypes = np.dtype(dtypes_list)
130+
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
131+
print(
132+
"%d docs, %dk each with %d keys"
133+
% (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
134+
)
135+
136+
# We need this because the naive methods don't always convert nested objects.
137+
@staticmethod
138+
def exercise_table(table):
139+
[
140+
[[n for n in i.values] if isinstance(i, pyarrow.ListScalar) else i for i in column]
141+
for column in table.columns
142+
]
143+
144+
# All of the following tests are being skipped because NumPy/Pandas do not work with nested arrays.
145+
def time_to_numpy(self):
146+
pass
147+
148+
def time_to_pandas(self):
149+
pass
150+
151+
def time_conventional_ndarray(self):
152+
pass
153+
154+
def time_conventional_pandas(self):
155+
pass
156+
157+
158+
class ProfileReadSmall(Read):
159+
schema = None
160+
dtypes = None
161+
162+
def setup(self):
163+
coll = db.benchmark
164+
coll.drop()
165+
base_dict = collections.OrderedDict(
166+
[
167+
("x", 1),
168+
("y", math.pi),
169+
]
170+
)
171+
schema_dict = {"x": pyarrow.int64(), "y": pyarrow.float64()}
172+
dtypes_list = np.dtype([("x", np.int64), ("y", np.float64)])
173+
self.schema = Schema(schema_dict)
174+
self.dtypes = np.dtype(dtypes_list)
175+
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
176+
print(
177+
"%d docs, %dk each with %d keys"
178+
% (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
179+
)
180+
181+
182+
class ProfileReadLarge(Read):
183+
schema = None
184+
dtypes = None
185+
186+
def setup(self):
187+
coll = db.benchmark
188+
coll.drop()
189+
large_doc_keys = self.large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
190+
base_dict = collections.OrderedDict([(k, math.pi) for k in large_doc_keys])
191+
dtypes_list = np.dtype([(k, np.float64) for k in large_doc_keys])
192+
schema_dict = {k: pyarrow.float64() for k in large_doc_keys}
193+
self.schema = Schema(schema_dict)
194+
self.dtypes = np.dtype(dtypes_list)
195+
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
196+
print(
197+
"%d docs, %dk each with %d keys"
198+
% (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
199+
)
200+
201+
202+
class ProfileInsertSmall(Insert):
203+
arrow_table = None
204+
pandas_table = None
205+
numpy_arrays = None
206+
dtypes = None
207+
208+
def setup(self):
209+
coll = db.benchmark
210+
coll.drop()
211+
base_dict = collections.OrderedDict([("x", 1), ("y", math.pi)])
212+
dtypes_list = np.dtype([("x", np.int64), ("y", np.float64)])
213+
self.dtypes = np.dtype(dtypes_list)
214+
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
215+
print(
216+
"%d docs, %dk each with %d keys"
217+
% (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
218+
)
219+
schema = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
220+
221+
self.arrow_table = find_arrow_all(db.benchmark, {}, schema=schema)
222+
self.pandas_table = find_pandas_all(db.benchmark, {}, schema=schema)
223+
self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema)
224+
225+
226+
class ProfileInsertLarge(Insert):
227+
arrow_table = None
228+
pandas_table = None
229+
numpy_arrays = None
230+
dtypes = None
231+
232+
def setup(self):
233+
coll = db.benchmark
234+
coll.drop()
235+
large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
236+
base_dict = collections.OrderedDict([(k, math.pi) for k in large_doc_keys])
237+
dtypes_list = np.dtype([(k, np.float64) for k in large_doc_keys])
238+
self.dtypes = np.dtype(dtypes_list)
239+
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
240+
print(
241+
"%d docs, %dk each with %d keys"
242+
% (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
243+
)
244+
schema = Schema({k: pyarrow.float64() for k in large_doc_keys})
245+
self.arrow_table = find_arrow_all(db.benchmark, {}, schema=schema)
246+
self.pandas_table = find_pandas_all(db.benchmark, {}, schema=schema)
247+
self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema)

0 commit comments

Comments
 (0)