Skip to content

Commit ed400df

Browse files
authored
ARROW-147 Update benchmark tests to include memory usage (#146)
1 parent 4fb6fcc commit ed400df

File tree

3 files changed

+87
-68
lines changed

3 files changed

+87
-68
lines changed

.github/workflows/benchmark.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ jobs:
5050
git checkout refs/bm/pr benchmarks/benchmarks.py
5151
fi
5252
git show --no-patch --format="%H (%s)"
53-
asv run --python=`which python` --set-commit-hash $(git rev-parse HEAD) -vvv
53+
asv run --python=`which python` --set-commit-hash $(git rev-parse HEAD)
5454
}
5555
5656
asv machine --yes

bindings/python/asv.conf.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"numpy": []
1515
},
1616
"env": {
17-
"N_DOCS": ["50000", "1000"],
17+
"N_DOCS": ["20000", "1000"],
1818
},
1919
},
2020
"environment_type": "virtualenv",

bindings/python/benchmarks/benchmarks.py

Lines changed: 85 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,12 @@
3131
)
3232

3333
N_DOCS = int(os.environ.get("N_DOCS"))
34-
name_to_obj = {"list": list, "dict": dict}
3534
assert pymongo.has_c()
3635
db = pymongo.MongoClient().pymongoarrow_test
3736

38-
LARGE_DOC_SIZE = 50
37+
LARGE_DOC_SIZE = 20
3938
EMBEDDED_OBJECT_SIZE = (
40-
64 # The number of values or key/value pairs in the embedded object (array or document).
39+
20 # The number of values or key/value pairs in the embedded object (array or document).
4140
)
4241

4342

@@ -48,10 +47,13 @@ class Insert(ABC):
4847
of inserting tabular data.
4948
"""
5049

51-
timeout = 100000
50+
timeout = 100000 # The setup sometimes times out.
51+
number = 1
52+
repeat = (1, 10, 30.0) # Min repeat, max repeat, time limit (will stop sampling after this)
53+
rounds = 1
5254

5355
@abc.abstractmethod
54-
def setup(self):
56+
def setup_cache(self):
5557
raise NotImplementedError
5658

5759
def time_insert_arrow(self):
@@ -67,17 +69,32 @@ def time_insert_pandas(self):
6769
def time_insert_numpy(self):
6870
write(db.benchmark, self.numpy_arrays)
6971

72+
def peakmem_insert_arrow(self):
73+
self.time_insert_arrow()
74+
75+
def peakmem_insert_conventional(self):
76+
self.time_insert_conventional()
77+
78+
def peakmem_insert_pandas(self):
79+
self.time_insert_pandas()
80+
81+
def peakmem_insert_numpy(self):
82+
self.time_insert_numpy()
83+
7084

7185
class Read(ABC):
7286
"""
7387
A benchmark that times the performance of various kinds
7488
of reading MongoDB data.
7589
"""
7690

77-
timeout = 100000
91+
timeout = 100000 # The setup sometimes times out.
92+
number = 3
93+
repeat = (1, 10, 30.0) # Min repeat, max repeat, time limit (will stop sampling after this)
94+
rounds = 1
7895

7996
@abc.abstractmethod
80-
def setup(self):
97+
def setup_cache(self):
8198
raise NotImplementedError
8299

83100
# We need this because the naive methods don't always convert nested objects.
@@ -118,20 +135,37 @@ def time_conventional_arrow(self):
118135
table = pyarrow.Table.from_pylist(f)
119136
self.exercise_table(table)
120137

138+
def peakmem_to_numpy(self):
139+
self.time_to_numpy()
140+
141+
def peakmem_conventional_pandas(self):
142+
self.time_conventional_pandas()
143+
144+
def peakmem_to_pandas(self):
145+
self.time_to_pandas()
146+
147+
def peakmem_to_arrow(self):
148+
self.time_to_arrow()
149+
150+
def peakmem_conventional_arrow(self):
151+
self.time_conventional_arrow()
152+
121153

122154
class ProfileReadArray(Read):
123-
def setup(self):
155+
schema = Schema(
156+
{
157+
"x": pyarrow.int64(),
158+
"y": pyarrow.float64(),
159+
"emb": pyarrow.list_(pyarrow.float64()),
160+
}
161+
)
162+
163+
def setup_cache(self):
124164
coll = db.benchmark
125165
coll.drop()
126166
base_dict = collections.OrderedDict(
127167
[("x", 1), ("y", math.pi), ("emb", [math.pi for _ in range(EMBEDDED_OBJECT_SIZE)])]
128168
)
129-
schema_dict = {
130-
"x": pyarrow.int64(),
131-
"y": pyarrow.float64(),
132-
"emb": pyarrow.list_(pyarrow.float64()),
133-
}
134-
self.schema = Schema(schema_dict)
135169
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
136170
print(
137171
"%d docs, %dk each with %d keys"
@@ -161,7 +195,17 @@ def time_conventional_pandas(self):
161195

162196

163197
class ProfileReadDocument(Read):
164-
def setup(self):
198+
schema = Schema(
199+
{
200+
"x": pyarrow.int64(),
201+
"y": pyarrow.float64(),
202+
"emb": pyarrow.struct(
203+
[pyarrow.field(f"a{i}", pyarrow.float64()) for i in range(EMBEDDED_OBJECT_SIZE)]
204+
),
205+
}
206+
)
207+
208+
def setup_cache(self):
165209
coll = db.benchmark
166210
coll.drop()
167211
base_dict = collections.OrderedDict(
@@ -171,14 +215,6 @@ def setup(self):
171215
("emb", {f"a{i}": math.pi for i in range(EMBEDDED_OBJECT_SIZE)}),
172216
]
173217
)
174-
schema_dict = {
175-
"x": pyarrow.int64(),
176-
"y": pyarrow.float64(),
177-
"emb": pyarrow.struct(
178-
[pyarrow.field(f"a{i}", pyarrow.float64()) for i in range(EMBEDDED_OBJECT_SIZE)]
179-
),
180-
}
181-
self.schema = Schema(schema_dict)
182218
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
183219
print(
184220
"%d docs, %dk each with %d keys"
@@ -208,10 +244,10 @@ def time_conventional_pandas(self):
208244

209245

210246
class ProfileReadSmall(Read):
211-
schema = None
212-
dtypes = None
247+
schema = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
248+
dtypes = np.dtype(np.dtype([("x", np.int64), ("y", np.float64)]))
213249

214-
def setup(self):
250+
def setup_cache(self):
215251
coll = db.benchmark
216252
coll.drop()
217253
base_dict = collections.OrderedDict(
@@ -220,10 +256,6 @@ def setup(self):
220256
("y", math.pi),
221257
]
222258
)
223-
schema_dict = {"x": pyarrow.int64(), "y": pyarrow.float64()}
224-
dtypes_list = np.dtype([("x", np.int64), ("y", np.float64)])
225-
self.schema = Schema(schema_dict)
226-
self.dtypes = np.dtype(dtypes_list)
227259
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
228260
print(
229261
"%d docs, %dk each with %d keys"
@@ -232,18 +264,15 @@ def setup(self):
232264

233265

234266
class ProfileReadLarge(Read):
235-
schema = None
236-
dtypes = None
267+
large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
268+
schema = Schema({k: pyarrow.float64() for k in large_doc_keys})
269+
dtypes = np.dtype([(k, np.float64) for k in large_doc_keys])
237270

238-
def setup(self):
271+
def setup_cache(self):
239272
coll = db.benchmark
240273
coll.drop()
241-
large_doc_keys = self.large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
242-
base_dict = collections.OrderedDict([(k, math.pi) for k in large_doc_keys])
243-
dtypes_list = np.dtype([(k, np.float64) for k in large_doc_keys])
244-
schema_dict = {k: pyarrow.float64() for k in large_doc_keys}
245-
self.schema = Schema(schema_dict)
246-
self.dtypes = np.dtype(dtypes_list)
274+
275+
base_dict = collections.OrderedDict([(k, math.pi) for k in self.large_doc_keys])
247276
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
248277
print(
249278
"%d docs, %dk each with %d keys"
@@ -252,48 +281,38 @@ def setup(self):
252281

253282

254283
class ProfileInsertSmall(Insert):
255-
arrow_table = None
256-
pandas_table = None
257-
numpy_arrays = None
258-
dtypes = None
259-
260-
def setup(self):
284+
large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
285+
schema = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
286+
arrow_table = find_arrow_all(db.benchmark, {}, schema=schema)
287+
pandas_table = find_pandas_all(db.benchmark, {}, schema=schema)
288+
numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema)
289+
dtypes = np.dtype([("x", np.int64), ("y", np.float64)])
290+
291+
def setup_cache(self):
261292
coll = db.benchmark
262293
coll.drop()
263294
base_dict = collections.OrderedDict([("x", 1), ("y", math.pi)])
264-
dtypes_list = np.dtype([("x", np.int64), ("y", np.float64)])
265-
self.dtypes = np.dtype(dtypes_list)
266295
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
267296
print(
268297
"%d docs, %dk each with %d keys"
269298
% (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
270299
)
271-
schema = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()})
272-
273-
self.arrow_table = find_arrow_all(db.benchmark, {}, schema=schema)
274-
self.pandas_table = find_pandas_all(db.benchmark, {}, schema=schema)
275-
self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema)
276300

277301

278302
class ProfileInsertLarge(Insert):
279-
arrow_table = None
280-
pandas_table = None
281-
numpy_arrays = None
282-
dtypes = None
283-
284-
def setup(self):
303+
large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
304+
schema = Schema({k: pyarrow.float64() for k in large_doc_keys})
305+
arrow_table = find_arrow_all(db.benchmark, {}, schema=schema)
306+
pandas_table = find_pandas_all(db.benchmark, {}, schema=schema)
307+
numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema)
308+
dtypes = np.dtype([(k, np.float64) for k in large_doc_keys])
309+
310+
def setup_cache(self):
285311
coll = db.benchmark
286312
coll.drop()
287-
large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)]
288-
base_dict = collections.OrderedDict([(k, math.pi) for k in large_doc_keys])
289-
dtypes_list = np.dtype([(k, np.float64) for k in large_doc_keys])
290-
self.dtypes = np.dtype(dtypes_list)
313+
base_dict = collections.OrderedDict([(k, math.pi) for k in self.large_doc_keys])
291314
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
292315
print(
293316
"%d docs, %dk each with %d keys"
294317
% (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
295318
)
296-
schema = Schema({k: pyarrow.float64() for k in large_doc_keys})
297-
self.arrow_table = find_arrow_all(db.benchmark, {}, schema=schema)
298-
self.pandas_table = find_pandas_all(db.benchmark, {}, schema=schema)
299-
self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema)

0 commit comments

Comments
 (0)