31
31
)
32
32
33
33
N_DOCS = int (os .environ .get ("N_DOCS" ))
34
- name_to_obj = {"list" : list , "dict" : dict }
35
34
assert pymongo .has_c ()
36
35
db = pymongo .MongoClient ().pymongoarrow_test
37
36
38
- LARGE_DOC_SIZE = 50
37
+ LARGE_DOC_SIZE = 20
39
38
EMBEDDED_OBJECT_SIZE = (
40
- 64 # The number of values or key/value pairs in the embedded object (array or document).
39
+ 20 # The number of values or key/value pairs in the embedded object (array or document).
41
40
)
42
41
43
42
@@ -48,10 +47,13 @@ class Insert(ABC):
48
47
of inserting tabular data.
49
48
"""
50
49
51
- timeout = 100000
50
+ timeout = 100000 # The setup sometimes times out.
51
+ number = 1
52
+ repeat = (1 , 10 , 30.0 ) # Min repeat, max repeat, time limit (will stop sampling after this)
53
+ rounds = 1
52
54
53
55
@abc .abstractmethod
54
- def setup (self ):
56
+ def setup_cache (self ):
55
57
raise NotImplementedError
56
58
57
59
def time_insert_arrow (self ):
@@ -67,17 +69,32 @@ def time_insert_pandas(self):
67
69
def time_insert_numpy (self ):
68
70
write (db .benchmark , self .numpy_arrays )
69
71
72
+ def peakmem_insert_arrow (self ):
73
+ self .time_insert_arrow ()
74
+
75
+ def peakmem_insert_conventional (self ):
76
+ self .time_insert_conventional ()
77
+
78
+ def peakmem_insert_pandas (self ):
79
+ self .time_insert_pandas ()
80
+
81
+ def peakmem_insert_numpy (self ):
82
+ self .time_insert_numpy ()
83
+
70
84
71
85
class Read (ABC ):
72
86
"""
73
87
A benchmark that times the performance of various kinds
74
88
of reading MongoDB data.
75
89
"""
76
90
77
- timeout = 100000
91
+ timeout = 100000 # The setup sometimes times out.
92
+ number = 3
93
+ repeat = (1 , 10 , 30.0 ) # Min repeat, max repeat, time limit (will stop sampling after this)
94
+ rounds = 1
78
95
79
96
@abc .abstractmethod
80
- def setup (self ):
97
+ def setup_cache (self ):
81
98
raise NotImplementedError
82
99
83
100
# We need this because the naive methods don't always convert nested objects.
@@ -118,20 +135,37 @@ def time_conventional_arrow(self):
118
135
table = pyarrow .Table .from_pylist (f )
119
136
self .exercise_table (table )
120
137
138
+ def peakmem_to_numpy (self ):
139
+ self .time_to_numpy ()
140
+
141
+ def peakmem_conventional_pandas (self ):
142
+ self .time_conventional_pandas ()
143
+
144
+ def peakmem_to_pandas (self ):
145
+ self .time_to_pandas ()
146
+
147
+ def peakmem_to_arrow (self ):
148
+ self .time_to_arrow ()
149
+
150
+ def peakmem_conventional_arrow (self ):
151
+ self .time_conventional_arrow ()
152
+
121
153
122
154
class ProfileReadArray (Read ):
123
- def setup (self ):
155
+ schema = Schema (
156
+ {
157
+ "x" : pyarrow .int64 (),
158
+ "y" : pyarrow .float64 (),
159
+ "emb" : pyarrow .list_ (pyarrow .float64 ()),
160
+ }
161
+ )
162
+
163
+ def setup_cache (self ):
124
164
coll = db .benchmark
125
165
coll .drop ()
126
166
base_dict = collections .OrderedDict (
127
167
[("x" , 1 ), ("y" , math .pi ), ("emb" , [math .pi for _ in range (EMBEDDED_OBJECT_SIZE )])]
128
168
)
129
- schema_dict = {
130
- "x" : pyarrow .int64 (),
131
- "y" : pyarrow .float64 (),
132
- "emb" : pyarrow .list_ (pyarrow .float64 ()),
133
- }
134
- self .schema = Schema (schema_dict )
135
169
coll .insert_many ([base_dict .copy () for _ in range (N_DOCS )])
136
170
print (
137
171
"%d docs, %dk each with %d keys"
@@ -161,7 +195,17 @@ def time_conventional_pandas(self):
161
195
162
196
163
197
class ProfileReadDocument (Read ):
164
- def setup (self ):
198
+ schema = Schema (
199
+ {
200
+ "x" : pyarrow .int64 (),
201
+ "y" : pyarrow .float64 (),
202
+ "emb" : pyarrow .struct (
203
+ [pyarrow .field (f"a{ i } " , pyarrow .float64 ()) for i in range (EMBEDDED_OBJECT_SIZE )]
204
+ ),
205
+ }
206
+ )
207
+
208
+ def setup_cache (self ):
165
209
coll = db .benchmark
166
210
coll .drop ()
167
211
base_dict = collections .OrderedDict (
@@ -171,14 +215,6 @@ def setup(self):
171
215
("emb" , {f"a{ i } " : math .pi for i in range (EMBEDDED_OBJECT_SIZE )}),
172
216
]
173
217
)
174
- schema_dict = {
175
- "x" : pyarrow .int64 (),
176
- "y" : pyarrow .float64 (),
177
- "emb" : pyarrow .struct (
178
- [pyarrow .field (f"a{ i } " , pyarrow .float64 ()) for i in range (EMBEDDED_OBJECT_SIZE )]
179
- ),
180
- }
181
- self .schema = Schema (schema_dict )
182
218
coll .insert_many ([base_dict .copy () for _ in range (N_DOCS )])
183
219
print (
184
220
"%d docs, %dk each with %d keys"
@@ -208,10 +244,10 @@ def time_conventional_pandas(self):
208
244
209
245
210
246
class ProfileReadSmall (Read ):
211
- schema = None
212
- dtypes = None
247
+ schema = Schema ({ "x" : pyarrow . int64 (), "y" : pyarrow . float64 ()})
248
+ dtypes = np . dtype ( np . dtype ([( "x" , np . int64 ), ( "y" , np . float64 )]))
213
249
214
- def setup (self ):
250
+ def setup_cache (self ):
215
251
coll = db .benchmark
216
252
coll .drop ()
217
253
base_dict = collections .OrderedDict (
@@ -220,10 +256,6 @@ def setup(self):
220
256
("y" , math .pi ),
221
257
]
222
258
)
223
- schema_dict = {"x" : pyarrow .int64 (), "y" : pyarrow .float64 ()}
224
- dtypes_list = np .dtype ([("x" , np .int64 ), ("y" , np .float64 )])
225
- self .schema = Schema (schema_dict )
226
- self .dtypes = np .dtype (dtypes_list )
227
259
coll .insert_many ([base_dict .copy () for _ in range (N_DOCS )])
228
260
print (
229
261
"%d docs, %dk each with %d keys"
@@ -232,18 +264,15 @@ def setup(self):
232
264
233
265
234
266
class ProfileReadLarge (Read ):
235
- schema = None
236
- dtypes = None
267
+ large_doc_keys = [f"a{ i } " for i in range (LARGE_DOC_SIZE )]
268
+ schema = Schema ({k : pyarrow .float64 () for k in large_doc_keys })
269
+ dtypes = np .dtype ([(k , np .float64 ) for k in large_doc_keys ])
237
270
238
- def setup (self ):
271
+ def setup_cache (self ):
239
272
coll = db .benchmark
240
273
coll .drop ()
241
- large_doc_keys = self .large_doc_keys = [f"a{ i } " for i in range (LARGE_DOC_SIZE )]
242
- base_dict = collections .OrderedDict ([(k , math .pi ) for k in large_doc_keys ])
243
- dtypes_list = np .dtype ([(k , np .float64 ) for k in large_doc_keys ])
244
- schema_dict = {k : pyarrow .float64 () for k in large_doc_keys }
245
- self .schema = Schema (schema_dict )
246
- self .dtypes = np .dtype (dtypes_list )
274
+
275
+ base_dict = collections .OrderedDict ([(k , math .pi ) for k in self .large_doc_keys ])
247
276
coll .insert_many ([base_dict .copy () for _ in range (N_DOCS )])
248
277
print (
249
278
"%d docs, %dk each with %d keys"
@@ -252,48 +281,38 @@ def setup(self):
252
281
253
282
254
283
class ProfileInsertSmall (Insert ):
255
- arrow_table = None
256
- pandas_table = None
257
- numpy_arrays = None
258
- dtypes = None
259
-
260
- def setup (self ):
284
+ large_doc_keys = [f"a{ i } " for i in range (LARGE_DOC_SIZE )]
285
+ schema = Schema ({"x" : pyarrow .int64 (), "y" : pyarrow .float64 ()})
286
+ arrow_table = find_arrow_all (db .benchmark , {}, schema = schema )
287
+ pandas_table = find_pandas_all (db .benchmark , {}, schema = schema )
288
+ numpy_arrays = find_numpy_all (db .benchmark , {}, schema = schema )
289
+ dtypes = np .dtype ([("x" , np .int64 ), ("y" , np .float64 )])
290
+
291
+ def setup_cache (self ):
261
292
coll = db .benchmark
262
293
coll .drop ()
263
294
base_dict = collections .OrderedDict ([("x" , 1 ), ("y" , math .pi )])
264
- dtypes_list = np .dtype ([("x" , np .int64 ), ("y" , np .float64 )])
265
- self .dtypes = np .dtype (dtypes_list )
266
295
coll .insert_many ([base_dict .copy () for _ in range (N_DOCS )])
267
296
print (
268
297
"%d docs, %dk each with %d keys"
269
298
% (N_DOCS , len (BSON .encode (base_dict )) // 1024 , len (base_dict ))
270
299
)
271
- schema = Schema ({"x" : pyarrow .int64 (), "y" : pyarrow .float64 ()})
272
-
273
- self .arrow_table = find_arrow_all (db .benchmark , {}, schema = schema )
274
- self .pandas_table = find_pandas_all (db .benchmark , {}, schema = schema )
275
- self .numpy_arrays = find_numpy_all (db .benchmark , {}, schema = schema )
276
300
277
301
278
302
class ProfileInsertLarge (Insert ):
279
- arrow_table = None
280
- pandas_table = None
281
- numpy_arrays = None
282
- dtypes = None
283
-
284
- def setup (self ):
303
+ large_doc_keys = [f"a{ i } " for i in range (LARGE_DOC_SIZE )]
304
+ schema = Schema ({k : pyarrow .float64 () for k in large_doc_keys })
305
+ arrow_table = find_arrow_all (db .benchmark , {}, schema = schema )
306
+ pandas_table = find_pandas_all (db .benchmark , {}, schema = schema )
307
+ numpy_arrays = find_numpy_all (db .benchmark , {}, schema = schema )
308
+ dtypes = np .dtype ([(k , np .float64 ) for k in large_doc_keys ])
309
+
310
+ def setup_cache (self ):
285
311
coll = db .benchmark
286
312
coll .drop ()
287
- large_doc_keys = [f"a{ i } " for i in range (LARGE_DOC_SIZE )]
288
- base_dict = collections .OrderedDict ([(k , math .pi ) for k in large_doc_keys ])
289
- dtypes_list = np .dtype ([(k , np .float64 ) for k in large_doc_keys ])
290
- self .dtypes = np .dtype (dtypes_list )
313
+ base_dict = collections .OrderedDict ([(k , math .pi ) for k in self .large_doc_keys ])
291
314
coll .insert_many ([base_dict .copy () for _ in range (N_DOCS )])
292
315
print (
293
316
"%d docs, %dk each with %d keys"
294
317
% (N_DOCS , len (BSON .encode (base_dict )) // 1024 , len (base_dict ))
295
318
)
296
- schema = Schema ({k : pyarrow .float64 () for k in large_doc_keys })
297
- self .arrow_table = find_arrow_all (db .benchmark , {}, schema = schema )
298
- self .pandas_table = find_pandas_all (db .benchmark , {}, schema = schema )
299
- self .numpy_arrays = find_numpy_all (db .benchmark , {}, schema = schema )
0 commit comments