|
| 1 | +# Copyright 2022-present MongoDB, Inc. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import collections |
| 16 | +import math |
| 17 | +import os |
| 18 | +import string |
| 19 | + |
| 20 | +import pyarrow |
| 21 | +import pymongo |
| 22 | +from bson import BSON |
| 23 | +from pymongoarrow.api import ( |
| 24 | + Schema, |
| 25 | + find_arrow_all, |
| 26 | + find_numpy_all, |
| 27 | + find_pandas_all, |
| 28 | + write, |
| 29 | +) |
| 30 | + |
| 31 | +CUR_SIZE = True if os.environ.get("BENCHMARK_SIZE") == "LARGE" else False |
| 32 | +N_LARGE_DOCS = 1000 |
| 33 | +N_SMALL_DOCS = 100000 |
| 34 | +assert pymongo.has_c() |
| 35 | +SMALL = False |
| 36 | +LARGE = True |
| 37 | +collection_names = {LARGE: "large", SMALL: "small"} |
| 38 | +dtypes = {} |
| 39 | +schemas = {} |
| 40 | + |
| 41 | +arrow_tables = {} |
| 42 | +pandas_tables = {} |
| 43 | +numpy_arrays = {} |
| 44 | + |
| 45 | +large_doc_keys = None |
| 46 | + |
| 47 | +db = pymongo.MongoClient().pymongoarrow_test |
| 48 | +small = db[collection_names[SMALL]] |
| 49 | +small.drop() |
| 50 | + |
| 51 | +small.insert_many( |
| 52 | + [collections.OrderedDict([("x", 1), ("y", math.pi)]) for _ in range(N_SMALL_DOCS)] |
| 53 | +) |
| 54 | + |
| 55 | +large_doc_keys = [c * i for c in string.ascii_lowercase for i in range(1, 101)] |
| 56 | +schemas[SMALL] = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()}) |
| 57 | +schemas[LARGE] = Schema({k: pyarrow.float64() for k in large_doc_keys}) |
| 58 | +large = db[collection_names[LARGE]] |
| 59 | +large.drop() |
| 60 | +# 2600 keys: 'a', 'aa', 'aaa', .., 'zz..z' |
| 61 | +large_doc = collections.OrderedDict([(k, math.pi) for k in large_doc_keys]) |
| 62 | +print( |
| 63 | + "%d large docs, %dk each with %d keys" |
| 64 | + % (N_LARGE_DOCS, len(BSON.encode(large_doc)) // 1024, len(large_doc_keys)) |
| 65 | +) |
| 66 | + |
| 67 | +large.insert_many([large_doc.copy() for _ in range(N_LARGE_DOCS)]) |
| 68 | + |
| 69 | +arrow_tables[SMALL] = find_arrow_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL]) |
| 70 | +arrow_tables[LARGE] = find_arrow_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE]) |
| 71 | +pandas_tables[SMALL] = find_pandas_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL]) |
| 72 | +pandas_tables[LARGE] = find_pandas_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE]) |
| 73 | +numpy_arrays[SMALL] = find_numpy_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL]) |
| 74 | +numpy_arrays[LARGE] = find_numpy_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE]) |
| 75 | + |
| 76 | + |
| 77 | +class ProfileInsert: |
| 78 | + """ |
| 79 | + A benchmark that times the performance of various kinds |
| 80 | + of inserting tabular data. |
| 81 | + """ |
| 82 | + |
| 83 | + def setup(self): |
| 84 | + db[collection_names[CUR_SIZE]].drop() |
| 85 | + |
| 86 | + def time_insert_arrow(self): |
| 87 | + write(db[collection_names[CUR_SIZE]], arrow_tables[CUR_SIZE]) |
| 88 | + |
| 89 | + def time_insert_conventional(self): |
| 90 | + tab = arrow_tables[CUR_SIZE].to_pylist() |
| 91 | + db[collection_names[CUR_SIZE]].insert_many(tab) |
| 92 | + |
| 93 | + def time_insert_pandas(self): |
| 94 | + write(db[collection_names[CUR_SIZE]], pandas_tables[CUR_SIZE]) |
| 95 | + |
| 96 | + def time_insert_numpy(self): |
| 97 | + write(db[collection_names[CUR_SIZE]], numpy_arrays[CUR_SIZE]) |
0 commit comments