|
11 | 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 |
| - |
| 14 | +import abc |
15 | 15 | import collections
|
16 | 16 | import math
|
17 | 17 | import os
|
18 |
| -import string |
| 18 | +from abc import ABC |
19 | 19 |
|
20 | 20 | import numpy as np
|
21 | 21 | import pandas as pd
|
|
30 | 30 | write,
|
31 | 31 | )
|
32 | 32 |
|
33 |
| -CUR_SIZE = True if os.environ.get("BENCHMARK_SIZE") == "LARGE" else False |
34 |
| -N_LARGE_DOCS = 1000 |
35 |
| -N_SMALL_DOCS = 100000 |
| 33 | +N_DOCS = int(os.environ.get("N_DOCS")) |
| 34 | +name_to_obj = {"list": list, "dict": dict} |
36 | 35 | assert pymongo.has_c()
|
37 |
| -SMALL = False |
38 |
| -LARGE = True |
39 |
| -collection_names = {LARGE: "large", SMALL: "small"} |
40 |
| -dtypes = {} |
41 |
| -schemas = {} |
42 |
| - |
43 |
| -arrow_tables = {} |
44 |
| -pandas_tables = {} |
45 |
| -numpy_arrays = {} |
46 |
| - |
47 |
| -large_doc_keys = None |
48 |
| - |
49 | 36 | db = pymongo.MongoClient().pymongoarrow_test
|
50 |
| -small = db[collection_names[SMALL]] |
51 |
| -small.drop() |
52 |
| - |
53 |
| -small.insert_many( |
54 |
| - [collections.OrderedDict([("x", 1), ("y", math.pi)]) for _ in range(N_SMALL_DOCS)] |
55 |
| -) |
56 |
| - |
57 |
| -large_doc_keys = [c * i for c in string.ascii_lowercase for i in range(1, 101)] |
58 |
| -schemas[SMALL] = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()}) |
59 |
| -dtypes[SMALL] = np.dtype([("x", np.int64), ("y", np.float64)]) |
60 |
| -dtypes[LARGE] = np.dtype([(k, np.float64) for k in large_doc_keys]) |
61 |
| -schemas[LARGE] = Schema({k: pyarrow.float64() for k in large_doc_keys}) |
62 |
| -large = db[collection_names[LARGE]] |
63 |
| -large.drop() |
64 |
| -# 2600 keys: 'a', 'aa', 'aaa', .., 'zz..z' |
65 |
| -large_doc = collections.OrderedDict([(k, math.pi) for k in large_doc_keys]) |
66 |
| -print( |
67 |
| - "%d large docs, %dk each with %d keys" |
68 |
| - % (N_LARGE_DOCS, len(BSON.encode(large_doc)) // 1024, len(large_doc_keys)) |
69 |
| -) |
70 |
| - |
71 |
| -large.insert_many([large_doc.copy() for _ in range(N_LARGE_DOCS)]) |
72 | 37 |
|
73 |
| -arrow_tables[SMALL] = find_arrow_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL]) |
74 |
| -arrow_tables[LARGE] = find_arrow_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE]) |
75 |
| -pandas_tables[SMALL] = find_pandas_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL]) |
76 |
| -pandas_tables[LARGE] = find_pandas_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE]) |
77 |
| -numpy_arrays[SMALL] = find_numpy_all(db[collection_names[SMALL]], {}, schema=schemas[SMALL]) |
78 |
| -numpy_arrays[LARGE] = find_numpy_all(db[collection_names[LARGE]], {}, schema=schemas[LARGE]) |
| 38 | +LARGE_DOC_SIZE = 50 |
79 | 39 |
|
80 | 40 |
|
81 |
| -class ProfileInsert: |
| 41 | +# We have to use ABCs because ASV doesn't support any other way of skipping tests. |
| 42 | +class Insert(ABC): |
82 | 43 | """
|
83 | 44 | A benchmark that times the performance of various kinds
|
84 | 45 | of inserting tabular data.
|
85 | 46 | """
|
86 | 47 |
|
| 48 | + timeout = 100000 |
| 49 | + |
| 50 | + @abc.abstractmethod |
87 | 51 | def setup(self):
|
88 |
| - db[collection_names[CUR_SIZE]].drop() |
| 52 | + raise NotImplementedError |
89 | 53 |
|
90 | 54 | def time_insert_arrow(self):
|
91 |
| - write(db[collection_names[CUR_SIZE]], arrow_tables[CUR_SIZE]) |
| 55 | + write(db.benchmark, self.arrow_table) |
92 | 56 |
|
93 | 57 | def time_insert_conventional(self):
|
94 |
| - tab = arrow_tables[CUR_SIZE].to_pylist() |
95 |
| - db[collection_names[CUR_SIZE]].insert_many(tab) |
| 58 | + tab = self.arrow_table.to_pylist() |
| 59 | + db.benchmark.insert_many(tab) |
96 | 60 |
|
97 | 61 | def time_insert_pandas(self):
|
98 |
| - write(db[collection_names[CUR_SIZE]], pandas_tables[CUR_SIZE]) |
| 62 | + write(db.benchmark, self.pandas_table) |
99 | 63 |
|
100 | 64 | def time_insert_numpy(self):
|
101 |
| - write(db[collection_names[CUR_SIZE]], numpy_arrays[CUR_SIZE]) |
| 65 | + write(db.benchmark, self.numpy_arrays) |
102 | 66 |
|
103 | 67 |
|
104 |
| -class ProfileRead: |
| 68 | +class Read(ABC): |
105 | 69 | """
|
106 | 70 | A benchmark that times the performance of various kinds
|
107 | 71 | of reading MongoDB data.
|
108 | 72 | """
|
109 | 73 |
|
| 74 | + timeout = 100000 |
| 75 | + |
| 76 | + @abc.abstractmethod |
110 | 77 | def setup(self):
|
111 |
| - db[collection_names[CUR_SIZE]].drop() |
| 78 | + raise NotImplementedError |
| 79 | + |
| 80 | + # We need this because the naive methods don't always convert nested objects. |
| 81 | + @staticmethod |
| 82 | + def exercise_table(table): |
| 83 | + pass |
112 | 84 |
|
113 | 85 | def time_conventional_ndarray(self):
|
114 |
| - collection = db[collection_names[CUR_SIZE]] |
| 86 | + collection = db.benchmark |
115 | 87 | cursor = collection.find()
|
116 |
| - dtype = dtypes[CUR_SIZE] |
117 |
| - |
118 |
| - if CUR_SIZE == LARGE: |
119 |
| - np.array([tuple(doc[k] for k in large_doc_keys) for doc in cursor], dtype=dtype) |
| 88 | + dtype = self.dtypes |
| 89 | + if "Large" in type(self).__name__: |
| 90 | + np.array([tuple(doc[k] for k in self.large_doc_keys) for doc in cursor], dtype=dtype) |
120 | 91 | else:
|
121 | 92 | np.array([(doc["x"], doc["y"]) for doc in cursor], dtype=dtype)
|
122 | 93 |
|
123 | 94 | def time_to_numpy(self):
|
124 |
| - c = db[collection_names[CUR_SIZE]] |
125 |
| - schema = schemas[CUR_SIZE] |
126 |
| - find_numpy_all(c, {}, schema=schema) |
| 95 | + c = db.benchmark |
| 96 | + find_numpy_all(c, {}, schema=self.schema) |
127 | 97 |
|
128 | 98 | def time_conventional_pandas(self):
|
129 |
| - collection = db[collection_names[CUR_SIZE]] |
130 |
| - _ = dtypes[CUR_SIZE] |
| 99 | + collection = db.benchmark |
131 | 100 | cursor = collection.find(projection={"_id": 0})
|
132 | 101 | _ = pd.DataFrame(list(cursor))
|
133 | 102 |
|
134 | 103 | def time_to_pandas(self):
|
135 |
| - c = db[collection_names[CUR_SIZE]] |
136 |
| - schema = schemas[CUR_SIZE] |
137 |
| - find_pandas_all(c, {}, schema=schema) |
| 104 | + c = db.benchmark |
| 105 | + find_pandas_all(c, {}, schema=self.schema, projection={"_id": 0}) |
138 | 106 |
|
139 | 107 | def time_to_arrow(self):
|
140 |
| - c = db[collection_names[CUR_SIZE]] |
141 |
| - schema = schemas[CUR_SIZE] |
142 |
| - find_arrow_all(c, {}, schema=schema) |
| 108 | + c = db.benchmark |
| 109 | + table = find_arrow_all(c, {}, schema=self.schema) |
| 110 | + self.exercise_table(table) |
| 111 | + |
| 112 | + def time_conventional_arrow(self): |
| 113 | + c = db.benchmark |
| 114 | + f = list(c.find({}, projection={"_id": 0})) |
| 115 | + table = pyarrow.Table.from_pylist(f) |
| 116 | + self.exercise_table(table) |
| 117 | + |
| 118 | + |
| 119 | +class ProfileReadArray(Read): |
| 120 | + def setup(self): |
| 121 | + coll = db.benchmark |
| 122 | + coll.drop() |
| 123 | + base_dict = collections.OrderedDict( |
| 124 | + [("x", 1), ("y", math.pi), ("emb", [math.pi for _ in range(64)])] |
| 125 | + ) |
| 126 | + schema_dict = {"x": pyarrow.int64(), "y": pyarrow.float64()} |
| 127 | + dtypes_list = np.dtype([("x", np.int64), ("y", np.float64)]) |
| 128 | + self.schema = Schema(schema_dict) |
| 129 | + self.dtypes = np.dtype(dtypes_list) |
| 130 | + coll.insert_many([base_dict.copy() for _ in range(N_DOCS)]) |
| 131 | + print( |
| 132 | + "%d docs, %dk each with %d keys" |
| 133 | + % (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict)) |
| 134 | + ) |
| 135 | + |
| 136 | + # We need this because the naive methods don't always convert nested objects. |
| 137 | + @staticmethod |
| 138 | + def exercise_table(table): |
| 139 | + [ |
| 140 | + [[n for n in i.values] if isinstance(i, pyarrow.ListScalar) else i for i in column] |
| 141 | + for column in table.columns |
| 142 | + ] |
| 143 | + |
| 144 | + # All of the following tests are being skipped because NumPy/Pandas do not work with nested arrays. |
| 145 | + def time_to_numpy(self): |
| 146 | + pass |
| 147 | + |
| 148 | + def time_to_pandas(self): |
| 149 | + pass |
| 150 | + |
| 151 | + def time_conventional_ndarray(self): |
| 152 | + pass |
| 153 | + |
| 154 | + def time_conventional_pandas(self): |
| 155 | + pass |
| 156 | + |
| 157 | + |
| 158 | +class ProfileReadSmall(Read): |
| 159 | + schema = None |
| 160 | + dtypes = None |
| 161 | + |
| 162 | + def setup(self): |
| 163 | + coll = db.benchmark |
| 164 | + coll.drop() |
| 165 | + base_dict = collections.OrderedDict( |
| 166 | + [ |
| 167 | + ("x", 1), |
| 168 | + ("y", math.pi), |
| 169 | + ] |
| 170 | + ) |
| 171 | + schema_dict = {"x": pyarrow.int64(), "y": pyarrow.float64()} |
| 172 | + dtypes_list = np.dtype([("x", np.int64), ("y", np.float64)]) |
| 173 | + self.schema = Schema(schema_dict) |
| 174 | + self.dtypes = np.dtype(dtypes_list) |
| 175 | + coll.insert_many([base_dict.copy() for _ in range(N_DOCS)]) |
| 176 | + print( |
| 177 | + "%d docs, %dk each with %d keys" |
| 178 | + % (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict)) |
| 179 | + ) |
| 180 | + |
| 181 | + |
| 182 | +class ProfileReadLarge(Read): |
| 183 | + schema = None |
| 184 | + dtypes = None |
| 185 | + |
| 186 | + def setup(self): |
| 187 | + coll = db.benchmark |
| 188 | + coll.drop() |
| 189 | + large_doc_keys = self.large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)] |
| 190 | + base_dict = collections.OrderedDict([(k, math.pi) for k in large_doc_keys]) |
| 191 | + dtypes_list = np.dtype([(k, np.float64) for k in large_doc_keys]) |
| 192 | + schema_dict = {k: pyarrow.float64() for k in large_doc_keys} |
| 193 | + self.schema = Schema(schema_dict) |
| 194 | + self.dtypes = np.dtype(dtypes_list) |
| 195 | + coll.insert_many([base_dict.copy() for _ in range(N_DOCS)]) |
| 196 | + print( |
| 197 | + "%d docs, %dk each with %d keys" |
| 198 | + % (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict)) |
| 199 | + ) |
| 200 | + |
| 201 | + |
| 202 | +class ProfileInsertSmall(Insert): |
| 203 | + arrow_table = None |
| 204 | + pandas_table = None |
| 205 | + numpy_arrays = None |
| 206 | + dtypes = None |
| 207 | + |
| 208 | + def setup(self): |
| 209 | + coll = db.benchmark |
| 210 | + coll.drop() |
| 211 | + base_dict = collections.OrderedDict([("x", 1), ("y", math.pi)]) |
| 212 | + dtypes_list = np.dtype([("x", np.int64), ("y", np.float64)]) |
| 213 | + self.dtypes = np.dtype(dtypes_list) |
| 214 | + coll.insert_many([base_dict.copy() for _ in range(N_DOCS)]) |
| 215 | + print( |
| 216 | + "%d docs, %dk each with %d keys" |
| 217 | + % (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict)) |
| 218 | + ) |
| 219 | + schema = Schema({"x": pyarrow.int64(), "y": pyarrow.float64()}) |
| 220 | + |
| 221 | + self.arrow_table = find_arrow_all(db.benchmark, {}, schema=schema) |
| 222 | + self.pandas_table = find_pandas_all(db.benchmark, {}, schema=schema) |
| 223 | + self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema) |
| 224 | + |
| 225 | + |
| 226 | +class ProfileInsertLarge(Insert): |
| 227 | + arrow_table = None |
| 228 | + pandas_table = None |
| 229 | + numpy_arrays = None |
| 230 | + dtypes = None |
| 231 | + |
| 232 | + def setup(self): |
| 233 | + coll = db.benchmark |
| 234 | + coll.drop() |
| 235 | + large_doc_keys = [f"a{i}" for i in range(LARGE_DOC_SIZE)] |
| 236 | + base_dict = collections.OrderedDict([(k, math.pi) for k in large_doc_keys]) |
| 237 | + dtypes_list = np.dtype([(k, np.float64) for k in large_doc_keys]) |
| 238 | + self.dtypes = np.dtype(dtypes_list) |
| 239 | + coll.insert_many([base_dict.copy() for _ in range(N_DOCS)]) |
| 240 | + print( |
| 241 | + "%d docs, %dk each with %d keys" |
| 242 | + % (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict)) |
| 243 | + ) |
| 244 | + schema = Schema({k: pyarrow.float64() for k in large_doc_keys}) |
| 245 | + self.arrow_table = find_arrow_all(db.benchmark, {}, schema=schema) |
| 246 | + self.pandas_table = find_pandas_all(db.benchmark, {}, schema=schema) |
| 247 | + self.numpy_arrays = find_numpy_all(db.benchmark, {}, schema=schema) |
0 commit comments