Skip to content

Commit 4fb6fcc

Browse files
authored
ARROW-145 Update benchmark test to include Embedded Documents (#144)
1 parent edaa054 commit 4fb6fcc

File tree

1 file changed

+56
-4
lines changed

1 file changed

+56
-4
lines changed

bindings/python/benchmarks/benchmarks.py

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@
3636
db = pymongo.MongoClient().pymongoarrow_test
3737

3838
LARGE_DOC_SIZE = 50
39+
EMBEDDED_OBJECT_SIZE = (
40+
64 # The number of values or key/value pairs in the embedded object (array or document).
41+
)
3942

4043

4144
# We have to use ABCs because ASV doesn't support any other way of skipping tests.
@@ -121,12 +124,14 @@ def setup(self):
121124
coll = db.benchmark
122125
coll.drop()
123126
base_dict = collections.OrderedDict(
124-
[("x", 1), ("y", math.pi), ("emb", [math.pi for _ in range(64)])]
127+
[("x", 1), ("y", math.pi), ("emb", [math.pi for _ in range(EMBEDDED_OBJECT_SIZE)])]
125128
)
126-
schema_dict = {"x": pyarrow.int64(), "y": pyarrow.float64()}
127-
dtypes_list = np.dtype([("x", np.int64), ("y", np.float64)])
129+
schema_dict = {
130+
"x": pyarrow.int64(),
131+
"y": pyarrow.float64(),
132+
"emb": pyarrow.list_(pyarrow.float64()),
133+
}
128134
self.schema = Schema(schema_dict)
129-
self.dtypes = np.dtype(dtypes_list)
130135
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
131136
print(
132137
"%d docs, %dk each with %d keys"
@@ -155,6 +160,53 @@ def time_conventional_pandas(self):
155160
pass
156161

157162

163+
class ProfileReadDocument(Read):
164+
def setup(self):
165+
coll = db.benchmark
166+
coll.drop()
167+
base_dict = collections.OrderedDict(
168+
[
169+
("x", 1),
170+
("y", math.pi),
171+
("emb", {f"a{i}": math.pi for i in range(EMBEDDED_OBJECT_SIZE)}),
172+
]
173+
)
174+
schema_dict = {
175+
"x": pyarrow.int64(),
176+
"y": pyarrow.float64(),
177+
"emb": pyarrow.struct(
178+
[pyarrow.field(f"a{i}", pyarrow.float64()) for i in range(EMBEDDED_OBJECT_SIZE)]
179+
),
180+
}
181+
self.schema = Schema(schema_dict)
182+
coll.insert_many([base_dict.copy() for _ in range(N_DOCS)])
183+
print(
184+
"%d docs, %dk each with %d keys"
185+
% (N_DOCS, len(BSON.encode(base_dict)) // 1024, len(base_dict))
186+
)
187+
188+
# We need this because the naive methods don't always convert nested objects.
189+
@staticmethod
190+
def exercise_table(table):
191+
[
192+
[[n for n in i.values()] if isinstance(i, pyarrow.StructScalar) else i for i in column]
193+
for column in table.columns
194+
]
195+
196+
# All of the following tests are being skipped because NumPy/Pandas do not work with nested documents.
197+
def time_to_numpy(self):
198+
pass
199+
200+
def time_to_pandas(self):
201+
pass
202+
203+
def time_conventional_ndarray(self):
204+
pass
205+
206+
def time_conventional_pandas(self):
207+
pass
208+
209+
158210
class ProfileReadSmall(Read):
159211
schema = None
160212
dtypes = None

0 commit comments

Comments
 (0)