8
8
9
9
import numpy as np
10
10
import pandas as pd
11
+ import pyarrow
11
12
import pymongo
12
-
13
13
from bson import BSON , CodecOptions , Int64 , ObjectId
14
14
from bson .raw_bson import RawBSONDocument
15
-
16
- import pyarrow
17
- from pymongoarrow .api import find_arrow_all , find_numpy_all , Schema , find_pandas_all
15
+ from pymongoarrow .api import Schema , find_arrow_all , find_numpy_all , find_pandas_all
18
16
19
17
assert pymongo .has_c ()
20
18
@@ -39,45 +37,49 @@ def _setup():
39
37
small = db [collection_names [SMALL ]]
40
38
small .drop ()
41
39
42
- print ("%d small docs, %d bytes each with 3 keys" % (
43
- N_SMALL_DOCS ,
44
- len (BSON .encode ({'_id' : ObjectId (), 'x' : 1 , 'y' : math .pi }))))
40
+ print (
41
+ "%d small docs, %d bytes each with 3 keys"
42
+ % (N_SMALL_DOCS , len (BSON .encode ({"_id" : ObjectId (), "x" : 1 , "y" : math .pi })))
43
+ )
45
44
46
- small .insert_many ([
47
- collections .OrderedDict ([('x' , 1 ), ('y' , math .pi )])
48
- for _ in range ( N_SMALL_DOCS )] )
45
+ small .insert_many (
46
+ [ collections .OrderedDict ([("x" , 1 ), ("y" , math .pi )]) for _ in range ( N_SMALL_DOCS )]
47
+ )
49
48
50
- dtypes [SMALL ] = np .dtype ([('x' , np .int64 ), ('y' , np .float64 )])
51
- schemas [SMALL ] = Schema ({'x' : pyarrow .int64 (), 'y' : pyarrow .float64 ()})
49
+ dtypes [SMALL ] = np .dtype ([("x" , np .int64 ), ("y" , np .float64 )])
50
+ schemas [SMALL ] = Schema ({"x" : pyarrow .int64 (), "y" : pyarrow .float64 ()})
52
51
53
52
large = db [collection_names [LARGE ]]
54
53
large .drop ()
55
54
# 2600 keys: 'a', 'aa', 'aaa', .., 'zz..z'
56
- large_doc_keys = [c * i for c in string .ascii_lowercase
57
- for i in range (1 , 101 )]
55
+ large_doc_keys = [c * i for c in string .ascii_lowercase for i in range (1 , 101 )]
58
56
large_doc = collections .OrderedDict ([(k , math .pi ) for k in large_doc_keys ])
59
- print ("%d large docs, %dk each with %d keys" % (
60
- N_LARGE_DOCS , len (BSON .encode (large_doc )) // 1024 , len (large_doc_keys )))
57
+ print (
58
+ "%d large docs, %dk each with %d keys"
59
+ % (N_LARGE_DOCS , len (BSON .encode (large_doc )) // 1024 , len (large_doc_keys ))
60
+ )
61
61
62
62
large .insert_many ([large_doc .copy () for _ in range (N_LARGE_DOCS )])
63
63
64
64
dtypes [LARGE ] = np .dtype ([(k , np .float64 ) for k in large_doc_keys ])
65
65
schemas [LARGE ] = Schema ({k : pyarrow .float64 () for k in large_doc_keys })
66
66
67
67
# Ignore for now that the first batch defaults to 101 documents.
68
- raw_bson_docs_small = [{'x' : 1 , 'y' : math .pi } for _ in range (N_SMALL_DOCS )]
69
- raw_bson_small = BSON .encode ({'ok' : 1 ,
70
- 'cursor' : {
71
- 'id' : Int64 (1234 ),
72
- 'ns' : 'db.collection' ,
73
- 'firstBatch' : raw_bson_docs_small }})
68
+ raw_bson_docs_small = [{"x" : 1 , "y" : math .pi } for _ in range (N_SMALL_DOCS )]
69
+ raw_bson_small = BSON .encode (
70
+ {
71
+ "ok" : 1 ,
72
+ "cursor" : {"id" : Int64 (1234 ), "ns" : "db.collection" , "firstBatch" : raw_bson_docs_small },
73
+ }
74
+ )
74
75
75
76
raw_bson_docs_large = [large_doc .copy () for _ in range (N_LARGE_DOCS )]
76
- raw_bson_large = BSON .encode ({'ok' : 1 ,
77
- 'cursor' : {
78
- 'id' : Int64 (1234 ),
79
- 'ns' : 'db.collection' ,
80
- 'firstBatch' : raw_bson_docs_large }})
77
+ raw_bson_large = BSON .encode (
78
+ {
79
+ "ok" : 1 ,
80
+ "cursor" : {"id" : Int64 (1234 ), "ns" : "db.collection" , "firstBatch" : raw_bson_docs_large },
81
+ }
82
+ )
81
83
82
84
raw_bsons [SMALL ] = raw_bson_small
83
85
raw_bsons [LARGE ] = raw_bson_large
@@ -99,62 +101,61 @@ def assign_name(fn):
99
101
return assign_name
100
102
101
103
102
- @bench (' conventional-to-ndarray' )
104
+ @bench (" conventional-to-ndarray" )
103
105
def conventional_ndarray (use_large ):
104
106
collection = db [collection_names [use_large ]]
105
107
cursor = collection .find ()
106
108
dtype = dtypes [use_large ]
107
109
108
110
if use_large :
109
- np .array ([tuple (doc [k ] for k in large_doc_keys ) for doc in cursor ],
110
- dtype = dtype )
111
+ np .array ([tuple (doc [k ] for k in large_doc_keys ) for doc in cursor ], dtype = dtype )
111
112
else :
112
- np .array ([(doc ['x' ], doc ['y' ]) for doc in cursor ], dtype = dtype )
113
+ np .array ([(doc ["x" ], doc ["y" ]) for doc in cursor ], dtype = dtype )
113
114
114
115
115
116
# Note: this is called "to-numpy" and not "to-ndarray" because find_numpy_all
116
117
# does not produce an ndarray.
117
- @bench (' pymongoarrow-to-numpy' )
118
+ @bench (" pymongoarrow-to-numpy" )
118
119
def to_numpy (use_large ):
119
120
c = db [collection_names [use_large ]]
120
121
schema = schemas [use_large ]
121
122
find_numpy_all (c , {}, schema = schema )
122
123
123
124
124
- @bench (' conventional-to-pandas' )
125
+ @bench (" conventional-to-pandas" )
125
126
def conventional_pandas (use_large ):
126
127
collection = db [collection_names [use_large ]]
127
128
dtype = dtypes [use_large ]
128
- cursor = collection .find (projection = {' _id' : 0 })
129
+ cursor = collection .find (projection = {" _id" : 0 })
129
130
data_frame = pd .DataFrame (list (cursor ))
130
131
131
132
132
- @bench (' pymongoarrow-to-pandas' )
133
+ @bench (" pymongoarrow-to-pandas" )
133
134
def to_pandas (use_large ):
134
135
c = db [collection_names [use_large ]]
135
136
schema = schemas [use_large ]
136
137
find_pandas_all (c , {}, schema = schema )
137
138
138
139
139
- @bench (' pymongoarrow-to-arrow' )
140
+ @bench (" pymongoarrow-to-arrow" )
140
141
def to_arrow (use_large ):
141
142
c = db [collection_names [use_large ]]
142
143
schema = schemas [use_large ]
143
144
find_arrow_all (c , {}, schema = schema )
144
145
145
146
146
- parser = argparse .ArgumentParser (formatter_class = argparse .RawTextHelpFormatter ,
147
- epilog = """
147
+ parser = argparse .ArgumentParser (
148
+ formatter_class = argparse .RawTextHelpFormatter ,
149
+ epilog = """
148
150
Available benchmark functions:
149
151
%s
150
- """ % ("\n " .join (bench_fns .keys ()),))
151
- parser .add_argument ('--large' , action = 'store_true' ,
152
- help = 'only test with large documents' )
153
- parser .add_argument ('--small' , action = 'store_true' ,
154
- help = 'only test with small documents' )
155
- parser .add_argument ('--test' , action = 'store_true' ,
156
- help = 'quick test of benchmark.py' )
157
- parser .add_argument ('funcs' , nargs = '*' , default = bench_fns .keys ())
152
+ """
153
+ % ("\n " .join (bench_fns .keys ()),),
154
+ )
155
+ parser .add_argument ("--large" , action = "store_true" , help = "only test with large documents" )
156
+ parser .add_argument ("--small" , action = "store_true" , help = "only test with small documents" )
157
+ parser .add_argument ("--test" , action = "store_true" , help = "quick test of benchmark.py" )
158
+ parser .add_argument ("funcs" , nargs = "*" , default = bench_fns .keys ())
158
159
options = parser .parse_args ()
159
160
160
161
if options .test :
@@ -175,7 +176,7 @@ def to_arrow(use_large):
175
176
176
177
for name in options .funcs :
177
178
if name not in bench_fns :
178
- sys .stderr .write (" Unknown function \ " %s\ "\n " % name )
179
+ sys .stderr .write (' Unknown function "%s"\n ' % name )
179
180
sys .stderr .write ("Available functions:\n %s\n " % ("\n " .join (bench_fns )))
180
181
sys .exit (1 )
181
182
0 commit comments