Skip to content

Commit 4752e45

Browse files
authored
Refactor python benchmarks (#11)
This PR attempts to make all arguments and outputs consistent across all python (sklearn and daal4py) benchmarks here. Support --header and --verbose in all python benchmarks Support all previous bookkeeping options but in all benchmarks: --batch, --arch, --prefix Support multiple dtypes in certain benchmarks Add mechanism for supporting sklearn n_jobs (disabled currently) Detect and report number of threads used in daal4py Use the same measurement methodology (timeit.default_timer with mean of inner loops and min of outer loops) for all python benchmarks Update makefile Run python benchmarks through flake8 Adding new python benchmarks should be very easy after this change, since many useful functions are already available in {sklearn,daal4py}/bench.py.
1 parent 62f153d commit 4752e45

25 files changed

+1692
-1991
lines changed

Makefile

Lines changed: 47 additions & 150 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,20 @@ SHELL = bash -o pipefail
2929

3030
# Other options
3131
NUM_THREADS = -1
32-
SVM_NUM_THREADS = 0
33-
DFCLF_NUM_THREADS = 0
34-
DFREG_NUM_THREADS = 0
35-
LOGREG_NUM_THREADS = 0
3632
MULTIPLIER = 100
3733
DATA_DIR = data/
3834
DATA_kmeans = data/kmeans_$(KMEANS_SIZE).npy
3935

36+
COMMON_ARGS = --batch '$(BATCH)' --arch '$(HOST)' \
37+
--num-threads '$(NUM_THREADS)' --header
38+
4039
# Define which benchmarks to run
4140
NATIVE_BENCHMARKS = distances ridge linear kmeans svm2 svm5 \
4241
logreg2 logreg5 dfclf2 dfclf5 dfreg pca_daal pca_full
4342
SKLEARN_BENCHMARKS = distances ridge linear kmeans svm2 svm5 \
4443
logreg2 logreg5 dfclf2 dfclf5 dfreg pca_full
4544
DAAL4PY_BENCHMARKS = distances ridge linear kmeans svm2 svm5 \
46-
pca_daal pca_full
45+
logreg2 logreg5 dfclf2 dfclf5 dfreg pca_daal pca_full
4746

4847
# Define native benchmark binary names
4948
NATIVE_distances = distances
@@ -61,30 +60,21 @@ NATIVE_pca_daal = pca
6160
NATIVE_pca_full = pca
6261

6362
# Define arguments for native benchmarks
64-
ARGS_NATIVE_distances = --batch "$(BATCH)" --arch "$(HOST)" \
65-
--num-threads "$(NUM_THREADS)" \
63+
ARGS_NATIVE_distances = --num-threads "$(NUM_THREADS)" \
6664
--size "$(DISTANCES_SIZE)" --header
67-
ARGS_NATIVE_ridge = --batch "$(BATCH)" --arch "$(HOST)" \
68-
--num-threads "$(NUM_THREADS)" \
65+
ARGS_NATIVE_ridge = --num-threads "$(NUM_THREADS)" \
6966
--size "$(REGRESSION_SIZE)" --header
70-
ARGS_NATIVE_linear = --batch "$(BATCH)" --arch "$(HOST)" \
71-
--num-threads "$(NUM_THREADS)" \
67+
ARGS_NATIVE_linear = --num-threads "$(NUM_THREADS)" \
7268
--size "$(REGRESSION_SIZE)" --header
73-
ARGS_NATIVE_pca_daal = --batch "$(BATCH)" --arch "$(HOST)" \
74-
--num-threads "$(NUM_THREADS)" \
75-
--size "$(REGRESSION_SIZE)" --header \
76-
--svd-solver daal
77-
ARGS_NATIVE_pca_full = --batch "$(BATCH)" --arch "$(HOST)" \
78-
--num-threads "$(NUM_THREADS)" \
79-
--size "$(REGRESSION_SIZE)" --header \
80-
--svd-solver full
81-
ARGS_NATIVE_kmeans = --batch "$(BATCH)" --arch "$(HOST)" \
82-
--num-threads "$(NUM_THREADS)" \
69+
ARGS_NATIVE_pca_daal = --num-threads "$(NUM_THREADS)" --header \
70+
--size "$(REGRESSION_SIZE)" --svd-solver daal
71+
ARGS_NATIVE_pca_full = --num-threads "$(NUM_THREADS)" --header \
72+
--size "$(REGRESSION_SIZE)" --svd-solver full
73+
ARGS_NATIVE_kmeans = --num-threads "$(NUM_THREADS)" --header \
8374
--data-multiplier "$(MULTIPLIER)" \
8475
--filex data/kmeans_$(KMEANS_SIZE).npy \
8576
--filei data/kmeans_$(KMEANS_SIZE).init.npy \
86-
--filet data/kmeans_$(KMEANS_SIZE).tol.npy \
87-
--header
77+
--filet data/kmeans_$(KMEANS_SIZE).tol.npy
8878
ARGS_NATIVE_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
8979
--fileY data/two/y-$(SVM_SIZE).npy \
9080
--num-threads $(SVM_NUM_THREADS) --header
@@ -113,133 +103,73 @@ SKLEARN_linear = linear
113103
SKLEARN_pca_full = pca
114104
SKLEARN_pca_daal = pca
115105
SKLEARN_kmeans = kmeans
116-
SKLEARN_svm2 = svm_bench
117-
SKLEARN_svm5 = svm_bench
106+
SKLEARN_svm2 = svm
107+
SKLEARN_svm5 = svm
118108
SKLEARN_logreg2 = log_reg
119109
SKLEARN_logreg5 = log_reg
120110
SKLEARN_dfclf2 = df_clsf
121111
SKLEARN_dfclf5 = df_clsf
122112
SKLEARN_dfreg = df_regr
123113

124-
ARGS_SKLEARN_distances = --batchID "$(BATCH)" --arch "$(HOST)" \
125-
--num-threads "$(NUM_THREADS)" \
126-
--size "$(DISTANCES_SIZE)" \
127-
--iteration "$(ITERATIONS)" --prefix sklearn
128-
ARGS_SKLEARN_ridge = --batchID "$(BATCH)" --arch "$(HOST)" \
129-
--num-threads "$(NUM_THREADS)" \
130-
--size "$(REGRESSION_SIZE)" \
131-
--iteration "$(ITERATIONS)" --prefix sklearn
132-
ARGS_SKLEARN_linear = --batchID "$(BATCH)" --arch "$(HOST)" \
133-
--num-threads "$(NUM_THREADS)" \
134-
--size "$(REGRESSION_SIZE)" \
135-
--iteration "$(ITERATIONS)" --prefix sklearn
136-
ARGS_SKLEARN_pca_daal = --batchID "$(BATCH)" --arch "$(HOST)" \
137-
--num-threads "$(NUM_THREADS)" \
138-
--size "$(REGRESSION_SIZE)" \
139-
--iteration "$(ITERATIONS)" --prefix sklearn \
140-
--svd-solver daal
141-
ARGS_SKLEARN_pca_full = --batchID "$(BATCH)" --arch "$(HOST)" \
142-
--num-threads "$(NUM_THREADS)" \
143-
--size "$(REGRESSION_SIZE)" \
144-
--iteration "$(ITERATIONS)" --prefix sklearn \
145-
--svd-solver full
146-
ARGS_SKLEARN_kmeans = --batchID "$(BATCH)" --arch "$(HOST)" \
147-
--num-threads "$(NUM_THREADS)" \
148-
--data-multiplier "$(MULTIPLIER)" \
114+
ARGS_SKLEARN_distances = --size "$(DISTANCES_SIZE)"
115+
ARGS_SKLEARN_ridge = --size "$(REGRESSION_SIZE)"
116+
ARGS_SKLEARN_linear = --size "$(REGRESSION_SIZE)"
117+
ARGS_SKLEARN_pca_daal = --size "$(REGRESSION_SIZE)" --svd-solver daal
118+
ARGS_SKLEARN_pca_full = --size "$(REGRESSION_SIZE)" --svd-solver full
119+
ARGS_SKLEARN_kmeans = --data-multiplier "$(MULTIPLIER)" \
149120
--filex data/kmeans_$(KMEANS_SIZE).npy \
150-
--filei data/kmeans_$(KMEANS_SIZE).init.npy \
151-
--iteration "$(ITERATIONS)" --prefix sklearn \
152-
--size "$(KMEANS_SIZE)"
121+
--filei data/kmeans_$(KMEANS_SIZE).init.npy
153122
ARGS_SKLEARN_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
154-
--fileY data/two/y-$(SVM_SIZE).npy \
155-
--num-threads $(SVM_NUM_THREADS) --header
123+
--fileY data/two/y-$(SVM_SIZE).npy
156124
ARGS_SKLEARN_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \
157-
--fileY data/multi/y-$(SVM_SIZE).npy \
158-
--num-threads $(SVM_NUM_THREADS) --header
125+
--fileY data/multi/y-$(SVM_SIZE).npy
159126
ARGS_SKLEARN_logreg2 = --fileX data/two/X-$(LOGREG_SIZE).npy \
160-
--fileY data/two/y-$(LOGREG_SIZE).npy \
161-
--num-threads $(LOGREG_NUM_THREADS) --header \
162-
--multiclass ovr
127+
--fileY data/two/y-$(LOGREG_SIZE).npy
163128
ARGS_SKLEARN_logreg5 = --fileX data/multi/X-$(LOGREG_SIZE).npy \
164-
--fileY data/multi/y-$(LOGREG_SIZE).npy \
165-
--num-threads $(LOGREG_NUM_THREADS) --header \
166-
--multiclass multinomial
129+
--fileY data/multi/y-$(LOGREG_SIZE).npy
167130
ARGS_SKLEARN_dfclf2 = --fileX data/two/X-$(DFCLF_SIZE).npy \
168-
--fileY data/two/y-$(DFCLF_SIZE).npy \
169-
--num-threads $(DFCLF_NUM_THREADS) --header
131+
--fileY data/two/y-$(DFCLF_SIZE).npy
170132
ARGS_SKLEARN_dfclf5 = --fileX data/multi/X-$(DFCLF_SIZE).npy \
171-
--fileY data/multi/y-$(DFCLF_SIZE).npy \
172-
--num-threads $(DFCLF_NUM_THREADS) --header
133+
--fileY data/multi/y-$(DFCLF_SIZE).npy
173134
ARGS_SKLEARN_dfreg = --fileX data/multi/X-$(DFREG_SIZE).npy \
174-
--fileY data/multi/y-$(DFREG_SIZE).npy \
175-
--num-threads $(DFREG_NUM_THREADS) --header
135+
--fileY data/multi/y-$(DFREG_SIZE).npy
176136

177137
DAAL4PY_distances = distances
178138
DAAL4PY_ridge = ridge
179139
DAAL4PY_linear = linear
180140
DAAL4PY_pca_full = pca
181141
DAAL4PY_pca_daal = pca
182142
DAAL4PY_kmeans = kmeans
183-
DAAL4PY_svm2 = svm_bench
184-
DAAL4PY_svm5 = svm_bench
143+
DAAL4PY_svm2 = svm
144+
DAAL4PY_svm5 = svm
185145
DAAL4PY_logreg2 = log_reg
186146
DAAL4PY_logreg5 = log_reg
187147
DAAL4PY_dfclf2 = df_clsf
188148
DAAL4PY_dfclf5 = df_clsf
189149
DAAL4PY_dfreg = df_regr
190150

191-
ARGS_DAAL4PY_distances = --batchID "$(BATCH)" --arch "$(HOST)" \
192-
--num-threads "$(NUM_THREADS)" \
193-
--size "$(DISTANCES_SIZE)" \
194-
--iteration "$(ITERATIONS)" --prefix daal4py
195-
ARGS_DAAL4PY_ridge = --batchID "$(BATCH)" --arch "$(HOST)" \
196-
--num-threads "$(NUM_THREADS)" \
197-
--size "$(REGRESSION_SIZE)" \
198-
--iteration "$(ITERATIONS)" --prefix daal4py
199-
ARGS_DAAL4PY_linear = --batchID "$(BATCH)" --arch "$(HOST)" \
200-
--num-threads "$(NUM_THREADS)" \
201-
--size "$(REGRESSION_SIZE)" \
202-
--iteration "$(ITERATIONS)" --prefix daal4py
203-
ARGS_DAAL4PY_pca_daal = --batchID "$(BATCH)" --arch "$(HOST)" \
204-
--num-threads "$(NUM_THREADS)" \
205-
--size "$(REGRESSION_SIZE)" \
206-
--iteration "$(ITERATIONS)" --prefix daal4py \
207-
--svd-solver daal
208-
ARGS_DAAL4PY_pca_full = --batchID "$(BATCH)" --arch "$(HOST)" \
209-
--num-threads "$(NUM_THREADS)" \
210-
--size "$(REGRESSION_SIZE)" \
211-
--iteration "$(ITERATIONS)" --prefix daal4py \
212-
--svd-solver full
213-
ARGS_DAAL4PY_kmeans = --batchID "$(BATCH)" --arch "$(HOST)" \
214-
--num-threads "$(NUM_THREADS)" \
215-
--data-multiplier "$(MULTIPLIER)" \
151+
ARGS_DAAL4PY_distances = --size "$(DISTANCES_SIZE)"
152+
ARGS_DAAL4PY_ridge = --size "$(REGRESSION_SIZE)"
153+
ARGS_DAAL4PY_linear = --size "$(REGRESSION_SIZE)"
154+
ARGS_DAAL4PY_pca_daal = --size "$(REGRESSION_SIZE)" --svd-solver daal
155+
ARGS_DAAL4PY_pca_full = --size "$(REGRESSION_SIZE)" --svd-solver full
156+
ARGS_DAAL4PY_kmeans = --data-multiplier "$(MULTIPLIER)" \
216157
--filex data/kmeans_$(KMEANS_SIZE).npy \
217-
--filei data/kmeans_$(KMEANS_SIZE).init.npy \
218-
--iteration "$(ITERATIONS)" --prefix daal4py \
219-
--size "$(KMEANS_SIZE)"
158+
--filei data/kmeans_$(KMEANS_SIZE).init.npy
220159
ARGS_DAAL4PY_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
221-
--fileY data/two/y-$(SVM_SIZE).npy \
222-
--num-threads $(SVM_NUM_THREADS) --header
160+
--fileY data/two/y-$(SVM_SIZE).npy
223161
ARGS_DAAL4PY_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \
224-
--fileY data/multi/y-$(SVM_SIZE).npy \
225-
--num-threads $(SVM_NUM_THREADS) --header
162+
--fileY data/multi/y-$(SVM_SIZE).npy
226163
ARGS_DAAL4PY_logreg2 = --fileX data/two/X-$(LOGREG_SIZE).npy \
227-
--fileY data/two/y-$(LOGREG_SIZE).npy \
228-
--num-threads $(LOGREG_NUM_THREADS) --header \
229-
--multiclass ovr
164+
--fileY data/two/y-$(LOGREG_SIZE).npy
230165
ARGS_DAAL4PY_logreg5 = --fileX data/multi/X-$(LOGREG_SIZE).npy \
231-
--fileY data/multi/y-$(LOGREG_SIZE).npy \
232-
--num-threads $(LOGREG_NUM_THREADS) --header \
233-
--multiclass multinomial
166+
--fileY data/multi/y-$(LOGREG_SIZE).npy
234167
ARGS_DAAL4PY_dfclf2 = --fileX data/two/X-$(DFCLF_SIZE).npy \
235-
--fileY data/two/y-$(DFCLF_SIZE).npy \
236-
--num-threads $(DFCLF_NUM_THREADS) --header
168+
--fileY data/two/y-$(DFCLF_SIZE).npy
237169
ARGS_DAAL4PY_dfclf5 = --fileX data/multi/X-$(DFCLF_SIZE).npy \
238-
--fileY data/multi/y-$(DFCLF_SIZE).npy \
239-
--num-threads $(DFCLF_NUM_THREADS) --header
170+
--fileY data/multi/y-$(DFCLF_SIZE).npy
240171
ARGS_DAAL4PY_dfreg = --fileX data/multi/X-$(DFREG_SIZE).npy \
241-
--fileY data/multi/y-$(DFREG_SIZE).npy \
242-
--num-threads $(DFREG_NUM_THREADS) --header
172+
--fileY data/multi/y-$(DFREG_SIZE).npy
243173

244174
comma = ,
245175

@@ -263,10 +193,10 @@ output/native/%.out: | DATA_% output/native/
263193
native/bin/$(NATIVE_$*) $(ARGS_NATIVE_$*) | tee $@
264194

265195
output/sklearn/%.out: | DATA_% output/sklearn/
266-
python sklearn/$(SKLEARN_$*).py $(ARGS_SKLEARN_$*) | tee $@
196+
python sklearn/$(SKLEARN_$*).py $(COMMON_ARGS) $(ARGS_SKLEARN_$*) | tee $@
267197

268198
output/daal4py/%.out: | DATA_% output/daal4py/
269-
python daal4py/$(DAAL4PY_$*).py $(ARGS_DAAL4PY_$*) | tee $@
199+
python daal4py/$(DAAL4PY_$*).py $(COMMON_ARGS) $(ARGS_DAAL4PY_$*) | tee $@
270200

271201
output/%/:
272202
mkdir -p $@
@@ -277,39 +207,6 @@ sklearn: $(addsuffix .out,$(addprefix output/sklearn/,$(SKLEARN_BENCHMARKS))) da
277207

278208
daal4py: $(addsuffix .out,$(addprefix output/daal4py/,$(DAAL4PY_BENCHMARKS))) data
279209

280-
281-
daal4py_: data
282-
@echo "# Running daal4py benchmarks"
283-
python daal4py/distances.py --batchID $(BATCH) --arch $(HOST) \
284-
--prefix python --core-number $(NUM_THREADS) \
285-
--size $(subst x,$(comma),$(DISTANCES_SIZE)) --iteration $(ITERATIONS)
286-
python daal4py/ridge.py --batchID $(BATCH) --arch $(HOST) \
287-
--prefix python --core-number $(NUM_THREADS) \
288-
--size $(subst x,$(comma),$(REGRESSION_SIZE)) --iteration $(ITERATIONS)
289-
python daal4py/linear.py --batchID $(BATCH) --arch $(HOST) \
290-
--prefix python --core-number $(NUM_THREADS) \
291-
--size $(subst x,$(comma),$(REGRESSION_SIZE)) --iteration $(ITERATIONS)
292-
python daal4py/kmeans.py --batchID $(BATCH) --arch $(HOST) \
293-
--prefix python --core-number $(NUM_THREADS) \
294-
--size $(subst x,$(comma),$(KMEANS_SIZE)) --iteration $(ITERATIONS) \
295-
-x $(KMEANS_DATA) -i $(basename $(KMEANS_DATA)).init.npy
296-
python daal4py/svm_bench.py --core-number $(NUM_THREADS) \
297-
--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
298-
--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
299-
--header
300-
python daal4py/svm_bench.py --core-number $(NUM_THREADS) \
301-
--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
302-
--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
303-
--header
304-
python daal4py/df_clsf.py --num-threads $(NUM_THREADS) \
305-
--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
306-
--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
307-
--header
308-
python daal4py/df_clsf.py --num-threads $(NUM_THREADS) \
309-
--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
310-
--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
311-
--header
312-
313210
data: $(KMEANS_DATA) svm_data logreg_data df_clf_data
314211

315212
DATA_kmeans: data/kmeans_$(KMEANS_SIZE).npy

daal4py/args.py

Lines changed: 0 additions & 28 deletions
This file was deleted.

0 commit comments

Comments
 (0)