Skip to content

Commit 77fcc63

Browse files
authored
Fixes to align benchmarks with sklearn (#15)
Native KMeans: fix wrong parameters passed to DAAL prediction object Native logistic regression: scale DAAL's loss function value and gradient by n_samples daal4py logistic regression: don't compute loss function value and gradient twice * Align kmeans native benchmark to sklearn * Don't evaluate func, grad if not necessary * Scale loss value and gradient like sklearn does * Specify maxIterations, accuracyThreshold for multi_class_classifier * Fix indentation in Makefiles * Fix logic to select n_features_per_node in daal4py df_regr * Specifically say memorySavingMode=false in native df_regr * Reformat native df_regr bench
1 parent 4752e45 commit 77fcc63

File tree

11 files changed

+251
-175
lines changed

11 files changed

+251
-175
lines changed

Makefile

Lines changed: 61 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,17 @@ REGRESSION_SIZE = 1000000x50
44
KMEANS_SAMPLES = 1000000
55
KMEANS_FEATURES = 50
66
KMEANS_SIZE = $(KMEANS_SAMPLES)x$(KMEANS_FEATURES)
7-
SVM_SAMPLES = 50000
7+
SVM_SAMPLES = 100000
88
SVM_FEATURES = 100
99
SVM_SIZE = $(SVM_SAMPLES)x$(SVM_FEATURES)
1010
LOGREG_SAMPLES = 100000
1111
LOGREG_FEATURES = 100
1212
LOGREG_SIZE = $(LOGREG_SAMPLES)x$(LOGREG_FEATURES)
1313
DFCLF_SAMPLES = 10000
14-
DFCLF_FEATURES = 50
14+
DFCLF_FEATURES = 100
1515
DFCLF_SIZE = $(DFCLF_SAMPLES)x$(DFCLF_FEATURES)
1616
DFREG_SAMPLES = 10000
17-
DFREG_FEATURES = 50
17+
DFREG_FEATURES = 100
1818
DFREG_SIZE = $(DFREG_SAMPLES)x$(DFREG_FEATURES)
1919

2020
ITERATIONS = 10
@@ -29,20 +29,24 @@ SHELL = bash -o pipefail
2929

3030
# Other options
3131
NUM_THREADS = -1
32+
SVM_NUM_THREADS = 0
33+
LOGREG_NUM_THREADS = $(SVM_NUM_THREADS)
34+
DFCLF_NUM_THREADS = $(SVM_NUM_THREADS)
35+
DFREG_NUM_THREADS = $(SVM_NUM_THREADS)
3236
MULTIPLIER = 100
3337
DATA_DIR = data/
3438
DATA_kmeans = data/kmeans_$(KMEANS_SIZE).npy
3539

36-
COMMON_ARGS = --batch '$(BATCH)' --arch '$(HOST)' \
37-
--num-threads '$(NUM_THREADS)' --header
40+
COMMON_ARGS = --batch '$(BATCH)' --arch '$(HOST)' \
41+
--num-threads '$(NUM_THREADS)' --header
3842

3943
# Define which benchmarks to run
40-
NATIVE_BENCHMARKS = distances ridge linear kmeans svm2 svm5 \
41-
logreg2 logreg5 dfclf2 dfclf5 dfreg pca_daal pca_full
44+
NATIVE_BENCHMARKS = distances ridge linear kmeans svm2 svm5 \
45+
logreg2 logreg5 dfclf2 dfclf5 dfreg pca_daal pca_full
4246
SKLEARN_BENCHMARKS = distances ridge linear kmeans svm2 svm5 \
43-
logreg2 logreg5 dfclf2 dfclf5 dfreg pca_full
47+
logreg2 logreg5 dfclf2 dfclf5 dfreg pca_full
4448
DAAL4PY_BENCHMARKS = distances ridge linear kmeans svm2 svm5 \
45-
logreg2 logreg5 dfclf2 dfclf5 dfreg pca_daal pca_full
49+
logreg2 logreg5 dfclf2 dfclf5 dfreg pca_daal pca_full
4650

4751
# Define native benchmark binary names
4852
NATIVE_distances = distances
@@ -61,41 +65,41 @@ NATIVE_pca_full = pca
6165

6266
# Define arguments for native benchmarks
6367
ARGS_NATIVE_distances = --num-threads "$(NUM_THREADS)" \
64-
--size "$(DISTANCES_SIZE)" --header
68+
--size "$(DISTANCES_SIZE)" --header
6569
ARGS_NATIVE_ridge = --num-threads "$(NUM_THREADS)" \
66-
--size "$(REGRESSION_SIZE)" --header
70+
--size "$(REGRESSION_SIZE)" --header
6771
ARGS_NATIVE_linear = --num-threads "$(NUM_THREADS)" \
68-
--size "$(REGRESSION_SIZE)" --header
72+
--size "$(REGRESSION_SIZE)" --header
6973
ARGS_NATIVE_pca_daal = --num-threads "$(NUM_THREADS)" --header \
70-
--size "$(REGRESSION_SIZE)" --svd-solver daal
74+
--size "$(REGRESSION_SIZE)" --svd-solver daal
7175
ARGS_NATIVE_pca_full = --num-threads "$(NUM_THREADS)" --header \
72-
--size "$(REGRESSION_SIZE)" --svd-solver full
76+
--size "$(REGRESSION_SIZE)" --svd-solver full
7377
ARGS_NATIVE_kmeans = --num-threads "$(NUM_THREADS)" --header \
74-
--data-multiplier "$(MULTIPLIER)" \
75-
--filex data/kmeans_$(KMEANS_SIZE).npy \
76-
--filei data/kmeans_$(KMEANS_SIZE).init.npy \
77-
--filet data/kmeans_$(KMEANS_SIZE).tol.npy
78-
ARGS_NATIVE_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
79-
--fileY data/two/y-$(SVM_SIZE).npy \
80-
--num-threads $(SVM_NUM_THREADS) --header
81-
ARGS_NATIVE_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \
82-
--fileY data/multi/y-$(SVM_SIZE).npy \
83-
--num-threads $(SVM_NUM_THREADS) --header
78+
--data-multiplier "$(MULTIPLIER)" \
79+
--filex data/kmeans_$(KMEANS_SIZE).npy \
80+
--filei data/kmeans_$(KMEANS_SIZE).init.npy \
81+
--filet data/kmeans_$(KMEANS_SIZE).tol.npy
82+
ARGS_NATIVE_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
83+
--fileY data/two/y-$(SVM_SIZE).npy \
84+
--num-threads $(SVM_NUM_THREADS) --header
85+
ARGS_NATIVE_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \
86+
--fileY data/multi/y-$(SVM_SIZE).npy \
87+
--num-threads $(SVM_NUM_THREADS) --header
8488
ARGS_NATIVE_logreg2 = --fileX data/two/X-$(LOGREG_SIZE).npy \
85-
--fileY data/two/y-$(LOGREG_SIZE).npy \
86-
--num-threads $(LOGREG_NUM_THREADS) --header
89+
--fileY data/two/y-$(LOGREG_SIZE).npy \
90+
--num-threads $(LOGREG_NUM_THREADS) --header
8791
ARGS_NATIVE_logreg5 = --fileX data/multi/X-$(LOGREG_SIZE).npy \
88-
--fileY data/multi/y-$(LOGREG_SIZE).npy \
89-
--num-threads $(LOGREG_NUM_THREADS) --header
92+
--fileY data/multi/y-$(LOGREG_SIZE).npy \
93+
--num-threads $(LOGREG_NUM_THREADS) --header
9094
ARGS_NATIVE_dfclf2 = --fileX data/two/X-$(DFCLF_SIZE).npy \
91-
--fileY data/two/y-$(DFCLF_SIZE).npy \
92-
--num-threads $(DFCLF_NUM_THREADS) --header
95+
--fileY data/two/y-$(DFCLF_SIZE).npy \
96+
--num-threads $(DFCLF_NUM_THREADS) --header
9397
ARGS_NATIVE_dfclf5 = --fileX data/multi/X-$(DFCLF_SIZE).npy \
94-
--fileY data/multi/y-$(DFCLF_SIZE).npy \
95-
--num-threads $(DFCLF_NUM_THREADS) --header
98+
--fileY data/multi/y-$(DFCLF_SIZE).npy \
99+
--num-threads $(DFCLF_NUM_THREADS) --header
96100
ARGS_NATIVE_dfreg = --fileX data/reg/X-$(DFREG_SIZE).npy \
97-
--fileY data/reg/y-$(DFREG_SIZE).npy \
98-
--num-threads $(DFREG_NUM_THREADS) --header
101+
--fileY data/reg/y-$(DFREG_SIZE).npy \
102+
--num-threads $(DFREG_NUM_THREADS) --header
99103

100104
SKLEARN_distances = distances
101105
SKLEARN_ridge = ridge
@@ -117,22 +121,22 @@ ARGS_SKLEARN_linear = --size "$(REGRESSION_SIZE)"
117121
ARGS_SKLEARN_pca_daal = --size "$(REGRESSION_SIZE)" --svd-solver daal
118122
ARGS_SKLEARN_pca_full = --size "$(REGRESSION_SIZE)" --svd-solver full
119123
ARGS_SKLEARN_kmeans = --data-multiplier "$(MULTIPLIER)" \
120-
--filex data/kmeans_$(KMEANS_SIZE).npy \
121-
--filei data/kmeans_$(KMEANS_SIZE).init.npy
122-
ARGS_SKLEARN_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
123-
--fileY data/two/y-$(SVM_SIZE).npy
124+
--filex data/kmeans_$(KMEANS_SIZE).npy \
125+
--filei data/kmeans_$(KMEANS_SIZE).init.npy
126+
ARGS_SKLEARN_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
127+
--fileY data/two/y-$(SVM_SIZE).npy
124128
ARGS_SKLEARN_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \
125-
--fileY data/multi/y-$(SVM_SIZE).npy
129+
--fileY data/multi/y-$(SVM_SIZE).npy
126130
ARGS_SKLEARN_logreg2 = --fileX data/two/X-$(LOGREG_SIZE).npy \
127-
--fileY data/two/y-$(LOGREG_SIZE).npy
131+
--fileY data/two/y-$(LOGREG_SIZE).npy
128132
ARGS_SKLEARN_logreg5 = --fileX data/multi/X-$(LOGREG_SIZE).npy \
129-
--fileY data/multi/y-$(LOGREG_SIZE).npy
133+
--fileY data/multi/y-$(LOGREG_SIZE).npy
130134
ARGS_SKLEARN_dfclf2 = --fileX data/two/X-$(DFCLF_SIZE).npy \
131-
--fileY data/two/y-$(DFCLF_SIZE).npy
135+
--fileY data/two/y-$(DFCLF_SIZE).npy
132136
ARGS_SKLEARN_dfclf5 = --fileX data/multi/X-$(DFCLF_SIZE).npy \
133-
--fileY data/multi/y-$(DFCLF_SIZE).npy
134-
ARGS_SKLEARN_dfreg = --fileX data/multi/X-$(DFREG_SIZE).npy \
135-
--fileY data/multi/y-$(DFREG_SIZE).npy
137+
--fileY data/multi/y-$(DFCLF_SIZE).npy
138+
ARGS_SKLEARN_dfreg = --fileX data/reg/X-$(DFREG_SIZE).npy \
139+
--fileY data/reg/y-$(DFREG_SIZE).npy
136140

137141
DAAL4PY_distances = distances
138142
DAAL4PY_ridge = ridge
@@ -154,22 +158,23 @@ ARGS_DAAL4PY_linear = --size "$(REGRESSION_SIZE)"
154158
ARGS_DAAL4PY_pca_daal = --size "$(REGRESSION_SIZE)" --svd-solver daal
155159
ARGS_DAAL4PY_pca_full = --size "$(REGRESSION_SIZE)" --svd-solver full
156160
ARGS_DAAL4PY_kmeans = --data-multiplier "$(MULTIPLIER)" \
157-
--filex data/kmeans_$(KMEANS_SIZE).npy \
158-
--filei data/kmeans_$(KMEANS_SIZE).init.npy
159-
ARGS_DAAL4PY_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
160-
--fileY data/two/y-$(SVM_SIZE).npy
161+
--filex data/kmeans_$(KMEANS_SIZE).npy \
162+
--filei data/kmeans_$(KMEANS_SIZE).init.npy \
163+
--filet data/kmeans_$(KMEANS_SIZE).tol.npy
164+
ARGS_DAAL4PY_svm2 = --fileX data/two/X-$(SVM_SIZE).npy \
165+
--fileY data/two/y-$(SVM_SIZE).npy
161166
ARGS_DAAL4PY_svm5 = --fileX data/multi/X-$(SVM_SIZE).npy \
162-
--fileY data/multi/y-$(SVM_SIZE).npy
167+
--fileY data/multi/y-$(SVM_SIZE).npy
163168
ARGS_DAAL4PY_logreg2 = --fileX data/two/X-$(LOGREG_SIZE).npy \
164-
--fileY data/two/y-$(LOGREG_SIZE).npy
169+
--fileY data/two/y-$(LOGREG_SIZE).npy
165170
ARGS_DAAL4PY_logreg5 = --fileX data/multi/X-$(LOGREG_SIZE).npy \
166-
--fileY data/multi/y-$(LOGREG_SIZE).npy
171+
--fileY data/multi/y-$(LOGREG_SIZE).npy
167172
ARGS_DAAL4PY_dfclf2 = --fileX data/two/X-$(DFCLF_SIZE).npy \
168-
--fileY data/two/y-$(DFCLF_SIZE).npy
173+
--fileY data/two/y-$(DFCLF_SIZE).npy
169174
ARGS_DAAL4PY_dfclf5 = --fileX data/multi/X-$(DFCLF_SIZE).npy \
170-
--fileY data/multi/y-$(DFCLF_SIZE).npy
171-
ARGS_DAAL4PY_dfreg = --fileX data/multi/X-$(DFREG_SIZE).npy \
172-
--fileY data/multi/y-$(DFREG_SIZE).npy
175+
--fileY data/multi/y-$(DFCLF_SIZE).npy
176+
ARGS_DAAL4PY_dfreg = --fileX data/reg/X-$(DFREG_SIZE).npy \
177+
--fileY data/reg/y-$(DFREG_SIZE).npy
173178

174179
comma = ,
175180

daal4py/df_regr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def df_regr_fit(X, y, n_trees=100, seed=12345, n_features_per_node=0,
1818
fptype = getFPType(X)
1919

2020
features_per_node = X.shape[1]
21-
if n_features_per_node > 0 or n_features_per_node <= features_per_node:
21+
if n_features_per_node > 0 and n_features_per_node <= features_per_node:
2222
features_per_node = n_features_per_node
2323

2424
engine = engines_mt2203(seed=seed, fptype=fptype)

daal4py/kmeans.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,17 @@
55
import argparse
66
from bench import parse_args, time_mean_min, print_header, print_row, size_str
77
from daal4py import kmeans
8+
from daal4py.sklearn.utils import getFPType
89
import numpy as np
910

1011
parser = argparse.ArgumentParser(description='daal4py K-Means clustering '
1112
'benchmark')
12-
parser.add_argument('-x', '--filex', '--fileX', '--input',
13+
parser.add_argument('-x', '--filex', '--fileX', '--input', required=True,
1314
type=str, help='Points to cluster')
14-
parser.add_argument('-i', '--filei', '--fileI', '--init',
15+
parser.add_argument('-i', '--filei', '--fileI', '--init', required=True,
1516
type=str, help='Initial clusters')
16-
# parser.add_argument('-t', '--filet', '--fileT', '--tol',
17-
# type=str, help='Absolute threshold')
17+
parser.add_argument('-t', '--filet', '--fileT', '--tol', required=True,
18+
type=str, help='Absolute threshold')
1819
parser.add_argument('-m', '--data-multiplier', default=100,
1920
type=int, help='Data multiplier')
2021
parser.add_argument('--maxiter', type=int, default=100,
@@ -25,6 +26,7 @@
2526
X = np.load(params.filex)
2627
X_init = np.load(params.filei)
2728
X_mult = np.vstack((X,) * params.data_multiplier)
29+
tol = np.load(params.filet)
2830

2931
params.size = size_str(X.shape)
3032
params.n_clusters = X_init.shape[0]
@@ -34,17 +36,23 @@
3436
# Define functions to time
3537
def test_fit(X, X_init):
3638
algorithm = kmeans(
39+
fptype=getFPType(X),
3740
nClusters=params.n_clusters,
38-
maxIterations=params.maxiter
39-
) # FIXME tolerance?
41+
maxIterations=params.maxiter,
42+
assignFlag=True,
43+
accuracyThreshold=tol
44+
)
4045
return algorithm.compute(X, X_init)
4146

4247

4348
def test_predict(X, X_init):
4449
algorithm = kmeans(
50+
fptype=getFPType(X),
4551
nClusters=params.n_clusters,
46-
maxIterations=0
47-
) # FIXME tolerance
52+
maxIterations=0,
53+
assignFlag=True,
54+
accuracyThreshold=0.0
55+
)
4856
return algorithm.compute(X, X_init)
4957

5058

daal4py/log_reg.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,19 @@ def __init__(self, X, y, beta, hess=False, fit_intercept=True):
3838
self.X = make2d(X)
3939
self.y = make2d(y)
4040

41+
self.last_beta = beta.copy()
42+
43+
self.func = None
44+
self.grad = None
45+
self.hess = None
46+
4147
def compute(self, beta):
48+
# Don't compute if we have already cached func, grad, hess
49+
if self.func is not None and np.array_equal(beta, self.last_beta):
50+
return
51+
4252
result = self.algo.compute(self.X, self.y, make2d(beta))
53+
np.copyto(self.last_beta, beta)
4354
self.func = result.valueIdx[0, 0] * self.n
4455
self.grad = result.gradientIdx.ravel() * self.n
4556
if self.compute_hess:

native/Makefile

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@
33
# SPDX-License-Identifier: MIT
44

55
BENCHMARKS += distances kmeans linear ridge pca \
6-
two_class_svm multi_class_svm log_reg_lbfgs \
7-
decision_forest_regr decision_forest_clsf
6+
two_class_svm multi_class_svm log_reg_lbfgs \
7+
decision_forest_regr decision_forest_clsf
88
FOBJ = $(addprefix lbfgsb/,lbfgsb.o linpack.o timer.o)
99
CXXSRCS = $(addsuffix _bench.cpp,$(BENCHMARKS))
1010

1111
CXX = icc
12+
1213
CXXFLAGS += -m64 -fPIC -fp-model strict -O3 -fomit-frame-pointer \
13-
-xSSE4.2 -axCORE-AVX2,COMMON-AVX512
14+
-xSSE4.2 -axCORE-AVX2,COMMON-AVX512
1415
CXXFLAGS += -std=c++14 -g
15-
LDFLAGS += -ltbb -lstdc++ -lpthread -lm -ldaal_core -ldaal_thread \
16-
-Wl,-rpath,$(CONDA_PREFIX)/lib
16+
LDFLAGS += -ltbb -lstdc++ -lpthread -lm -ldaal_core -ldaal_thread \
17+
-Wl,-rpath,$(CONDA_PREFIX)/lib
1718
CXXINCLUDE += include
1819

1920
ifneq ($(CONDA_PREFIX),)

0 commit comments

Comments
 (0)