Skip to content

Commit 002fa9d

Browse files
authored
Update of parameters for sklearn & cuml benchmarks (#35)
1 parent 30df9be commit 002fa9d

File tree

17 files changed

+280
-55
lines changed

17 files changed

+280
-55
lines changed

.gitignore

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Logs
2+
*.log
3+
4+
# Release and work directories
5+
__pycache__*
6+
__work*
7+
8+
# Visual Studio related files, e.g., ".vscode"
9+
.vs*
10+
11+
# Datasets
12+
dataset
13+
*.csv

cuml/dbscan.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
# SPDX-License-Identifier: MIT
44

55
import argparse
6-
from bench import parse_args, measure_function_time, load_data, print_output
6+
from bench import parse_args, measure_function_time, load_data, print_output, convert_to_numpy
77
from cuml import DBSCAN
8+
from sklearn.metrics.cluster import davies_bouldin_score
89

910
parser = argparse.ArgumentParser(description='cuML DBSCAN benchmark')
1011
parser.add_argument('-e', '--eps', '--epsilon', type=float, default=10.,
@@ -27,9 +28,14 @@
2728
# Time fit
2829
time, _ = measure_function_time(dbscan.fit, X, params=params)
2930
labels = dbscan.labels_
30-
params.n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
31+
32+
X_host = convert_to_numpy(X)
33+
labels_host = convert_to_numpy(labels)
34+
35+
acc = davies_bouldin_score(X_host, labels_host)
36+
params.n_clusters = len(set(labels_host)) - (1 if -1 in labels_host else 0)
3137

3238
print_output(library='cuml', algorithm='dbscan', stages=['training'],
3339
columns=columns, params=params, functions=['DBSCAN'],
34-
times=[time], accuracies=[None], accuracy_type=None, data=[X],
40+
times=[time], accuracies=[acc], accuracy_type='davies_bouldin_score', data=[X],
3541
alg_instance=dbscan)

cuml/elasticnet.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright (C) 2020 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
import argparse
6+
from bench import (
7+
parse_args, measure_function_time, load_data, print_output, rmse_score
8+
)
9+
from cuml.linear_model import ElasticNet
10+
11+
parser = argparse.ArgumentParser(description='scikit-learn elastic-net regression '
12+
'benchmark')
13+
parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=True,
14+
action='store_false',
15+
help="Don't fit intercept (assume data already centered)")
16+
parser.add_argument('--alpha', dest='alpha', type=float, default=1.0,
17+
help='Regularization parameter')
18+
parser.add_argument('--maxiter', type=int, default=1000,
19+
help='Maximum iterations for the iterative solver')
20+
parser.add_argument('--l1_ratio', dest='l1_ratio', type=float, default=0.5,
21+
help='Regularization parameter')
22+
parser.add_argument('--tol', type=float, default=0.0,
23+
help='Tolerance for solver.')
24+
params = parse_args(parser)
25+
26+
# Load data
27+
X_train, X_test, y_train, y_test = load_data(params)
28+
29+
# Create our regression object
30+
regr = ElasticNet(fit_intercept=params.fit_intercept, l1_ratio=params.l1_ratio, alpha=params.alpha,
31+
tol=params.tol, max_iter=params.maxiter)
32+
33+
columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
34+
'time')
35+
36+
# Time fit
37+
fit_time, _ = measure_function_time(regr.fit, X_train, y_train, params=params)
38+
39+
# Time predict
40+
predict_time, pred_train = measure_function_time(regr.predict, X_train, params=params)
41+
42+
train_rmse = rmse_score(pred_train, y_train)
43+
pred_test = regr.predict(X_test)
44+
test_rmse = rmse_score(pred_test, y_test)
45+
46+
print_output(library='cuml', algorithm='elastic-net',
47+
stages=['training', 'prediction'], columns=columns,
48+
params=params, functions=['ElasticNet.fit', 'ElasticNet.predict'],
49+
times=[fit_time, predict_time], accuracy_type='rmse',
50+
accuracies=[train_rmse, test_rmse], data=[X_train, X_train],
51+
alg_instance=regr)

cuml/kmeans.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44

55
import argparse
66
from bench import (
7-
parse_args, measure_function_time, load_data, print_output
7+
parse_args, measure_function_time, load_data, print_output, convert_to_numpy
88
)
99
import numpy as np
1010
from cuml import KMeans
1111
import warnings
12-
12+
from sklearn.metrics.cluster import davies_bouldin_score
1313

1414
warnings.filterwarnings('ignore', category=FutureWarning)
1515
parser = argparse.ArgumentParser(description='cuML K-means benchmark')
@@ -27,8 +27,10 @@
2727
# Load and convert generated data
2828
X_train, X_test, _, _ = load_data(params)
2929

30+
if params.filei == 'k-means++':
31+
X_init = 'k-means++'
3032
# Load initial centroids from specified path
31-
if params.filei is not None:
33+
elif params.filei is not None:
3234
X_init = np.load(params.filei).astype(params.dtype)
3335
params.n_clusters = X_init.shape[0]
3436
# or choose random centroids from training data
@@ -57,15 +59,24 @@ def kmeans_fit(X):
5759

5860
# Time fit
5961
fit_time, kmeans = measure_function_time(kmeans_fit, X_train, params=params)
60-
train_inertia = float(kmeans.inertia_)
62+
train_predict = kmeans.predict(X_train)
6163

6264
# Time predict
63-
predict_time, _ = measure_function_time(kmeans.predict, X_test, params=params)
64-
test_inertia = float(kmeans.inertia_)
65+
predict_time, test_predict = measure_function_time(kmeans.predict, X_test, params=params)
66+
67+
X_train_host = convert_to_numpy(X_train)
68+
train_predict_host = convert_to_numpy(train_predict)
69+
acc_train = davies_bouldin_score(X_train_host, train_predict_host)
70+
71+
X_test_host = convert_to_numpy(X_test)
72+
test_predict_host = convert_to_numpy(test_predict)
73+
74+
acc_test = davies_bouldin_score(X_test_host, test_predict_host)
6575

6676
print_output(library='cuml', algorithm='kmeans',
6777
stages=['training', 'prediction'], columns=columns,
6878
params=params, functions=['KMeans.fit', 'KMeans.predict'],
69-
times=[fit_time, predict_time], accuracy_type='inertia',
70-
accuracies=[train_inertia, test_inertia], data=[X_train, X_test],
79+
times=[fit_time, predict_time], accuracy_type='davies_bouldin_score',
80+
accuracies=[acc_train, acc_test], data=[X_train, X_test],
7181
alg_instance=kmeans)
82+

cuml/lasso.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Copyright (C) 2020 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
import argparse
6+
from bench import (
7+
parse_args, measure_function_time, load_data, print_output, rmse_score
8+
)
9+
from cuml.linear_model import Lasso
10+
11+
parser = argparse.ArgumentParser(description='scikit-learn lasso regression '
12+
'benchmark')
13+
parser.add_argument('--no-fit-intercept', dest='fit_intercept', default=False,
14+
action='store_false',
15+
help="Don't fit intercept (assume data already centered)")
16+
parser.add_argument('--alpha', dest='alpha', type=float, default=1.0,
17+
help='Regularization parameter')
18+
parser.add_argument('--maxiter', type=int, default=1000,
19+
help='Maximum iterations for the iterative solver')
20+
parser.add_argument('--tol', type=float, default=0.0,
21+
help='Tolerance for solver.')
22+
params = parse_args(parser)
23+
24+
# Load data
25+
X_train, X_test, y_train, y_test = load_data(params)
26+
27+
# Create our regression object
28+
regr = Lasso(fit_intercept=params.fit_intercept, alpha=params.alpha,
29+
tol=params.tol, max_iter=params.maxiter)
30+
31+
columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',
32+
'time')
33+
34+
# Time fit
35+
fit_time, _ = measure_function_time(regr.fit, X_train, y_train, params=params)
36+
37+
# Time predict
38+
predict_time, pred_train = measure_function_time(regr.predict, X_train, params=params)
39+
40+
train_rmse = rmse_score(pred_train, y_train)
41+
pred_test = regr.predict(X_test)
42+
test_rmse = rmse_score(pred_test, y_test)
43+
44+
print_output(library='sklearn', algorithm='lasso',
45+
stages=['training', 'prediction'], columns=columns,
46+
params=params, functions=['Lasso.fit', 'Lasso.predict'],
47+
times=[fit_time, predict_time], accuracy_type='rmse',
48+
accuracies=[train_rmse, test_rmse], data=[X_train, X_test],
49+
alg_instance=regr)

cuml/log_reg.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333
# Create our classifier object
3434
clf = LogisticRegression(penalty='l2', C=params.C,
3535
linesearch_max_iter=params.linesearch_max_iter,
36-
fit_intercept=params.fit_intercept,
37-
verbose=params.verbose, tol=params.tol,
36+
fit_intercept=params.fit_intercept, verbose=params.verbose,
37+
tol=params.tol,
3838
max_iter=params.maxiter, solver=params.solver)
3939

4040
columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size',

cuml/svm.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,16 +74,17 @@ def get_optimal_cache_size(n_rows, dtype=np.double, max_cache=64):
7474
# Time fit and predict
7575
fit_time, _ = measure_function_time(clf.fit, X_train, y_train, params=params)
7676
params.sv_len = clf.support_.shape[0]
77-
y_pred = clf.predict(X_train)
78-
train_acc = 100 * accuracy_score(y_pred, y_train)
7977

8078
predict_time, y_pred = measure_function_time(
81-
clf.predict, X_test, params=params)
79+
clf.predict, X_train, params=params)
80+
train_acc = 100 * accuracy_score(y_pred, y_train)
81+
82+
y_pred = clf.predict(X_test)
8283
test_acc = 100 * accuracy_score(y_pred, y_test)
8384

8485
print_output(library='cuml', algorithm='svc',
8586
stages=['training', 'prediction'], columns=columns,
8687
params=params, functions=['SVM.fit', 'SVM.predict'],
8788
times=[fit_time, predict_time], accuracy_type='accuracy[%]',
88-
accuracies=[train_acc, test_acc], data=[X_train, X_test],
89+
accuracies=[train_acc, test_acc], data=[X_train, X_train],
8990
alg_instance=clf)

daal4py/svm.py

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,8 @@ def construct_dual_coefs(model, num_classes, X, y):
140140
del tmp
141141

142142
support_ = two_class_sv_ind_[perm]
143-
# support_vectors_ = X[support_]
144-
145143
dual_coef_ = model.ClassificationCoefficients.T
146144
dual_coef_ = dual_coef_[:, perm]
147-
# intercept_ = np.array([model.Bias])
148145

149146
else:
150147
# multi-class
@@ -170,10 +167,6 @@ def construct_dual_coefs(model, num_classes, X, y):
170167
two_class_sv_ind_.ravel())
171168
sv_ind_by_clf.append(sv_ind)
172169

173-
# svs_ = getArrayFromNumericTable(
174-
# svm_model.getSupportVectors())
175-
# assert np.array_equal(svs_, X[sv_ind])
176-
177170
intercepts.append(-svm_model.Bias)
178171
coefs.append(-svm_model.ClassificationCoefficients)
179172
model_id += 1
@@ -190,8 +183,6 @@ def construct_dual_coefs(model, num_classes, X, y):
190183
sv_coef_by_clf, # classification coeffs by two-class classifiers
191184
y.squeeze().astype(np.intp, copy=False) # integer labels
192185
)
193-
# support_vectors_ = X[support_]
194-
# intercept_ = np.array(intercepts)
195186

196187
return support_
197188

@@ -210,12 +201,8 @@ def test_fit(X, y, params):
210201
fptype = getFPType(X)
211202
kf = daal_kernel(params.kernel, fptype, gamma=params.gamma)
212203

213-
if params.n_classes == 2:
214-
y[y == 0] = -1
215-
else:
216-
y[y == -1] = 0
217-
218204
svm_train = svm_training(
205+
method='thunder',
219206
fptype=fptype,
220207
C=params.C,
221208
maxIterations=params.maxiter,
@@ -269,7 +256,6 @@ def test_predict(X, training_result, params):
269256
else:
270257
prdct = multi_class_classifier_prediction(
271258
nClasses=params.n_classes,
272-
method='thunder',
273259
fptype=fptype,
274260
maxIterations=params.maxiter,
275261
accuracyThreshold=params.tol,
@@ -323,9 +309,6 @@ def main():
323309
params.cache_size_bytes = cache_size_bytes
324310
params.n_classes = np.unique(y_train).size
325311

326-
# This is necessary for daal
327-
y_train[y_train == 0] = -1
328-
329312
columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype',
330313
'size', 'kernel', 'cache_size_mb', 'C', 'sv_len', 'n_classes',
331314
'accuracy', 'time')

make_datasets.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#
33
# SPDX-License-Identifier: MIT
44

5-
65
import argparse
76
import sys
87

@@ -14,7 +13,7 @@
1413
def gen_blobs(args):
1514
X, y = make_blobs(n_samples=args.samples + args.test_samples,
1615
n_features=args.features,
17-
centers=None,
16+
centers=args.clusters,
1817
center_box=(-32, 32),
1918
shuffle=True,
2019
random_state=args.seed)
@@ -55,19 +54,16 @@ def gen_classification(args):
5554
np.save(args.fileytest, y[args.samples:])
5655
return 0
5756

58-
5957
def _ch_size(n):
6058
return n * (n + 1) // 2
6159

62-
6360
def _get_cluster_centers(clusters, features):
6461
import numpy.random_intel as nri
6562
rs = nri.RandomState(1234, brng='SFMT19937')
6663
cluster_centers = rs.randn(clusters, features)
6764
cluster_centers *= np.double(clusters)
6865
return cluster_centers
6966

70-
7167
def gen_kmeans(args):
7268
try:
7369
import numpy.random_intel as nri

runner.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ def is_ht_enabled():
188188
generate_cases(params)
189189
verbose_print(f'{algorithm} algorithm: {len(libs) * len(cases)} case(s),'
190190
f' {len(params_set["dataset"])} dataset(s)\n')
191+
191192
for dataset in params_set['dataset']:
192193
if dataset['source'] in ['csv', 'npy']:
193194
paths = f'--file-X-train {dataset["training"]["x"]}'
@@ -212,6 +213,8 @@ class GenerationArgs:
212213
else:
213214
gen_args.seed = 777
214215

216+
# default values
217+
gen_args.clusters = 10
215218
gen_args.type = dataset['type']
216219
gen_args.samples = dataset['training']['n_samples']
217220
gen_args.features = dataset['n_features']

0 commit comments

Comments
 (0)