Skip to content

Commit 25199da

Browse files
authored
Add benchmarks for daal4py random forest, fix native RF benches for consistency (#6)
* Add daal4py benchmarks for random forests * Native random forests: Use MT2203 engine for fit and single precision float for prediction for consistency with sklearn.
1 parent a475be0 commit 25199da

File tree

5 files changed

+374
-4
lines changed

5 files changed

+374
-4
lines changed

Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,14 @@ daal4py_: data
301301
--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
302302
--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
303303
--header
304+
python daal4py/df_clsf.py --num-threads $(NUM_THREADS) \
305+
--fileX data/two/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
306+
--fileY data/two/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
307+
--header
308+
python daal4py/df_clsf.py --num-threads $(NUM_THREADS) \
309+
--fileX data/multi/X-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
310+
--fileY data/multi/y-$(SVM_SAMPLES)x$(SVM_FEATURES).npy \
311+
--header
304312

305313
data: $(KMEANS_DATA) svm_data logreg_data df_clf_data
306314

daal4py/df_clsf.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
# Copyright (C) 2018-2019 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
6+
import numpy as np
7+
import bench
8+
from daal4py import decision_forest_classification_training
9+
from daal4py import decision_forest_classification_prediction
10+
from daal4py import engines_mt2203
11+
from daal4py.sklearn.utils import getFPType
12+
13+
14+
def df_clsf_fit(X, y, n_classes, n_trees=100, seed=12345,
15+
n_features_per_node=0, max_depth=0, min_impurity=0,
16+
bootstrap=True, verbose=False):
17+
18+
fptype = getFPType(X)
19+
20+
features_per_node = X.shape[1]
21+
if n_features_per_node > 0 and n_features_per_node < features_per_node:
22+
features_per_node = n_features_per_node
23+
24+
engine = engines_mt2203(seed=seed, fptype=fptype)
25+
26+
algorithm = decision_forest_classification_training(
27+
nClasses=n_classes,
28+
fptype=fptype,
29+
method='defaultDense',
30+
nTrees=n_trees,
31+
observationsPerTreeFraction=1.,
32+
featuresPerNode=features_per_node,
33+
maxTreeDepth=max_depth,
34+
minObservationsInLeafNode=1,
35+
engine=engine,
36+
impurityThreshold=min_impurity,
37+
varImportance='MDI',
38+
resultsToCompute='',
39+
memorySavingMode=False,
40+
bootstrap=bootstrap
41+
)
42+
43+
df_clsf_result = algorithm.compute(X, y)
44+
45+
return df_clsf_result
46+
47+
48+
def df_clsf_predict(X, training_result, n_classes, verbose=False):
49+
50+
algorithm = decision_forest_classification_prediction(
51+
nClasses=n_classes,
52+
fptype='float', # we give float here specifically to match sklearn
53+
)
54+
55+
result = algorithm.compute(X, training_result.model)
56+
57+
return result.prediction
58+
59+
60+
if __name__ == '__main__':
61+
import argparse
62+
63+
def getArguments(argParser):
64+
argParser.add_argument('--prefix', type=str, default='daal4py',
65+
help="Identifier of the bench being executed")
66+
argParser.add_argument('--fileX', type=argparse.FileType('r'),
67+
help="Input file with features")
68+
argParser.add_argument('--fileY', type=argparse.FileType('r'),
69+
help="Input file with labels")
70+
argParser.add_argument('--num-trees', type=int, default=100,
71+
help="Number of trees in decision forest")
72+
argParser.add_argument('--max-features', type=int, default=0,
73+
help="Max features used to build trees")
74+
argParser.add_argument('--max-depth', type=int, default=0,
75+
help="Maximal depth of trees constructed")
76+
77+
argParser.add_argument('--use-sklearn-class', action='store_true',
78+
help="Force use of sklearn.ensemble.RandomForestClassifier")
79+
argParser.add_argument('--seed', type=int, default=12345,
80+
help="Seed to pass as random_state to the class")
81+
82+
argParser.add_argument('--fit-repetitions', dest="fit_inner_reps", type=int, default=1,
83+
help="Count of operations whose execution time is being clocked, average time reported")
84+
argParser.add_argument('--fit-samples', dest="fit_outer_reps", type=int, default=5,
85+
help="Count of repetitions of time measurements to collect statistics ")
86+
argParser.add_argument('--predict-repetitions', dest="predict_inner_reps", type=int, default=50,
87+
help="Count of operations whose execution time is being clocked, average time reported")
88+
argParser.add_argument('--predict-samples', dest="predict_outer_reps", type=int, default=5,
89+
help="Count of repetitions of time measurements to collect statistics ")
90+
91+
argParser.add_argument('--verbose', action="store_true",
92+
help="Whether to print additional information.")
93+
argParser.add_argument('--header', action="store_true",
94+
help="Whether to print header.")
95+
argParser.add_argument('--num-threads', type=int, dest="num_threads", default=0,
96+
help="Number of threads for DAAL to use")
97+
98+
args = argParser.parse_args()
99+
100+
return args
101+
102+
103+
argParser = argparse.ArgumentParser(prog="df_clsf_bench.py",
104+
description="Execute RandomForest classification",
105+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
106+
107+
args = getArguments(argParser)
108+
num_threads, daal_version = bench.prepare_benchmark(args)
109+
110+
111+
import timeit
112+
113+
if args.fileX is None or args.fileY is None:
114+
argParser.error("Please specify data for the algorithm to train on. Use --fileX and --fileY or --generate options.")
115+
else:
116+
X = np.load(args.fileX.name)
117+
y = np.load(args.fileY.name)[:,np.newaxis]
118+
119+
if args.verbose:
120+
print("@ {", end='')
121+
print(" FIT_SAMPLES : {0}, FIT_REPETITIONS : {1}, PREDICT_SAMPLES: {2}, PREDICT_REPETITIONS: {3}".format(
122+
args.fit_outer_reps, args.fit_inner_reps, args.predict_outer_reps, args.predict_inner_reps
123+
), end='')
124+
print("}")
125+
126+
if args.verbose:
127+
print("@ {", end='')
128+
print("'n_estimators': {0}, 'max_depth': {1}, 'max_features': {2}, 'random_state': {3}".format(
129+
args.num_trees, args.max_depth, args.max_features, args.seed
130+
), end='')
131+
print("}")
132+
133+
n_classes = np.max(y) - np.min(y) + 1
134+
135+
fit_times = []
136+
for outer_it in range(args.fit_outer_reps):
137+
t0 = timeit.default_timer()
138+
for _ in range(args.fit_inner_reps):
139+
result = df_clsf_fit(X, y, n_classes, n_trees=args.num_trees,
140+
seed=args.seed,
141+
n_features_per_node=args.max_features,
142+
max_depth=args.max_depth, verbose=args.verbose)
143+
t1 = timeit.default_timer()
144+
fit_times.append((t1 - t0) / args.fit_inner_reps)
145+
146+
147+
predict_times = []
148+
for outer_it in range(args.predict_outer_reps):
149+
150+
t0 = timeit.default_timer()
151+
for _ in range(args.predict_inner_reps):
152+
y_pred = df_clsf_predict(X, result, n_classes,
153+
verbose=args.verbose)
154+
t1 = timeit.default_timer()
155+
predict_times.append((t1 - t0) / args.predict_inner_reps)
156+
157+
158+
from sklearn.metrics import accuracy_score
159+
acc = accuracy_score(y, y_pred)
160+
161+
162+
num_classes = np.unique(y).shape[0]
163+
if args.header:
164+
print("prefix_ID,function,threads,rows,features,fit,predict,accuracy,classes")
165+
print(",".join((
166+
args.prefix,
167+
'df_clsf',
168+
str(num_threads),
169+
str(X.shape[0]),
170+
str(X.shape[1]),
171+
"{0:.3f}".format(min(fit_times)),
172+
"{0:.3f}".format(min(predict_times)),
173+
"{0:.4f}".format(100*acc),
174+
str(num_classes)
175+
)))
176+
177+
if args.verbose:
178+
print("")
179+
print("@ Median of {0} runs of .fit averaging over {1} executions is {2:3.3f}".format(args.fit_outer_reps, args.fit_inner_reps, np.percentile(fit_times, 50)))
180+
print("@ Median of {0} runs of .predict averaging over {1} executions is {2:3.3f}".format(args.predict_outer_reps, args.predict_inner_reps, np.percentile(predict_times, 50)))

daal4py/df_regr.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
# Copyright (C) 2018-2019 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
6+
import numpy as np
7+
import bench
8+
from daal4py import decision_forest_regression_training
9+
from daal4py import decision_forest_regression_prediction
10+
from daal4py import engines_mt2203
11+
from daal4py.sklearn.utils import getFPType
12+
13+
14+
def df_regr_fit(X, y, n_trees=100, seed=12345, n_features_per_node=0,
15+
max_depth=0, min_impurity=0, bootstrap=True):
16+
17+
fptype = getFPType(X)
18+
19+
features_per_node = X.shape[1]
20+
if n_features_per_node > 0 or n_features_per_node <= features_per_node:
21+
features_per_node = n_features_per_node
22+
23+
engine = engines_mt2203(seed=seed, fptype=fptype)
24+
25+
algorithm = decision_forest_regression_training(
26+
fptype=fptype,
27+
method='defaultDense',
28+
nTrees=n_trees,
29+
observationsPerTreeFraction=1.,
30+
featuresPerNode=features_per_node,
31+
maxTreeDepth=max_depth,
32+
minObservationsInLeafNode=1,
33+
engine=engine,
34+
impurityThreshold=min_impurity,
35+
varImportance='MDI',
36+
resultsToCompute='',
37+
memorySavingMode=False,
38+
bootstrap=bootstrap
39+
)
40+
41+
df_regr_result = algorithm.compute(X, y)
42+
43+
return df_regr_result
44+
45+
46+
def df_regr_predict(X, training_result, verbose=False):
47+
48+
algorithm = decision_forest_regression_prediction(
49+
fptype='float'
50+
)
51+
52+
result = algorithm.compute(X, training_result.model)
53+
54+
return result.prediction
55+
56+
57+
if __name__ == '__main__':
58+
import argparse
59+
60+
def getArguments(argParser):
61+
argParser.add_argument('--prefix', type=str, default='daal4py',
62+
help="Identifier of the bench being executed")
63+
argParser.add_argument('--fileX', type=argparse.FileType('r'),
64+
help="Input file with features")
65+
argParser.add_argument('--fileY', type=argparse.FileType('r'),
66+
help="Input file with labels")
67+
argParser.add_argument('--num-trees', type=int, default=100,
68+
help="Number of trees in decision forest")
69+
argParser.add_argument('--max-features', type=int, default=0,
70+
help="Max features used to build trees")
71+
argParser.add_argument('--max-depth', type=int, default=0,
72+
help="Maximal depth of trees constructed")
73+
74+
argParser.add_argument('--use-sklearn-class', action='store_true',
75+
help="Force use of sklearn.ensemble.RandomForestRegressor")
76+
argParser.add_argument('--seed', type=int, default=12345,
77+
help="Seed to pass as random_state to the class")
78+
79+
argParser.add_argument('--fit-repetitions', dest="fit_inner_reps", type=int, default=1,
80+
help="Count of operations whose execution time is being clocked, average time reported")
81+
argParser.add_argument('--fit-samples', dest="fit_outer_reps", type=int, default=5,
82+
help="Count of repetitions of time measurements to collect statistics ")
83+
argParser.add_argument('--predict-repetitions', dest="predict_inner_reps", type=int, default=50,
84+
help="Count of operations whose execution time is being clocked, average time reported")
85+
argParser.add_argument('--predict-samples', dest="predict_outer_reps", type=int, default=5,
86+
help="Count of repetitions of time measurements to collect statistics ")
87+
88+
argParser.add_argument('--verbose', action="store_true",
89+
help="Whether to print additional information.")
90+
argParser.add_argument('--header', action="store_true",
91+
help="Whether to print header.")
92+
argParser.add_argument('--num-threads', type=int, dest="num_threads", default=0,
93+
help="Number of threads for DAAL to use")
94+
95+
args = argParser.parse_args()
96+
97+
return args
98+
99+
100+
argParser = argparse.ArgumentParser(prog="df_regr_bench.py",
101+
description="Execute RandomForest Regression",
102+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
103+
104+
args = getArguments(argParser)
105+
num_threads, daal_version = bench.prepare_benchmark(args)
106+
107+
import sklearn
108+
try:
109+
from daal4py.sklearn.ensemble import RandomForestRegressor as rfRegressor
110+
except ImportError:
111+
from sklearn.ensemble import RandomForestRegressor as rfRegressor
112+
113+
if args.use_sklearn_class:
114+
from sklearn.ensemble import RandomForestRegressor as rfRegressor
115+
116+
import timeit
117+
118+
if args.fileX is None or args.fileY is None:
119+
argParser.error("Please specify data for the algorithm to train on. Use --fileX and --fileY or --generate options.")
120+
else:
121+
X = np.load(args.fileX.name)
122+
y = np.load(args.fileY.name)
123+
124+
if args.verbose:
125+
print("@ {", end='')
126+
print(" FIT_SAMPLES : {0}, FIT_REPETITIONS : {1}, PREDICT_SAMPLES: {2}, PREDICT_REPETITIONS: {3}".format(
127+
args.fit_outer_reps, args.fit_inner_reps, args.predict_outer_reps, args.predict_inner_reps
128+
), end='')
129+
print("}")
130+
131+
if args.verbose:
132+
print("@ {", end='')
133+
print("'n_estimators': {0}, 'max_depth': {1}, 'max_features': {2}, 'random_state': {3}".format(
134+
args.num_trees, args.max_depth, args.max_features, args.seed
135+
), end='')
136+
print("}")
137+
138+
fit_times = []
139+
for outer_it in range(args.fit_outer_reps):
140+
t0 = timeit.default_timer()
141+
for _ in range(args.fit_inner_reps):
142+
result = df_regr_fit(X, y, n_trees=args.num_trees, seed=args.seed,
143+
n_features_per_node=args.max_features,
144+
max_depth=args.max_depth, verbose=args.verbose)
145+
t1 = timeit.default_timer()
146+
fit_times.append((t1 - t0) / args.fit_inner_reps)
147+
148+
149+
predict_times = []
150+
for outer_it in range(args.predict_outer_reps):
151+
152+
t0 = timeit.default_timer()
153+
for _ in range(args.predict_inner_reps):
154+
y_pred = df_regr_predict(X, result, verbose=args.verbose)
155+
t1 = timeit.default_timer()
156+
predict_times.append((t1 - t0) / args.predict_inner_reps)
157+
158+
159+
acc = sklearn.metrics.explained_variance_score(y, y_pred)
160+
161+
num_classes = lambda c: 2 if c.shape[0] == 1 else c.shape[0]
162+
if args.header:
163+
print("prefix_ID,function,threads,rows,features,fit,predict,accuracy,targets")
164+
print(",".join((
165+
args.prefix,
166+
'df_regr',
167+
str(num_threads),
168+
str(X.shape[0]),
169+
str(X.shape[1]),
170+
"{0:.3f}".format(min(fit_times)),
171+
"{0:.3f}".format(min(predict_times)),
172+
"{0:.4f}".format(100*acc),
173+
str(y.ndim)
174+
)))
175+
176+
if args.verbose:
177+
print("")
178+
print("@ Median of {0} runs of .fit averaging over {1} executions is {2:3.3f}".format(args.fit_outer_reps, args.fit_inner_reps, np.percentile(fit_times, 50)))
179+
print("@ Median of {0} runs of .predict averaging over {1} executions is {2:3.3f}".format(args.predict_outer_reps, args.predict_inner_reps, np.percentile(predict_times, 50)))
180+
print("")

native/decision_forest_clsf_bench.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ df_classification_fit(
5757
df_clsf_alg.parameter.minObservationsInLeafNode = 1;
5858
df_clsf_alg.parameter.impurityThreshold = min_impurity;
5959
df_clsf_alg.parameter.bootstrap = bootsrap;
60-
df_clsf_alg.parameter.engine = da::engines::mt19937::Batch<double>::create(seed);
60+
df_clsf_alg.parameter.engine = da::engines::mt2203::Batch<double>::create(seed);
6161

6262
if (verbose) {
6363
std::cout << "@ {'nTrees': " << nTrees <<
@@ -88,7 +88,8 @@ df_classification_predict(
8888
bool verbose
8989
)
9090
{
91-
dfc::prediction::Batch<double> pred_alg(nClasses);
91+
// We explicitly specify float here to match sklearn.
92+
dfc::prediction::Batch<float> pred_alg(nClasses);
9293
pred_alg.input.set(da::classifier::prediction::data, Xt);
9394
pred_alg.input.set(da::classifier::prediction::model,
9495
training_result_ptr->get(da::classifier::training::model));

0 commit comments

Comments
 (0)