Skip to content

Commit 62f153d

Browse files
authored
Add daal4py logistic regression benchmark (#10)
This PR adds logistic regression replicating the results of sklearn.linear_model.LogisticRegression but implemented in daal4py. It supports solvers lbfgs and newton-cg. test_fit uses the canonical form of logistic regression for the binary case (which in scikit-learn is multi_class='ovr') and the multinomial (softmax of exponentiated scores, multi_class='multinomial') for the multi-class case. test_predict supports any combination of n_classes and multi_class, but we use the same multi_class used in test_fit. While we cannot directly use daal4py's logistic regression because it isn't as easy as native DAAL to pass in a custom solver (see #8), we use daal4py's logistic regression objective functions and math primitives to compute logistic regression.
1 parent 25199da commit 62f153d

File tree

1 file changed

+307
-0
lines changed

1 file changed

+307
-0
lines changed

daal4py/log_reg.py

Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
# Copyright (C) 2018-2019 Intel Corporation
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
6+
import numpy as np
7+
from bench import prepare_benchmark
8+
import daal4py
9+
from daal4py import math_logistic, math_softmax
10+
from daal4py.sklearn.utils import getFPType, make2d
11+
import scipy.optimize
12+
13+
_logistic_loss = daal4py.optimization_solver_logistic_loss
14+
_cross_entropy_loss = daal4py.optimization_solver_cross_entropy_loss
15+
16+
17+
def _results_to_compute(value=True, gradient=True, hessian=False):
18+
19+
results_to_compute = []
20+
if value:
21+
results_to_compute.append('value')
22+
if gradient:
23+
results_to_compute.append('gradient')
24+
if hessian:
25+
results_to_compute.append('hessian')
26+
27+
return '|'.join(results_to_compute)
28+
29+
30+
class Loss:
31+
32+
def __init__(self, X, y, beta, hess=False, fit_intercept=True):
33+
self.compute_hess = hess
34+
self.n = X.shape[0]
35+
self.fptype = getFPType(X)
36+
self.fit_intercept = fit_intercept
37+
self.X = make2d(X)
38+
self.y = make2d(y)
39+
40+
def compute(self, beta):
41+
result = self.algo.compute(self.X, self.y, make2d(beta))
42+
self.func = result.valueIdx[0, 0] * self.n
43+
self.grad = result.gradientIdx.ravel() * self.n
44+
if self.compute_hess:
45+
self.hess = result.hessianIdx * self.n
46+
47+
def get_value(self, arg):
48+
self.compute(arg)
49+
return self.func
50+
51+
def get_grad(self, arg):
52+
self.compute(arg)
53+
return self.grad
54+
55+
def get_hess(self, arg):
56+
if not self.compute_hess:
57+
raise ValueError('You asked for Hessian but compute_hess=False')
58+
self.compute(arg)
59+
return self.hess
60+
61+
62+
class LogisticLoss(Loss):
63+
64+
def __init__(self, *args, l1=0.0, l2=0.0, **kwargs):
65+
66+
super().__init__(*args, **kwargs)
67+
68+
self.algo = _logistic_loss(
69+
numberOfTerms=self.n,
70+
fptype=self.fptype,
71+
method='defaultDense',
72+
interceptFlag=self.fit_intercept,
73+
penaltyL1=l1 / self.n,
74+
penaltyL2=l2 / self.n,
75+
resultsToCompute=_results_to_compute(hessian=self.compute_hess)
76+
)
77+
78+
79+
class CrossEntropyLoss(Loss):
80+
81+
def __init__(self, n_classes, *args, l1=0.0, l2=0.0, **kwargs):
82+
83+
super().__init__(*args, **kwargs)
84+
85+
self.algo = _cross_entropy_loss(
86+
nClasses=n_classes,
87+
numberOfTerms=self.n,
88+
fptype=self.fptype,
89+
method='defaultDense',
90+
interceptFlag=self.fit_intercept,
91+
penaltyL1=l1 / self.n,
92+
penaltyL2=l2 / self.n,
93+
resultsToCompute=_results_to_compute(hessian=self.compute_hess)
94+
)
95+
96+
97+
def test_fit(X, y, penalty='l2', C=1.0, fit_intercept=True, tol=1e-4,
98+
max_iter=100, solver='lbfgs', verbose=0):
99+
100+
if penalty == 'l2':
101+
l2 = 0.5 / C
102+
else:
103+
l2 = 0.0
104+
105+
n_features = X.shape[1]
106+
n_classes = len(np.unique(y))
107+
compute_hessian = (solver == 'newton-cg')
108+
109+
if n_classes == 2:
110+
# Use the standard logistic regression formulation
111+
multi_class = 'ovr'
112+
else:
113+
# Use the multinomial logistic regression formulation
114+
multi_class = 'multinomial'
115+
116+
if multi_class == 'ovr':
117+
beta = np.zeros(n_features + 1, dtype='f8')
118+
loss_obj = LogisticLoss(X, y, beta, fit_intercept=fit_intercept, l2=l2,
119+
hess=compute_hessian)
120+
else:
121+
beta = np.zeros((n_classes, n_features + 1), dtype='f8')
122+
beta = beta.ravel()
123+
loss_obj = CrossEntropyLoss(n_classes, X, y, beta,
124+
hess=compute_hessian,
125+
fit_intercept=fit_intercept, l2=l2)
126+
127+
if solver == 'lbfgs':
128+
result = scipy.optimize.minimize(loss_obj.get_value, beta,
129+
method='L-BFGS-B',
130+
jac=loss_obj.get_grad,
131+
options=dict(disp=verbose, gtol=tol))
132+
else:
133+
result = scipy.optimize.minimize(loss_obj.get_value, beta,
134+
method='Newton-CG',
135+
jac=loss_obj.get_grad,
136+
hess=loss_obj.get_hess,
137+
options=dict(disp=verbose))
138+
139+
beta = result.x
140+
beta_n_classes = n_classes if n_classes > 2 else 1
141+
beta = beta.reshape((beta_n_classes, n_features + 1))
142+
143+
return beta[:, 1:], beta[:, 0], result, multi_class
144+
145+
146+
def test_predict(X, beta, intercept=0, multi_class='ovr'):
147+
148+
fptype = getFPType(X)
149+
150+
scores = np.dot(X, beta.T) + intercept
151+
if multi_class == 'ovr':
152+
# use binary logistic regressions and normalize
153+
logistic = math_logistic(fptype=fptype, method='defaultDense')
154+
prob = logistic.compute(scores).value
155+
if prob.shape[1] == 1:
156+
return np.c_[1 - prob, prob]
157+
else:
158+
return prob / prob.sum(axis=1)[:, np.newaxis]
159+
else:
160+
# use softmax of exponentiated scores
161+
if scores.shape[1] == 1:
162+
scores = np.c_[-scores, scores]
163+
softmax = math_softmax(fptype=fptype, method='defaultDense')
164+
return softmax.compute(scores).value
165+
166+
167+
if __name__ == '__main__':
168+
import argparse
169+
170+
def getArguments(argParser):
171+
argParser.add_argument('--prefix', type=str, default='daal4py',
172+
help="Identifier of the bench being executed")
173+
argParser.add_argument('--fileX', type=argparse.FileType('r'),
174+
help="Input file with features")
175+
argParser.add_argument('--fileY', type=argparse.FileType('r'),
176+
help="Input file with labels")
177+
argParser.add_argument('--intercept', action="store_true",
178+
help="Whether to fit intercept")
179+
argParser.add_argument('--solver', default='lbfgs',
180+
choices=['lbfgs', 'newton-cg'],
181+
help="Solver to use.")
182+
argParser.add_argument('--maxiter', type=int, default=100,
183+
help="Maximum iterations setting for the "
184+
"iterative solver of choice")
185+
argParser.add_argument('--fit-repetitions', dest="fit_inner_reps",
186+
type=int, default=1,
187+
help="Count of operations whose execution time "
188+
"is being clocked, average time reported")
189+
argParser.add_argument('--fit-samples', dest="fit_outer_reps",
190+
type=int, default=5,
191+
help="Count of repetitions of time "
192+
"measurements to collect statistics ")
193+
argParser.add_argument('--verbose', action="store_const",
194+
const=1, default=0,
195+
help="Whether to print additional information.")
196+
argParser.add_argument('--header', action="store_true",
197+
help="Whether to print header.")
198+
argParser.add_argument('--predict-repetitions',
199+
dest="predict_inner_reps", type=int, default=50,
200+
help="Count of operations whose execution time "
201+
"is being clocked, average time reported")
202+
argParser.add_argument('--predict-samples', dest="predict_outer_reps",
203+
type=int, default=5,
204+
help="Count of repetitions of time "
205+
"measurements to collect statistics ")
206+
argParser.add_argument('--num-threads', type=int, dest="num_threads",
207+
default=0,
208+
help="Number of threads for DAAL to use")
209+
210+
args = argParser.parse_args()
211+
return args
212+
213+
argParser = argparse.ArgumentParser(description="daal4py logistic "
214+
"regression benchmark")
215+
216+
args = getArguments(argParser)
217+
218+
num_threads, daal_version = prepare_benchmark(args)
219+
220+
import timeit
221+
222+
X = np.load(args.fileX.name)
223+
y = np.load(args.fileY.name)
224+
225+
if args.verbose:
226+
print("@ {", end='')
227+
print(" FIT_SAMPLES : {0}, FIT_REPETITIONS : {1},"
228+
" PREDICT_SAMPLES: {2}, PREDICT_REPETITIONS: {3}".format(
229+
args.fit_outer_reps, args.fit_inner_reps,
230+
args.predict_outer_reps, args.predict_inner_reps
231+
), end='')
232+
print("}")
233+
234+
C = 1.0
235+
tol = 1e-3 if args.solver == 'newton-cg' else 1e-10
236+
fit_intercept = args.intercept
237+
238+
if args.verbose:
239+
print("@ {", end='')
240+
print("'fit_intercept' : {0}, 'C' : {1}, 'max_iter' : {2}, "
241+
"'tol' : {3}, 'solver' : {4}".format(
242+
fit_intercept, C, args.maxiter, tol, args.solver
243+
), end='')
244+
print("}")
245+
246+
fit_times = []
247+
n_iters = []
248+
for outer_it in range(args.fit_outer_reps):
249+
t0 = timeit.default_timer()
250+
for _ in range(args.fit_inner_reps):
251+
w, w0, r, mc = test_fit(X, y, penalty='l2', C=C,
252+
verbose=args.verbose,
253+
fit_intercept=fit_intercept, tol=tol,
254+
max_iter=args.maxiter, solver=args.solver)
255+
t1 = timeit.default_timer()
256+
fit_times.append((t1 - t0) / args.fit_inner_reps)
257+
n_iters.append(r.nit)
258+
259+
predict_times = []
260+
for outer_it in range(args.predict_outer_reps):
261+
262+
t0 = timeit.default_timer()
263+
for _ in range(args.predict_inner_reps):
264+
y_proba = test_predict(X, w, intercept=w0, multi_class=mc)
265+
y_pred = np.argmax(y_proba, axis=1)
266+
t1 = timeit.default_timer()
267+
predict_times.append((t1 - t0) / args.predict_inner_reps)
268+
269+
acc = np.mean(abs(y_pred - y) < 0.5)
270+
271+
def num_classes(c):
272+
if c.shape[0] == 1:
273+
return 2
274+
else:
275+
return c.shape[0]
276+
277+
if args.header:
278+
print("prefix_ID,function,solver,threads,rows,features,fit,predict,"
279+
"accuracy,classes")
280+
print(",".join((
281+
args.prefix,
282+
'log_reg',
283+
args.solver,
284+
"Serial" if num_threads == 1 else "Threaded",
285+
str(X.shape[0]),
286+
str(X.shape[1]),
287+
"{0:.3f}".format(min(fit_times)),
288+
"{0:.3f}".format(min(predict_times)),
289+
"{0:.4f}".format(100*acc),
290+
str(num_classes(w))
291+
)))
292+
293+
if args.verbose:
294+
print("")
295+
print("@ Median of {0} runs of .fit averaging over {1} executions is "
296+
"{2:3.3f}".format(args.fit_outer_reps, args.fit_inner_reps,
297+
np.percentile(fit_times, 50)))
298+
print("@ Median of {0} runs of .predict averaging over {1} executions "
299+
"is {2:3.3f}".format(args.predict_outer_reps,
300+
args.predict_inner_reps,
301+
np.percentile(predict_times, 50)))
302+
print("")
303+
print("@ Number of iterations: {}".format(r.nit))
304+
print("@ fit coefficients:")
305+
print("@ {}".format(w.tolist()))
306+
print("@ fit intercept")
307+
print("@ {}".format(w0.tolist()))

0 commit comments

Comments
 (0)