Skip to content
This repository was archived by the owner on Dec 6, 2023. It is now read-only.

Commit 8f73c23

Browse files
committed
Update scaling and initialization.
Regularization scaling is now ON by default. I think this is sensible, because it keeps the choice independent of data split. Adagrad seems very sensitive to the initial norm of P, so I changed the init to have unit variance rather than 0.01. Makes benchmark more reasonable but norms are still weird. Finnicky tests (fm warm starts) had to be updated, but most things behave well.
1 parent 292512b commit 8f73c23

File tree

4 files changed

+94
-53
lines changed

4 files changed

+94
-53
lines changed

benchmarks/bench_20newsgroups.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,42 +16,43 @@
1616

1717

1818
estimators = {
19-
'fm-2': FactorizationMachineClassifier(beta=0.0001,
20-
n_components=30,
19+
'fm-2': FactorizationMachineClassifier(beta=1e-15,
20+
scale_regularization=True,
21+
n_components=12,
2122
fit_linear=False,
2223
fit_lower=None,
2324
degree=2,
2425
random_state=0,
25-
max_iter=20,
26+
max_iter=100,
2627
tol=1e-10),
2728

28-
'polynet-2': PolynomialNetworkClassifier(beta=0.0001,
29-
n_components=15,
29+
'polynet-2': PolynomialNetworkClassifier(beta=1e-15,
30+
n_components=6,
3031
degree=2,
3132
fit_lower=None,
32-
max_iter=20,
33+
max_iter=100,
3334
random_state=0,
3435
tol=1e-10)
3536
}
3637

3738
estimators['fm-3'] = clone(estimators['fm-2']).set_params(degree=3)
38-
estimators['fm-2-ada'] = clone(estimators['fm-2']).set_params(
39-
solver='adagrad')
40-
estimators['fm-3-ada'] = clone(estimators['fm-3']).set_params(
41-
solver='adagrad')
39+
estimators['fm-2-ada'] = clone(estimators['fm-2']).set_params(solver='adagrad')
40+
estimators['fm-3-ada'] = clone(estimators['fm-3']).set_params(solver='adagrad')
4241
estimators['polynet-3'] = (clone(estimators['polynet-2'])
43-
.set_params(degree=3, n_components=10))
42+
.set_params(degree=3, n_components=4))
4443

4544
if __name__ == '__main__':
4645
data_train = fetch_20newsgroups_vectorized(subset="train")
4746
data_test = fetch_20newsgroups_vectorized(subset="test")
48-
X_train_csc = sp.csc_matrix(data_train.data)
49-
X_test_csc = sp.csc_matrix(data_test.data)
50-
X_train_csr = sp.csr_matrix(data_train.data)
51-
X_test_csr = sp.csr_matrix(data_test.data)
47+
train_mask = np.in1d(data_train.target, [0, 15]) # atheism vs christianity
48+
test_mask = np.in1d(data_test.target, [0, 15]) # atheism vs christianity
49+
X_train_csc = sp.csc_matrix(data_train.data[train_mask])
50+
X_test_csc = sp.csc_matrix(data_test.data[test_mask])
51+
X_train_csr = sp.csr_matrix(data_train.data[train_mask])
52+
X_test_csr = sp.csr_matrix(data_test.data[test_mask])
5253

53-
y_train = data_train.target == 0 # atheism vs rest
54-
y_test = data_test.target == 0
54+
y_train = data_train.target[train_mask] == 0 # atheism is positive
55+
y_test = data_test.target[test_mask] == 0
5556

5657
print("20 newsgroups")
5758
print("=============")
@@ -60,6 +61,7 @@
6061
print("X_train density = {0}"
6162
"".format(X_train_csr.nnz / np.product(X_train_csr.shape)))
6263
print("y_train {0}".format(y_train.shape))
64+
print("Training class ratio: {0}".format(y_train.mean()))
6365
print("X_test {0}".format(X_test_csr.shape))
6466
print("X_test.dtype = {0}".format(X_test_csr.dtype))
6567
print("y_test {0}".format(y_test.shape))
@@ -84,6 +86,10 @@
8486
accuracy[name] = accuracy_score(y_test, y_pred)
8587
f1[name] = f1_score(y_test, y_pred)
8688
print("done")
89+
try:
90+
print("||P|| =", np.linalg.norm(clf.P_))
91+
except:
92+
pass
8793

8894
print("Classification performance:")
8995
print("===========================")

polylearn/factorization_machine.py

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ class _BaseFactorizationMachine(six.with_metaclass(ABCMeta, _BasePoly)):
3232
@abstractmethod
3333
def __init__(self, degree=2, loss='squared', n_components=2, alpha=1,
3434
beta=1, tol=1e-6, fit_lower='explicit', fit_linear=True,
35-
learning_rate=0.001, solver='cd', warm_start=False,
36-
init_lambdas='ones', max_iter=10000, verbose=False,
37-
callback=None, n_calls=100, random_state=None):
35+
learning_rate=0.001, scale_regularization=True,
36+
solver='cd', warm_start=False, init_lambdas='ones',
37+
max_iter=10000, verbose=False, callback=None, n_calls=100,
38+
random_state=None):
3839
self.degree = degree
3940
self.loss = loss
4041
self.n_components = n_components
@@ -44,6 +45,7 @@ def __init__(self, degree=2, loss='squared', n_components=2, alpha=1,
4445
self.fit_lower = fit_lower
4546
self.fit_linear = fit_linear
4647
self.learning_rate = learning_rate
48+
self.scale_regularization = scale_regularization
4749
self.solver = solver
4850
self.warm_start = warm_start
4951
self.init_lambdas = init_lambdas
@@ -82,10 +84,20 @@ def fit(self, X, y):
8284

8385
X, y = self._check_X_y(X, y)
8486
X = self._augment(X)
85-
n_features = X.shape[1] # augmented
87+
n_samples, n_features = X.shape # augmented
8688
rng = check_random_state(self.random_state)
8789
loss_obj = self._get_loss(self.loss)
8890

91+
# Scale regularization params to make losses equivalent.
92+
if self.scale_regularization and self.solver == 'cd':
93+
alpha = 0.5 * self.alpha * n_samples
94+
beta = 0.5 * self.beta * n_samples
95+
elif not self.scale_regularization and self.solver == 'adagrad':
96+
alpha = self.alpha / 0.5 * n_samples
97+
beta = self.beta / 0.5 * n_samples
98+
else:
99+
alpha, beta = self.alpha, self.beta
100+
89101
if not (self.warm_start and hasattr(self, 'w_')):
90102
self.w_ = np.zeros(n_features, dtype=np.double)
91103

@@ -95,7 +107,7 @@ def fit(self, X, y):
95107
n_orders = 1
96108

97109
if not (self.warm_start and hasattr(self, 'P_')):
98-
self.P_ = 0.01 * rng.randn(n_orders, self.n_components, n_features)
110+
self.P_ = rng.randn(n_orders, self.n_components, n_features)
99111
if 'ada' in self.solver:
100112
# ensure each slice P[0], P[1]... is in F-order
101113
self.P_ = np.transpose(self.P_, [1, 2, 0])
@@ -125,7 +137,7 @@ def fit(self, X, y):
125137

126138
converged = _cd_direct_ho(self.P_, self.w_, dataset, X_col_norms,
127139
y, y_pred, self.lams_, self.degree,
128-
self.alpha, self.beta, self.fit_linear,
140+
alpha, beta, self.fit_linear,
129141
self.fit_lower == 'explicit', loss_obj,
130142
self.max_iter, self.tol, self.verbose)
131143
if not converged:
@@ -141,9 +153,9 @@ def fit(self, X, y):
141153

142154
dataset = get_dataset(X, order="c")
143155
_fast_fm_adagrad(self, self.w_, self.P_[0], dataset, y,
144-
self.degree, self.alpha, self.beta,
145-
self.fit_linear, loss_obj, self.max_iter,
146-
self.learning_rate, self.callback, self.n_calls)
156+
self.degree, alpha, beta, self.fit_linear,
157+
loss_obj, self.max_iter, self.learning_rate,
158+
self.callback, self.n_calls)
147159
return self
148160

149161
def _get_output(self, X):
@@ -212,9 +224,17 @@ class FactorizationMachineRegressor(_BaseFactorizationMachine,
212224
coordinate descent. If False, the model can still capture linear
213225
effects if ``fit_lower == 'augment'``.
214226
215-
learning_rate: double, default: 0.001
227+
learning_rate : double, default: 0.001
216228
Learning rate for 'adagrad' solver. Ignored by other solvers.
217229
230+
scale_regularization : boolean, default: True
231+
Whether to adjust regularization according to the number of samples.
232+
This helps if, after tuning regularization, the model will be retrained
233+
on more data.
234+
235+
If set, the loss optimized is mean_i(l_i) + 0.5 || params || ^2
236+
If not set, the loss becomes sum_i(l_i) + || params || ^ 2
237+
218238
solver : {'cd'|'adagrad'}, default: 'cd'
219239
- 'cd': Uses a coordinate descent solver. Currently limited to
220240
degree=3.
@@ -292,14 +312,15 @@ class FactorizationMachineRegressor(_BaseFactorizationMachine,
292312
"""
293313
def __init__(self, degree=2, n_components=2, alpha=1, beta=1, tol=1e-6,
294314
fit_lower='explicit', fit_linear=True, learning_rate=0.001,
295-
solver='cd', warm_start=False, init_lambdas='ones',
296-
max_iter=10000, verbose=False, callback=None, n_calls=100,
297-
random_state=None):
315+
scale_regularization=True, solver='cd', warm_start=False,
316+
init_lambdas='ones', max_iter=10000, verbose=False,
317+
callback=None, n_calls=100, random_state=None):
298318

299319
super(FactorizationMachineRegressor, self).__init__(
300320
degree, 'squared', n_components, alpha, beta, tol, fit_lower,
301-
fit_linear, learning_rate, solver, warm_start, init_lambdas,
302-
max_iter, verbose, callback, n_calls, random_state)
321+
fit_linear, learning_rate, scale_regularization, solver,
322+
warm_start, init_lambdas, max_iter, verbose, callback, n_calls,
323+
random_state)
303324

304325

305326
class FactorizationMachineClassifier(_BaseFactorizationMachine,
@@ -355,9 +376,17 @@ class FactorizationMachineClassifier(_BaseFactorizationMachine,
355376
coordinate descent. If False, the model can still capture linear
356377
effects if ``fit_lower == 'augment'``.
357378
358-
learning_rate: double, default: 0.001
379+
learning_rate : double, default: 0.001
359380
Learning rate for 'adagrad' solver. Ignored by other solvers.
360381
382+
scale_regularization : boolean, default: True
383+
Whether to adjust regularization according to the number of samples.
384+
This helps if, after tuning regularization, the model will be retrained
385+
on more data.
386+
387+
If set, the loss optimized is mean_i(l_i) + 0.5 || params || ^2
388+
If not set, the loss becomes sum_i(l_i) + || params || ^ 2
389+
361390
solver : {'cd'|'adagrad'}, default: 'cd'
362391
- 'cd': Uses a coordinate descent solver. Currently limited to
363392
degree=3.
@@ -436,11 +465,12 @@ class FactorizationMachineClassifier(_BaseFactorizationMachine,
436465

437466
def __init__(self, degree=2, loss='squared_hinge', n_components=2, alpha=1,
438467
beta=1, tol=1e-6, fit_lower='explicit', fit_linear=True,
439-
learning_rate=0.001, solver='cd', warm_start=False,
440-
init_lambdas='ones', max_iter=10000, verbose=False,
441-
callback=None, n_calls=100, random_state=None):
468+
learning_rate=0.001, scale_regularization=True, solver='cd',
469+
warm_start=False, init_lambdas='ones', max_iter=10000,
470+
verbose=False, callback=None, n_calls=100, random_state=None):
442471

443472
super(FactorizationMachineClassifier, self).__init__(
444473
degree, loss, n_components, alpha, beta, tol, fit_lower,
445-
fit_linear, learning_rate, solver, warm_start, init_lambdas,
446-
max_iter, verbose, callback, n_calls, random_state)
474+
fit_linear, learning_rate, scale_regularization, solver,
475+
warm_start, init_lambdas, max_iter, verbose, callback, n_calls,
476+
random_state)

polylearn/tests/test_adagrad.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@
1212
from .test_kernels import dumb_anova_grad
1313

1414

15-
def sg_adagrad_slow(P, X, y, degree, beta, max_iter, learning_rate):
15+
def sg_adagrad_slow(P, X, y, degree, beta, max_iter, learning_rate,
16+
scale_regularization=True):
1617

1718
n_samples = X.shape[0]
1819
n_components = P.shape[0]
1920

2021
grad_norms = np.zeros_like(P)
2122

23+
if not scale_regularization:
24+
beta /= 0.5 * n_samples
25+
2226
for it in range(max_iter):
2327

2428
for i in range(n_samples):
@@ -92,13 +96,14 @@ def test_adagrad_decrease():
9296
def check_adagrad_fit(degree):
9397
y = _poly_predict(X, P, lams, kernel="anova", degree=degree)
9498

95-
est = FactorizationMachineRegressor(degree=degree, n_components=3,
99+
est = FactorizationMachineRegressor(degree=degree, n_components=5,
96100
fit_linear=True, fit_lower=None,
97101
solver='adagrad',
98102
init_lambdas='ones',
99-
max_iter=30000,
100-
learning_rate=0.1,
101-
beta=1e-8,
103+
max_iter=2000,
104+
learning_rate=0.25,
105+
alpha=1e-10,
106+
beta=1e-10,
102107
random_state=0)
103108

104109
est.fit(X, y)
@@ -116,7 +121,7 @@ def test_adagrad_fit():
116121

117122
def check_adagrad_same_as_slow(degree, sparse):
118123

119-
beta = 0.00001
124+
beta = 1e-5
120125
lr = 0.01
121126

122127
if sparse:
@@ -128,7 +133,7 @@ def check_adagrad_same_as_slow(degree, sparse):
128133

129134
y = _poly_predict(X, P, lams, kernel="anova", degree=degree)
130135

131-
P_fast = 0.01 * np.random.RandomState(42).randn(1, P.shape[0], P.shape[1])
136+
P_fast = np.random.RandomState(42).randn(1, P.shape[0], P.shape[1])
132137
P_slow = P_fast[0].copy()
133138

134139
reg = FactorizationMachineRegressor(degree=degree, n_components=P.shape[0],

polylearn/tests/test_factorization_machine.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def cd_direct_slow(X, y, lams=None, degree=2, n_components=5, beta=1.,
2626
n_samples, n_features = X.shape
2727

2828
rng = check_random_state(random_state)
29-
P = 0.01 * rng.randn(n_components, n_features)
29+
P = rng.randn(n_components, n_features)
3030
if lams is None:
3131
lams = np.ones(n_components)
3232

@@ -222,6 +222,7 @@ def check_same_as_slow(degree):
222222
y = _poly_predict(X, P, lams, kernel="anova", degree=degree)
223223

224224
reg = FactorizationMachineRegressor(degree=degree, n_components=5,
225+
scale_regularization=False,
225226
fit_lower=None, fit_linear=False,
226227
beta=1, warm_start=False, tol=1e-3,
227228
max_iter=5, random_state=0)
@@ -303,21 +304,20 @@ def check_warm_start(degree):
303304
X_train, X_test = X[:10], X[10:]
304305
y_train, y_test = noisy_y[:10], noisy_y[10:]
305306

306-
beta_low = 0.5
307-
beta = 0.1
308-
beta_hi = 1
307+
beta_low = 0.001
308+
beta = 0.002
309+
beta_hi = 0.003
309310
ref = FactorizationMachineRegressor(degree=degree, n_components=5,
310311
fit_linear=False, fit_lower=None,
311-
beta=beta, max_iter=20000,
312-
random_state=0)
312+
beta=beta, random_state=0)
313313
ref.fit(X_train, y_train)
314314
y_pred_ref = ref.predict(X_test)
315315

316316
# (a) starting from lower beta, increasing and refitting
317317
from_low = FactorizationMachineRegressor(degree=degree, n_components=5,
318318
fit_lower=None, fit_linear=False,
319319
beta=beta_low, warm_start=True,
320-
random_state=0)
320+
max_iter=20000, random_state=0)
321321
from_low.fit(X_train, y_train)
322322
from_low.set_params(beta=beta)
323323
from_low.fit(X_train, y_train)
@@ -339,4 +339,4 @@ def check_warm_start(degree):
339339

340340
def test_warm_start():
341341
yield check_warm_start, 2
342-
yield check_warm_start, 3
342+
yield check_warm_start, 3

0 commit comments

Comments
 (0)