Skip to content

Commit c245ff5

Browse files
[MRG] fix bug and add regression test (#145)
* fix bug and add regression test * fix examples * deprecated parameter in example and black * black * fix job name * fix tests * fix test * fix example clustering
1 parent f664751 commit c245ff5

File tree

5 files changed

+35
-101
lines changed

5 files changed

+35
-101
lines changed

examples/cluster/plot_clustering.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@
7070
# Define two other clustering algorithms
7171
kmeans_rob = RobustWeightedKMeans(
7272
n_clusters,
73-
eta0=0.01,
7473
weighting="mom",
7574
max_iter=100,
7675
k=int(n_samples / 20),

examples/robust/plot_robust_classification_diabete.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
weighting="huber",
4343
loss="hinge",
4444
c=1.35,
45-
eta0=1e-3,
4645
max_iter=300,
4746
)
4847

@@ -60,7 +59,6 @@
6059
weighting="huber",
6160
loss="hinge",
6261
c=1.35,
63-
eta0=1e-3,
6462
max_iter=300,
6563
random_state=rng,
6664
)

examples/robust/plot_robust_regression_california_houses.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,13 @@ def quadratic_loss(est, X, y, X_test, y_test):
4848
estimators = [
4949
(
5050
"SGD",
51-
SGDRegressor(learning_rate="adaptive", eta0=1e-2),
51+
SGDRegressor(learning_rate="adaptive"),
5252
),
5353
(
5454
"RobustWeightedRegressor",
5555
RobustWeightedRegressor(
5656
weighting="huber",
5757
c=0.1,
58-
eta0=1e-2,
5958
sgd_args={
6059
"learning_rate": "invscaling",
6160
},

sklearn_extra/robust/robust_weighted_estimator.py

Lines changed: 7 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -127,16 +127,7 @@ class _RobustWeightedEstimator(BaseEstimator):
127127
128128
max_iter : int, default=100
129129
Maximum number of iterations.
130-
For more information, see the optimization scheme of base_estimator
131-
and the eta0 and burn_in parameter.
132-
133-
burn_in : int, default=10
134-
Number of steps used without changing the learning rate.
135-
Can be useful to make the weight estimation better at the beginning.
136-
137-
eta0 : float, default=0.01
138-
Constant step-size used during the burn_in period. Used only if
139-
burn_in>0. Can have a big effect on efficiency.
130+
For more information, see the optimization scheme of base_estimator.
140131
141132
c : float>0 or None, default=None
142133
Parameter used for Huber weighting procedure, used only if weightings
@@ -189,8 +180,7 @@ class _RobustWeightedEstimator(BaseEstimator):
189180
For now only scikit-learn SGDRegressor and SGDClassifier are officially
190181
supported but one can use any estimator compatible with scikit-learn,
191182
as long as this estimator support partial_fit, warm_start and sample_weight
192-
. It must have the parameters max_iter and support "constant" learning rate
193-
with learning rate called "eta0".
183+
. It must have the parameters max_iter.
194184
195185
For now, only binary classification is implemented. See sklearn.multiclass
196186
if you want to use this algorithm in multiclass classification.
@@ -221,8 +211,6 @@ def __init__(
221211
loss,
222212
weighting="huber",
223213
max_iter=100,
224-
burn_in=10,
225-
eta0=0.1,
226214
c=None,
227215
k=0,
228216
tol=1e-5,
@@ -232,8 +220,6 @@ def __init__(
232220
):
233221
self.base_estimator = base_estimator
234222
self.weighting = weighting
235-
self.eta0 = eta0
236-
self.burn_in = burn_in
237223
self.c = c
238224
self.k = k
239225
self.loss = loss
@@ -282,16 +268,10 @@ def fit(self, X, y=None):
282268
if ("loss" in parameters) and (loss_param != "squared_error"):
283269
base_estimator.set_params(loss=loss_param)
284270

285-
if "eta0" in parameters:
286-
base_estimator.set_params(eta0=self.eta0)
287-
288271
if "n_iter_no_change" in parameters:
289272
base_estimator.set_params(n_iter_no_change=self.n_iter_no_change)
290273

291274
base_estimator.set_params(random_state=random_state)
292-
if self.burn_in > 0:
293-
learning_rate = base_estimator.learning_rate
294-
base_estimator.set_params(learning_rate="constant", eta0=self.eta0)
295275

296276
# Initialization
297277
if self._estimator_type == "classifier":
@@ -329,11 +309,6 @@ def fit(self, X, y=None):
329309
# Optimization algorithm
330310
for epoch in range(self.max_iter):
331311

332-
if epoch > self.burn_in and self.burn_in > 0:
333-
# If not in the burn_in phase anymore, change the learning_rate
334-
# calibration to the one edicted by self.base_estimator.
335-
base_estimator.set_params(learning_rate=learning_rate)
336-
337312
if self._estimator_type == "classifier":
338313
# If in classification, use decision_function
339314
pred = base_estimator.decision_function(X)
@@ -448,12 +423,6 @@ def _validate_hyperparameters(self, n):
448423
if not (self.c is None) and (self.c <= 0):
449424
raise ValueError("c must be > 0, got %s." % self.c)
450425

451-
if self.burn_in < 0:
452-
raise ValueError("burn_in must be >= 0, got %s." % self.burn_in)
453-
454-
if (self.burn_in > 0) and (self.eta0 <= 0):
455-
raise ValueError("eta0 must be > 0, got %s." % self.eta0)
456-
457426
if not (self.k is None) and (
458427
not isinstance(self.k, int)
459428
or self.k < 0
@@ -619,16 +588,7 @@ class RobustWeightedClassifier(BaseEstimator, ClassifierMixin):
619588
620589
max_iter : int, default=100
621590
Maximum number of iterations.
622-
For more information, see the optimization scheme of base_estimator
623-
and the eta0 and burn_in parameter.
624-
625-
burn_in : int, default=10
626-
Number of steps used without changing the learning rate.
627-
Can be useful to make the weight estimation better at the beginning.
628-
629-
eta0 : float, default=0.01
630-
Constant step-size used during the burn_in period. Used only if
631-
burn_in>0. Can have a big effect on efficiency.
591+
For more information, see the optimization scheme of base_estimator.
632592
633593
c : float>0 or None, default=None
634594
Parameter used for Huber weighting procedure, used only if weightings
@@ -748,8 +708,6 @@ def __init__(
748708
self,
749709
weighting="huber",
750710
max_iter=100,
751-
burn_in=10,
752-
eta0=0.01,
753711
c=None,
754712
k=0,
755713
loss="log",
@@ -763,8 +721,6 @@ def __init__(
763721
):
764722
self.weighting = weighting
765723
self.max_iter = max_iter
766-
self.burn_in = burn_in
767-
self.eta0 = eta0
768724
self.c = c
769725
self.k = k
770726
self.loss = loss
@@ -802,13 +758,11 @@ def fit(self, X, y):
802758
X, y = self._validate_data(X, y, y_numeric=False)
803759

804760
base_robust_estimator_ = _RobustWeightedEstimator(
805-
SGDClassifier(**sgd_args, eta0=self.eta0),
761+
SGDClassifier(**sgd_args),
806762
weighting=self.weighting,
807763
loss=self.loss,
808-
burn_in=self.burn_in,
809764
c=self.c,
810765
k=self.k,
811-
eta0=self.eta0,
812766
max_iter=self.max_iter,
813767
tol=self.tol,
814768
n_iter_no_change=self.n_iter_no_change,
@@ -951,16 +905,7 @@ class RobustWeightedRegressor(BaseEstimator, RegressorMixin):
951905
952906
max_iter : int, default=100
953907
Maximum number of iterations.
954-
For more information, see the optimization scheme of base_estimator
955-
and the eta0 and burn_in parameter.
956-
957-
burn_in : int, default=10
958-
Number of steps used without changing the learning rate.
959-
Can be useful to make the weight estimation better at the beginning.
960-
961-
eta0 : float, default=0.01
962-
Constant step-size used during the burn_in period. Used only if
963-
burn_in>0. Can have a big effect on efficiency.
908+
For more information, see the optimization scheme of base_estimator.
964909
965910
c : float>0 or None, default=None
966911
Parameter used for Huber weighting procedure, used only if weightings
@@ -1062,8 +1007,6 @@ def __init__(
10621007
self,
10631008
weighting="huber",
10641009
max_iter=100,
1065-
burn_in=10,
1066-
eta0=0.01,
10671010
c=None,
10681011
k=0,
10691012
loss=SQ_LOSS,
@@ -1076,8 +1019,6 @@ def __init__(
10761019

10771020
self.weighting = weighting
10781021
self.max_iter = max_iter
1079-
self.burn_in = burn_in
1080-
self.eta0 = eta0
10811022
self.c = c
10821023
self.k = k
10831024
self.loss = loss
@@ -1113,13 +1054,11 @@ def fit(self, X, y):
11131054
X, y = self._validate_data(X, y, y_numeric=True)
11141055

11151056
self.base_estimator_ = _RobustWeightedEstimator(
1116-
SGDRegressor(**sgd_args, eta0=self.eta0),
1057+
SGDRegressor(**sgd_args),
11171058
weighting=self.weighting,
11181059
loss=self.loss,
1119-
burn_in=self.burn_in,
11201060
c=self.c,
11211061
k=self.k,
1122-
eta0=self.eta0,
11231062
max_iter=self.max_iter,
11241063
tol=self.tol,
11251064
n_iter_no_change=self.n_iter_no_change,
@@ -1202,12 +1141,7 @@ class RobustWeightedKMeans(BaseEstimator, ClusterMixin):
12021141
12031142
max_iter : int, default=100
12041143
Maximum number of iterations.
1205-
For more information, see the optimization scheme of base_estimator
1206-
and the eta0 and burn_in parameter.
1207-
1208-
eta0 : float, default=0.01
1209-
Constant step-size used during the burn_in period. Used only if
1210-
burn_in>0. Can have a big effect on efficiency.
1144+
For more information, see the optimization scheme of base_estimator.
12111145
12121146
c : float>0 or None, default=None
12131147
Parameter used for Huber weighting procedure, used only if weightings
@@ -1314,7 +1248,6 @@ def __init__(
13141248
n_clusters=8,
13151249
weighting="huber",
13161250
max_iter=100,
1317-
eta0=0.01,
13181251
c=None,
13191252
k=0,
13201253
kmeans_args=None,
@@ -1326,7 +1259,6 @@ def __init__(
13261259
self.n_clusters = n_clusters
13271260
self.weighting = weighting
13281261
self.max_iter = max_iter
1329-
self.eta0 = eta0
13301262
self.c = c
13311263
self.k = k
13321264
self.kmeans_args = kmeans_args
@@ -1369,13 +1301,9 @@ def fit(self, X, y=None):
13691301
random_state=self.random_state,
13701302
**kmeans_args
13711303
),
1372-
burn_in=0, # Important because it does not mean anything to
1373-
# have burn-in
1374-
# steps for kmeans. It must be 0.
13751304
weighting=self.weighting,
13761305
loss=_kmeans_loss,
13771306
max_iter=self.max_iter,
1378-
eta0=self.eta0,
13791307
c=self.c,
13801308
k=self.k,
13811309
tol=self.tol,

sklearn_extra/robust/tests/test_robust_weighted_estimator.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,6 @@ def test_robust_estimator_input_validation_and_fit_check():
8585
with pytest.raises(ValueError, match=msg):
8686
RobustWeightedKMeans(c=0).fit(X_cc)
8787

88-
msg = "burn_in must be >= 0, got -1."
89-
with pytest.raises(ValueError, match=msg):
90-
RobustWeightedClassifier(burn_in=-1).fit(X_cc, y_cc)
91-
92-
msg = "eta0 must be > 0, got 0."
93-
with pytest.raises(ValueError, match=msg):
94-
RobustWeightedClassifier(burn_in=1, eta0=0).fit(X_cc, y_cc)
95-
9688
msg = "k must be integer >= 0, and smaller than floor"
9789
with pytest.raises(ValueError, match=msg):
9890
RobustWeightedKMeans(k=-1).fit(X_cc)
@@ -145,7 +137,6 @@ def test_not_robust_classif(loss, weighting, multi_class):
145137
weighting=weighting,
146138
k=0,
147139
c=1e7,
148-
burn_in=0,
149140
multi_class=multi_class,
150141
random_state=rng,
151142
)
@@ -172,7 +163,6 @@ def test_classif_binary(weighting):
172163
weighting=weighting,
173164
k=0,
174165
c=1e7,
175-
burn_in=0,
176166
multi_class="binary",
177167
random_state=rng,
178168
)
@@ -203,7 +193,6 @@ def test_classif_corrupted_weights(weighting):
203193
weighting=weighting,
204194
k=5,
205195
c=1,
206-
burn_in=0,
207196
multi_class="binary",
208197
random_state=rng,
209198
)
@@ -219,7 +208,6 @@ def test_predict_proba(weighting):
219208
weighting=weighting,
220209
k=0,
221210
c=1e7,
222-
burn_in=0,
223211
random_state=rng,
224212
)
225213
clf_not_rob = SGDClassifier(loss="log", random_state=rng)
@@ -268,6 +256,31 @@ def test_corrupted_regression(loss, weighting, k, c):
268256
assert np.abs(reg.intercept_[0]) < 0.3
269257

270258

259+
@pytest.mark.parametrize("loss", regression_losses)
260+
@pytest.mark.parametrize("weighting", weightings)
261+
def test_corrupted_regression_multidim(loss, weighting):
262+
263+
n = 1000
264+
d = 10
265+
266+
coef = np.zeros((d, 1))
267+
coef[0, 0] = 1
268+
X = np.array(np.random.randn(n, d))
269+
y = X @ coef + np.array(np.random.randn(n, 1))
270+
271+
reg = RobustWeightedRegressor(
272+
loss=loss,
273+
max_iter=100,
274+
weighting=weighting,
275+
k=1,
276+
c=1,
277+
random_state=rng,
278+
n_iter_no_change=20,
279+
)
280+
reg.fit(X, y)
281+
assert np.linalg.norm(reg.coef_ - coef) < 2 * np.sqrt(d)
282+
283+
271284
# Check that weights_ parameter can be used as outlier score.
272285
@pytest.mark.parametrize("weighting", weightings)
273286
def test_regression_corrupted_weights(weighting):
@@ -276,7 +289,6 @@ def test_regression_corrupted_weights(weighting):
276289
weighting=weighting,
277290
k=5,
278291
c=1,
279-
burn_in=0,
280292
random_state=rng,
281293
)
282294
reg.fit(X_rc, y_rc)
@@ -297,7 +309,6 @@ def test_not_robust_regression(loss, weighting):
297309
weighting=weighting,
298310
k=0,
299311
c=1e7,
300-
burn_in=0,
301312
random_state=rng,
302313
)
303314
reg_not_rob = SGDRegressor(loss=loss, random_state=rng)
@@ -308,7 +319,7 @@ def test_not_robust_regression(loss, weighting):
308319
difference = [
309320
np.linalg.norm(pred1[i] - pred2[i]) for i in range(len(pred1))
310321
]
311-
assert np.mean(difference) < 1
322+
assert np.mean(difference) < 2
312323
assert_almost_equal(reg.score(X_r, y_r), r2_score(y_r, reg.predict(X_r)))
313324

314325

@@ -325,7 +336,6 @@ def test_vs_huber():
325336
weighting="huber",
326337
k=5,
327338
c=1,
328-
burn_in=0,
329339
sgd_args={"learning_rate": "adaptive"}, # test sgd_args
330340
random_state=rng,
331341
)
@@ -394,7 +404,7 @@ def test_not_robust_cluster(weighting):
394404
difference = [
395405
np.linalg.norm(pred1[i] - pred2[i]) for i in range(len(pred1))
396406
]
397-
assert np.mean(difference) < 1
407+
assert np.mean(difference) < 2
398408

399409

400410
def test_transform():

0 commit comments

Comments
 (0)