[MRG] fix bug and add regression test (#145)

TimotheeMathieu · web-flow · commit c245ff54a077 · 2022-12-28T18:25:04.000+01:00
* fix bug and add regression test

* fix examples

* deprecated parameter in example and black

* black

* fix job name

* fix tests

* fix test

* fix example clustering
diff --git a/examples/cluster/plot_clustering.py b/examples/cluster/plot_clustering.py
@@ -70,7 +70,6 @@
     # Define two other clustering algorithms
     kmeans_rob = RobustWeightedKMeans(
         n_clusters,
-        eta0=0.01,
         weighting="mom",
         max_iter=100,
         k=int(n_samples / 20),
diff --git a/examples/robust/plot_robust_classification_diabete.py b/examples/robust/plot_robust_classification_diabete.py
@@ -42,7 +42,6 @@
     weighting="huber",
     loss="hinge",
     c=1.35,
-    eta0=1e-3,
     max_iter=300,
 )
 
@@ -60,7 +59,6 @@
         weighting="huber",
         loss="hinge",
         c=1.35,
-        eta0=1e-3,
         max_iter=300,
         random_state=rng,
     )
diff --git a/examples/robust/plot_robust_regression_california_houses.py b/examples/robust/plot_robust_regression_california_houses.py
@@ -48,14 +48,13 @@ def quadratic_loss(est, X, y, X_test, y_test):
 estimators = [
     (
         "SGD",
-        SGDRegressor(learning_rate="adaptive", eta0=1e-2),
+        SGDRegressor(learning_rate="adaptive"),
     ),
     (
         "RobustWeightedRegressor",
         RobustWeightedRegressor(
             weighting="huber",
             c=0.1,
-            eta0=1e-2,
             sgd_args={
                 "learning_rate": "invscaling",
             },
diff --git a/sklearn_extra/robust/robust_weighted_estimator.py b/sklearn_extra/robust/robust_weighted_estimator.py
@@ -127,16 +127,7 @@ class _RobustWeightedEstimator(BaseEstimator):
 
     max_iter : int, default=100
         Maximum number of iterations.
-        For more information, see the optimization scheme of base_estimator
-        and the eta0 and burn_in parameter.
-
-    burn_in : int, default=10
-        Number of steps used without changing the learning rate.
-        Can be useful to make the weight estimation better at the beginning.
-
-    eta0 : float, default=0.01
-        Constant step-size used during the burn_in period. Used only if
-        burn_in>0. Can have a big effect on efficiency.
+        For more information, see the optimization scheme of base_estimator.
 
     c : float>0 or None, default=None
         Parameter used for Huber weighting procedure, used only if weightings
@@ -189,8 +180,7 @@ class _RobustWeightedEstimator(BaseEstimator):
     For now only scikit-learn SGDRegressor and SGDClassifier are officially
     supported but one can use any estimator compatible with scikit-learn,
     as long as this estimator support partial_fit, warm_start and sample_weight
-    . It must have the parameters max_iter and support "constant" learning rate
-    with learning rate called "eta0".
+    . It must have the parameters max_iter.
 
     For now, only binary classification is implemented. See sklearn.multiclass
     if you want to use this algorithm in multiclass classification.
@@ -221,8 +211,6 @@ def __init__(
         loss,
         weighting="huber",
         max_iter=100,
-        burn_in=10,
-        eta0=0.1,
         c=None,
         k=0,
         tol=1e-5,
@@ -232,8 +220,6 @@ def __init__(
     ):
         self.base_estimator = base_estimator
         self.weighting = weighting
-        self.eta0 = eta0
-        self.burn_in = burn_in
         self.c = c
         self.k = k
         self.loss = loss
@@ -282,16 +268,10 @@ def fit(self, X, y=None):
         if ("loss" in parameters) and (loss_param != "squared_error"):
             base_estimator.set_params(loss=loss_param)
 
-        if "eta0" in parameters:
-            base_estimator.set_params(eta0=self.eta0)
-
         if "n_iter_no_change" in parameters:
             base_estimator.set_params(n_iter_no_change=self.n_iter_no_change)
 
         base_estimator.set_params(random_state=random_state)
-        if self.burn_in > 0:
-            learning_rate = base_estimator.learning_rate
-            base_estimator.set_params(learning_rate="constant", eta0=self.eta0)
 
         # Initialization
         if self._estimator_type == "classifier":
@@ -329,11 +309,6 @@ def fit(self, X, y=None):
         # Optimization algorithm
         for epoch in range(self.max_iter):
 
-            if epoch > self.burn_in and self.burn_in > 0:
-                # If not in the burn_in phase anymore, change the learning_rate
-                # calibration to the one edicted by self.base_estimator.
-                base_estimator.set_params(learning_rate=learning_rate)
-
             if self._estimator_type == "classifier":
                 # If in classification, use decision_function
                 pred = base_estimator.decision_function(X)
@@ -448,12 +423,6 @@ def _validate_hyperparameters(self, n):
         if not (self.c is None) and (self.c <= 0):
             raise ValueError("c must be > 0, got %s." % self.c)
 
-        if self.burn_in < 0:
-            raise ValueError("burn_in must be >= 0, got %s." % self.burn_in)
-
-        if (self.burn_in > 0) and (self.eta0 <= 0):
-            raise ValueError("eta0 must be > 0, got %s." % self.eta0)
-
         if not (self.k is None) and (
             not isinstance(self.k, int)
             or self.k < 0
@@ -619,16 +588,7 @@ class RobustWeightedClassifier(BaseEstimator, ClassifierMixin):
 
     max_iter : int, default=100
         Maximum number of iterations.
-        For more information, see the optimization scheme of base_estimator
-        and the eta0 and burn_in parameter.
-
-    burn_in : int, default=10
-        Number of steps used without changing the learning rate.
-        Can be useful to make the weight estimation better at the beginning.
-
-    eta0 : float, default=0.01
-        Constant step-size used during the burn_in period. Used only if
-        burn_in>0. Can have a big effect on efficiency.
+        For more information, see the optimization scheme of base_estimator.
 
     c : float>0 or None, default=None
         Parameter used for Huber weighting procedure, used only if weightings
@@ -748,8 +708,6 @@ def __init__(
         self,
         weighting="huber",
         max_iter=100,
-        burn_in=10,
-        eta0=0.01,
         c=None,
         k=0,
         loss="log",
@@ -763,8 +721,6 @@ def __init__(
     ):
         self.weighting = weighting
         self.max_iter = max_iter
-        self.burn_in = burn_in
-        self.eta0 = eta0
         self.c = c
         self.k = k
         self.loss = loss
@@ -802,13 +758,11 @@ def fit(self, X, y):
         X, y = self._validate_data(X, y, y_numeric=False)
 
         base_robust_estimator_ = _RobustWeightedEstimator(
-            SGDClassifier(**sgd_args, eta0=self.eta0),
+            SGDClassifier(**sgd_args),
             weighting=self.weighting,
             loss=self.loss,
-            burn_in=self.burn_in,
             c=self.c,
             k=self.k,
-            eta0=self.eta0,
             max_iter=self.max_iter,
             tol=self.tol,
             n_iter_no_change=self.n_iter_no_change,
@@ -951,16 +905,7 @@ class RobustWeightedRegressor(BaseEstimator, RegressorMixin):
 
     max_iter : int, default=100
         Maximum number of iterations.
-        For more information, see the optimization scheme of base_estimator
-        and the eta0 and burn_in parameter.
-
-    burn_in : int, default=10
-        Number of steps used without changing the learning rate.
-        Can be useful to make the weight estimation better at the beginning.
-
-    eta0 : float, default=0.01
-        Constant step-size used during the burn_in period. Used only if
-        burn_in>0. Can have a big effect on efficiency.
+        For more information, see the optimization scheme of base_estimator.
 
     c : float>0 or None, default=None
         Parameter used for Huber weighting procedure, used only if weightings
@@ -1062,8 +1007,6 @@ def __init__(
         self,
         weighting="huber",
         max_iter=100,
-        burn_in=10,
-        eta0=0.01,
         c=None,
         k=0,
         loss=SQ_LOSS,
@@ -1076,8 +1019,6 @@ def __init__(
 
         self.weighting = weighting
         self.max_iter = max_iter
-        self.burn_in = burn_in
-        self.eta0 = eta0
         self.c = c
         self.k = k
         self.loss = loss
@@ -1113,13 +1054,11 @@ def fit(self, X, y):
         X, y = self._validate_data(X, y, y_numeric=True)
 
         self.base_estimator_ = _RobustWeightedEstimator(
-            SGDRegressor(**sgd_args, eta0=self.eta0),
+            SGDRegressor(**sgd_args),
             weighting=self.weighting,
             loss=self.loss,
-            burn_in=self.burn_in,
             c=self.c,
             k=self.k,
-            eta0=self.eta0,
             max_iter=self.max_iter,
             tol=self.tol,
             n_iter_no_change=self.n_iter_no_change,
@@ -1202,12 +1141,7 @@ class RobustWeightedKMeans(BaseEstimator, ClusterMixin):
 
     max_iter : int, default=100
         Maximum number of iterations.
-        For more information, see the optimization scheme of base_estimator
-        and the eta0 and burn_in parameter.
-
-    eta0 : float, default=0.01
-        Constant step-size used during the burn_in period. Used only if
-        burn_in>0. Can have a big effect on efficiency.
+        For more information, see the optimization scheme of base_estimator.
 
     c : float>0 or None, default=None
         Parameter used for Huber weighting procedure, used only if weightings
@@ -1314,7 +1248,6 @@ def __init__(
         n_clusters=8,
         weighting="huber",
         max_iter=100,
-        eta0=0.01,
         c=None,
         k=0,
         kmeans_args=None,
@@ -1326,7 +1259,6 @@ def __init__(
         self.n_clusters = n_clusters
         self.weighting = weighting
         self.max_iter = max_iter
-        self.eta0 = eta0
         self.c = c
         self.k = k
         self.kmeans_args = kmeans_args
@@ -1369,13 +1301,9 @@ def fit(self, X, y=None):
                 random_state=self.random_state,
                 **kmeans_args
             ),
-            burn_in=0,  # Important because it does not mean anything to
-            # have burn-in
-            # steps for kmeans. It must be 0.
             weighting=self.weighting,
             loss=_kmeans_loss,
             max_iter=self.max_iter,
-            eta0=self.eta0,
             c=self.c,
             k=self.k,
             tol=self.tol,
diff --git a/sklearn_extra/robust/tests/test_robust_weighted_estimator.py b/sklearn_extra/robust/tests/test_robust_weighted_estimator.py
@@ -85,14 +85,6 @@ def test_robust_estimator_input_validation_and_fit_check():
     with pytest.raises(ValueError, match=msg):
         RobustWeightedKMeans(c=0).fit(X_cc)
 
-    msg = "burn_in must be >= 0, got -1."
-    with pytest.raises(ValueError, match=msg):
-        RobustWeightedClassifier(burn_in=-1).fit(X_cc, y_cc)
-
-    msg = "eta0 must be > 0, got 0."
-    with pytest.raises(ValueError, match=msg):
-        RobustWeightedClassifier(burn_in=1, eta0=0).fit(X_cc, y_cc)
-
     msg = "k must be integer >= 0, and smaller than floor"
     with pytest.raises(ValueError, match=msg):
         RobustWeightedKMeans(k=-1).fit(X_cc)
@@ -145,7 +137,6 @@ def test_not_robust_classif(loss, weighting, multi_class):
         weighting=weighting,
         k=0,
         c=1e7,
-        burn_in=0,
         multi_class=multi_class,
         random_state=rng,
     )
@@ -172,7 +163,6 @@ def test_classif_binary(weighting):
         weighting=weighting,
         k=0,
         c=1e7,
-        burn_in=0,
         multi_class="binary",
         random_state=rng,
     )
@@ -203,7 +193,6 @@ def test_classif_corrupted_weights(weighting):
         weighting=weighting,
         k=5,
         c=1,
-        burn_in=0,
         multi_class="binary",
         random_state=rng,
     )
@@ -219,7 +208,6 @@ def test_predict_proba(weighting):
         weighting=weighting,
         k=0,
         c=1e7,
-        burn_in=0,
         random_state=rng,
     )
     clf_not_rob = SGDClassifier(loss="log", random_state=rng)
@@ -268,6 +256,31 @@ def test_corrupted_regression(loss, weighting, k, c):
     assert np.abs(reg.intercept_[0]) < 0.3
 
 
+@pytest.mark.parametrize("loss", regression_losses)
+@pytest.mark.parametrize("weighting", weightings)
+def test_corrupted_regression_multidim(loss, weighting):
+
+    n = 1000
+    d = 10
+
+    coef = np.zeros((d, 1))
+    coef[0, 0] = 1
+    X = np.array(np.random.randn(n, d))
+    y = X @ coef + np.array(np.random.randn(n, 1))
+
+    reg = RobustWeightedRegressor(
+        loss=loss,
+        max_iter=100,
+        weighting=weighting,
+        k=1,
+        c=1,
+        random_state=rng,
+        n_iter_no_change=20,
+    )
+    reg.fit(X, y)
+    assert np.linalg.norm(reg.coef_ - coef) < 2 * np.sqrt(d)
+
+
 # Check that weights_ parameter can be used as outlier score.
 @pytest.mark.parametrize("weighting", weightings)
 def test_regression_corrupted_weights(weighting):
@@ -276,7 +289,6 @@ def test_regression_corrupted_weights(weighting):
         weighting=weighting,
         k=5,
         c=1,
-        burn_in=0,
         random_state=rng,
     )
     reg.fit(X_rc, y_rc)
@@ -297,7 +309,6 @@ def test_not_robust_regression(loss, weighting):
         weighting=weighting,
         k=0,
         c=1e7,
-        burn_in=0,
         random_state=rng,
     )
     reg_not_rob = SGDRegressor(loss=loss, random_state=rng)
@@ -308,7 +319,7 @@ def test_not_robust_regression(loss, weighting):
     difference = [
         np.linalg.norm(pred1[i] - pred2[i]) for i in range(len(pred1))
     ]
-    assert np.mean(difference) < 1
+    assert np.mean(difference) < 2
     assert_almost_equal(reg.score(X_r, y_r), r2_score(y_r, reg.predict(X_r)))
 
 
@@ -325,7 +336,6 @@ def test_vs_huber():
         weighting="huber",
         k=5,
         c=1,
-        burn_in=0,
         sgd_args={"learning_rate": "adaptive"},  # test sgd_args
         random_state=rng,
     )
@@ -394,7 +404,7 @@ def test_not_robust_cluster(weighting):
     difference = [
         np.linalg.norm(pred1[i] - pred2[i]) for i in range(len(pred1))
     ]
-    assert np.mean(difference) < 1
+    assert np.mean(difference) < 2
 
 
 def test_transform():

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,6 @@`
`42`	`42`	`weighting="huber",`
`43`	`43`	`loss="hinge",`
`44`	`44`	`c=1.35,`
`45`		`- eta0=1e-3,`
`46`	`45`	`max_iter=300,`
`47`	`46`	`)`
`48`	`47`
`@@ -60,7 +59,6 @@`
`60`	`59`	`weighting="huber",`
`61`	`60`	`loss="hinge",`
`62`	`61`	`c=1.35,`
`63`		`- eta0=1e-3,`
`64`	`62`	`max_iter=300,`
`65`	`63`	`random_state=rng,`
`66`	`64`	`)`