scikit-learn-contrib
diff --git a/‎benchmarks/_bench/robust_plot_synthetic.py
Lines changed: 76 additions & 0 deletions b/‎benchmarks/_bench/robust_plot_synthetic.py
Lines changed: 76 additions & 0 deletions
diff --git a/‎doc/api.rst
Lines changed: 9 additions & 0 deletions b/‎doc/api.rst
Lines changed: 9 additions & 0 deletions
diff --git a/‎doc/images/robust_def_outliers.png
13.1 KB b/‎doc/images/robust_def_outliers.png
13.1 KB
diff --git a/‎doc/images/robust_plot_regression.png
36.3 KB b/‎doc/images/robust_plot_regression.png
36.3 KB
diff --git a/‎doc/modules/robust.rst
Lines changed: 212 additions & 0 deletions b/‎doc/modules/robust.rst
Lines changed: 212 additions & 0 deletions
diff --git a/‎doc/user_guide.rst
Lines changed: 2 additions & 2 deletions b/‎doc/user_guide.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/plot_robust_classification_diabete.py
Lines changed: 73 additions & 0 deletions b/‎examples/plot_robust_classification_diabete.py
Lines changed: 73 additions & 0 deletions
@@ -0,0 +1,76 @@
+"""
+==================================================================
+Plot of accuracy and time as sample_size and num_features increase
+==================================================================
+We show that the increase in computation time is linear when
+increasing the number of features or the sample size increases.
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+from time import time
+
+from sklearn_extra.robust import RobustWeightedEstimator
+from sklearn.linear_model import SGDClassifier
+from sklearn.datasets import make_classification
+from sklearn.model_selection import cross_val_score
+
+rng = np.random.RandomState(42)
+
+dimensions = np.linspace(50, 5000, num=8).astype(int)
+sample_sizes = np.linspace(50, 5000, num=8).astype(int)
+accuracies = []
+times = []
+
+# Get the accuracy and time of computations for a dataset with varying number
+# of features
+
+for d in dimensions:
+    # Make an example in dimension d. Use a scale factor for the problem to be
+    # easy even in high dimension.
+    X, y = make_classification(
+        n_samples=200, n_features=d, scale=1 / np.sqrt(2 * d), random_state=rng
+    )
+    stime = time()
+    clf = RobustWeightedEstimator(
+        SGDClassifier(loss="hinge", penalty="l1"),
+        loss="hinge",
+        random_state=rng,
+    )
+    accuracies.append(np.mean(cross_val_score(clf, X, y, cv=10)))
+    times.append(time() - stime)
+
+fig, axs = plt.subplots(2, 2)
+axs[0, 0].plot(dimensions, accuracies)
+axs[0, 0].set_xlabel("Number of features")
+axs[0, 0].set_ylabel("accuracy")
+axs[0, 1].plot(dimensions, times)
+axs[0, 1].set_xlabel("Number of features")
+axs[0, 1].set_ylabel("Time to fit and predict (s)")
+
+accuracies = []
+times = []
+
+# Get the accuracy and time of computations for a dataset with varying number
+# of samples
+
+for n in sample_sizes:
+    X, y = make_classification(n_samples=n, n_features=5, random_state=rng)
+    stime = time()
+    clf = RobustWeightedEstimator(
+        SGDClassifier(loss="hinge", penalty="l1"),
+        loss="hinge",
+        random_state=rng,
+    )
+    accuracies.append(np.mean(cross_val_score(clf, X, y, cv=10)))
+    times.append(time() - stime)
+
+axs[1, 0].plot(dimensions, accuracies)
+axs[1, 0].set_xlabel("Number of features")
+axs[1, 0].set_ylabel("accuracy")
+axs[1, 1].plot(dimensions, times)
+axs[1, 1].set_xlabel("Number of features")
+axs[1, 1].set_ylabel("Time to fit and predict (s)")
+
+
+plt.show()
@@ -31,4 +31,13 @@ Clustering
    :template: class.rst
 
    cluster.KMedoids
+   
+Robust
+====================
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   robust.RobustWeightedEstimator
 
@@ -0,0 +1,212 @@
+.. _robust:
+
+===================================================
+Robust algorithms for Regression and Classification
+===================================================
+
+.. currentmodule:: sklearn_extra.robust
+
+Robust statistics are mostly about how to deal with data corrupted with
+outliers (i.e. abnormal data, unique data in some sense).
+The aim is to modify classical  methods in order to deal with outliers while
+loosing as little as possible in efficiency compared to classical (non-robust)
+methods applied to non-corrupted datasets.
+In particular, in machine learning, we want to bound the
+influence that any minority of the dataset can have on the prediction, see
+the figure for an example in regression.
+
+.. |robust_regression| image:: ../images/robust_plot_regression.png
+       :target: ../examples/plot_robust_regression_toy.py
+       :scale: 70
+
+.. centered:: |robust_regression|
+
+What is an outlier ?
+====================
+
+The term "outlier" refers to a discordant minority of the dataset. It is
+generally assumed to be a set of points situated outside the bulk of the data
+but there exists more complex cases as illustrated in the figure below.
+
+Formally, we define outliers for a given task by considering  points for
+which the loss function takes unusually high values.
+In the case of classification, one can consider that in the following scatter
+plot the points in the up-right corner are outliers while the points in the
+bottom-left corner are not.
+
+.. |outlier| image:: ../images/robust_def_outliers.png
+      :scale: 80
+
+.. centered:: |outlier|
+
+Outliers can be caused by a lot of things, among them are human errors, captor
+errors or inherent causes. These are often found for example in biology,
+econometrics or datasets that describe some human relationships.
+
+Here, we limit ourselves to linear estimators, but non-linear estimators are
+also plagued with the same non-robustness properties. See scikit-learn RANSAC
+documentation (`scikit-learn <https://scikit-learn.org/stable/modules/linear_model.html#ransac-random-sample-consensus>`__)
+for an example of outliers for non-linear estimators.
+
+Robust estimation with robust weighting
+=======================================
+
+A lot of learning algorithms are based on a paradigm known as empirical risk
+minimization (ERM) which consists in finding the estimator :math:`\widehat{f}`
+that minimizes an estimation of the risk.
+
+.. math::
+
+  \widehat{f} = \text{argmin}_{f\in F}\frac{1}{n}\sum_{i=1}^n\ell(f(X_i),y_i),
+
+where the :math:`ell` is a loss function (e.g. the squared distance in
+regression problems). Said in another way, we are trying to minimize an
+estimation of the expected risk and this estimation corresponds to an empirical
+mean. However, it is well known that the empirical mean is not robust to
+extreme data and these extreme values will have a big influence on the
+estimation of :math:`\widehat{f}`. The principle behind the robust weighting
+algorithm is to rely on a robust estimator (such as median-of-means (MOM) or
+Huber estimator) in place of the empirical mean in the equation above [1]_.
+
+In practice, one can define weights :math:`w_i` that depends on the
+:math:`i^{th}` sample, with the weight :math:`w_i` being very small when
+the :math:`i^{th}` data is an outlier and large otherwise.
+This way, the problem is reduced to the following optimization :
+
+.. math::
+
+  \min_{f}\, \frac{1}{n} \sum_{i=1}^n w_i\ell(f(X_i),y_i)
+
+Remark that the weights :math:`w_i` depends on :math:`\widehat{f}`, and the
+resulting algorithm is then an alternate optimization scheme, iteratively
+doing one step to optimize with respect to :math:`f` while the weights stay
+fixed and then one step to estimate the weights while :math:`f` stays fixed.
+These two steps are then repeated until convergence.
+
+Robust estimation in practice
+=============================
+
+The algorithm
+-------------
+
+The approach is implemented as a meta algorithm that takes as input a base
+estimator (e.g., SGDClassifier or SGDRegressor). To be compatible, the
+base estimator must support partial_fit and sample_weight
+partial_fit and sample_weight. Refer to the KMeans example for a template
+to adapt the method to other estimators.
+
+At each step, the algorithm estimates sample weights that are meant to be small
+for outliers and large for inliers and then we do one optimization step using
+the base_estimator optimization algorithm.
+
+There are two weighting scheme supported in this algorithm: Huber-like weights
+and median-of-means weights. These two types of weights both come with a
+parameter that will determine the robustness/efficiency trade-off of the
+estimation.
+
+* Huber weights : the parameter "c" is a positive real number. For small
+  values of c the estimator is more robust but less efficient than it is
+  for large values of c.
+  A good heuristic consists in choosing c as an estimate of the standard
+  deviation of the losses of the inliers. In practice, if c=None, it is
+  estimated with the inter-quartile range.
+
+* Median-of-means weights : the parameter "k" is a non-negative integer,
+  when k=0 the estimator is exactly the same as base_estimator and when
+  k=sample_size/2 the estimator is very robust but less efficient on inliers.
+  A good heuristic consists in choosing k as an estimate of the number of
+  outliers. In practice, if k=None, it is estimated using the number of
+  points distant from the median of more than a 1.45 times the inter-quartile
+  range.
+
+  .. table:: Robustness/Efficiency tradeoff and choice of parameters
+     :widths: auto
+     :align: center
+
+     +-----------+----------------------+-----------------+-------------------+
+     | weighting | Robustness parameter | Small parameter | Large parameter   |
+     +===========+======================+=================+===================+
+     |    mom    |          k           |   Non robust    |      Robust       |
+     +-----------+----------------------+-----------------+-------------------+
+     |   huber   |          c           |   Robust        |      Non robust   |
+     +-----------+----------------------+-----------------+-------------------+
+
+
+The choice of the optimization parameters max_iter and eta_0 are also very
+important for the efficiency of this estimator. It is recommended to use
+`cross-validation <https://scikit-learn.org/stable/modules/cross_validation.html>`__
+to fix these hyper-parameters. Choosing eta0 too large
+can have the effect of making the estimator non-robust. One should also take
+care that it can be important to rescale the data (the same way as it is
+important to do it for SGD). In the context of a corrupted dataset, please use
+`RobustScaler <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html>`__.
+
+This algorithm has been studied in the context of "mom" weights in the
+article [1]_, the context of "huber" weights has been mentioned in [2]_.
+Both weighting scheme can be seen as special cases of the algorithm in [3]_.
+
+Comparison with other robust estimators
+---------------------------------------
+
+There are already some robust algorithms in scikit-learn but one major
+difference is that robust algorithms in scikit-learn are primarily meant for
+Regression, see `robustness in regression <https://scikit-learn.org/stable/modules/linear_model.html#robustness-regression-outliers-and-modeling-errors>`__.
+Hence, we will not talk about classification algorithms in this comparison.
+
+As such we only compare ourselves to TheilSenRegressor and RANSACRegressor as
+they both deal with outliers in X and in Y and are closer to
+RobustWeightedEstimator.
+
+**Warning:** Huber weights used in our algorithm should not be confused with
+HuberRegressor or other regression with “robust losses”. Those types of
+regressions are robust only to outliers in the label Y but not in X.
+
+Pro: RANSACRegressor and TheilSenRegressor both use a hard rejection of
+outlier. This can be interpreted as though there was an outlier detection
+step and then a regression step whereas RobustWeightedEstimator is directly
+robust to outliers. This often increase the performance on moderatly corrupted
+datasets.
+
+Con: In general, this algorithm is slower than both  TheilSenRegressor and
+RANSACRegressor.
+
+One other advantage of RobustWeightedEstimator is that it can be used for a
+broad range of algorithms. For example, one can do robust unsupervised
+learning with RobustWeightedEstimator, see the example using KMeans algorithm.
+
+Speed and limits of the algorithm
+---------------------------------
+
+Most of the time, it is interesting to do robust statistics only when there
+are outliers and notice that a lot of dataset have previously been "cleaned"
+of an outliers in which case this algorithm is not better than base_estimator.
+
+In high dimension, the algorithm is expected to be as good
+(or as bad) as base_estimator do in high dimension.
+
+Complexity and limitation:
+
+* weighting=”huber”: the complexity is larger than that of base_estimator but
+  it is still of the same order of magnitude.
+* weighting=”mom”: the larger k is the faster the algorithm will perform if 
+  sample_size is large. This weighting scheme is advised only with 
+  sufficiently large dataset (thumb rule sample_size > 500 the specifics 
+  depend on the dataset).
+
+**Warning:** On a real dataset, one should be aware that there can be outliers
+in the training set but also in the test set when the loss is not bounded. See
+the example with California housing real dataset, for further discussion.
+
+.. topic:: References:
+
+    .. [1] Guillaume Lecué, Matthieu Lerasle and Timothée Mathieu.
+           `"Robust classification via MOM minimization" <https://doi.org/10.1007/s10994-019-05863-6>`_, Machine Learning Journal (2020).
+
+
+    .. [2] Christian Brownlees, Emilien Joly and Gábor Lugosi.
+           `"Empirical risk minimization for heavy-tailed losses" <https://projecteuclid.org/euclid.aos/1444222083>`_, Ann. Statist.
+           Volume 43, Number 6 (2015), 2507-2536.
+
+    .. [3] Stanislav Minsker and Timothée Mathieu.
+           `"Excess risk bounds in robust empirical risk minimization" <https://arxiv.org/abs/1910.07485>`_
+           arXiv preprint (2019). arXiv:1910.07485.
@@ -11,6 +11,7 @@ User guide
      :numbered:
 
      modules/eigenpro.rst
+     modules/robust.rst
 
 .. _k_medoids:
 
@@ -63,8 +64,7 @@ This version works as follows:
   maximum number of iterations ``max_iter`` is reached.
 
 .. topic:: References:
-
 * Maranzana, F.E., 1963. On the location of supply points to minimize
   transportation costs. IBM Systems Journal, 2(2), pp.129-135.
 * Park, H.S. and Jun, C.H., 2009. A simple and fast algorithm for K-medoids
-  clustering. Expert systems with applications, 36(2), pp.3336-3341.
+  clustering. Expert systems with applications, 36(2), pp.3336-3341.
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+"""
+======================================================================
+A demo of Robust Classification on real dataset "diabetes" from OpenML
+======================================================================
+In this example we compare the RobustWeightedEstimator using SGDClassifier
+for classification on the real dataset "diabetes".
+WARNING: running this example can take some time (<1hour).
+We only compare the estimator with SGDClassifier as there is no robust
+classification estimator in scikit-learn.
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn_extra.robust import RobustWeightedEstimator
+from sklearn.linear_model import SGDClassifier
+from sklearn.datasets import fetch_openml
+from sklearn.metrics import roc_auc_score, make_scorer
+from sklearn.model_selection import cross_val_score
+from sklearn.preprocessing import RobustScaler
+
+
+X, y = fetch_openml(name="diabetes", return_X_y=True)
+
+# replace the label names with 0 or 1
+y = (y == "tested_positive").astype(int)
+
+# Scale the dataset with sklearn RobustScaler (important for this algorithm)
+X = RobustScaler().fit_transform(X)
+
+# Using GridSearchCV, to tune the parameters alpha, eta0, learning_rate, loss
+# and average of SGDClassifier, we get the following parameters.
+
+clf_not_rob = SGDClassifier(average=10, learning_rate="optimal", loss="hinge")
+
+# Then, we use this estimator as base_estimator of RobustWeightedEstimator.
+# Using GridSearchCV, we tuned the parameters c and eta0, with the
+# choice of "huber" weighting because the sample_size is not very large.
+
+clf_rob = RobustWeightedEstimator(
+    SGDClassifier(average=10, learning_rate="optimal", loss="hinge"),
+    weighting="huber",
+    loss="hinge",
+    c=1.35,
+    eta0=1e-3,
+    max_iter=300,
+)
+
+# We compute M times the cross validations in order to also have an estimate
+# of the variance of the loss of the estimators.
+M = 10
+res = []
+for f in range(M):
+    print("\r Progress: %s / %s" % (f + 1, M), end="")
+    clf = SGDClassifier(average=10, learning_rate="optimal", loss="hinge")
+
+    cv_not_rob = cross_val_score(
+        clf_not_rob, X, y, cv=10, scoring=make_scorer(roc_auc_score)
+    )
+
+    cv_rob = cross_val_score(
+        clf_rob, X, y, cv=10, scoring=make_scorer(roc_auc_score)
+    )
+
+    res += [[np.mean(cv_rob), np.mean(cv_not_rob)]]
+
+
+plt.boxplot(np.array(res), labels=["RobustWeightedEstimator", "SGDClassifier"])
+plt.ylabel("AUC")
+
+plt.show()
+
+# Remark : when using accuracy score, the optimal hyperparameters change and
+#          for example the parameter c changes from 1.35 to 10.