DOC advance a bit the documentation

glemaitre · glemaitre · commit bdd117d6d655 · 2017-11-28T02:15:57.000+01:00
diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst
@@ -0,0 +1,38 @@
+.. _miscellaneous:
+
+======================
+Miscellaneous samplers
+======================
+
+.. currentmodule:: imblearn.misc
+
+.. _function_sampler:
+
+Custom samplers
+---------------
+
+A fully customized sampler, :class:`FunctionSampler`, is available in
+imbalanced-learn such that you can fast prototype your own sampler by defining
+a single function. Additional parameters can be added using the attribute
+``kw_args`` which accepts a dictionary. The following example illustrates how
+to retain the 10 first elements of the array ``X`` and ``y``::
+
+  >>> from sklearn.datasets import make_classification
+  >>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
+  ...                            n_redundant=0, n_repeated=0, n_classes=3,
+  ...                            n_clusters_per_class=1,
+  ...                            weights=[0.01, 0.05, 0.94],
+  ...                            class_sep=0.8, random_state=0)
+  >>> def func(X, y):
+  ...   return X[:10], y[10:]
+  >>> sampler = FunctionSampler(func=func)
+  >>> X_res, y_res = sampler.fit_sample(X, y)
+  >>> (X_res == X[:10]).all()
+  True
+  >>> (y_res == y_res[:10]).all()
+  True
+
+We illustrate the use of such sampler to implement an outlier rejection
+estimator which can be easily used within a
+:class:`imblearn.pipeline.Pipeline`:
+:ref:`sphx_glr_auto_examples_plot_outlier_rejections.py`
diff --git a/doc/user_guide.rst b/doc/user_guide.rst
@@ -14,6 +14,7 @@ User Guide
    under_sampling.rst
    combine.rst
    ensemble.rst
+   miscellaneous.rst
    metrics.rst
    Dataset loading utilities <datasets/index.rst>
    developers_utils.rst
diff --git a/examples/plot_outlier_rejections.py b/examples/plot_outlier_rejections.py
@@ -0,0 +1,88 @@
+"""
+===============================================================
+Customized sampler to implement an outlier rejections estimator
+===============================================================
+
+This example illustrates the use of a custom sampler to implement an outlier
+rejections estimator. It can be used easily within a pipeline in which the
+number of samples can vary during training, which usually is a limitation of
+the current scikit-learn pipeline.
+
+"""
+
+# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
+# License: MIT
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.datasets import make_moons, make_blobs
+from sklearn.ensemble import IsolationForest
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report
+
+from imblearn.misc import FunctionSampler
+from imblearn.pipeline import make_pipeline
+
+print(__doc__)
+
+rng = np.random.RandomState(42)
+
+
+def plot_scatter(X, y, title):
+    plt.figure()
+    plt.scatter(X[y == 1, 0], X[y == 1, 1], label='Class #1')
+    plt.scatter(X[y == 0, 0], X[y == 0, 1], label='Class #0')
+    plt.legend()
+    plt.title(title)
+
+
+# Generate contaminated training data
+moons, _ = make_moons(n_samples=500, noise=0.05)
+blobs, _ = make_blobs(n_samples=500, centers=[(-0.75, 2.25),
+                                              (1.0, 2.0)],
+                      cluster_std=0.25)
+outliers = rng.uniform(low=-3, high=3, size=(500, 2))
+X_train = np.vstack([moons, blobs, outliers])
+y_train = np.hstack([np.ones(moons.shape[0], dtype=np.int8),
+                     np.zeros(blobs.shape[0], dtype=np.int8),
+                     rng.randint(0, 2, size=outliers.shape[0],
+                                 dtype=np.int8)])
+
+plot_scatter(X_train, y_train, 'Training dataset')
+
+# Generate non-contaminated testing data
+moons, _ = make_moons(n_samples=50, noise=0.05)
+blobs, _ = make_blobs(n_samples=50, centers=[(-0.75, 2.25),
+                                             (1.0, 2.0)],
+                      cluster_std=0.25)
+X_test = np.vstack([moons, blobs])
+y_test = np.hstack([np.ones(moons.shape[0], dtype=np.int8),
+                    np.zeros(blobs.shape[0], dtype=np.int8)])
+
+plot_scatter(X_test, y_test, 'Testing dataset')
+
+
+def outlier_rejection(X, y):
+    model = IsolationForest(max_samples=100,
+                            contamination=0.4,
+                            random_state=rng)
+    model.fit(X)
+    y_pred = model.predict(X)
+    return X[y_pred == 1], y[y_pred == 1]
+
+
+reject_sampler = FunctionSampler(func=outlier_rejection)
+X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train)
+plot_scatter(X_inliers, y_inliers, 'Training data without outliers')
+
+pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
+                     LogisticRegression(random_state=rng))
+y_pred = pipe.fit(X_train, y_train).predict(X_test)
+print(classification_report(y_test, y_pred))
+
+clf = LogisticRegression(random_state=rng)
+y_pred = clf.fit(X_train, y_train).predict(X_test)
+print(classification_report(y_test, y_pred))
+
+plt.show()
diff --git a/imblearn/misc.py b/imblearn/misc.py
@@ -28,14 +28,56 @@ class FunctionSampler(SamplerMixin):
         same arguments as transform, with args and kwargs forwarded. If func is
         None, then func will be the identity function.
 
+    accept_sparse : bool, optional (default=True)
+        Whether sparse input are supported. By default, sparse inputs are
+        supported.
+
+    kw_args : dict, optional (default=None)
+        The keyword argument expected by ``func``.
+
+    Notes
+    -----
+
+    See
+    :ref:`sphx_glr_auto_examples_plot_outlier_rejections.py`
+
+    Examples
+    --------
+    >>> from sklearn.datasets import fetch_mldata
+    >>> from imblearn.misc import FunctionSampler
+    >>> pima = fetch_mldata('diabetes_scale')
+    >>> X, y = pima['data'], pima['target']
+
+    We can create to select only the first ten samples for instance.
+
+    >>> def func(X, y):
+    ...   return X[:10], y[:10]
+    >>> sampler = FunctionSampler(func=func)
+    >>> X_res, y_res = sampler.fit_sample(X, y)
+    >>> (X_res == X[:10]).all()
+    True
+    >>> (y_res == y_res[:10]).all()
+    True
+
+    We can also create a specific function which take some arguments.
+
+    >>> from collections import Counter
+    >>> from imblearn.under_sampling import RandomUnderSampler
+    >>> def func(X, y, ratio, random_state):
+    ...   return RandomUnderSampler(ratio=ratio,
+    ...                             random_state=random_state).fit_sample(X, y)
+    >>> sampler = FunctionSampler(func=func,
+    ...                           kw_args={'ratio': 'auto', 'random_state': 0})
+    >>> X_res, y_res = sampler.fit_sample(X, y)
+    >>> print('Resampled dataset shape {}'.format(Counter(y_res)))
+    Resampled dataset shape Counter({-1: 268, 1: 268})
+
     """
 
-    def __init__(self, func=None, accept_sparse=True, kw_args=None,
-                 random_state=None):
+    def __init__(self, func=None, accept_sparse=True, kw_args=None):
         self.func = func
         self.accept_sparse = accept_sparse
         self.kw_args = kw_args
-        self.random_state = random_state
         self.logger = logging.getLogger(__name__)
 
     def _check_X_y(self, X, y):
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -76,8 +76,8 @@ def check_estimator(Estimator):
     sklearn_check_estimator(Estimator)
     check_parameters_default_constructible(name, Estimator)
     for check in _yield_all_checks(name, Estimator):
-        if name not in NOT_TESTED_SAMPLERS:
-            check(name, Estimator)
+        # if name not in NOT_TESTED_SAMPLERS:
+        check(name, Estimator)
 
 
 def check_target_type(name, Estimator):