Skip to content

Commit bdd117d

Browse files
committed
DOC advance a bit the documentation
1 parent 5d8e920 commit bdd117d

File tree

5 files changed

+174
-5
lines changed

5 files changed

+174
-5
lines changed

doc/miscellaneous.rst

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
.. _miscellaneous:
2+
3+
======================
4+
Miscellaneous samplers
5+
======================
6+
7+
.. currentmodule:: imblearn.misc
8+
9+
.. _function_sampler:
10+
11+
Custom samplers
12+
---------------
13+
14+
A fully customized sampler, :class:`FunctionSampler`, is available in
15+
imbalanced-learn such that you can fast prototype your own sampler by defining
16+
a single function. Additional parameters can be added using the attribute
17+
``kw_args`` which accepts a dictionary. The following example illustrates how
18+
to retain the 10 first elements of the array ``X`` and ``y``::
19+
20+
>>> from sklearn.datasets import make_classification
21+
>>> X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
22+
... n_redundant=0, n_repeated=0, n_classes=3,
23+
... n_clusters_per_class=1,
24+
... weights=[0.01, 0.05, 0.94],
25+
... class_sep=0.8, random_state=0)
26+
>>> def func(X, y):
27+
... return X[:10], y[10:]
28+
>>> sampler = FunctionSampler(func=func)
29+
>>> X_res, y_res = sampler.fit_sample(X, y)
30+
>>> (X_res == X[:10]).all()
31+
True
32+
>>> (y_res == y_res[:10]).all()
33+
True
34+
35+
We illustrate the use of such sampler to implement an outlier rejection
36+
estimator which can be easily used within a
37+
:class:`imblearn.pipeline.Pipeline`:
38+
:ref:`sphx_glr_auto_examples_plot_outlier_rejections.py`

doc/user_guide.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ User Guide
1414
under_sampling.rst
1515
combine.rst
1616
ensemble.rst
17+
miscellaneous.rst
1718
metrics.rst
1819
Dataset loading utilities <datasets/index.rst>
1920
developers_utils.rst

examples/plot_outlier_rejections.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
"""
2+
===============================================================
3+
Customized sampler to implement an outlier rejections estimator
4+
===============================================================
5+
6+
This example illustrates the use of a custom sampler to implement an outlier
7+
rejections estimator. It can be used easily within a pipeline in which the
8+
number of samples can vary during training, which usually is a limitation of
9+
the current scikit-learn pipeline.
10+
11+
"""
12+
13+
# Authors: Guillaume Lemaitre <[email protected]>
14+
# License: MIT
15+
16+
import numpy as np
17+
import matplotlib.pyplot as plt
18+
19+
from sklearn.datasets import make_moons, make_blobs
20+
from sklearn.ensemble import IsolationForest
21+
from sklearn.linear_model import LogisticRegression
22+
from sklearn.metrics import classification_report
23+
24+
from imblearn.misc import FunctionSampler
25+
from imblearn.pipeline import make_pipeline
26+
27+
print(__doc__)
28+
29+
rng = np.random.RandomState(42)
30+
31+
32+
def plot_scatter(X, y, title):
33+
plt.figure()
34+
plt.scatter(X[y == 1, 0], X[y == 1, 1], label='Class #1')
35+
plt.scatter(X[y == 0, 0], X[y == 0, 1], label='Class #0')
36+
plt.legend()
37+
plt.title(title)
38+
39+
40+
# Generate contaminated training data
41+
moons, _ = make_moons(n_samples=500, noise=0.05)
42+
blobs, _ = make_blobs(n_samples=500, centers=[(-0.75, 2.25),
43+
(1.0, 2.0)],
44+
cluster_std=0.25)
45+
outliers = rng.uniform(low=-3, high=3, size=(500, 2))
46+
X_train = np.vstack([moons, blobs, outliers])
47+
y_train = np.hstack([np.ones(moons.shape[0], dtype=np.int8),
48+
np.zeros(blobs.shape[0], dtype=np.int8),
49+
rng.randint(0, 2, size=outliers.shape[0],
50+
dtype=np.int8)])
51+
52+
plot_scatter(X_train, y_train, 'Training dataset')
53+
54+
# Generate non-contaminated testing data
55+
moons, _ = make_moons(n_samples=50, noise=0.05)
56+
blobs, _ = make_blobs(n_samples=50, centers=[(-0.75, 2.25),
57+
(1.0, 2.0)],
58+
cluster_std=0.25)
59+
X_test = np.vstack([moons, blobs])
60+
y_test = np.hstack([np.ones(moons.shape[0], dtype=np.int8),
61+
np.zeros(blobs.shape[0], dtype=np.int8)])
62+
63+
plot_scatter(X_test, y_test, 'Testing dataset')
64+
65+
66+
def outlier_rejection(X, y):
67+
model = IsolationForest(max_samples=100,
68+
contamination=0.4,
69+
random_state=rng)
70+
model.fit(X)
71+
y_pred = model.predict(X)
72+
return X[y_pred == 1], y[y_pred == 1]
73+
74+
75+
reject_sampler = FunctionSampler(func=outlier_rejection)
76+
X_inliers, y_inliers = reject_sampler.fit_sample(X_train, y_train)
77+
plot_scatter(X_inliers, y_inliers, 'Training data without outliers')
78+
79+
pipe = make_pipeline(FunctionSampler(func=outlier_rejection),
80+
LogisticRegression(random_state=rng))
81+
y_pred = pipe.fit(X_train, y_train).predict(X_test)
82+
print(classification_report(y_test, y_pred))
83+
84+
clf = LogisticRegression(random_state=rng)
85+
y_pred = clf.fit(X_train, y_train).predict(X_test)
86+
print(classification_report(y_test, y_pred))
87+
88+
plt.show()

imblearn/misc.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,56 @@ class FunctionSampler(SamplerMixin):
2828
same arguments as transform, with args and kwargs forwarded. If func is
2929
None, then func will be the identity function.
3030
31+
accept_sparse : bool, optional (default=True)
32+
Whether sparse input are supported. By default, sparse inputs are
33+
supported.
34+
35+
kw_args : dict, optional (default=None)
36+
The keyword argument expected by ``func``.
37+
38+
Notes
39+
-----
40+
41+
See
42+
:ref:`sphx_glr_auto_examples_plot_outlier_rejections.py`
43+
44+
Examples
45+
--------
46+
>>> from sklearn.datasets import fetch_mldata
47+
>>> from imblearn.misc import FunctionSampler
48+
>>> pima = fetch_mldata('diabetes_scale')
49+
>>> X, y = pima['data'], pima['target']
50+
51+
We can create to select only the first ten samples for instance.
52+
53+
>>> def func(X, y):
54+
... return X[:10], y[:10]
55+
>>> sampler = FunctionSampler(func=func)
56+
>>> X_res, y_res = sampler.fit_sample(X, y)
57+
>>> (X_res == X[:10]).all()
58+
True
59+
>>> (y_res == y_res[:10]).all()
60+
True
61+
62+
We can also create a specific function which take some arguments.
63+
64+
>>> from collections import Counter
65+
>>> from imblearn.under_sampling import RandomUnderSampler
66+
>>> def func(X, y, ratio, random_state):
67+
... return RandomUnderSampler(ratio=ratio,
68+
... random_state=random_state).fit_sample(X, y)
69+
>>> sampler = FunctionSampler(func=func,
70+
... kw_args={'ratio': 'auto', 'random_state': 0})
71+
>>> X_res, y_res = sampler.fit_sample(X, y)
72+
>>> print('Resampled dataset shape {}'.format(Counter(y_res)))
73+
Resampled dataset shape Counter({-1: 268, 1: 268})
74+
3175
"""
3276

33-
def __init__(self, func=None, accept_sparse=True, kw_args=None,
34-
random_state=None):
77+
def __init__(self, func=None, accept_sparse=True, kw_args=None):
3578
self.func = func
3679
self.accept_sparse = accept_sparse
3780
self.kw_args = kw_args
38-
self.random_state = random_state
3981
self.logger = logging.getLogger(__name__)
4082

4183
def _check_X_y(self, X, y):

imblearn/utils/estimator_checks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ def check_estimator(Estimator):
7676
sklearn_check_estimator(Estimator)
7777
check_parameters_default_constructible(name, Estimator)
7878
for check in _yield_all_checks(name, Estimator):
79-
if name not in NOT_TESTED_SAMPLERS:
80-
check(name, Estimator)
79+
# if name not in NOT_TESTED_SAMPLERS:
80+
check(name, Estimator)
8181

8282

8383
def check_target_type(name, Estimator):

0 commit comments

Comments
 (0)