Merge pull request #73 from dvro/renn

glemaitre · web-flow · commit e9b5a812a8fc · 2016-06-24T18:45:17.000+02:00
RENN - Repeated Edited Nearest Neighbors undersampling method
diff --git a/examples/under-sampling/plot_repeated_edited_nearest_neighbours.py b/examples/under-sampling/plot_repeated_edited_nearest_neighbours.py
@@ -0,0 +1,74 @@
+"""
+=========================
+Repeated Edited nearest-neighbours
+=========================
+
+An illustration of the repeated edited nearest-neighbours method.
+
+"""
+
+print(__doc__)
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+sns.set()
+
+# Define some color for the plotting
+almost_black = '#262626'
+palette = sns.color_palette()
+
+from sklearn.datasets import make_classification
+from sklearn.decomposition import PCA
+
+from unbalanced_dataset.under_sampling import EditedNearestNeighbours, \
+    RepeatedEditedNearestNeighbours
+
+# Generate the dataset
+X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7],
+                           n_informative=3, n_redundant=1, flip_y=0,
+                           n_features=5, n_clusters_per_class=1,
+                           n_samples=5000, random_state=10)
+
+# Instanciate a PCA object for the sake of easy visualisation
+pca = PCA(n_components=2)
+# Fit and transform x to visualise inside a 2D feature space
+X_vis = pca.fit_transform(X)
+
+# Three subplots, unpack the axes array immediately
+f, (ax1, ax2, ax3) = plt.subplots(1, 3)
+
+ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=.5,
+            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
+ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=.5,
+            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
+ax1.set_title('Original set')
+
+# Apply the ENN
+print('ENN')
+enn = EditedNearestNeighbours()
+X_resampled, y_resampled = enn.fit_transform(X, y)
+X_res_vis = pca.transform(X_resampled)
+
+ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
+            label="Class #0", alpha=.5, edgecolor=almost_black,
+            facecolor=palette[0], linewidth=0.15)
+ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
+            label="Class #1", alpha=.5, edgecolor=almost_black,
+            facecolor=palette[2], linewidth=0.15)
+ax2.set_title('Edited nearest neighbours')
+
+# Apply the RENN
+print('RENN')
+renn = RepeatedEditedNearestNeighbours()
+X_resampled, y_resampled = renn.fit_transform(X, y)
+X_res_vis = pca.transform(X_resampled)
+
+ax3.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
+            label="Class #0", alpha=.5, edgecolor=almost_black,
+            facecolor=palette[0], linewidth=0.15)
+ax3.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
+            label="Class #1", alpha=.5, edgecolor=almost_black,
+            facecolor=palette[2], linewidth=0.15)
+ax3.set_title('Repeated Edited nearest neighbours')
+
+plt.show()
diff --git a/unbalanced_dataset/under_sampling/__init__.py b/unbalanced_dataset/under_sampling/__init__.py
@@ -12,6 +12,7 @@
 from .one_sided_selection import OneSidedSelection
 from .neighbourhood_cleaning_rule import NeighbourhoodCleaningRule
 from .edited_nearest_neighbours import EditedNearestNeighbours
+from .edited_nearest_neighbours import RepeatedEditedNearestNeighbours
 from .instance_hardness_threshold import InstanceHardnessThreshold
 
 __all__ = ['UnderSampler',
@@ -23,4 +24,5 @@
            'OneSidedSelection',
            'NeighbourhoodCleaningRule',
            'EditedNearestNeighbours',
+           'RepeatedEditedNearestNeighbours',
            'InstanceHardnessThreshold']
diff --git a/unbalanced_dataset/under_sampling/edited_nearest_neighbours.py b/unbalanced_dataset/under_sampling/edited_nearest_neighbours.py
@@ -251,3 +251,219 @@ def transform(self, X, y):
             return X_resampled, y_resampled, idx_under
         else:
             return X_resampled, y_resampled
+
+
+class RepeatedEditedNearestNeighbours(UnderSampler):
+    """Class to perform under-sampling based on the repeated edited nearest 
+    neighbour method.
+
+    Parameters
+    ----------
+    return_indices : bool, optional (default=False)
+        Either to return or not the indices which will be selected from
+        the majority class.
+
+    random_state : int or None, optional (default=None)
+        Seed for random number generation.
+
+    verbose : bool, optional (default=True)
+        Boolean to either or not print information about the processing
+
+    size_ngh : int, optional (default=3)
+        Size of the neighbourhood to consider to compute the average
+        distance to the minority point samples.
+
+    kind_sel : str, optional (default='all')
+        Strategy to use in order to exclude samples.
+
+        - If 'all', all neighbours will have to agree with the samples of
+        interest to not be excluded.
+        - If 'mode', the majority vote of the neighbours will be used in
+        order to exclude a sample.
+
+    n_jobs : int, optional (default=-1)
+        The number of thread to open when it is possible.
+
+    Attributes
+    ----------
+    ratio_ : str or float, optional (default='auto')
+        If 'auto', the ratio will be defined automatically to balanced
+        the dataset. Otherwise, the ratio will corresponds to the number
+        of samples in the minority class over the the number of samples
+        in the majority class.
+
+    rs_ : int or None, optional (default=None)
+        Seed for random number generation.
+
+    min_c_ : str or int
+        The identifier of the minority class.
+
+    max_c_ : str or int
+        The identifier of the majority class.
+
+    stats_c_ : dict of str/int : int
+        A dictionary in which the number of occurences of each class is
+        reported.
+
+    max_iter : int, optional (default=100)
+        Maximum number of iterations of the edited nearest neighbours
+        algorithm for a single run.
+
+    Notes
+    -----
+    The method is based on [1]_.
+
+    This class supports multi-class.
+
+    References
+    ----------
+    .. [1] I. Tomek, “An Experiment with the Edited Nearest-Neighbor
+       Rule,” IEEE Trans. Systems, Man, and Cybernetics, vol. 6, no. 6,
+       pp. 448-452, June 1976.
+
+    """
+
+    def __init__(self, return_indices=False, random_state=None, verbose=True,
+                 size_ngh=3, max_iter=100, kind_sel='all', n_jobs=-1):
+        """Initialisation of RENN object.
+
+        Parameters
+        ----------
+        return_indices : bool, optional (default=False)
+            Either to return or not the indices which will be selected from
+            the majority class.
+
+        random_state : int or None, optional (default=None)
+            Seed for random number generation.
+
+        verbose : bool, optional (default=True)
+            Boolean to either or not print information about the processing
+
+        size_ngh : int, optional (default=3)
+            Size of the neighbourhood to consider to compute the average
+            distance to the minority point samples.
+
+        max_iter : int, optional (default=100)
+            Maximum number of iterations of the edited nearest neighbours
+            algorithm for a single run.
+
+        kind_sel : str, optional (default='all')
+            Strategy to use in order to exclude samples.
+
+            - If 'all', all neighbours will have to agree with the samples of
+            interest to not be excluded.
+            - If 'mode', the majority vote of the neighbours will be used in
+            order to exclude a sample.
+
+        n_jobs : int, optional (default=-1)
+            The number of thread to open when it is possible.
+
+        Returns
+        -------
+        None
+
+        """
+        super(RepeatedEditedNearestNeighbours, self).__init__(
+            return_indices=return_indices,
+            random_state=random_state,
+            verbose=verbose)
+
+        self.size_ngh = size_ngh
+        possible_kind_sel = ('all', 'mode')
+        if kind_sel not in possible_kind_sel:
+            raise NotImplementedError
+        else:
+            self.kind_sel = kind_sel
+        self.n_jobs = n_jobs
+
+        if max_iter < 2:
+            raise ValueError('max_iter must be greater than 1.')
+        else:
+            self.max_iter = max_iter
+
+        self.enn_ = EditedNearestNeighbours(
+            return_indices=return_indices,
+            random_state=random_state, verbose=False,
+            size_ngh=size_ngh, kind_sel=kind_sel,
+            n_jobs=n_jobs)
+
+    def fit(self, X, y):
+        """Find the classes statistics before to perform sampling.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : ndarray, shape (n_samples, )
+            Corresponding label for each sample in X.
+
+        Returns
+        -------
+        self : object,
+            Return self.
+
+        """
+        # Check the consistency of X and y
+        X, y = check_X_y(X, y)
+
+        super(RepeatedEditedNearestNeighbours, self).fit(X, y)
+        self.enn_.fit(X, y)
+
+        return self
+
+    def transform(self, X, y):
+        """Resample the dataset.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Matrix containing the data which have to be sampled.
+
+        y : ndarray, shape (n_samples, )
+            Corresponding label for each sample in X.
+
+        Returns
+        -------
+        X_resampled : ndarray, shape (n_samples_new, n_features)
+            The array containing the resampled data.
+
+        y_resampled : ndarray, shape (n_samples_new)
+            The corresponding label of `X_resampled`
+
+        idx_under : ndarray, shape (n_samples, )
+            If `return_indices` is `True`, a boolean array will be returned
+            containing the which samples have been selected.
+
+        """
+        # Check the consistency of X and y
+        X, y = check_X_y(X, y)
+        X_, y_ = X.copy(), y.copy()
+
+        if self.return_indices:
+            idx_under = np.arange(X.shape[0], dtype=int)
+
+        prev_len = y.shape[0]
+
+        for n_iter in range(self.max_iter):
+            prev_len = y_.shape[0]
+            if self.return_indices:
+                X_, y_, idx_ = self.enn_.transform(X_, y_)
+                idx_under = idx_under[idx_]
+            else:
+                X_, y_ = self.enn_.transform(X_, y_)
+
+            if prev_len == y_.shape[0]:
+                break
+
+        if self.verbose:
+            print("Under-sampling performed: {}".format(Counter(y_)))
+
+        X_resampled, y_resampled = X_, y_
+
+        # Check if the indices of the samples selected should be returned too
+        if self.return_indices:
+            # Return the indices of interest
+            return X_resampled, y_resampled, idx_under
+        else:
+            return X_resampled, y_resampled