Added references to Instance Selection Algorithms #153

dpuenteramirez · dpuenteramirez · commit 94f740e5c04c · 2022-03-31T11:59:00.000+02:00
diff --git a/instance_selection/_CNN.py b/instance_selection/_CNN.py
@@ -18,6 +18,9 @@ def __init__(self):
 
     def filter(self, samples, y):
         """
+        Hart, P. (1968). The condensed nearest neighbor rule (corresp.). IEEE
+            transactions on information theory, 14(3), 515-516.
+
         Implementation of The Condensed Nearest Neighbor Rule
 
         The first sample of each class is placed in *store*. Thus we only have
diff --git a/instance_selection/_DROP3.py b/instance_selection/_DROP3.py
@@ -57,6 +57,10 @@ def __init__(self, nearest_neighbors=3, power_parameter=2):
 
     def filter(self, samples, y):
         """
+        Wilson, D. R., & Martinez, T. R. (2000). Reduction techniques for
+            instance-based learning algorithms. Machine learning, 38(3),
+            257-286.
+
         Implementation of DROP3.
 
         The Decremental Reduction Optimization Procedure (DROP) algorithms base
diff --git a/instance_selection/_ENN.py b/instance_selection/_ENN.py
@@ -3,13 +3,13 @@
 # @Filename:    ENN.py
 # @Author:      Daniel Puente Ramírez
 # @Time:        16/11/21 17:14
-# @Version:     4.0
+# @Version:     5.0
 
 import numpy as np
 import pandas as pd
 from sklearn.neighbors import NearestNeighbors
 
-from .utils import transform
+from .utils import transform, transform_original_complete
 
 
 class ENN:
@@ -20,6 +20,10 @@ def __init__(self, nearest_neighbors=3, power_parameter=2):
 
     def filter(self, samples, y):
         """
+        Wilson, D. L. (1972). Asymptotic properties of nearest neighbor rules
+            using edited data. IEEE Transactions on Systems, Man, and
+            Cybernetics, (3), 408-421.
+
         Implementation of the Wilson Editing algorithm.
 
         For each sample locates the *k* nearest neighbors and selects the
@@ -62,3 +66,58 @@ def filter(self, samples, y):
         y = pd.DataFrame(s_targets)
 
         return samples, y
+
+    def filter_original_complete(self, original, original_y, complete,
+                                 complete_y):
+        """
+        Modification of the Wilson Editing algorithm.
+
+        For each sample locates the *k* nearest neighbors and selects the number
+        of different classes there are.
+        If a sample results in a wrong classification after being classified
+        with k-NN, that sample is removed from the TS, only if the sample to be
+        removed is not from the original dataset.
+        :param original: DataFrame: dataset with the initial samples.
+        :param original_y: DataFrame: labels.
+        :param complete: DataFrame: dataset with the initial samples and the new
+        ones added by self-training.
+        :param complete_y: labels.
+        :return: the input dataset with the remaining samples.
+        """
+        self.x_attr = original.keys()
+        original, complete = transform_original_complete(original, original_y,
+                                                         complete, complete_y)
+        size = len(complete['data'])
+        s_samples = list(complete['data'])
+        s_targets = list(complete['target'])
+        o_samples = list(original['data'])
+        removed = 0
+
+        for index in range(size):
+            x_sample = s_samples[index - removed]
+            x_target = s_targets[index - removed]
+            knn = NearestNeighbors(n_jobs=-1,
+                                   n_neighbors=self.nearest_neighbors, p=2)
+            samples_not_x = s_samples[:index - removed] + s_samples[
+                                                          index - removed + 1:]
+            targets_not_x = s_targets[:index - removed] + s_targets[
+                                                          index - removed + 1:]
+            knn.fit(samples_not_x)
+            _, neigh_ind = knn.kneighbors([x_sample])
+            y_targets = [targets_not_x[x] for x in neigh_ind[0]]
+            count = np.bincount(y_targets)
+            max_class = np.where(count == np.amax(count))[0][0]
+            if max_class != x_target:
+                delete = True
+                for o_sample in o_samples:
+                    if np.array_equal(o_sample, x_sample):
+                        delete = False
+                if delete:
+                    removed += 1
+                    s_samples = samples_not_x
+                    s_targets = targets_not_x
+
+        samples = pd.DataFrame(s_samples, columns=self.x_attr)
+        y = pd.DataFrame(s_targets)
+
+        return samples, y
diff --git a/instance_selection/_ENN_self_training.py b/instance_selection/_ENN_self_training.py
diff --git a/instance_selection/_ICF.py b/instance_selection/_ICF.py
@@ -80,7 +80,11 @@ def __init__(self, nearest_neighbors=3, power_parameter=2):
 
     def filter(self, samples, y):
         """
-        Implementation of Iterative Case Filtering
+        Brighton, H., & Mellish, C. (2002). Advances in instance selection for
+            instance-based learning algorithms. Data mining and knowledge
+            discovery, 6(2), 153-172.
+
+        Implementation of Iterative Case Filtering.
 
         ICF is based on coverage and reachable, due to this two concepts it
         performs deletion of samples based on the rule: "If the reachability
diff --git a/instance_selection/_LocalSets.py b/instance_selection/_LocalSets.py
@@ -11,6 +11,12 @@
 
 
 class LocalSets:
+    """
+    Leyva, E., González, A., & Pérez, R. (2015). Three new instance selection
+        methods based on local sets: A comparative study with several approaches
+        from a bi-objective perspective. Pattern Recognition, 48(4), 1523-1537.
+    """
+
     def __init__(self):
         self.local_sets = None
         self.n_id = 0
@@ -67,6 +73,8 @@ def __init__(self):
     def filter(self, instances, labels):
         names = instances.keys()
         instances = instances.to_numpy()
+        import numpy as np
+        instances = [np.ravel(i) for i in instances]
         if len(instances) != len(labels):
             raise ValueError(
                 f'The dimension of the labeled data must be the same as the '
@@ -88,7 +96,7 @@ def filter(self, instances, labels):
                 s_samples.append(instances[index])
                 s_labels.append(labels[index])
 
-        x = pd.DataFrame(s_samples, columns=names)
+        x = pd.DataFrame(s_samples)
         y = pd.DataFrame(s_labels)
         return x, y
 
diff --git a/instance_selection/_MSS.py b/instance_selection/_MSS.py
@@ -45,7 +45,12 @@ def __init__(self):
 
     def filter(self, samples, y):
         """
-        Implementation of Modified Selective Subset
+        Barandela, R., Ferri, F. J., & Sánchez, J. S. (2005). Decision boundary
+            preserving prototype selection for nearest neighbor classification.
+            International Journal of Pattern Recognition and Artificial
+            Intelligence, 19(06), 787-806.
+
+        Implementation of Modified Selective Subset.
 
         It starts with two empty arrays *dat* and *tar*, which will contain the
         instances selected. The first approach is to sort based on Dj all the
diff --git a/instance_selection/_RNN.py b/instance_selection/_RNN.py
@@ -21,7 +21,10 @@ def __init__(self):
 
     def filter(self, samples, y):
         """
-        Implementation of The Reduced Nearest Neighbor
+        Gates, G. (1972). The reduced nearest neighbor rule (corresp.).
+            IEEE transactions on information theory, 18(3), 431-433.
+
+        Implementation of The Reduced Nearest Neighbor.
 
         RNN is an extension of CNN. Firstly CNN will be executed in order to
         have S-CCN. It will perform iterative sample removal from S, and
diff --git a/instance_selection/utils/__init__.py b/instance_selection/utils/__init__.py
@@ -1,6 +1,8 @@
-from ._transformer import transform, delete_multiple_element
+from ._transformer import transform, transform_original_complete,\
+    delete_multiple_element
 
 __all__ = [
     "transform",
+    "transform_original_complete",
     "delete_multiple_element"
 ]
diff --git a/instance_selection/utils/_transformer.py b/instance_selection/utils/_transformer.py
@@ -7,6 +7,10 @@ def transform(samples, y):
     return Bunch(data=x_transformed, target=y_transformed)
 
 
+def transform_original_complete(original, original_y, complete, complete_y):
+    return transform(original, original_y), transform(complete, complete_y)
+
+
 def delete_multiple_element(list_object, indices, reverse=True):
     indices = sorted(indices, reverse=reverse)
     for idx in indices:

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,8 @@`
`1`		`-from ._transformer import transform, delete_multiple_element`
	`1`	`+from ._transformer import transform, transform_original_complete,\`
	`2`	`+ delete_multiple_element`
`2`	`3`
`3`	`4`	`__all__ = [`
`4`	`5`	`"transform",`
	`6`	`+ "transform_original_complete",`
`5`	`7`	`"delete_multiple_element"`
`6`	`8`	`]`