Added tests for instance selection algorithms #174

dpuenteramirez · dpuenteramirez · commit 4ed22a217672 · 2022-04-15T19:34:38.000+02:00
diff --git a/instance_selection/_DROP3.py b/instance_selection/_DROP3.py
@@ -138,10 +138,6 @@ def filter(self, samples, y):
                         if np.array_equal(neigh, x_sample):
                             break
                     a_neighs = a_neighs[:index_a] + a_neighs[index_a + 1:]
-                    try:
-                        assert len(a_neighs) == self.nearest_neighbors
-                    except AssertionError:
-                        breakpoint()
                     # Find a new neigh for the associate
                     remaining_samples = [x for x, _, _ in initial_distances]
                     knn = NearestNeighbors(
@@ -162,20 +158,12 @@ def filter(self, samples, y):
                             a_neighs.append(pos_neigh)
                             break
 
-                    try:
-                        assert len(a_neighs) == self.nearest_neighbors + 1
-                    except AssertionError:
-                        print('Duplicated instances')
-
                     samples_info[tuple(a_associate_of_x)][0] = a_neighs
 
                     # Add a_associate to the associates list of the new neigh
                     new_neigh = a_neighs[-1]
-                    try:
-                        samples_info[tuple(new_neigh)][1].append(
-                            a_associate_of_x)
-                    except TypeError:
-                        pass
+                    samples_info[tuple(new_neigh)][1].append(
+                        a_associate_of_x)
 
         samples = pd.DataFrame([x for x, _, _ in initial_distances],
                                columns=self.x_attr)
diff --git a/instance_selection/_ENN.py b/instance_selection/_ENN.py
@@ -18,7 +18,7 @@ def __init__(self, nearest_neighbors=3, power_parameter=2):
         self.power_parameter = power_parameter
         self.x_attr = None
 
-    def neighs(self, s_samples, s_targets, index, removed):
+    def __neighs(self, s_samples, s_targets, index, removed):
         x_sample = s_samples[index - removed]
         x_target = s_targets[index - removed]
         knn = NearestNeighbors(n_jobs=-1,
@@ -56,8 +56,8 @@ def filter(self, samples, y):
         removed = 0
 
         for index in range(size):
-            _, x_target, targets_not_x, samples_not_x, neigh_ind = self.neighs(
-                s_samples, s_targets, index, removed)
+            _, x_target, targets_not_x, samples_not_x, neigh_ind = \
+                self.__neighs(s_samples, s_targets, index, removed)
             y_targets = np.ravel(
                 np.array([targets_not_x[x] for x in neigh_ind[0]])).astype(int)
             count = np.bincount(y_targets)
@@ -100,9 +100,9 @@ def filter_original_complete(self, original, original_y, complete,
 
         for index in range(size):
             x_sample, x_target, targets_not_x, samples_not_x, neigh_ind = \
-                self.neighs(s_samples, s_targets, index, removed)
+                self.__neighs(s_samples, s_targets, index, removed)
             y_targets = [targets_not_x[x] for x in neigh_ind[0]]
-            count = np.bincount(y_targets)
+            count = np.bincount(np.ravel(y_targets))
             max_class = np.where(count == np.amax(count))[0][0]
             if max_class != x_target:
                 delete = True
diff --git a/instance_selection/_LocalSets.py b/instance_selection/_LocalSets.py
@@ -6,6 +6,7 @@
 # @Version:     2.0
 import sys
 
+import numpy as np
 import pandas as pd
 from sklearn.metrics import pairwise_distances
 
@@ -65,6 +66,12 @@ def usefulness(self, e):
     def get_local_sets(self):
         return self.local_sets
 
+    @staticmethod
+    def check_frame_to_numpy(y):
+        if isinstance(y, pd.DataFrame):
+            return np.ravel(y.to_numpy())
+        return y
+
 
 class LSSm(LocalSets):
     def __init__(self):
@@ -73,8 +80,8 @@ def __init__(self):
     def filter(self, instances, labels):
         names = instances.keys()
         instances = instances.to_numpy()
-        import numpy as np
         instances = [np.ravel(i) for i in instances]
+        labels = self.check_frame_to_numpy(labels)
         if len(instances) != len(labels):
             raise ValueError(
                 f'The dimension of the labeled data must be the same as the '
@@ -113,6 +120,7 @@ def filter(self, instances, labels):
                 f'number of labels given. {len(instances)} != {len(labels)}'
             )
         self.n_id = len(instances)
+        labels = self.check_frame_to_numpy(labels)
         lssm = LSSm()
         instances, labels = lssm.filter(instances, labels)
         instances = instances.to_numpy()
diff --git a/instance_selection/utils/__init__.py b/instance_selection/utils/__init__.py
@@ -1,4 +1,4 @@
-from ._transformer import transform, transform_original_complete,\
+from ._transformer import transform, transform_original_complete, \
     delete_multiple_element
 
 __all__ = [
diff --git a/is-ssl.yml b/is-ssl.yml
@@ -2,10 +2,12 @@ name: IS-SSL
 channels:
   - conda-forge
   - default
+  - anaconda
 dependencies:
   - numpy=1.20.3
   - scikit-learn=0.24.2
   - matplotlib=3.4.3
   - pandas=1.3.4
   - yagmail=0.15.277
-  - scipy~=1.7.1
+  - scipy=1.7.1
+  - pytest=7.1.1
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,5 @@ scikit-learn~=0.24.2
 matplotlib~=3.4.3
 pandas~=1.3.4
 yagmail~=0.15.277
-scipy~=1.7.1
+scipy~=1.7.1
+pytest~=7.1.1
diff --git a/tests/InstanceSelection.py b/tests/InstanceSelection.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# @Filename:    InstanceSelection.py
+# @Author:      Daniel Puente Ramírez
+# @Time:        15/4/22 16:20
+
+import random
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.datasets import load_iris
+
+from instance_selection import ENN, CNN, RNN, ICF, MSS, DROP3, LSSm, LSBo
+
+
+def to_dataframe(y):
+    if not isinstance(y, pd.DataFrame):
+        return pd.DataFrame(y)
+    return y
+
+
+@pytest.fixture
+def iris_dataset():
+    x, y = load_iris(return_X_y=True, as_frame=True)
+    y = to_dataframe(y)
+    return x, y
+
+
+@pytest.fixture
+def iris_dataset_ss():
+    x, y = load_iris(return_X_y=True, as_frame=True)
+    y = to_dataframe(y)
+    li = list(set(range(x.shape[0])))
+
+    unlabeled = random.sample(li, int(x.shape[0] * 0.3))
+    labeled = [x for x in range(x.shape[0]) if x not in unlabeled]
+
+    complete = x
+    complete_labels = y
+
+    original = x.loc[labeled]
+    original_labels = y.loc[labeled]
+
+    return original, original_labels, complete, complete_labels
+
+
+def base(x, y, algorithm, params=None):
+    assert isinstance(x, pd.DataFrame) and isinstance(y, pd.DataFrame)
+    model = algorithm(**params) if params is not None else algorithm()
+    x_filtered, y_filtered = model.filter(x, y)
+
+    assert x_filtered.shape[1] == x.shape[1] and y_filtered.shape[1] == \
+           y.shape[1]
+
+    assert x_filtered.shape[0] == y_filtered.shape[0]
+    assert x_filtered.shape[0] < x.shape[0]
+
+
+def test_enn_original(iris_dataset):
+    x, y = iris_dataset
+    base(x, y, ENN, {'nearest_neighbors': 3, 'power_parameter': 2})
+
+
+def test_cnn(iris_dataset):
+    x, y = iris_dataset
+    base(x, y, CNN)
+
+
+def test_rnn(iris_dataset):
+    x, y = iris_dataset
+    base(x, y, RNN)
+
+
+def test_icf(iris_dataset):
+    x, y = iris_dataset
+    base(x, y, ICF, {'nearest_neighbors': 3, 'power_parameter': 2})
+
+
+def test_mss(iris_dataset):
+    x, y = iris_dataset
+    base(x, y, MSS)
+
+
+def test_drop3(iris_dataset):
+    x, y = iris_dataset
+    base(x, y, DROP3, {'nearest_neighbors': 3, 'power_parameter': 2})
+
+
+def test_local_sets_lssm(iris_dataset):
+    x, y = iris_dataset
+    base(x, y, LSSm)
+
+
+def test_local_sets_lsbo(iris_dataset):
+    x, y = iris_dataset
+    base(x, y, LSBo)
+
+
+def test_enn_ss(iris_dataset_ss):
+    original, original_labels, complete, complete_labels, = iris_dataset_ss
+
+    model = ENN()
+    x, y = model.filter_original_complete(original, original_labels,
+                                          complete, complete_labels)
+
+    new_orig = []
+    for ori in original.to_numpy():
+        for index, x_sample in enumerate(x.to_numpy()):
+            if np.array_equal(ori, x_sample):
+                new_orig.append(index)
+                break
+
+    a = np.ravel(y.loc[new_orig].to_numpy())
+    o = np.ravel(original_labels.to_numpy())
+    assert np.array_equal(o, a)
+    assert complete.shape[1] == x.shape[1]
+    assert complete.shape[0] >= x.shape[0]
+
+
+def test_different_len(iris_dataset):
+    x, y = iris_dataset
+    y = y.loc[:-1]
+    model1 = LSSm()
+    with pytest.raises(ValueError):
+        model1.filter(x, y)
+    model2 = LSBo()
+    with pytest.raises(ValueError):
+        model2.filter(x, y)
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+# @Filename:    __init__.py
+# @Author:      Daniel Puente Ramírez
+# @Time:        15/4/22 16:19
+
+"""Python module for testing"""

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from ._transformer import transform, transform_original_complete,\`
	`1`	`+from ._transformer import transform, transform_original_complete, \`
`2`	`2`	`delete_multiple_element`
`3`	`3`
`4`	`4`	`__all__ = [`