Skip to content

Commit 94f740e

Browse files
Added references to Instance Selection Algorithms #153
1 parent 8a61453 commit 94f740e

File tree

10 files changed

+99
-76
lines changed

10 files changed

+99
-76
lines changed

instance_selection/_CNN.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ def __init__(self):
1818

1919
def filter(self, samples, y):
2020
"""
21+
Hart, P. (1968). The condensed nearest neighbor rule (corresp.). IEEE
22+
transactions on information theory, 14(3), 515-516.
23+
2124
Implementation of The Condensed Nearest Neighbor Rule
2225
2326
The first sample of each class is placed in *store*. Thus we only have

instance_selection/_DROP3.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ def __init__(self, nearest_neighbors=3, power_parameter=2):
5757

5858
def filter(self, samples, y):
5959
"""
60+
Wilson, D. R., & Martinez, T. R. (2000). Reduction techniques for
61+
instance-based learning algorithms. Machine learning, 38(3),
62+
257-286.
63+
6064
Implementation of DROP3.
6165
6266
The Decremental Reduction Optimization Procedure (DROP) algorithms base

instance_selection/_ENN.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@
33
# @Filename: ENN.py
44
# @Author: Daniel Puente Ramírez
55
# @Time: 16/11/21 17:14
6-
# @Version: 4.0
6+
# @Version: 5.0
77

88
import numpy as np
99
import pandas as pd
1010
from sklearn.neighbors import NearestNeighbors
1111

12-
from .utils import transform
12+
from .utils import transform, transform_original_complete
1313

1414

1515
class ENN:
@@ -20,6 +20,10 @@ def __init__(self, nearest_neighbors=3, power_parameter=2):
2020

2121
def filter(self, samples, y):
2222
"""
23+
Wilson, D. L. (1972). Asymptotic properties of nearest neighbor rules
24+
using edited data. IEEE Transactions on Systems, Man, and
25+
Cybernetics, (3), 408-421.
26+
2327
Implementation of the Wilson Editing algorithm.
2428
2529
For each sample locates the *k* nearest neighbors and selects the
@@ -62,3 +66,58 @@ def filter(self, samples, y):
6266
y = pd.DataFrame(s_targets)
6367

6468
return samples, y
69+
70+
def filter_original_complete(self, original, original_y, complete,
71+
complete_y):
72+
"""
73+
Modification of the Wilson Editing algorithm.
74+
75+
For each sample locates the *k* nearest neighbors and selects the number
76+
of different classes there are.
77+
If a sample results in a wrong classification after being classified
78+
with k-NN, that sample is removed from the TS, only if the sample to be
79+
removed is not from the original dataset.
80+
:param original: DataFrame: dataset with the initial samples.
81+
:param original_y: DataFrame: labels.
82+
:param complete: DataFrame: dataset with the initial samples and the new
83+
ones added by self-training.
84+
:param complete_y: labels.
85+
:return: the input dataset with the remaining samples.
86+
"""
87+
self.x_attr = original.keys()
88+
original, complete = transform_original_complete(original, original_y,
89+
complete, complete_y)
90+
size = len(complete['data'])
91+
s_samples = list(complete['data'])
92+
s_targets = list(complete['target'])
93+
o_samples = list(original['data'])
94+
removed = 0
95+
96+
for index in range(size):
97+
x_sample = s_samples[index - removed]
98+
x_target = s_targets[index - removed]
99+
knn = NearestNeighbors(n_jobs=-1,
100+
n_neighbors=self.nearest_neighbors, p=2)
101+
samples_not_x = s_samples[:index - removed] + s_samples[
102+
index - removed + 1:]
103+
targets_not_x = s_targets[:index - removed] + s_targets[
104+
index - removed + 1:]
105+
knn.fit(samples_not_x)
106+
_, neigh_ind = knn.kneighbors([x_sample])
107+
y_targets = [targets_not_x[x] for x in neigh_ind[0]]
108+
count = np.bincount(y_targets)
109+
max_class = np.where(count == np.amax(count))[0][0]
110+
if max_class != x_target:
111+
delete = True
112+
for o_sample in o_samples:
113+
if np.array_equal(o_sample, x_sample):
114+
delete = False
115+
if delete:
116+
removed += 1
117+
s_samples = samples_not_x
118+
s_targets = targets_not_x
119+
120+
samples = pd.DataFrame(s_samples, columns=self.x_attr)
121+
y = pd.DataFrame(s_targets)
122+
123+
return samples, y

instance_selection/_ENN_self_training.py

Lines changed: 0 additions & 69 deletions
This file was deleted.

instance_selection/_ICF.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,11 @@ def __init__(self, nearest_neighbors=3, power_parameter=2):
8080

8181
def filter(self, samples, y):
8282
"""
83-
Implementation of Iterative Case Filtering
83+
Brighton, H., & Mellish, C. (2002). Advances in instance selection for
84+
instance-based learning algorithms. Data mining and knowledge
85+
discovery, 6(2), 153-172.
86+
87+
Implementation of Iterative Case Filtering.
8488
8589
ICF is based on coverage and reachable, due to this two concepts it
8690
performs deletion of samples based on the rule: "If the reachability

instance_selection/_LocalSets.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@
1111

1212

1313
class LocalSets:
14+
"""
15+
Leyva, E., González, A., & Pérez, R. (2015). Three new instance selection
16+
methods based on local sets: A comparative study with several approaches
17+
from a bi-objective perspective. Pattern Recognition, 48(4), 1523-1537.
18+
"""
19+
1420
def __init__(self):
1521
self.local_sets = None
1622
self.n_id = 0
@@ -67,6 +73,8 @@ def __init__(self):
6773
def filter(self, instances, labels):
6874
names = instances.keys()
6975
instances = instances.to_numpy()
76+
import numpy as np
77+
instances = [np.ravel(i) for i in instances]
7078
if len(instances) != len(labels):
7179
raise ValueError(
7280
f'The dimension of the labeled data must be the same as the '
@@ -88,7 +96,7 @@ def filter(self, instances, labels):
8896
s_samples.append(instances[index])
8997
s_labels.append(labels[index])
9098

91-
x = pd.DataFrame(s_samples, columns=names)
99+
x = pd.DataFrame(s_samples)
92100
y = pd.DataFrame(s_labels)
93101
return x, y
94102

instance_selection/_MSS.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,12 @@ def __init__(self):
4545

4646
def filter(self, samples, y):
4747
"""
48-
Implementation of Modified Selective Subset
48+
Barandela, R., Ferri, F. J., & Sánchez, J. S. (2005). Decision boundary
49+
preserving prototype selection for nearest neighbor classification.
50+
International Journal of Pattern Recognition and Artificial
51+
Intelligence, 19(06), 787-806.
52+
53+
Implementation of Modified Selective Subset.
4954
5055
It starts with two empty arrays *dat* and *tar*, which will contain the
5156
instances selected. The first approach is to sort based on Dj all the

instance_selection/_RNN.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@ def __init__(self):
2121

2222
def filter(self, samples, y):
2323
"""
24-
Implementation of The Reduced Nearest Neighbor
24+
Gates, G. (1972). The reduced nearest neighbor rule (corresp.).
25+
IEEE transactions on information theory, 18(3), 431-433.
26+
27+
Implementation of The Reduced Nearest Neighbor.
2528
2629
RNN is an extension of CNN. Firstly CNN will be executed in order to
2730
have S-CCN. It will perform iterative sample removal from S, and
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
from ._transformer import transform, delete_multiple_element
1+
from ._transformer import transform, transform_original_complete,\
2+
delete_multiple_element
23

34
__all__ = [
45
"transform",
6+
"transform_original_complete",
57
"delete_multiple_element"
68
]

instance_selection/utils/_transformer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ def transform(samples, y):
77
return Bunch(data=x_transformed, target=y_transformed)
88

99

10+
def transform_original_complete(original, original_y, complete, complete_y):
11+
return transform(original, original_y), transform(complete, complete_y)
12+
13+
1014
def delete_multiple_element(list_object, indices, reverse=True):
1115
indices = sorted(indices, reverse=reverse)
1216
for idx in indices:

0 commit comments

Comments
 (0)