33# @Filename: ENN.py
44# @Author: Daniel Puente Ramírez
55# @Time: 16/11/21 17:14
6- # @Version: 4 .0
6+ # @Version: 5 .0
77
88import numpy as np
99import pandas as pd
1010from sklearn .neighbors import NearestNeighbors
1111
12- from .utils import transform
12+ from .utils import transform , transform_original_complete
1313
1414
1515class ENN :
@@ -20,6 +20,10 @@ def __init__(self, nearest_neighbors=3, power_parameter=2):
2020
2121 def filter (self , samples , y ):
2222 """
23+ Wilson, D. L. (1972). Asymptotic properties of nearest neighbor rules
24+ using edited data. IEEE Transactions on Systems, Man, and
25+ Cybernetics, (3), 408-421.
26+
2327 Implementation of the Wilson Editing algorithm.
2428
2529 For each sample locates the *k* nearest neighbors and selects the
@@ -62,3 +66,58 @@ def filter(self, samples, y):
6266 y = pd .DataFrame (s_targets )
6367
6468 return samples , y
69+
70+ def filter_original_complete (self , original , original_y , complete ,
71+ complete_y ):
72+ """
73+ Modification of the Wilson Editing algorithm.
74+
75+ For each sample locates the *k* nearest neighbors and selects the number
76+ of different classes there are.
77+ If a sample results in a wrong classification after being classified
78+ with k-NN, that sample is removed from the TS, only if the sample to be
79+ removed is not from the original dataset.
80+ :param original: DataFrame: dataset with the initial samples.
81+ :param original_y: DataFrame: labels.
82+ :param complete: DataFrame: dataset with the initial samples and the new
83+ ones added by self-training.
84+ :param complete_y: labels.
85+ :return: the input dataset with the remaining samples.
86+ """
87+ self .x_attr = original .keys ()
88+ original , complete = transform_original_complete (original , original_y ,
89+ complete , complete_y )
90+ size = len (complete ['data' ])
91+ s_samples = list (complete ['data' ])
92+ s_targets = list (complete ['target' ])
93+ o_samples = list (original ['data' ])
94+ removed = 0
95+
96+ for index in range (size ):
97+ x_sample = s_samples [index - removed ]
98+ x_target = s_targets [index - removed ]
99+ knn = NearestNeighbors (n_jobs = - 1 ,
100+ n_neighbors = self .nearest_neighbors , p = 2 )
101+ samples_not_x = s_samples [:index - removed ] + s_samples [
102+ index - removed + 1 :]
103+ targets_not_x = s_targets [:index - removed ] + s_targets [
104+ index - removed + 1 :]
105+ knn .fit (samples_not_x )
106+ _ , neigh_ind = knn .kneighbors ([x_sample ])
107+ y_targets = [targets_not_x [x ] for x in neigh_ind [0 ]]
108+ count = np .bincount (y_targets )
109+ max_class = np .where (count == np .amax (count ))[0 ][0 ]
110+ if max_class != x_target :
111+ delete = True
112+ for o_sample in o_samples :
113+ if np .array_equal (o_sample , x_sample ):
114+ delete = False
115+ if delete :
116+ removed += 1
117+ s_samples = samples_not_x
118+ s_targets = targets_not_x
119+
120+ samples = pd .DataFrame (s_samples , columns = self .x_attr )
121+ y = pd .DataFrame (s_targets )
122+
123+ return samples , y
0 commit comments