Skip to content

Commit 0cd8979

Browse files
Reduced cognitive complexity #197
1 parent ca00cc8 commit 0cd8979

File tree

5 files changed

+240
-162
lines changed

5 files changed

+240
-162
lines changed

semisupervised/DensityPeaks.py

Lines changed: 47 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@
2222
class STDPNF:
2323
"""
2424
Li, J., Zhu, Q., & Wu, Q. (2019). A self-training method based on density
25-
peaks and an extended parameter-free local noise filter for k nearest
26-
neighbor. Knowledge-Based Systems, 184, 104895.
25+
peaks and an extended parameter-free local noise filter for k nearest
26+
neighbor. Knowledge-Based Systems, 184, 104895.
2727
2828
Wu, D., Shang, M., Luo, X., Xu, J., Yan, H., Deng, W., & Wang, G. (2018).
29-
Self-training semi-supervised classification based on density peaks of
30-
data. Neurocomputing, 275, 180-191.
29+
Self-training semi-supervised classification based on density peaks of
30+
data. Neurocomputing, 275, 180-191.
3131
"""
3232

3333
def __init__(self,
@@ -455,27 +455,9 @@ def _fit_stdpnf(self):
455455
complete = labeled_data['sample']
456456
complete_y = labeled_data['label']
457457

458-
if isinstance(self.filter, ENN):
459-
original = pd.DataFrame(self.l)
460-
original_y = pd.DataFrame(self.y)
461-
result, _ = self.filter.filter_original_complete(
462-
original, original_y, complete, complete_y)
463-
else:
464-
result, _ = self.filter.filter(complete, complete_y)
465-
466-
results_to_unlabeled = []
467-
for r in result.to_numpy():
468-
is_in = False
469-
for c in complete:
470-
if np.array_equal(r, c):
471-
is_in = True
472-
if not is_in:
473-
results_to_unlabeled.append(r)
474-
475-
for r in results_to_unlabeled:
476-
self.structure_stdnpf.at[
477-
np.array(self.structure_stdnpf['sample'],
478-
r)]['label'] = -1
458+
result = self._if_filter(complete, complete_y)
459+
460+
self._results_to_structure(complete, result)
479461

480462
labeled_data = self.structure_stdnpf.loc[self.structure_stdnpf[
481463
'label'] != -1]
@@ -489,6 +471,46 @@ def _fit_stdpnf(self):
489471
self.classifier_stdpnf.fit(
490472
labeled_data['sample'].tolist(), labeled_data['label'].tolist())
491473

474+
def _results_to_structure(self, complete, result):
475+
"""
476+
> This function takes the results of the model and compares them to the
477+
complete data set. If the result is not in the complete data set, it is
478+
added to the structure data set.
479+
480+
:param complete: the complete dataset
481+
:param result: the result of the clustering
482+
"""
483+
results_to_unlabeled = []
484+
for r in result.to_numpy():
485+
is_in = False
486+
for c in complete:
487+
if np.array_equal(r, c):
488+
is_in = True
489+
if not is_in:
490+
results_to_unlabeled.append(r)
491+
for r in results_to_unlabeled:
492+
self.structure_stdnpf.at[
493+
np.array(self.structure_stdnpf['sample'],
494+
r)]['label'] = -1
495+
496+
def _if_filter(self, complete, complete_y):
497+
"""
498+
If the filter is an ENN, then filter the original data, otherwise
499+
filter the complete data
500+
501+
:param complete: the complete dataframe
502+
:param complete_y: the complete y values
503+
:return: The result is a dataframe with the filtered data.
504+
"""
505+
if isinstance(self.filter, ENN):
506+
original = pd.DataFrame(self.l)
507+
original_y = pd.DataFrame(self.y)
508+
result, _ = self.filter.filter_original_complete(
509+
original, original_y, complete, complete_y)
510+
else:
511+
result, _ = self.filter.filter(complete, complete_y)
512+
return result
513+
492514
def fit(self, samples, y):
493515
"""Fit method."""
494516
try:

semisupervised/TriTraining.py

Lines changed: 114 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# @Filename: TriTraining.py
44
# @Author: Daniel Puente Ramírez
55
# @Time: 27/12/21 10:25
6-
# @Version: 4.0
6+
# @Version: 5.0
77

88
from math import floor, ceil
99

@@ -81,6 +81,16 @@ def __init__(self, random_state=None,
8181
np.random.randint(low=0, high=10e5, size=1)[0]
8282

8383
def _subsample(self, l_t, s):
84+
"""
85+
> The function takes in a Bunch object, which is a dictionary-like
86+
object that contains the data and target arrays, and a sample size,
87+
and returns a Bunch object with the data and target arrays sub-sampled
88+
to the specified size
89+
90+
:param l_t: the labeled and unlabeled data
91+
:param s: the number of samples to be drawn from the dataset
92+
:return: A Bunch object with the data and target attributes.
93+
"""
8494
np.random.seed(self.random_state)
8595
rng = np.random.default_rng()
8696
data = np.array(l_t['data'])
@@ -91,6 +101,17 @@ def _subsample(self, l_t, s):
91101
return Bunch(data=samples, target=targets)
92102

93103
def fit(self, samples, y):
104+
"""
105+
The function takes in the training data and the labels, and then splits
106+
the data into three parts: labeled, unlabeled, and test. It then
107+
creates three classifiers, h_i, h_j, and h_k, and trains them on the
108+
labeled data. It then checks to see if the classifiers are accurate
109+
enough, and if they are, it returns them. If they are not, it trains
110+
them again on the labeled data, and then checks again
111+
112+
:param samples: The samples to train the classifier on
113+
:param y: the labels
114+
"""
94115
try:
95116
labeled, u, y = split(samples, y)
96117
except IndexError:
@@ -121,114 +142,104 @@ def fit(self, samples, y):
121142
hash_j = h_j.__hash__()
122143
hash_k = h_k.__hash__()
123144

124-
update_j = False
125-
l_j = Bunch(data=[], target=[])
126-
e_j = measure_error(h_j, h_k, labeled)
127-
128-
if e_j < ep_j:
129-
for sample in u:
130-
sample_s = sample.reshape(1, -1)
131-
if h_j.predict(sample_s) == h_k.predict(sample_s):
132-
pred = h_i.predict(sample_s)
133-
prev_dat = list(l_j['data'])
134-
prev_tar = list(l_j['target'])
135-
prev_dat.append(sample)
136-
l_j['data'] = np.array(prev_dat)
137-
prev_tar.append(pred)
138-
l_j['target'] = np.array(prev_tar)
139-
140-
if lp_j == 0:
141-
lp_j = floor(e_j / (ep_j - e_j) + 1)
142-
143-
if lp_j < len(l_j['data']):
144-
if e_j * len(l_j['data']) < ep_j * lp_j:
145-
update_j = True
146-
elif lp_j > e_j / (ep_j - e_j):
147-
l_j = self._subsample(l_j, ceil(((ep_j * lp_j) / e_j)
148-
- 1))
149-
update_j = True
150-
151-
update_k = False
152-
l_k = Bunch(data=np.array([]), target=np.array([]))
153-
e_k = measure_error(h_j, h_k, labeled)
154-
155-
if e_k < ep_k:
156-
for sample in u:
157-
sample_s = sample.reshape(1, -1)
158-
if h_j.predict(sample_s) == h_k.predict(sample_s):
159-
pred = h_i.predict(sample_s)
160-
prev_dat = list(l_k['data'])
161-
prev_tar = list(l_k['target'])
162-
prev_dat.append(sample)
163-
l_k['data'] = np.array(prev_dat)
164-
prev_tar.append(pred)
165-
l_k['target'] = np.array(prev_tar)
166-
167-
if lp_k == 0:
168-
lp_k = floor(e_k / (ep_k - e_k) + 1)
169-
170-
if lp_k < len(l_k['data']):
171-
if e_k * len(l_k['data']) < ep_k * lp_k:
172-
update_k = True
173-
elif lp_k > e_k / (ep_k - e_k):
174-
l_k = self._subsample(l_k, ceil(((ep_k * lp_k) / e_k)
175-
- 1))
176-
update_k = True
177-
178-
update_i = False
179-
l_i = Bunch(data=np.array([]), target=np.array([]))
180-
e_i = measure_error(h_j, h_k, labeled)
181-
182-
if e_i < ep_i:
183-
for sample in u:
184-
sample_s = sample.reshape(1, -1)
185-
if h_j.predict(sample_s) == h_k.predict(sample_s):
186-
pred = h_i.predict(sample_s)
187-
prev_dat = list(l_i['data'])
188-
prev_tar = list(l_i['target'])
189-
prev_dat.append(sample)
190-
l_i['data'] = np.array(prev_dat)
191-
prev_tar.append(pred)
192-
l_i['target'] = np.array(prev_tar)
193-
194-
if lp_i == 0:
195-
lp_i = floor(e_i / (ep_i - e_i) + 1)
196-
197-
if lp_i < len(l_i['data']):
198-
if e_i * len(l_i['data']) < ep_i * lp_i:
199-
update_i = True
200-
elif lp_i > e_i / (ep_i - e_i):
201-
l_i = self._subsample(l_i, ceil(((ep_i * lp_i) / e_i)
202-
- 1))
203-
update_i = True
204-
205-
if update_j:
206-
train = np.concatenate((labeled, l_j['data']), axis=0)
207-
test = np.concatenate((y, np.ravel(l_j['target'])),
208-
axis=0)
209-
h_j = self.hj.fit(train, test)
210-
ep_j = e_j
211-
lp_j = len(l_j)
212-
if update_k:
213-
train = np.concatenate((labeled, l_k['data']), axis=0)
214-
test = np.concatenate((y, np.ravel(l_k['target'])),
215-
axis=0)
216-
h_k = self.hk.fit(train, test)
217-
ep_k = e_k
218-
lp_k = len(l_k)
219-
if update_i:
220-
train = np.concatenate((labeled, l_i['data']), axis=0)
221-
test = np.concatenate((y, np.ravel(l_i['target'])),
222-
axis=0)
223-
h_i = self.hi.fit(train, test)
224-
ep_i = e_i
225-
lp_i = len(l_i)
145+
e_j, l_j, update_j = self._train_classifier(ep_j, h_i, h_j, h_k,
146+
labeled, lp_j, u)
147+
148+
e_k, l_k, update_k = self._train_classifier(ep_k, h_i, h_j, h_k,
149+
labeled, lp_k, u)
150+
151+
e_i, l_i, update_i = self._train_classifier(ep_i, h_i, h_j, h_k,
152+
labeled, lp_i, u)
153+
154+
ep_j, h_j, lp_j = self._check_for_update(e_j, ep_j, h_j, l_j,
155+
labeled, lp_j, update_j, y)
156+
ep_k, h_k, lp_k = self._check_for_update(e_k, ep_k, h_k, l_k,
157+
labeled, lp_k, update_k,
158+
y)
159+
160+
ep_i, h_i, lp_i = self._check_for_update(e_i, ep_i, h_i, l_i,
161+
labeled, lp_i, update_i, y)
226162

227163
if h_i.__hash__() == hash_i and h_j.__hash__() == hash_j and \
228164
h_k.__hash__() == hash_k:
229165
break
230166

167+
def _check_for_update(self, e_j, ep_j, h_j, l_j, labeled, lp_j, update_j,
168+
y):
169+
"""
170+
If the update_j flag is True, then we concatenate the labeled data with
171+
the new data, and fit the model to the new data
172+
173+
:param e_j: the error of the current hypothesis
174+
:param ep_j: the error of the previous iteration
175+
:param h_j: the classifier for the jth class
176+
:param l_j: the labeled data
177+
:param labeled: the labeled data
178+
:param lp_j: the number of labeled points in the current iteration
179+
:param update_j: boolean, whether to update the model or not
180+
:param y: the true labels of the data
181+
:return: the error, the hypothesis, and the length of the labeled data.
182+
"""
183+
if update_j:
184+
train = np.concatenate((labeled, l_j['data']), axis=0)
185+
test = np.concatenate((y, np.ravel(l_j['target'])),
186+
axis=0)
187+
h_j = self.hj.fit(train, test)
188+
ep_j = e_j
189+
lp_j = len(l_j)
190+
return ep_j, h_j, lp_j
191+
192+
def _train_classifier(self, ep_k, h_i, h_j, h_k, labeled, lp_k, u):
193+
"""
194+
If the error of the classifier is less than the error threshold, and the
195+
number of samples in the labeled set is less than the number of samples
196+
in the unlabeled set, then add the samples to the labeled set
197+
198+
:param ep_k: the error threshold for the classifier
199+
:param h_i: the classifier that is being trained
200+
:param h_j: the classifier that is being compared to h_k
201+
:param h_k: the classifier we're training
202+
:param labeled: the labeled data
203+
:param lp_k: the number of samples that have been labeled by h_k
204+
:param u: the unlabeled data
205+
:return: The error, the new labeled data, and a boolean indicating
206+
whether the classifier should be updated.
207+
"""
208+
update_k = False
209+
l_k = Bunch(data=np.array([]), target=np.array([]))
210+
e_k = measure_error(h_j, h_k, labeled)
211+
if e_k < ep_k:
212+
for sample in u:
213+
sample_s = sample.reshape(1, -1)
214+
if h_j.predict(sample_s) == h_k.predict(sample_s):
215+
pred = h_i.predict(sample_s)
216+
prev_dat = list(l_k['data'])
217+
prev_tar = list(l_k['target'])
218+
prev_dat.append(sample)
219+
l_k['data'] = np.array(prev_dat)
220+
prev_tar.append(pred)
221+
l_k['target'] = np.array(prev_tar)
222+
223+
if lp_k == 0:
224+
lp_k = floor(e_k / (ep_k - e_k) + 1)
225+
226+
if lp_k < len(l_k['data']):
227+
if e_k * len(l_k['data']) < ep_k * lp_k:
228+
update_k = True
229+
elif lp_k > e_k / (ep_k - e_k):
230+
l_k = self._subsample(l_k, ceil(((ep_k * lp_k) / e_k)
231+
- 1))
232+
update_k = True
233+
return e_k, l_k, update_k
234+
231235
def predict(self, samples):
236+
"""
237+
For each sample, we predict the label using each of the three
238+
classifiers, and then we take the majority vote of the three predictions
239+
240+
:param samples: the data to be classified
241+
:return: The labels of the samples.
242+
"""
232243
labels = []
233244
pred1 = self.hi.predict(samples)
234245
pred2 = self.hj.predict(samples)

0 commit comments

Comments
 (0)