Skip to content

Commit 2e355a7

Browse files
Implemented RESSEL algorithm #188
1 parent c259556 commit 2e355a7

File tree

3 files changed

+196
-0
lines changed

3 files changed

+196
-0
lines changed

SSL_RELEASE.md renamed to SSL_REALEASE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ This project has the following algorithms:
77
- Density Peaks
88
- STDPNF
99

10+
- Ensemble
11+
- RUSSEL
1012

1113
# Release Notes
1214

semisupervised/ensemble/_RESSEL.py

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
#!/usr/bin/env python
2+
# -*- coding:utf-8 -*-
3+
# @Filename: _RESSEL.py
4+
# @Author: Daniel Puente Ramírez
5+
# @Time: 25/4/22 18:49
6+
7+
import numpy as np
8+
import pandas as pd
9+
from sklearn.metrics import f1_score
10+
11+
12+
class RESSEL:
13+
"""
14+
de Vries, S., & Thierens, D. (2021). A reliable ensemble based approach
15+
to semi-supervised learning. Knowledge-Based Systems, 215, 106738.
16+
"""
17+
18+
def __init__(self, n=10, m=10, k=8, unlabeled_sample_frac=0.75,
19+
random_state=42, reuse_samples=True):
20+
self.n = n
21+
self.m = m
22+
self.k = k
23+
self.unlabeled_sample_frac = unlabeled_sample_frac
24+
self.random_state = random_state
25+
self.reuse_samples = reuse_samples
26+
self.ensemble = []
27+
28+
def fit(self, labeled, unlabeled, base_estimator, estimator_params=None):
29+
"""
30+
Build an ensemble based on the base_estimator.
31+
32+
:param labeled: pandas DataFrame with labeled samples.
33+
:param unlabeled: pandas DataFrame with unlabeled samples.
34+
:param base_estimator: base Classifier.
35+
:param estimator_params: dict of params to pass to the estimator.
36+
:return: the ensemble in case is needed.
37+
"""
38+
39+
if not isinstance(labeled, pd.DataFrame):
40+
raise AttributeError("Labeled samples object needs to be a "
41+
"Pandas DataFrame. Not a ", type(labeled))
42+
43+
if not isinstance(unlabeled, pd.DataFrame):
44+
raise AttributeError("Unlabeled samples object needs to be a "
45+
"Pandas DataFrame. Not a ",
46+
type(unlabeled))
47+
48+
if labeled.shape[1] != unlabeled.shape[1] + 1:
49+
raise ValueError("Labeled samples must have one more attribute "
50+
"than the unlabeled ones.",
51+
labeled.shape[1], unlabeled.shape[1])
52+
53+
if base_estimator is None:
54+
raise AttributeError("The base estimator can not be None.")
55+
56+
if estimator_params is None:
57+
for _ in range(self.k):
58+
self.ensemble.append(base_estimator())
59+
else:
60+
for _ in range(self.k):
61+
self.ensemble.append(base_estimator(**estimator_params))
62+
63+
labeled.columns = [*range(len(labeled.keys()))]
64+
unlabeled.columns = [*range(len(unlabeled.keys()))]
65+
66+
for i in range(self.k):
67+
seed = self.random_state[i] if hasattr(self.random_state,
68+
'__iter__') else \
69+
self.random_state
70+
71+
l_i = labeled.sample(n=len(labeled), frac=None, replace=True,
72+
random_state=seed, ignore_index=True)
73+
u_i = unlabeled.sample(frac=self.unlabeled_sample_frac,
74+
replace=False, random_state=seed,
75+
ignore_index=True)
76+
77+
oob_i = []
78+
for sample in labeled.to_numpy():
79+
is_in = False
80+
for selected_sample in l_i.to_numpy():
81+
if np.array_equal(sample, selected_sample):
82+
is_in = True
83+
break
84+
if not is_in:
85+
oob_i.append(sample)
86+
87+
oob_i = pd.DataFrame(oob_i)
88+
89+
d_class_i = l_i[l_i.shape[1] - 1].value_counts(
90+
sort=False) # n labels
91+
d_class_i = [x / d_class_i.sum() for x in d_class_i]
92+
93+
self.ensemble[i].fit(l_i.iloc[:, :-1], np.ravel(l_i.iloc[:, -1:]))
94+
self.__robust_self_training(i, l_i, u_i, oob_i, d_class_i)
95+
96+
return self.ensemble
97+
98+
def __robust_self_training(self, iteration, l_i, u_i, oob_i, d_class_i):
99+
"""Procedure to enrich a given classifier."""
100+
101+
y_pred = self.ensemble[iteration].predict(oob_i.iloc[:, :-1])
102+
best_error_i = f1_score(y_true=np.ravel(oob_i.iloc[:, -1:]),
103+
y_pred=y_pred, average="weighted")
104+
best_c_i = self.ensemble[iteration]
105+
106+
for _ in range(self.m):
107+
prob_i = self.ensemble[iteration].predict_proba(u_i)
108+
n_labels = len(prob_i[0])
109+
110+
u_conf_i = []
111+
for unlabeled_sample, prob in zip(u_i.to_numpy(), prob_i):
112+
val = np.argmax(prob)
113+
u_conf_i.append([unlabeled_sample, val, prob[val]])
114+
115+
u_conf_i.sort(key=lambda x: x[1], reverse=True)
116+
samples_pred_label = {x: [] for x in range(n_labels)}
117+
for sample, val, prob in u_conf_i:
118+
samples_pred_label[val].append(sample)
119+
120+
proportion = [int(x * self.n) for x in d_class_i]
121+
122+
samples_selected_proportion = []
123+
try:
124+
for prop, (label, samples) in zip(proportion,
125+
samples_pred_label.items()):
126+
for k in range(prop):
127+
sample_temp = list(samples[k])
128+
sample_temp.append(label)
129+
samples_selected_proportion.append(sample_temp)
130+
131+
except IndexError:
132+
print("Warning: There are not enough samples to keep the "
133+
"proportion, consider changing the problem to be able "
134+
"to reuse samples or change the model parametrization. ")
135+
136+
samples_u_best = pd.DataFrame(samples_selected_proportion)
137+
138+
l_i = pd.concat([l_i, samples_u_best], ignore_index=True, axis=0)
139+
140+
if not self.reuse_samples:
141+
indexes = []
142+
for _, sample in samples_u_best.iterrows():
143+
sample = sample.to_numpy()[:-1]
144+
for index, sample_u in u_i.iterrows():
145+
if np.array_equal(sample, sample_u.to_numpy()):
146+
indexes.append(index)
147+
break
148+
149+
u_i = u_i.drop(index=indexes)
150+
151+
self.ensemble[iteration].fit(l_i.iloc[:, :-1],
152+
np.ravel(l_i.iloc[:, -1:]))
153+
154+
y_pred = self.ensemble[iteration].predict(oob_i.iloc[:, :-1])
155+
current_error_i = f1_score(y_true=np.ravel(oob_i.iloc[:, -1:]),
156+
y_pred=y_pred, average="weighted")
157+
158+
if current_error_i < best_error_i:
159+
best_error_i = current_error_i
160+
best_c_i = self.ensemble[iteration]
161+
162+
self.ensemble[iteration] = best_c_i
163+
164+
def predict(self, samples):
165+
"""
166+
Predict using the trained ensemble with a majority vote.
167+
168+
:param samples: pandas DataFrame or vector shape (n_samples,
169+
n_attributes)
170+
171+
:return: numpy array: predicted samples.
172+
"""
173+
if isinstance(samples, pd.DataFrame):
174+
samples = samples.to_numpy()
175+
if len(self.ensemble) == 0:
176+
raise InterruptedError("To be able to predict, fitting is needed "
177+
"to be already done.")
178+
c_pred = []
179+
for classifier in self.ensemble:
180+
c_pred.append(classifier.predict(samples))
181+
c_pred = pd.DataFrame(np.array(c_pred)).mode().iloc[0].to_numpy()
182+
183+
return c_pred
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#!/usr/bin/env python
2+
# -*- coding:utf-8 -*-
3+
# @Filename: __init__.py.py
4+
# @Author: Daniel Puente Ramírez
5+
# @Time: 27/4/22 10:41
6+
7+
from ._RESSEL import RESSEL
8+
9+
__all__ = [
10+
"RESSEL"
11+
]

0 commit comments

Comments
 (0)