Skip to content

Commit 4f036b3

Browse files
Improved SSL's algorithms documentation #194
1 parent 068dcac commit 4f036b3

File tree

8 files changed

+423
-116
lines changed

8 files changed

+423
-116
lines changed

semisupervised/CoTraining.py

Lines changed: 99 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# @Filename: CoTraining.py
44
# @Author: Daniel Puente Ramírez
55
# @Time: 22/12/21 09:27
6-
# @Version: 4.0
6+
# @Version: 5 .0
77

88
from math import ceil, floor
99

@@ -15,15 +15,66 @@
1515

1616

1717
class CoTraining:
18-
"""Blum, A., & Mitchell, T. (1998, July). Combining labeled and unlabeled
19-
data with co-training. In Proceedings of the eleventh annual conference
20-
on Computational learning theory (pp. 92-100).
18+
"""
19+
Blum, A., & Mitchell, T. (1998, July). Combining labeled and unlabeled
20+
data with co-training. In Proceedings of the eleventh annual conference
21+
on Computational learning theory (pp. 92-100).
22+
23+
Parameters
24+
----------
25+
p : int, default=1
26+
The number of positive samples.
27+
28+
n : int, default=3
29+
The number of negative samples.
30+
31+
k : int, default=30
32+
The number of iterations to train the classifiers.
33+
34+
u : int, default=75
35+
The number of unlabeled samples to use in the training set
36+
37+
random_state : int, default=None
38+
The random seed used to generate the initial population
39+
40+
c1 : base_estimator, default=GaussianNB
41+
The first classifier to be used
42+
43+
c1_params : dict, default=None
44+
Parameters for the first classifier
45+
46+
c2 : base_estimator, default=GaussianNB
47+
The second classifier to be used
48+
49+
c2_params : dict, default=None
50+
Parameters for the second classifier
51+
2152
"""
2253

2354
def __init__(self, p=1, n=3, k=30, u=75, random_state=None,
2455
c1=None, c1_params=None,
25-
c2=None, c2_params=None,
26-
):
56+
c2=None, c2_params=None,):
57+
"""
58+
The function takes in the parameters for the two classifiers, and if the
59+
classifier is not None, it will use the parameters to create the
60+
classifier. If the classifier is None, it will use the default
61+
classifier, which is GaussianNB
62+
63+
:param p: The number of positive samples, defaults to 1
64+
(optional)
65+
:param n: The number of negative samples, defaults to 3
66+
(optional)
67+
:param k: The number of iterations to train the classifiers, defaults
68+
to 30 (optional)
69+
:param u: The number of unlabeled samples to use in the training set,
70+
defaults to 75 (optional)
71+
:param random_state: The random seed used to generate the initial
72+
population
73+
:param c1: The first classifier to be used
74+
:param c1_params: parameters for the first classifier
75+
:param c2: The second classifier to be used
76+
:param c2_params: The parameters for the second classifier
77+
"""
2778
self.p = p
2879
self.n = n
2980
self.k = k
@@ -46,10 +97,17 @@ def __init__(self, p=1, n=3, k=30, u=75, random_state=None,
4697
self.h1, self.h2 = configs
4798

4899
def fit(self, samples, y):
49-
try:
50-
labeled, u, y = split(samples, y)
51-
except IndexError:
52-
raise ValueError('Dimensions do not match.')
100+
"""
101+
The function takes in a set of labeled samples and unlabeled samples,
102+
and then uses the labeled samples to train two classifiers, and then
103+
uses the two classifiers to predict the unlabeled samples. The top n
104+
samples with the highest confidence are then added to the labeled
105+
samples, and the process is repeated k times
106+
107+
:param samples: the unlabeled data
108+
:param y: the labels of the samples
109+
"""
110+
labeled, rng, u, u_random_index, y = self._check_parameters(samples, y)
53111

54112
le = LabelEncoder()
55113
le.fit(y)
@@ -58,14 +116,6 @@ def fit(self, samples, y):
58116

59117
self.size_x1 = ceil(len(labeled[0]) / 2)
60118

61-
rng = np.random.default_rng()
62-
try:
63-
u_random_index = rng.choice(len(u), size=floor(self.u),
64-
replace=False, shuffle=False)
65-
except ValueError:
66-
raise ValueError('The model was incorrectly parametrized, '
67-
'total between _p_ and _u_ is to big.')
68-
69119
u_prime = u[u_random_index]
70120
u1, u2 = np.array_split(u_prime, 2, axis=1)
71121

@@ -118,7 +168,38 @@ def fit(self, samples, y):
118168

119169
u_prime = np.concatenate((u_prime, u[u_random_index]))
120170

171+
def _check_parameters(self, samples, y):
172+
"""
173+
> The function checks the parameters of the model and returns the
174+
labeled samples, the random number generator, the unlabeled samples,
175+
the random index of the unlabeled samples, and the labels
176+
177+
:param samples: The samples to be labeled
178+
:param y: the target variable
179+
:return: the labeled, rng, u, u_random_index, y
180+
"""
181+
try:
182+
labeled, u, y = split(samples, y)
183+
except IndexError:
184+
raise ValueError('Dimensions do not match.')
185+
rng = np.random.default_rng()
186+
try:
187+
u_random_index = rng.choice(len(u), size=floor(self.u),
188+
replace=False, shuffle=False)
189+
except ValueError:
190+
raise ValueError('The model was incorrectly parametrized, '
191+
'total between _p_ and _u_ is to big.')
192+
return labeled, rng, u, u_random_index, y
193+
121194
def predict(self, samples):
195+
"""
196+
If the predictions of the two classifiers are the same, return that
197+
prediction. If they disagree, return the prediction of the classifier
198+
with the highest probability
199+
200+
:param samples: the data to be predicted
201+
:return: The labels of the samples.
202+
"""
122203
x1, x2 = np.array_split(samples, 2, axis=1)
123204
pred1, pred_proba1 = self.h1.predict(x1), self.h1.predict_proba(x1)
124205
pred2, pred_proba2 = self.h2.predict(x2), self.h2.predict_proba(x2)

semisupervised/DemocraticCoLearning.py

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# @Filename: DemocraticCoLearning.py
44
# @Author: Daniel Puente Ramírez
55
# @Time: 29/12/21 15:39
6-
# @Version: 4.0
6+
# @Version: 5.0
77

88
import copy
99
from math import sqrt
@@ -18,11 +18,12 @@
1818

1919

2020
def check_bounds(wi):
21-
"""Check upper and lower bounds. The left minimum value can be 0, and the
22-
right minimum value can be 1.
21+
"""
22+
It checks that the lower bound is not less than 0 and the upper bound is not
23+
greater than 1
2324
2425
:param wi: lower and upper mean confidence
25-
:return: wi fixed
26+
:return: the fixed wi.
2627
"""
2728
if wi[0] < 0:
2829
wi[0] = 0
@@ -32,16 +33,54 @@ def check_bounds(wi):
3233

3334

3435
class DemocraticCoLearning:
35-
"""Democratic Co-Learning Implementation. Based on:
36-
Zhou, Y., & Goldman, S. (2004, November). Democratic co-learning.
37-
In 16th IEEE International Conference on Tools with Artificial
38-
Intelligence (pp. 594-602). IEEE.
36+
"""
37+
Democratic Co-Learning Implementation. Based on:
38+
Zhou, Y., & Goldman, S. (2004, November). Democratic co-learning.
39+
In 16th IEEE International Conference on Tools with Artificial
40+
Intelligence (pp. 594-602). IEEE.
41+
42+
Parameters
43+
----------
44+
random_state : int, default=None
45+
The random seed used to initialize the classifiers
46+
47+
c1 : base_estimator, default=MultinomialNB
48+
The first classifier to be used
49+
50+
c1_params : dict, default=None
51+
Parameters for the first classifier
52+
53+
c2 : base_estimator, default=KNeighborsClassifier
54+
The second classifier to be used
55+
56+
c2_params : dict, default=None
57+
Parameters for the second classifier
58+
59+
c3 : base_estimator, default=DecisionTreeClassifier
60+
The third classifier to be used
61+
62+
c3_params : dict, default=None
63+
Parameters for the third classifier
64+
3965
"""
4066

4167
def __init__(self, random_state=None,
4268
c1=None, c1_params=None,
4369
c2=None, c2_params=None,
4470
c3=None, c3_params=None):
71+
"""
72+
The function takes in three classifiers and their parameters, and if
73+
they are not provided, it uses the default classifiers and their
74+
parameters.
75+
76+
:param random_state: The random seed used to initialize the classifiers
77+
:param c1: The first classifier
78+
:param c1_params: parameters for the first classifier
79+
:param c2: The classifier to use for the second classifier
80+
:param c2_params: The parameters for the second classifier
81+
:param c3: The third classifier
82+
:param c3_params: The parameters for the third classifier
83+
"""
4584
self.const = 1.96 # 95%
4685
self.random_state = random_state if random_state is not None else \
4786
np.random.randint(low=0, high=10e5, size=1)[0]
@@ -69,6 +108,17 @@ def __init__(self, random_state=None,
69108
self.h1, self.h2, self.h3 = configs
70109

71110
def fit(self, samples, y):
111+
"""
112+
The function takes in a set of labeled and unlabeled data, and uses the
113+
labeled data to train three classifiers. Then, it uses the three
114+
classifiers to predict the labels of the unlabeled data. If the
115+
prediction is correct, the data is not added to the training set. If
116+
the prediction is incorrect, the data is added to the training set.
117+
The process is repeated until the training set stops changing
118+
119+
:param samples: the training data
120+
:param y: the labels of the samples
121+
"""
72122
try:
73123
labeled, u, y = split(samples, y)
74124
except IndexError:
@@ -291,6 +341,14 @@ def fit(self, samples, y):
291341
self.w3 = sum(check_bounds(w3)) / 2
292342

293343
def predict(self, samples):
344+
"""
345+
For each sample, we get the predictions of the three classifiers, and
346+
then we count the number of times each label appears in the
347+
predictions. The label that appears the most is the one we return
348+
349+
:param samples: the samples to be classified
350+
:return: The labels of the samples.
351+
"""
294352
all_instances = samples
295353

296354
gj = [0 for _ in range(self.n_labels)]

0 commit comments

Comments
 (0)