33# @Filename: TriTraining.py
44# @Author: Daniel Puente Ramírez
55# @Time: 27/12/21 10:25
6- # @Version: 4 .0
6+ # @Version: 5 .0
77
88from math import floor , ceil
99
@@ -81,6 +81,16 @@ def __init__(self, random_state=None,
8181 np .random .randint (low = 0 , high = 10e5 , size = 1 )[0 ]
8282
8383 def _subsample (self , l_t , s ):
84+ """
85+ > The function takes in a Bunch object, which is a dictionary-like
86+ object that contains the data and target arrays, and a sample size,
87+ and returns a Bunch object with the data and target arrays sub-sampled
88+ to the specified size
89+
90+ :param l_t: the labeled and unlabeled data
91+ :param s: the number of samples to be drawn from the dataset
92+ :return: A Bunch object with the data and target attributes.
93+ """
8494 np .random .seed (self .random_state )
8595 rng = np .random .default_rng ()
8696 data = np .array (l_t ['data' ])
@@ -91,6 +101,17 @@ def _subsample(self, l_t, s):
91101 return Bunch (data = samples , target = targets )
92102
93103 def fit (self , samples , y ):
104+ """
105+ The function takes in the training data and the labels, and then splits
106+ the data into three parts: labeled, unlabeled, and test. It then
107+ creates three classifiers, h_i, h_j, and h_k, and trains them on the
108+ labeled data. It then checks to see if the classifiers are accurate
109+ enough, and if they are, it returns them. If they are not, it trains
110+ them again on the labeled data, and then checks again
111+
112+ :param samples: The samples to train the classifier on
113+ :param y: the labels
114+ """
94115 try :
95116 labeled , u , y = split (samples , y )
96117 except IndexError :
@@ -121,114 +142,104 @@ def fit(self, samples, y):
121142 hash_j = h_j .__hash__ ()
122143 hash_k = h_k .__hash__ ()
123144
124- update_j = False
125- l_j = Bunch (data = [], target = [])
126- e_j = measure_error (h_j , h_k , labeled )
127-
128- if e_j < ep_j :
129- for sample in u :
130- sample_s = sample .reshape (1 , - 1 )
131- if h_j .predict (sample_s ) == h_k .predict (sample_s ):
132- pred = h_i .predict (sample_s )
133- prev_dat = list (l_j ['data' ])
134- prev_tar = list (l_j ['target' ])
135- prev_dat .append (sample )
136- l_j ['data' ] = np .array (prev_dat )
137- prev_tar .append (pred )
138- l_j ['target' ] = np .array (prev_tar )
139-
140- if lp_j == 0 :
141- lp_j = floor (e_j / (ep_j - e_j ) + 1 )
142-
143- if lp_j < len (l_j ['data' ]):
144- if e_j * len (l_j ['data' ]) < ep_j * lp_j :
145- update_j = True
146- elif lp_j > e_j / (ep_j - e_j ):
147- l_j = self ._subsample (l_j , ceil (((ep_j * lp_j ) / e_j )
148- - 1 ))
149- update_j = True
150-
151- update_k = False
152- l_k = Bunch (data = np .array ([]), target = np .array ([]))
153- e_k = measure_error (h_j , h_k , labeled )
154-
155- if e_k < ep_k :
156- for sample in u :
157- sample_s = sample .reshape (1 , - 1 )
158- if h_j .predict (sample_s ) == h_k .predict (sample_s ):
159- pred = h_i .predict (sample_s )
160- prev_dat = list (l_k ['data' ])
161- prev_tar = list (l_k ['target' ])
162- prev_dat .append (sample )
163- l_k ['data' ] = np .array (prev_dat )
164- prev_tar .append (pred )
165- l_k ['target' ] = np .array (prev_tar )
166-
167- if lp_k == 0 :
168- lp_k = floor (e_k / (ep_k - e_k ) + 1 )
169-
170- if lp_k < len (l_k ['data' ]):
171- if e_k * len (l_k ['data' ]) < ep_k * lp_k :
172- update_k = True
173- elif lp_k > e_k / (ep_k - e_k ):
174- l_k = self ._subsample (l_k , ceil (((ep_k * lp_k ) / e_k )
175- - 1 ))
176- update_k = True
177-
178- update_i = False
179- l_i = Bunch (data = np .array ([]), target = np .array ([]))
180- e_i = measure_error (h_j , h_k , labeled )
181-
182- if e_i < ep_i :
183- for sample in u :
184- sample_s = sample .reshape (1 , - 1 )
185- if h_j .predict (sample_s ) == h_k .predict (sample_s ):
186- pred = h_i .predict (sample_s )
187- prev_dat = list (l_i ['data' ])
188- prev_tar = list (l_i ['target' ])
189- prev_dat .append (sample )
190- l_i ['data' ] = np .array (prev_dat )
191- prev_tar .append (pred )
192- l_i ['target' ] = np .array (prev_tar )
193-
194- if lp_i == 0 :
195- lp_i = floor (e_i / (ep_i - e_i ) + 1 )
196-
197- if lp_i < len (l_i ['data' ]):
198- if e_i * len (l_i ['data' ]) < ep_i * lp_i :
199- update_i = True
200- elif lp_i > e_i / (ep_i - e_i ):
201- l_i = self ._subsample (l_i , ceil (((ep_i * lp_i ) / e_i )
202- - 1 ))
203- update_i = True
204-
205- if update_j :
206- train = np .concatenate ((labeled , l_j ['data' ]), axis = 0 )
207- test = np .concatenate ((y , np .ravel (l_j ['target' ])),
208- axis = 0 )
209- h_j = self .hj .fit (train , test )
210- ep_j = e_j
211- lp_j = len (l_j )
212- if update_k :
213- train = np .concatenate ((labeled , l_k ['data' ]), axis = 0 )
214- test = np .concatenate ((y , np .ravel (l_k ['target' ])),
215- axis = 0 )
216- h_k = self .hk .fit (train , test )
217- ep_k = e_k
218- lp_k = len (l_k )
219- if update_i :
220- train = np .concatenate ((labeled , l_i ['data' ]), axis = 0 )
221- test = np .concatenate ((y , np .ravel (l_i ['target' ])),
222- axis = 0 )
223- h_i = self .hi .fit (train , test )
224- ep_i = e_i
225- lp_i = len (l_i )
145+ e_j , l_j , update_j = self ._train_classifier (ep_j , h_i , h_j , h_k ,
146+ labeled , lp_j , u )
147+
148+ e_k , l_k , update_k = self ._train_classifier (ep_k , h_i , h_j , h_k ,
149+ labeled , lp_k , u )
150+
151+ e_i , l_i , update_i = self ._train_classifier (ep_i , h_i , h_j , h_k ,
152+ labeled , lp_i , u )
153+
154+ ep_j , h_j , lp_j = self ._check_for_update (e_j , ep_j , h_j , l_j ,
155+ labeled , lp_j , update_j , y )
156+ ep_k , h_k , lp_k = self ._check_for_update (e_k , ep_k , h_k , l_k ,
157+ labeled , lp_k , update_k ,
158+ y )
159+
160+ ep_i , h_i , lp_i = self ._check_for_update (e_i , ep_i , h_i , l_i ,
161+ labeled , lp_i , update_i , y )
226162
227163 if h_i .__hash__ () == hash_i and h_j .__hash__ () == hash_j and \
228164 h_k .__hash__ () == hash_k :
229165 break
230166
167+ def _check_for_update (self , e_j , ep_j , h_j , l_j , labeled , lp_j , update_j ,
168+ y ):
169+ """
170+ If the update_j flag is True, then we concatenate the labeled data with
171+ the new data, and fit the model to the new data
172+
173+ :param e_j: the error of the current hypothesis
174+ :param ep_j: the error of the previous iteration
175+ :param h_j: the classifier for the jth class
176+ :param l_j: the labeled data
177+ :param labeled: the labeled data
178+ :param lp_j: the number of labeled points in the current iteration
179+ :param update_j: boolean, whether to update the model or not
180+ :param y: the true labels of the data
181+ :return: the error, the hypothesis, and the length of the labeled data.
182+ """
183+ if update_j :
184+ train = np .concatenate ((labeled , l_j ['data' ]), axis = 0 )
185+ test = np .concatenate ((y , np .ravel (l_j ['target' ])),
186+ axis = 0 )
187+ h_j = self .hj .fit (train , test )
188+ ep_j = e_j
189+ lp_j = len (l_j )
190+ return ep_j , h_j , lp_j
191+
192+ def _train_classifier (self , ep_k , h_i , h_j , h_k , labeled , lp_k , u ):
193+ """
194+ If the error of the classifier is less than the error threshold, and the
195+ number of samples in the labeled set is less than the number of samples
196+ in the unlabeled set, then add the samples to the labeled set
197+
198+ :param ep_k: the error threshold for the classifier
199+ :param h_i: the classifier that is being trained
200+ :param h_j: the classifier that is being compared to h_k
201+ :param h_k: the classifier we're training
202+ :param labeled: the labeled data
203+ :param lp_k: the number of samples that have been labeled by h_k
204+ :param u: the unlabeled data
205+ :return: The error, the new labeled data, and a boolean indicating
206+ whether the classifier should be updated.
207+ """
208+ update_k = False
209+ l_k = Bunch (data = np .array ([]), target = np .array ([]))
210+ e_k = measure_error (h_j , h_k , labeled )
211+ if e_k < ep_k :
212+ for sample in u :
213+ sample_s = sample .reshape (1 , - 1 )
214+ if h_j .predict (sample_s ) == h_k .predict (sample_s ):
215+ pred = h_i .predict (sample_s )
216+ prev_dat = list (l_k ['data' ])
217+ prev_tar = list (l_k ['target' ])
218+ prev_dat .append (sample )
219+ l_k ['data' ] = np .array (prev_dat )
220+ prev_tar .append (pred )
221+ l_k ['target' ] = np .array (prev_tar )
222+
223+ if lp_k == 0 :
224+ lp_k = floor (e_k / (ep_k - e_k ) + 1 )
225+
226+ if lp_k < len (l_k ['data' ]):
227+ if e_k * len (l_k ['data' ]) < ep_k * lp_k :
228+ update_k = True
229+ elif lp_k > e_k / (ep_k - e_k ):
230+ l_k = self ._subsample (l_k , ceil (((ep_k * lp_k ) / e_k )
231+ - 1 ))
232+ update_k = True
233+ return e_k , l_k , update_k
234+
231235 def predict (self , samples ):
236+ """
237+ For each sample, we predict the label using each of the three
238+ classifiers, and then we take the majority vote of the three predictions
239+
240+ :param samples: the data to be classified
241+ :return: The labels of the samples.
242+ """
232243 labels = []
233244 pred1 = self .hi .predict (samples )
234245 pred2 = self .hj .predict (samples )
0 commit comments