33# @Filename: CoTraining.py
44# @Author: Daniel Puente Ramírez
55# @Time: 22/12/21 09:27
6- # @Version: 4 .0
6+ # @Version: 5 .0
77
88from math import ceil , floor
99
1515
1616
1717class CoTraining :
18- """Blum, A., & Mitchell, T. (1998, July). Combining labeled and unlabeled
19- data with co-training. In Proceedings of the eleventh annual conference
20- on Computational learning theory (pp. 92-100).
18+ """
19+ Blum, A., & Mitchell, T. (1998, July). Combining labeled and unlabeled
20+ data with co-training. In Proceedings of the eleventh annual conference
21+ on Computational learning theory (pp. 92-100).
22+
23+ Parameters
24+ ----------
25+ p : int, default=1
26+ The number of positive samples.
27+
28+ n : int, default=3
29+ The number of negative samples.
30+
31+ k : int, default=30
32+ The number of iterations to train the classifiers.
33+
34+ u : int, default=75
35+ The number of unlabeled samples to use in the training set
36+
37+ random_state : int, default=None
38+ The random seed used to generate the initial population
39+
40+ c1 : base_estimator, default=GaussianNB
41+ The first classifier to be used
42+
43+ c1_params : dict, default=None
44+ Parameters for the first classifier
45+
46+ c2 : base_estimator, default=GaussianNB
47+ The second classifier to be used
48+
49+ c2_params : dict, default=None
50+ Parameters for the second classifier
51+
2152 """
2253
2354 def __init__ (self , p = 1 , n = 3 , k = 30 , u = 75 , random_state = None ,
2455 c1 = None , c1_params = None ,
25- c2 = None , c2_params = None ,
26- ):
56+ c2 = None , c2_params = None ,):
57+ """
58+ The function takes in the parameters for the two classifiers, and if the
59+ classifier is not None, it will use the parameters to create the
60+ classifier. If the classifier is None, it will use the default
61+ classifier, which is GaussianNB
62+
63+ :param p: The number of positive samples, defaults to 1
64+ (optional)
65+ :param n: The number of negative samples, defaults to 3
66+ (optional)
67+ :param k: The number of iterations to train the classifiers, defaults
68+ to 30 (optional)
69+ :param u: The number of unlabeled samples to use in the training set,
70+ defaults to 75 (optional)
71+ :param random_state: The random seed used to generate the initial
72+ population
73+ :param c1: The first classifier to be used
74+ :param c1_params: parameters for the first classifier
75+ :param c2: The second classifier to be used
76+ :param c2_params: The parameters for the second classifier
77+ """
2778 self .p = p
2879 self .n = n
2980 self .k = k
@@ -46,10 +97,17 @@ def __init__(self, p=1, n=3, k=30, u=75, random_state=None,
4697 self .h1 , self .h2 = configs
4798
4899 def fit (self , samples , y ):
49- try :
50- labeled , u , y = split (samples , y )
51- except IndexError :
52- raise ValueError ('Dimensions do not match.' )
100+ """
101+ The function takes in a set of labeled samples and unlabeled samples,
102+ and then uses the labeled samples to train two classifiers, and then
103+ uses the two classifiers to predict the unlabeled samples. The top n
104+ samples with the highest confidence are then added to the labeled
105+ samples, and the process is repeated k times
106+
107+ :param samples: the unlabeled data
108+ :param y: the labels of the samples
109+ """
110+ labeled , rng , u , u_random_index , y = self ._check_parameters (samples , y )
53111
54112 le = LabelEncoder ()
55113 le .fit (y )
@@ -58,14 +116,6 @@ def fit(self, samples, y):
58116
59117 self .size_x1 = ceil (len (labeled [0 ]) / 2 )
60118
61- rng = np .random .default_rng ()
62- try :
63- u_random_index = rng .choice (len (u ), size = floor (self .u ),
64- replace = False , shuffle = False )
65- except ValueError :
66- raise ValueError ('The model was incorrectly parametrized, '
67- 'total between _p_ and _u_ is to big.' )
68-
69119 u_prime = u [u_random_index ]
70120 u1 , u2 = np .array_split (u_prime , 2 , axis = 1 )
71121
@@ -118,7 +168,38 @@ def fit(self, samples, y):
118168
119169 u_prime = np .concatenate ((u_prime , u [u_random_index ]))
120170
171+ def _check_parameters (self , samples , y ):
172+ """
173+ > The function checks the parameters of the model and returns the
174+ labeled samples, the random number generator, the unlabeled samples,
175+ the random index of the unlabeled samples, and the labels
176+
177+ :param samples: The samples to be labeled
178+ :param y: the target variable
179+ :return: the labeled, rng, u, u_random_index, y
180+ """
181+ try :
182+ labeled , u , y = split (samples , y )
183+ except IndexError :
184+ raise ValueError ('Dimensions do not match.' )
185+ rng = np .random .default_rng ()
186+ try :
187+ u_random_index = rng .choice (len (u ), size = floor (self .u ),
188+ replace = False , shuffle = False )
189+ except ValueError :
190+ raise ValueError ('The model was incorrectly parametrized, '
191+ 'total between _p_ and _u_ is to big.' )
192+ return labeled , rng , u , u_random_index , y
193+
121194 def predict (self , samples ):
195+ """
196+ If the predictions of the two classifiers are the same, return that
197+ prediction. If they disagree, return the prediction of the classifier
198+ with the highest probability
199+
200+ :param samples: the data to be predicted
201+ :return: The labels of the samples.
202+ """
122203 x1 , x2 = np .array_split (samples , 2 , axis = 1 )
123204 pred1 , pred_proba1 = self .h1 .predict (x1 ), self .h1 .predict_proba (x1 )
124205 pred2 , pred_proba2 = self .h2 .predict (x2 ), self .h2 .predict_proba (x2 )
0 commit comments