@@ -38,7 +38,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput,
3838
3939 assert 0.0 <= p_subsample <= 1.0 , 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
4040
41- #expected_error = np.full(shape=(len(X), ), fill_value=-np.nan)
4241 expected_error = np .zeros (shape = (len (X ), ))
4342 possible_labels = np .unique (learner .y_training )
4443
@@ -68,3 +67,55 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput,
6867
6968 return query_idx , X [query_idx ]
7069
70+
71+ def expected_log_loss_reduction (learner : ActiveLearner , X : modALinput ,
72+ p_subsample : np .float = 1.0 , n_instances : int = 1 ) -> Tuple [np .ndarray , modALinput ]:
73+ """
74+ Expected log loss reduction query strategy.
75+
76+ References:
77+ Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf)
78+
79+ Args:
80+ learner: The ActiveLearner object for which the expected log loss is to be estimated.
81+ X: The samples.
82+ p_subsample: Probability of keeping a sample from the pool when calculating expected log loss.
83+ Significantly improves runtime for large sample pools.
84+ n_instances: The number of instances to be sampled.
85+
86+
87+ Returns:
88+ The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
89+ """
90+
91+ assert 0.0 <= p_subsample <= 1.0 , 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
92+
93+ expected_log_loss = np .zeros (shape = (len (X ), ))
94+ possible_labels = np .unique (learner .y_training )
95+
96+ try :
97+ X_proba = learner .predict_proba (X )
98+ except NotFittedError :
99+ # TODO: implement a proper cold-start
100+ return 0 , X [0 ]
101+
102+ for x_idx , x in enumerate (X ):
103+ # subsample the data if needed
104+ if np .random .rand () <= p_subsample :
105+ # estimate the expected error
106+ for y_idx , y in enumerate (possible_labels ):
107+ X_new = data_vstack ((learner .X_training , x .reshape (1 , - 1 )))
108+ y_new = data_vstack ((learner .y_training , np .array (y ).reshape (1 , )))
109+
110+ refitted_estimator = clone (learner .estimator ).fit (X_new , y_new )
111+ refitted_proba = refitted_estimator .predict_proba (X )
112+ entr = np .transpose (entropy (np .transpose (refitted_proba )))
113+
114+ expected_log_loss [x_idx ] += np .sum (entr )* X_proba [x_idx , y_idx ]
115+
116+ else :
117+ expected_log_loss [x_idx ] - np .nan
118+
119+ query_idx = multi_argmax (expected_log_loss , n_instances )
120+
121+ return query_idx , X [query_idx ]
0 commit comments