Skip to content

Commit 3f6bbcb

Browse files
committed
add: expected log loss reduction
1 parent bdd7415 commit 3f6bbcb

File tree

2 files changed

+54
-2
lines changed

2 files changed

+54
-2
lines changed

modAL/expected_error.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput,
3838

3939
assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
4040

41-
#expected_error = np.full(shape=(len(X), ), fill_value=-np.nan)
4241
expected_error = np.zeros(shape=(len(X), ))
4342
possible_labels = np.unique(learner.y_training)
4443

@@ -68,3 +67,55 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput,
6867

6968
return query_idx, X[query_idx]
7069

70+
71+
def expected_log_loss_reduction(learner: ActiveLearner, X: modALinput,
72+
p_subsample: np.float = 1.0, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
73+
"""
74+
Expected log loss reduction query strategy.
75+
76+
References:
77+
Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf)
78+
79+
Args:
80+
learner: The ActiveLearner object for which the expected log loss is to be estimated.
81+
X: The samples.
82+
p_subsample: Probability of keeping a sample from the pool when calculating expected log loss.
83+
Significantly improves runtime for large sample pools.
84+
n_instances: The number of instances to be sampled.
85+
86+
87+
Returns:
88+
The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
89+
"""
90+
91+
assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
92+
93+
expected_log_loss = np.zeros(shape=(len(X), ))
94+
possible_labels = np.unique(learner.y_training)
95+
96+
try:
97+
X_proba = learner.predict_proba(X)
98+
except NotFittedError:
99+
# TODO: implement a proper cold-start
100+
return 0, X[0]
101+
102+
for x_idx, x in enumerate(X):
103+
# subsample the data if needed
104+
if np.random.rand() <= p_subsample:
105+
# estimate the expected error
106+
for y_idx, y in enumerate(possible_labels):
107+
X_new = data_vstack((learner.X_training, x.reshape(1, -1)))
108+
y_new = data_vstack((learner.y_training, np.array(y).reshape(1, )))
109+
110+
refitted_estimator = clone(learner.estimator).fit(X_new, y_new)
111+
refitted_proba = refitted_estimator.predict_proba(X)
112+
entr = np.transpose(entropy(np.transpose(refitted_proba)))
113+
114+
expected_log_loss[x_idx] += np.sum(entr)*X_proba[x_idx, y_idx]
115+
116+
else:
117+
expected_log_loss[x_idx] -np.nan
118+
119+
query_idx = multi_argmax(expected_log_loss, n_instances)
120+
121+
return query_idx, X[query_idx]

tests/core_tests.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -386,13 +386,14 @@ def test_KL_max_disagreement(self):
386386

387387
class TestEER(unittest.TestCase):
388388
def test_eer(self):
389-
for n_pool, n_features, n_classes in product(range(1, 10), range(1, 5), range(2, 5)):
389+
for n_pool, n_features, n_classes in product(range(5, 10), range(1, 5), range(2, 5)):
390390
X_training, y_training = np.random.rand(10, n_features), np.random.randint(0, n_classes, size=10)
391391
X_pool, y_pool = np.random.rand(n_pool, n_features), np.random.randint(0, n_classes+1, size=n_pool)
392392

393393
learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2),
394394
X_training=X_training, y_training=y_training)
395395

396+
modAL.expected_error.expected_log_loss_reduction(learner, X_pool)
396397
modAL.expected_error.expected_error_reduction(learner, X_pool)
397398

398399

0 commit comments

Comments
 (0)