add: expected log loss reduction

cosmic-cortex · cosmic-cortex · commit 3f6bbcb3ab89 · 2018-11-28T14:33:02.000+01:00
diff --git a/modAL/expected_error.py b/modAL/expected_error.py
@@ -38,7 +38,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput,
 
     assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
 
-    #expected_error = np.full(shape=(len(X), ), fill_value=-np.nan)
     expected_error = np.zeros(shape=(len(X), ))
     possible_labels = np.unique(learner.y_training)
 
@@ -68,3 +67,55 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput,
 
     return query_idx, X[query_idx]
 
+
+def expected_log_loss_reduction(learner: ActiveLearner, X: modALinput,
+                                p_subsample: np.float = 1.0, n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+    """
+    Expected log loss reduction query strategy.
+
+    References:
+        Roy and McCallum, 2001 (http://groups.csail.mit.edu/rrg/papers/icml01.pdf)
+
+    Args:
+        learner: The ActiveLearner object for which the expected log loss is to be estimated.
+        X: The samples.
+        p_subsample: Probability of keeping a sample from the pool when calculating expected log loss.
+            Significantly improves runtime for large sample pools.
+        n_instances: The number of instances to be sampled.
+
+
+    Returns:
+        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
+    """
+
+    assert 0.0 <= p_subsample <= 1.0, 'p_subsample subsampling keep ratio must be between 0.0 and 1.0'
+
+    expected_log_loss = np.zeros(shape=(len(X), ))
+    possible_labels = np.unique(learner.y_training)
+
+    try:
+        X_proba = learner.predict_proba(X)
+    except NotFittedError:
+        # TODO: implement a proper cold-start
+        return 0, X[0]
+
+    for x_idx, x in enumerate(X):
+        # subsample the data if needed
+        if np.random.rand() <= p_subsample:
+            # estimate the expected error
+            for y_idx, y in enumerate(possible_labels):
+                X_new = data_vstack((learner.X_training, x.reshape(1, -1)))
+                y_new = data_vstack((learner.y_training, np.array(y).reshape(1, )))
+
+                refitted_estimator = clone(learner.estimator).fit(X_new, y_new)
+                refitted_proba = refitted_estimator.predict_proba(X)
+                entr = np.transpose(entropy(np.transpose(refitted_proba)))
+
+                expected_log_loss[x_idx] += np.sum(entr)*X_proba[x_idx, y_idx]
+
+        else:
+            expected_log_loss[x_idx] -np.nan
+
+    query_idx = multi_argmax(expected_log_loss, n_instances)
+
+    return query_idx, X[query_idx]
diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -386,13 +386,14 @@ def test_KL_max_disagreement(self):
 
 class TestEER(unittest.TestCase):
     def test_eer(self):
-        for n_pool, n_features, n_classes in product(range(1, 10), range(1, 5), range(2, 5)):
+        for n_pool, n_features, n_classes in product(range(5, 10), range(1, 5), range(2, 5)):
             X_training, y_training = np.random.rand(10, n_features), np.random.randint(0, n_classes, size=10)
             X_pool, y_pool = np.random.rand(n_pool, n_features), np.random.randint(0, n_classes+1, size=n_pool)
 
             learner = modAL.models.ActiveLearner(RandomForestClassifier(n_estimators=2),
                                                  X_training=X_training, y_training=y_training)
 
+            modAL.expected_error.expected_log_loss_reduction(learner, X_pool)
             modAL.expected_error.expected_error_reduction(learner, X_pool)