add: Mean Max Loss and Max Loss multilabel SVM query strategies from Li et al. added

cosmic-cortex · cosmic-cortex · commit f2cf52efd79f · 2018-10-25T15:14:39.000+02:00
diff --git a/examples/multilabel_svm.py b/examples/multilabel_svm.py
@@ -24,7 +24,7 @@
     plt.scatter(X[y[:, 1] == 1, 0], X[y[:, 1] == 1, 1],
                 facecolors='none', edgecolors='r', s=100, linewidths=2, label='class 2')
     plt.legend()
-    #plt.show()
+    plt.show()
 
 learner = ActiveLearner(
     estimator=OneVsRestClassifier(SVC(probability=True)),
diff --git a/modAL/multilabel.py b/modAL/multilabel.py
@@ -5,6 +5,7 @@
 
 from modAL.utils.data import modALinput
 from typing import Tuple, Optional
+from itertools import combinations
 
 
 def _SVM_loss(multiclass_classifier: OneVsRestClassifier,
@@ -30,7 +31,7 @@ def _SVM_loss(multiclass_classifier: OneVsRestClassifier,
     if most_certain_classes is None:
         cls_mtx = 2*np.eye(n_classes, n_classes) - 1
         loss_mtx = np.maximum(1-np.dot(predictions, cls_mtx), 0)
-        return loss_mtx.mean(axis=0)
+        return loss_mtx.mean(axis=1)
     else:
         cls_mtx = -np.ones(shape=(len(X), n_classes))
         for inst_idx, most_certain_class in enumerate(most_certain_classes):
@@ -63,10 +64,80 @@ def SVM_binary_minimum(classifier: BaseEstimator,
 def max_loss(classifier: BaseEstimator,
              X_pool: modALinput,
              n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
-    pass
+
+    """
+    Max Loss query strategy for SVM multilabel classification.
+
+    For more details on this query strategy, see
+    Li et al., Multilabel SVM active learning for image classification
+    (http://dx.doi.org/10.1109/ICIP.2004.1421535)
+
+    Args:
+        classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model
+            such as the ones from sklearn.svm. Although the function will execute for other models as well,
+            the mathematical calculations in Li et al. work only for SVM-s.
+        X: The pool of samples to query from.
+
+    Returns:
+        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
+    """
+
+    most_certain_classes = classifier.predict_proba(X_pool).argmax(axis=1)
+    loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes)
+
+    assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)'
+
+    if n_instances == 1:
+        query_idx = np.argmax(loss)
+        return query_idx, X_pool[query_idx]
+    else:
+        max_val = -np.inf
+        max_idx = None
+        for subset_idx in combinations(range(len(X_pool)), n_instances):
+            subset_sum = loss[list(subset_idx)].sum()
+            if subset_sum > max_val:
+                max_val = subset_sum
+                max_idx = subset_idx
+
+        query_idx = np.array(max_idx)
+        return query_idx, X_pool[query_idx]
 
 
 def mean_max_loss(classifier: BaseEstimator,
                   X_pool: modALinput,
                   n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
-    pass
+    """
+    Mean Max Loss query strategy for SVM multilabel classification.
+
+    For more details on this query strategy, see
+    Li et al., Multilabel SVM active learning for image classification
+    (http://dx.doi.org/10.1109/ICIP.2004.1421535)
+
+    Args:
+        classifier: The multilabel classifier for which the labels are to be queried. Should be an SVM model
+            such as the ones from sklearn.svm. Although the function will execute for other models as well,
+            the mathematical calculations in Li et al. work only for SVM-s.
+        X: The pool of samples to query from.
+
+    Returns:
+        The index of the instance from X chosen to be labelled; the instance from X chosen to be labelled.
+    """
+
+    loss = _SVM_loss(classifier, X_pool)
+
+    assert len(X_pool) >= n_instances, 'n_instances cannot be larger than len(X_pool)'
+
+    if n_instances == 1:
+        query_idx = np.argmax(loss)
+        return query_idx, X_pool[query_idx]
+    else:
+        max_val = -np.inf
+        max_idx = None
+        for subset_idx in combinations(range(len(X_pool)), n_instances):
+            subset_sum = loss[list(subset_idx)].sum()
+            if subset_sum > max_val:
+                max_val = subset_sum
+                max_idx = subset_idx
+
+        query_idx = np.array(max_idx)
+        return query_idx, X_pool[query_idx]
diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -928,17 +928,43 @@ def test_vote(self):
 
 class TestMultilabel(unittest.TestCase):
     def test_SVM_loss(self):
-        for n_classes in range(3, 10):
-            for n_instances in range(5, 10):
+        for n_classes in range(2, 10):
+            for n_instances in range(1, 10):
                 X_training = np.random.rand(n_instances, 5)
                 y_training = np.random.randint(0, 2, size=(n_instances, n_classes))
                 X_pool = np.random.rand(n_instances, 5)
                 y_pool = np.random.randint(0, 2, size=(n_instances, n_classes))
                 classifier = OneVsRestClassifier(SVC())
                 classifier.fit(X_training, y_training)
-                loss = modAL.multilabel._SVM_loss(classifier, X_pool)
-                loss = modAL.multilabel._SVM_loss(classifier, X_pool,
-                                           most_certain_classes=np.random.randint(0, n_classes, size=(n_instances)))
+                avg_loss = modAL.multilabel._SVM_loss(classifier, X_pool)
+                mcc_loss = modAL.multilabel._SVM_loss(classifier, X_pool,
+                                                      most_certain_classes=np.random.randint(0, n_classes, size=(n_instances)))
+                self.assertEqual(avg_loss.shape, (len(X_pool), ))
+                self.assertEqual(mcc_loss.shape, (len(X_pool),))
+
+    def test_mean_max_loss(self):
+        for n_classes in range(2, 10):
+            for n_pool_instances in range(1, 10):
+                for n_query_instances in range(1, min(n_pool_instances, 3)):
+                    X_training = np.random.rand(n_pool_instances, 5)
+                    y_training = np.random.randint(0, 2, size=(n_pool_instances, n_classes))
+                    X_pool = np.random.rand(n_pool_instances, 5)
+                    y_pool = np.random.randint(0, 2, size=(n_pool_instances, n_classes))
+                    classifier = OneVsRestClassifier(SVC())
+                    classifier.fit(X_training, y_training)
+                    query_idx, query_inst = modAL.multilabel.mean_max_loss(classifier, X_pool, n_query_instances)
+
+    def test_max_loss(self):
+        for n_classes in range(2, 10):
+            for n_pool_instances in range(1, 10):
+                for n_query_instances in range(1, min(n_pool_instances, 3)):
+                    X_training = np.random.rand(n_pool_instances, 5)
+                    y_training = np.random.randint(0, 2, size=(n_pool_instances, n_classes))
+                    X_pool = np.random.rand(n_pool_instances, 5)
+                    y_pool = np.random.randint(0, 2, size=(n_pool_instances, n_classes))
+                    classifier = OneVsRestClassifier(SVC(probability=True))
+                    classifier.fit(X_training, y_training)
+                    query_idx, query_inst = modAL.multilabel.max_loss(classifier, X_pool, n_query_instances)
 
 
 class TestExamples(unittest.TestCase):