batch uncertainty sampling fixed for higher dimensional datasets

cosmic-cortex · cosmic-cortex · commit a8eca52816c3 · 2019-05-09T15:48:22.000+02:00
diff --git a/modAL/batch.py b/modAL/batch.py
@@ -79,9 +79,11 @@ def select_instance(
         Index of the best index from X chosen to be labelled; a single record from our unlabeled set that is considered
         the most optimal incremental record for including in our query set.
     """
+    X_pool_masked = X_pool[mask]
+
     # Extract the number of labeled and unlabeled records.
-    n_labeled_records, _ = X_training.shape
-    n_unlabeled, _ = X_pool[mask].shape
+    n_labeled_records, *rest = X_training.shape
+    n_unlabeled, *rest = X_pool_masked.shape
 
     # Determine our alpha parameter as |U| / (|U| + |D|). Note that because we
     # append to X_training and remove from X_pool within `ranked_batch`,
@@ -90,10 +92,15 @@ def select_instance(
 
     # Compute pairwise distance (and then similarity) scores from every unlabeled record
     # to every record in X_training. The result is an array of shape (n_samples, ).
+
     if n_jobs == 1 or n_jobs is None:
-        _, distance_scores = pairwise_distances_argmin_min(X_pool[mask], X_training, metric=metric)
+        _, distance_scores = pairwise_distances_argmin_min(X_pool_masked.reshape(n_unlabeled, -1),
+                                                           X_training.reshape(n_labeled_records, -1),
+                                                           metric=metric)
     else:
-        distance_scores = pairwise_distances(X_pool[mask], X_training, metric=metric, n_jobs=n_jobs).min(axis=1)
+        distance_scores = pairwise_distances(X_pool_masked.reshape(n_unlabeled, -1),
+                                             X_training.reshape(n_labeled_records, -1),
+                                             metric=metric, n_jobs=n_jobs).min(axis=1)
 
     similarity_scores = 1 / (1 + distance_scores)
 
@@ -103,11 +110,11 @@ def select_instance(
 
     # Isolate and return our best instance for labeling as the one with the largest score.
     best_instance_index_in_unlabeled = np.argmax(scores)
-    n_pool, _ = X_pool.shape
+    n_pool, *rest = X_pool.shape
     unlabeled_indices = [i for i in range(n_pool) if mask[i]]
     best_instance_index = unlabeled_indices[best_instance_index_in_unlabeled]
     mask[best_instance_index] = 0
-    return best_instance_index, X_pool[best_instance_index].reshape(1, -1), mask
+    return best_instance_index, np.expand_dims(X_pool[best_instance_index], axis=0), mask
 
 
 def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -1071,6 +1071,7 @@ def test_strategies(self):
 class TestExamples(unittest.TestCase):
 
     def test_examples(self):
+        import example_tests.multidimensional_data
         import example_tests.active_regression
         import example_tests.bagging
         import example_tests.ensemble
diff --git a/tests/example_tests/multidimensional_data.py b/tests/example_tests/multidimensional_data.py
@@ -0,0 +1,38 @@
+import numpy as np
+from modAL.models import ActiveLearner
+from modAL.uncertainty import margin_sampling, entropy_sampling
+from modAL.batch import uncertainty_batch_sampling
+from modAL.expected_error import expected_error_reduction
+
+
+class MockClassifier:
+    def __init__(self, n_classes=2):
+        self.n_classes = n_classes
+
+    def fit(self, X, y):
+        return self
+
+    def predict(self, X):
+        return np.random.randint(0, self.n_classes, shape=(len(X), 1))
+
+    def predict_proba(self, X):
+        return np.ones(shape=(len(X), self.n_classes))/self.n_classes
+
+
+if __name__ == '__main__':
+    X_train = np.random.rand(10, 5, 5)
+    y_train = np.random.rand(10, 1)
+    X_pool = np.random.rand(10, 5, 5)
+    y_pool = np.random.rand(10, 1)
+
+    strategies = [margin_sampling, entropy_sampling, uncertainty_batch_sampling]
+
+    for query_strategy in strategies:
+        print("testing %s..." % query_strategy.__name__)
+        # max margin sampling
+        learner = ActiveLearner(
+            estimator=MockClassifier(), query_strategy=query_strategy,
+            X_training=X_train, y_training=y_train
+        )
+        learner.query(X_pool)
+        learner.teach(X_pool, y_pool)