#20, #104 - added tests for on_transformed functionality and pandas support; small fixes

Boyan Hristov · Boyan Hristov · commit 171e2e956d51 · 2020-09-28T15:48:32.000+02:00
diff --git a/modAL/models/base.py b/modAL/models/base.py
@@ -300,6 +300,8 @@ def __init__(self, learner_list: List[BaseLearner], query_strategy: Callable, on
         self.learner_list = learner_list
         self.query_strategy = query_strategy
         self.on_transformed = on_transformed
+        # TODO: update training data when using fit() and teach() methods
+        self.X_training = None
 
     def __iter__(self) -> Iterator[BaseLearner]:
         for learner in self.learner_list:
diff --git a/modAL/models/learners.py b/modAL/models/learners.py
@@ -7,7 +7,7 @@
 
 from modAL.models.base import BaseLearner, BaseCommittee
 from modAL.utils.validation import check_class_labels, check_class_proba
-from modAL.utils.data import modALinput
+from modAL.utils.data import modALinput, retrieve_rows
 from modAL.uncertainty import uncertainty_sampling
 from modAL.disagreement import vote_entropy_sampling, max_std_sampling
 from modAL.acquisition import max_EI
@@ -187,7 +187,7 @@ def __init__(self,
         # setting the maximum value
         if self.y_training is not None:
             max_idx = np.argmax(self.y_training)
-            self.X_max = self.X_training[max_idx]
+            self.X_max = retrieve_rows(self.X_training, max_idx)
             self.y_max = self.y_training[max_idx]
         else:
             self.X_max = None
@@ -198,7 +198,7 @@ def _set_max(self, X: modALinput, y: modALinput) -> None:
         y_max = y[max_idx]
         if y_max > self.y_max:
             self.y_max = y_max
-            self.X_max = X[max_idx]
+            self.X_max = retrieve_rows(X, max_idx)
 
     def get_max(self) -> Tuple:
         """
@@ -248,6 +248,8 @@ class Committee(BaseCommittee):
         learner_list: A list of ActiveLearners forming the Committee.
         query_strategy: Query strategy function. Committee supports disagreement-based query strategies from
             :mod:`modAL.disagreement`, but uncertainty-based ones from :mod:`modAL.uncertainty` are also supported.
+        on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator
+            when applying the query strategy.
 
     Attributes:
         classes_: Class labels known by the Committee.
@@ -288,8 +290,9 @@ class Committee(BaseCommittee):
         ...     y=iris['target'][query_idx].reshape(1, )
         ... )
     """
-    def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = vote_entropy_sampling) -> None:
-        super().__init__(learner_list, query_strategy)
+    def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = vote_entropy_sampling,
+                 on_transformed: bool = False) -> None:
+        super().__init__(learner_list, query_strategy, on_transformed)
         self._set_classes()
 
     def _set_classes(self):
@@ -456,6 +459,8 @@ class CommitteeRegressor(BaseCommittee):
     Args:
         learner_list: A list of ActiveLearners forming the CommitteeRegressor.
         query_strategy: Query strategy function.
+        on_transformed: Whether to transform samples with the pipeline defined by each learner's estimator
+            when applying the query strategy.
 
     Examples:
 
@@ -499,8 +504,9 @@ class CommitteeRegressor(BaseCommittee):
         ...     query_idx, query_instance = committee.query(X.reshape(-1, 1))
         ...     committee.teach(X[query_idx].reshape(-1, 1), y[query_idx].reshape(-1, 1))
     """
-    def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = max_std_sampling) -> None:
-        super().__init__(learner_list, query_strategy)
+    def __init__(self, learner_list: List[ActiveLearner], query_strategy: Callable = max_std_sampling,
+                 on_transformed: bool = False) -> None:
+        super().__init__(learner_list, query_strategy, on_transformed)
 
     def predict(self, X: modALinput, return_std: bool = False, **predict_kwargs) -> Any:
         """
diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -1,6 +1,7 @@
 import random
 import unittest
 import numpy as np
+import pandas as pd
 
 import mock
 import modAL.models.base
@@ -26,6 +27,8 @@
 from sklearn.metrics import confusion_matrix
 from sklearn.svm import SVC
 from sklearn.multiclass import OneVsRestClassifier
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer
 from scipy.stats import entropy, norm
 from scipy.special import ndtr
 from scipy import sparse as sp
@@ -788,6 +791,68 @@ def test_sparse_matrices(self):
             query_idx, query_inst = learner.query(X_pool)
             learner.teach(X_pool[query_idx], y_pool[query_idx])
 
+    def test_on_transformed(self):
+        n_samples = 10
+        n_features = 5
+        query_strategies = [
+            modAL.batch.uncertainty_batch_sampling
+            # add further strategies which work with instance representations
+            # no further ones as of 25.09.2020
+        ]
+        X_pool = np.random.rand(n_samples, n_features)
+
+        # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline
+        X_pool = pd.DataFrame(X_pool)
+
+        y_pool = np.random.randint(0, 2, size=(n_samples,))
+        train_idx = np.random.choice(range(n_samples), size=2, replace=False)
+
+        for query_strategy in query_strategies:
+            learner = modAL.models.learners.ActiveLearner(
+                estimator=make_pipeline(
+                    FunctionTransformer(func=pd.DataFrame.to_numpy),
+                    RandomForestClassifier(n_estimators=10)
+                ),
+                query_strategy=query_strategy,
+                X_training=X_pool.iloc[train_idx],
+                y_training=y_pool[train_idx],
+                on_transformed=True
+            )
+            query_idx, query_inst = learner.query(X_pool)
+            learner.teach(X_pool.iloc[query_idx], y_pool[query_idx])
+
+    def test_old_query_strategy_interface(self):
+        n_samples = 10
+        n_features = 5
+        X_pool = np.random.rand(n_samples, n_features)
+        y_pool = np.random.randint(0, 2, size=(n_samples,))
+
+        # defining a custom query strategy also returning the selected instance
+        # make sure even if a query strategy works in some funny way
+        # (e.g. instance not matching instance index),
+        # the old interface remains unchanged
+        query_idx_ = np.random.choice(n_samples, 2)
+        query_instance_ = X_pool[(query_idx_ + 1) % len(X_pool)]
+
+        def custom_query_strategy(classifier, X):
+            return query_idx_, query_instance_
+
+
+        train_idx = np.random.choice(range(n_samples), size=2, replace=False)
+        custom_query_learner = modAL.models.learners.ActiveLearner(
+            estimator=RandomForestClassifier(n_estimators=10),
+            query_strategy=custom_query_strategy,
+            X_training=X_pool[train_idx], y_training=y_pool[train_idx]
+        )
+
+        query_idx, query_instance = custom_query_learner.query(X_pool)
+        custom_query_learner.teach(
+            X=X_pool[query_idx],
+            y=y_pool[query_idx]
+        )
+        np.testing.assert_equal(query_idx, query_idx_)
+        np.testing.assert_equal(query_instance, query_instance_)
+
 
 class TestBayesianOptimizer(unittest.TestCase):
     def test_set_max(self):
@@ -897,6 +962,39 @@ def test_teach(self):
                     )
                     learner.teach(X, y, bootstrap=bootstrap, only_new=only_new)
 
+    def test_on_transformed(self):
+        n_samples = 10
+        n_features = 5
+        query_strategies = [
+            # TODO remove, added just to make sure on_transformed doesn't break anything
+            # but it has no influence on this strategy, nothing special tested here
+            mock.MockFunction(return_val=[np.random.randint(0, n_samples)])
+
+            # add further strategies which work with instance representations
+            # no further ones as of 25.09.2020
+        ]
+        X_pool = np.random.rand(n_samples, n_features)
+
+        # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline
+        X_pool = pd.DataFrame(X_pool)
+
+        y_pool = np.random.rand(n_samples)
+        train_idx = np.random.choice(range(n_samples), size=2, replace=False)
+
+        for query_strategy in query_strategies:
+            learner = modAL.models.learners.BayesianOptimizer(
+                estimator=make_pipeline(
+                    FunctionTransformer(func=pd.DataFrame.to_numpy),
+                    GaussianProcessRegressor()
+                ),
+                query_strategy=query_strategy,
+                X_training=X_pool.iloc[train_idx],
+                y_training=y_pool[train_idx],
+                on_transformed=True
+            )
+            query_idx, query_inst = learner.query(X_pool)
+            learner.teach(X_pool.iloc[query_idx], y_pool[query_idx])
+
 
 class TestCommittee(unittest.TestCase):
 
@@ -1007,6 +1105,42 @@ def test_teach(self):
 
                 committee.teach(X, y, bootstrap=bootstrap, only_new=only_new)
 
+    def test_on_transformed(self):
+        n_samples = 10
+        n_features = 5
+        query_strategies = [
+            modAL.batch.uncertainty_batch_sampling
+            # add further strategies which work with instance representations
+            # no further ones as of 25.09.2020
+        ]
+        X_pool = np.random.rand(n_samples, n_features)
+
+        # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline
+        X_pool = pd.DataFrame(X_pool)
+
+        y_pool = np.random.randint(0, 2, size=(n_samples,))
+        train_idx = np.random.choice(range(n_samples), size=5, replace=False)
+
+        learner_list = [modAL.models.learners.ActiveLearner(
+            estimator=make_pipeline(
+                FunctionTransformer(func=pd.DataFrame.to_numpy),
+                RandomForestClassifier(n_estimators=10)
+            ),
+            # committee learners can contain different amounts of
+            # different instances
+            X_training=X_pool.iloc[train_idx[(np.arange(i + 1) + i) % len(train_idx)]],
+            y_training=y_pool[train_idx[(np.arange(i + 1) + i) % len(train_idx)]],
+        ) for i in range(3)]
+
+        for query_strategy in query_strategies:
+            committee = modAL.models.learners.Committee(
+                learner_list=learner_list,
+                query_strategy=query_strategy,
+                on_transformed=True
+            )
+            query_idx, query_inst = committee.query(X_pool)
+            committee.teach(X_pool.iloc[query_idx], y_pool[query_idx])
+
 
 class TestCommitteeRegressor(unittest.TestCase):
 
@@ -1040,6 +1174,45 @@ def test_vote(self):
                     vote_output
                 )
 
+    def test_on_transformed(self):
+        n_samples = 10
+        n_features = 5
+        query_strategies = [
+            # TODO remove, added just to make sure on_transformed doesn't break anything
+            # but it has no influence on this strategy, nothing special tested here
+            mock.MockFunction(return_val=[np.random.randint(0, n_samples)])
+
+            # add further strategies which work with instance representations
+            # no further ones as of 25.09.2020
+        ]
+        X_pool = np.random.rand(n_samples, n_features)
+
+        # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline
+        X_pool = pd.DataFrame(X_pool)
+
+        y_pool = np.random.rand(n_samples)
+        train_idx = np.random.choice(range(n_samples), size=2, replace=False)
+
+        learner_list = [modAL.models.learners.ActiveLearner(
+            estimator=make_pipeline(
+                FunctionTransformer(func=pd.DataFrame.to_numpy),
+                GaussianProcessRegressor()
+            ),
+            # committee learners can contain different amounts of
+            # different instances
+            X_training=X_pool.iloc[train_idx[(np.arange(i + 1) + i) % len(train_idx)]],
+            y_training=y_pool[train_idx[(np.arange(i + 1) + i) % len(train_idx)]],
+        ) for i in range(3)]
+
+        for query_strategy in query_strategies:
+            committee = modAL.models.learners.CommitteeRegressor(
+                learner_list=learner_list,
+                query_strategy=query_strategy,
+                on_transformed=True
+            )
+            query_idx, query_inst = committee.query(X_pool)
+            committee.teach(X_pool.iloc[query_idx], y_pool[query_idx])
+
 
 class TestMultilabel(unittest.TestCase):
     def test_SVM_loss(self):