|
1 | 1 | import random |
2 | 2 | import unittest |
3 | 3 | import numpy as np |
| 4 | +import pandas as pd |
4 | 5 |
|
5 | 6 | import mock |
6 | 7 | import modAL.models.base |
|
26 | 27 | from sklearn.metrics import confusion_matrix |
27 | 28 | from sklearn.svm import SVC |
28 | 29 | from sklearn.multiclass import OneVsRestClassifier |
| 30 | +from sklearn.pipeline import make_pipeline |
| 31 | +from sklearn.preprocessing import FunctionTransformer |
29 | 32 | from scipy.stats import entropy, norm |
30 | 33 | from scipy.special import ndtr |
31 | 34 | from scipy import sparse as sp |
@@ -788,6 +791,68 @@ def test_sparse_matrices(self): |
788 | 791 | query_idx, query_inst = learner.query(X_pool) |
789 | 792 | learner.teach(X_pool[query_idx], y_pool[query_idx]) |
790 | 793 |
|
| 794 | + def test_on_transformed(self): |
| 795 | + n_samples = 10 |
| 796 | + n_features = 5 |
| 797 | + query_strategies = [ |
| 798 | + modAL.batch.uncertainty_batch_sampling |
| 799 | + # add further strategies which work with instance representations |
| 800 | + # no further ones as of 25.09.2020 |
| 801 | + ] |
| 802 | + X_pool = np.random.rand(n_samples, n_features) |
| 803 | + |
| 804 | + # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline |
| 805 | + X_pool = pd.DataFrame(X_pool) |
| 806 | + |
| 807 | + y_pool = np.random.randint(0, 2, size=(n_samples,)) |
| 808 | + train_idx = np.random.choice(range(n_samples), size=2, replace=False) |
| 809 | + |
| 810 | + for query_strategy in query_strategies: |
| 811 | + learner = modAL.models.learners.ActiveLearner( |
| 812 | + estimator=make_pipeline( |
| 813 | + FunctionTransformer(func=pd.DataFrame.to_numpy), |
| 814 | + RandomForestClassifier(n_estimators=10) |
| 815 | + ), |
| 816 | + query_strategy=query_strategy, |
| 817 | + X_training=X_pool.iloc[train_idx], |
| 818 | + y_training=y_pool[train_idx], |
| 819 | + on_transformed=True |
| 820 | + ) |
| 821 | + query_idx, query_inst = learner.query(X_pool) |
| 822 | + learner.teach(X_pool.iloc[query_idx], y_pool[query_idx]) |
| 823 | + |
| 824 | + def test_old_query_strategy_interface(self): |
| 825 | + n_samples = 10 |
| 826 | + n_features = 5 |
| 827 | + X_pool = np.random.rand(n_samples, n_features) |
| 828 | + y_pool = np.random.randint(0, 2, size=(n_samples,)) |
| 829 | + |
| 830 | + # defining a custom query strategy also returning the selected instance |
| 831 | + # make sure even if a query strategy works in some funny way |
| 832 | + # (e.g. instance not matching instance index), |
| 833 | + # the old interface remains unchanged |
| 834 | + query_idx_ = np.random.choice(n_samples, 2) |
| 835 | + query_instance_ = X_pool[(query_idx_ + 1) % len(X_pool)] |
| 836 | + |
| 837 | + def custom_query_strategy(classifier, X): |
| 838 | + return query_idx_, query_instance_ |
| 839 | + |
| 840 | + |
| 841 | + train_idx = np.random.choice(range(n_samples), size=2, replace=False) |
| 842 | + custom_query_learner = modAL.models.learners.ActiveLearner( |
| 843 | + estimator=RandomForestClassifier(n_estimators=10), |
| 844 | + query_strategy=custom_query_strategy, |
| 845 | + X_training=X_pool[train_idx], y_training=y_pool[train_idx] |
| 846 | + ) |
| 847 | + |
| 848 | + query_idx, query_instance = custom_query_learner.query(X_pool) |
| 849 | + custom_query_learner.teach( |
| 850 | + X=X_pool[query_idx], |
| 851 | + y=y_pool[query_idx] |
| 852 | + ) |
| 853 | + np.testing.assert_equal(query_idx, query_idx_) |
| 854 | + np.testing.assert_equal(query_instance, query_instance_) |
| 855 | + |
791 | 856 |
|
792 | 857 | class TestBayesianOptimizer(unittest.TestCase): |
793 | 858 | def test_set_max(self): |
@@ -897,6 +962,39 @@ def test_teach(self): |
897 | 962 | ) |
898 | 963 | learner.teach(X, y, bootstrap=bootstrap, only_new=only_new) |
899 | 964 |
|
| 965 | + def test_on_transformed(self): |
| 966 | + n_samples = 10 |
| 967 | + n_features = 5 |
| 968 | + query_strategies = [ |
| 969 | + # TODO remove, added just to make sure on_transformed doesn't break anything |
| 970 | + # but it has no influence on this strategy, nothing special tested here |
| 971 | + mock.MockFunction(return_val=[np.random.randint(0, n_samples)]) |
| 972 | + |
| 973 | + # add further strategies which work with instance representations |
| 974 | + # no further ones as of 25.09.2020 |
| 975 | + ] |
| 976 | + X_pool = np.random.rand(n_samples, n_features) |
| 977 | + |
| 978 | + # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline |
| 979 | + X_pool = pd.DataFrame(X_pool) |
| 980 | + |
| 981 | + y_pool = np.random.rand(n_samples) |
| 982 | + train_idx = np.random.choice(range(n_samples), size=2, replace=False) |
| 983 | + |
| 984 | + for query_strategy in query_strategies: |
| 985 | + learner = modAL.models.learners.BayesianOptimizer( |
| 986 | + estimator=make_pipeline( |
| 987 | + FunctionTransformer(func=pd.DataFrame.to_numpy), |
| 988 | + GaussianProcessRegressor() |
| 989 | + ), |
| 990 | + query_strategy=query_strategy, |
| 991 | + X_training=X_pool.iloc[train_idx], |
| 992 | + y_training=y_pool[train_idx], |
| 993 | + on_transformed=True |
| 994 | + ) |
| 995 | + query_idx, query_inst = learner.query(X_pool) |
| 996 | + learner.teach(X_pool.iloc[query_idx], y_pool[query_idx]) |
| 997 | + |
900 | 998 |
|
901 | 999 | class TestCommittee(unittest.TestCase): |
902 | 1000 |
|
@@ -1007,6 +1105,42 @@ def test_teach(self): |
1007 | 1105 |
|
1008 | 1106 | committee.teach(X, y, bootstrap=bootstrap, only_new=only_new) |
1009 | 1107 |
|
| 1108 | + def test_on_transformed(self): |
| 1109 | + n_samples = 10 |
| 1110 | + n_features = 5 |
| 1111 | + query_strategies = [ |
| 1112 | + modAL.batch.uncertainty_batch_sampling |
| 1113 | + # add further strategies which work with instance representations |
| 1114 | + # no further ones as of 25.09.2020 |
| 1115 | + ] |
| 1116 | + X_pool = np.random.rand(n_samples, n_features) |
| 1117 | + |
| 1118 | + # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline |
| 1119 | + X_pool = pd.DataFrame(X_pool) |
| 1120 | + |
| 1121 | + y_pool = np.random.randint(0, 2, size=(n_samples,)) |
| 1122 | + train_idx = np.random.choice(range(n_samples), size=5, replace=False) |
| 1123 | + |
| 1124 | + learner_list = [modAL.models.learners.ActiveLearner( |
| 1125 | + estimator=make_pipeline( |
| 1126 | + FunctionTransformer(func=pd.DataFrame.to_numpy), |
| 1127 | + RandomForestClassifier(n_estimators=10) |
| 1128 | + ), |
| 1129 | + # committee learners can contain different amounts of |
| 1130 | + # different instances |
| 1131 | + X_training=X_pool.iloc[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], |
| 1132 | + y_training=y_pool[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], |
| 1133 | + ) for i in range(3)] |
| 1134 | + |
| 1135 | + for query_strategy in query_strategies: |
| 1136 | + committee = modAL.models.learners.Committee( |
| 1137 | + learner_list=learner_list, |
| 1138 | + query_strategy=query_strategy, |
| 1139 | + on_transformed=True |
| 1140 | + ) |
| 1141 | + query_idx, query_inst = committee.query(X_pool) |
| 1142 | + committee.teach(X_pool.iloc[query_idx], y_pool[query_idx]) |
| 1143 | + |
1010 | 1144 |
|
1011 | 1145 | class TestCommitteeRegressor(unittest.TestCase): |
1012 | 1146 |
|
@@ -1040,6 +1174,45 @@ def test_vote(self): |
1040 | 1174 | vote_output |
1041 | 1175 | ) |
1042 | 1176 |
|
| 1177 | + def test_on_transformed(self): |
| 1178 | + n_samples = 10 |
| 1179 | + n_features = 5 |
| 1180 | + query_strategies = [ |
| 1181 | + # TODO remove, added just to make sure on_transformed doesn't break anything |
| 1182 | + # but it has no influence on this strategy, nothing special tested here |
| 1183 | + mock.MockFunction(return_val=[np.random.randint(0, n_samples)]) |
| 1184 | + |
| 1185 | + # add further strategies which work with instance representations |
| 1186 | + # no further ones as of 25.09.2020 |
| 1187 | + ] |
| 1188 | + X_pool = np.random.rand(n_samples, n_features) |
| 1189 | + |
| 1190 | + # use pandas data frame as X_pool, which will be transformed back to numpy with sklearn pipeline |
| 1191 | + X_pool = pd.DataFrame(X_pool) |
| 1192 | + |
| 1193 | + y_pool = np.random.rand(n_samples) |
| 1194 | + train_idx = np.random.choice(range(n_samples), size=2, replace=False) |
| 1195 | + |
| 1196 | + learner_list = [modAL.models.learners.ActiveLearner( |
| 1197 | + estimator=make_pipeline( |
| 1198 | + FunctionTransformer(func=pd.DataFrame.to_numpy), |
| 1199 | + GaussianProcessRegressor() |
| 1200 | + ), |
| 1201 | + # committee learners can contain different amounts of |
| 1202 | + # different instances |
| 1203 | + X_training=X_pool.iloc[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], |
| 1204 | + y_training=y_pool[train_idx[(np.arange(i + 1) + i) % len(train_idx)]], |
| 1205 | + ) for i in range(3)] |
| 1206 | + |
| 1207 | + for query_strategy in query_strategies: |
| 1208 | + committee = modAL.models.learners.CommitteeRegressor( |
| 1209 | + learner_list=learner_list, |
| 1210 | + query_strategy=query_strategy, |
| 1211 | + on_transformed=True |
| 1212 | + ) |
| 1213 | + query_idx, query_inst = committee.query(X_pool) |
| 1214 | + committee.teach(X_pool.iloc[query_idx], y_pool[query_idx]) |
| 1215 | + |
1043 | 1216 |
|
1044 | 1217 | class TestMultilabel(unittest.TestCase): |
1045 | 1218 | def test_SVM_loss(self): |
|
0 commit comments