Skip to content

Commit e6ee09d

Browse files
amuellerMatthias Feurer
authored andcommitted
MRG Sklearn 0.21 compatibility and CI (#752)
* test against scikit-learn 0.21 * fix call to roc_auc * added verbose parameter to pipeline in 0.21 * remove no-longer-existant categorical_features paramter * more pipeline parameter checks * more imputer replacements * don't break on dev versions * typo on roc_auc_score name * use ordered dicts, avoid nan comparison * undid weird merge artifact * add missing file whoops * flake8 * try fixing import in backport, pep8 * move SimpleImputer to testing module * don't trust dicts to be ordered * run CI mostly on 0.21.2 * failed to safe lol
1 parent b59cc46 commit e6ee09d

File tree

8 files changed

+84
-72
lines changed

8 files changed

+84
-72
lines changed

.travis.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ env:
1515
- TEST_DIR=/tmp/test_dir/
1616
- MODULE=openml
1717
matrix:
18-
- DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.20.0"
19-
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.0"
20-
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" RUN_FLAKE8="true" SKIP_TESTS="true"
21-
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" COVERAGE="true" DOCPUSH="true"
18+
- DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2"
19+
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2"
20+
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
21+
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
22+
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
2223
# Checks for older scikit-learn versions (which also don't nicely work with
2324
# Python3.7)
2425
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"

openml/testing.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,4 +321,10 @@ def _check_fold_timing_evaluations(
321321
self.assertLessEqual(evaluation, max_val)
322322

323323

324-
__all__ = ['TestBase']
324+
try:
325+
from sklearn.impute import SimpleImputer
326+
except ImportError:
327+
from sklearn.preprocessing import Imputer as SimpleImputer
328+
329+
330+
__all__ = ['TestBase', 'SimpleImputer']

tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py

Lines changed: 46 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,15 @@
2828
import sklearn.tree
2929
import sklearn.cluster
3030

31-
if LooseVersion(sklearn.__version__) < "0.20":
32-
from sklearn.preprocessing import Imputer
33-
else:
34-
from sklearn.impute import SimpleImputer as Imputer
3531

3632
import openml
3733
from openml.extensions.sklearn import SklearnExtension
3834
from openml.exceptions import PyOpenMLError
3935
from openml.flows import OpenMLFlow
4036
from openml.flows.functions import assert_flows_equal
4137
from openml.runs.trace import OpenMLRunTrace
42-
from openml.testing import TestBase
38+
from openml.testing import TestBase, SimpleImputer
39+
4340

4441
this_directory = os.path.dirname(os.path.abspath(__file__))
4542
sys.path.append(this_directory)
@@ -285,11 +282,14 @@ def test_serialize_pipeline(self):
285282
# Comparing the pipeline
286283
# The parameters only have the name of base objects(not the whole flow)
287284
# as value
288-
# memory parameter has been added in 0.19
285+
# memory parameter has been added in 0.19, verbose in 0.21
289286
if LooseVersion(sklearn.__version__) < "0.19":
290287
self.assertEqual(len(serialization.parameters), 1)
291-
else:
288+
elif LooseVersion(sklearn.__version__) < "0.21":
292289
self.assertEqual(len(serialization.parameters), 2)
290+
else:
291+
self.assertEqual(len(serialization.parameters), 3)
292+
293293
# Hard to compare two representations of a dict due to possibly
294294
# different sorting. Making a json makes it easier
295295
self.assertEqual(
@@ -374,8 +374,10 @@ def test_serialize_pipeline_clustering(self):
374374
# memory parameter has been added in 0.19
375375
if LooseVersion(sklearn.__version__) < "0.19":
376376
self.assertEqual(len(serialization.parameters), 1)
377-
else:
377+
elif LooseVersion(sklearn.__version__) < "0.21":
378378
self.assertEqual(len(serialization.parameters), 2)
379+
else:
380+
self.assertEqual(len(serialization.parameters), 3)
379381
# Hard to compare two representations of a dict due to possibly
380382
# different sorting. Making a json makes it easier
381383
self.assertEqual(
@@ -624,7 +626,7 @@ def test_serialize_feature_union_switched_names(self):
624626
.format(module_name_encoder))
625627

626628
def test_serialize_complex_flow(self):
627-
ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0])
629+
ohe = sklearn.preprocessing.OneHotEncoder()
628630
scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
629631
boosting = sklearn.ensemble.AdaBoostClassifier(
630632
base_estimator=sklearn.tree.DecisionTreeClassifier())
@@ -747,25 +749,26 @@ def test_serialize_simple_parameter_grid(self):
747749
# Examples from the scikit-learn documentation
748750
models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()]
749751
grids = \
750-
[[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
751-
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
752-
'kernel': ['rbf']}],
753-
{"max_depth": [3, None],
754-
"max_features": [1, 3, 10],
755-
"min_samples_split": [1, 3, 10],
756-
"min_samples_leaf": [1, 3, 10],
757-
"bootstrap": [True, False],
758-
"criterion": ["gini", "entropy"]}]
752+
[[OrderedDict([('C', [1, 10, 100, 1000]), ('kernel', ['linear'])]),
753+
OrderedDict([('C', [1, 10, 100, 1000]), ('gamma', [0.001, 0.0001]),
754+
('kernel', ['rbf'])])],
755+
OrderedDict([("bootstrap", [True, False]),
756+
("criterion", ["gini", "entropy"]),
757+
("max_depth", [3, None]),
758+
("max_features", [1, 3, 10]),
759+
("min_samples_leaf", [1, 3, 10]),
760+
("min_samples_split", [1, 3, 10])
761+
])]
759762

760763
for grid, model in zip(grids, models):
761764
serialized = self.extension.model_to_flow(grid)
762765
deserialized = self.extension.flow_to_model(serialized)
763766

764767
self.assertEqual(deserialized, grid)
765768
self.assertIsNot(deserialized, grid)
766-
769+
# providing error_score because nan != nan
767770
hpo = sklearn.model_selection.GridSearchCV(
768-
param_grid=grid, estimator=model)
771+
param_grid=grid, estimator=model, error_score=-1000)
769772

770773
serialized = self.extension.model_to_flow(hpo)
771774
deserialized = self.extension.flow_to_model(serialized)
@@ -943,7 +946,7 @@ def test_illegal_parameter_names(self):
943946
def test_illegal_parameter_names_pipeline(self):
944947
# illegal name: steps
945948
steps = [
946-
('Imputer', Imputer(strategy='median')),
949+
('Imputer', SimpleImputer(strategy='median')),
947950
('OneHotEncoder',
948951
sklearn.preprocessing.OneHotEncoder(sparse=False,
949952
handle_unknown='ignore')),
@@ -956,7 +959,7 @@ def test_illegal_parameter_names_featureunion(self):
956959
# illegal name: transformer_list
957960
transformer_list = [
958961
('transformer_list',
959-
Imputer(strategy='median')),
962+
SimpleImputer(strategy='median')),
960963
('OneHotEncoder',
961964
sklearn.preprocessing.OneHotEncoder(sparse=False,
962965
handle_unknown='ignore'))
@@ -1015,18 +1018,25 @@ def test_paralizable_check(self):
10151018
self.extension._prevent_optimize_n_jobs(model)
10161019

10171020
def test__get_fn_arguments_with_defaults(self):
1018-
if LooseVersion(sklearn.__version__) < "0.19":
1021+
sklearn_version = LooseVersion(sklearn.__version__)
1022+
if sklearn_version < "0.19":
10191023
fns = [
10201024
(sklearn.ensemble.RandomForestRegressor.__init__, 15),
10211025
(sklearn.tree.DecisionTreeClassifier.__init__, 12),
10221026
(sklearn.pipeline.Pipeline.__init__, 0)
10231027
]
1024-
else:
1028+
elif sklearn_version < "0.21":
10251029
fns = [
10261030
(sklearn.ensemble.RandomForestRegressor.__init__, 16),
10271031
(sklearn.tree.DecisionTreeClassifier.__init__, 13),
10281032
(sklearn.pipeline.Pipeline.__init__, 1)
10291033
]
1034+
else:
1035+
fns = [
1036+
(sklearn.ensemble.RandomForestRegressor.__init__, 16),
1037+
(sklearn.tree.DecisionTreeClassifier.__init__, 13),
1038+
(sklearn.pipeline.Pipeline.__init__, 2)
1039+
]
10301040

10311041
for fn, num_params_with_defaults in fns:
10321042
defaults, defaultless = (
@@ -1047,7 +1057,7 @@ def test_deserialize_with_defaults(self):
10471057
# used the 'initialize_with_defaults' flag of the deserialization
10481058
# method to return a flow that contains default hyperparameter
10491059
# settings.
1050-
steps = [('Imputer', Imputer()),
1060+
steps = [('Imputer', SimpleImputer()),
10511061
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
10521062
('Estimator', sklearn.tree.DecisionTreeClassifier())]
10531063
pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
@@ -1071,7 +1081,7 @@ def test_deserialize_adaboost_with_defaults(self):
10711081
# used the 'initialize_with_defaults' flag of the deserialization
10721082
# method to return a flow that contains default hyperparameter
10731083
# settings.
1074-
steps = [('Imputer', Imputer()),
1084+
steps = [('Imputer', SimpleImputer()),
10751085
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
10761086
('Estimator', sklearn.ensemble.AdaBoostClassifier(
10771087
sklearn.tree.DecisionTreeClassifier()))]
@@ -1097,7 +1107,7 @@ def test_deserialize_complex_with_defaults(self):
10971107
# method to return a flow that contains default hyperparameter
10981108
# settings.
10991109
steps = [
1100-
('Imputer', Imputer()),
1110+
('Imputer', SimpleImputer()),
11011111
('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
11021112
(
11031113
'Estimator',
@@ -1237,7 +1247,7 @@ def test_run_model_on_task(self):
12371247
class MyPipe(sklearn.pipeline.Pipeline):
12381248
pass
12391249
task = openml.tasks.get_task(1)
1240-
pipe = MyPipe([('imp', Imputer()),
1250+
pipe = MyPipe([('imp', SimpleImputer()),
12411251
('dummy', sklearn.dummy.DummyClassifier())])
12421252
openml.runs.run_model_on_task(pipe, task)
12431253

@@ -1309,7 +1319,7 @@ def test_run_model_on_fold_classification_1(self):
13091319
y_test = y[test_indices]
13101320

13111321
pipeline = sklearn.pipeline.Pipeline(steps=[
1312-
('imp', sklearn.preprocessing.Imputer()),
1322+
('imp', SimpleImputer()),
13131323
('clf', sklearn.tree.DecisionTreeClassifier()),
13141324
])
13151325
# TODO add some mocking here to actually test the innards of this function, too!
@@ -1435,11 +1445,11 @@ def predict_proba(*args, **kwargs):
14351445
y_train = y[train_indices]
14361446
X_test = X[test_indices]
14371447
clf1 = sklearn.pipeline.Pipeline(steps=[
1438-
('imputer', sklearn.preprocessing.Imputer()),
1448+
('imputer', SimpleImputer()),
14391449
('estimator', sklearn.naive_bayes.GaussianNB())
14401450
])
14411451
clf2 = sklearn.pipeline.Pipeline(steps=[
1442-
('imputer', sklearn.preprocessing.Imputer()),
1452+
('imputer', SimpleImputer()),
14431453
('estimator', HardNaiveBayes())
14441454
])
14451455

@@ -1492,7 +1502,7 @@ def test_run_model_on_fold_regression(self):
14921502
y_test = y[test_indices]
14931503

14941504
pipeline = sklearn.pipeline.Pipeline(steps=[
1495-
('imp', sklearn.preprocessing.Imputer()),
1505+
('imp', SimpleImputer()),
14961506
('clf', sklearn.tree.DecisionTreeRegressor()),
14971507
])
14981508
# TODO add some mocking here to actually test the innards of this function, too!
@@ -1537,7 +1547,7 @@ def test_run_model_on_fold_clustering(self):
15371547
X = task.get_X(dataset_format='array')
15381548

15391549
pipeline = sklearn.pipeline.Pipeline(steps=[
1540-
('imp', sklearn.preprocessing.Imputer()),
1550+
('imp', SimpleImputer()),
15411551
('clf', sklearn.cluster.KMeans()),
15421552
])
15431553
# TODO add some mocking here to actually test the innards of this function, too!
@@ -1626,7 +1636,7 @@ def test_trim_flow_name(self):
16261636
long = """sklearn.pipeline.Pipeline(
16271637
columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
16281638
numeric=sklearn.pipeline.Pipeline(
1629-
imputer=sklearn.preprocessing.imputation.Imputer,
1639+
SimpleImputer=sklearn.preprocessing.imputation.Imputer,
16301640
standardscaler=sklearn.preprocessing.data.StandardScaler),
16311641
nominal=sklearn.pipeline.Pipeline(
16321642
simpleimputer=sklearn.impute.SimpleImputer,
@@ -1650,7 +1660,7 @@ def test_trim_flow_name(self):
16501660
self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
16511661

16521662
long = """sklearn.pipeline.Pipeline(
1653-
Imputer=sklearn.preprocessing.imputation.Imputer,
1663+
SimpleImputer=sklearn.preprocessing.imputation.Imputer,
16541664
VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501
16551665
Estimator=sklearn.model_selection._search.RandomizedSearchCV(
16561666
estimator=sklearn.tree.tree.DecisionTreeClassifier))"""
@@ -1660,7 +1670,7 @@ def test_trim_flow_name(self):
16601670

16611671
long = """sklearn.model_selection._search.RandomizedSearchCV(
16621672
estimator=sklearn.pipeline.Pipeline(
1663-
Imputer=sklearn.preprocessing.imputation.Imputer,
1673+
SimpleImputer=sklearn.preprocessing.imputation.Imputer,
16641674
classifier=sklearn.ensemble.forest.RandomForestClassifier))"""
16651675
short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))"
16661676
long_stripped, _ = re.subn(r'\s', '', long)

tests/test_flows/test_flow.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,13 @@
1919
import sklearn.naive_bayes
2020
import sklearn.tree
2121

22-
if LooseVersion(sklearn.__version__) < "0.20":
23-
from sklearn.preprocessing import Imputer
24-
else:
25-
from sklearn.impute import SimpleImputer as Imputer
26-
2722
import xmltodict
2823

2924
import openml
3025
from openml._api_calls import _perform_api_call
3126
import openml.exceptions
3227
import openml.extensions.sklearn
33-
from openml.testing import TestBase
28+
from openml.testing import TestBase, SimpleImputer
3429
import openml.utils
3530

3631

@@ -318,8 +313,8 @@ def test_illegal_flow(self):
318313
# should throw error as it contains two imputers
319314
illegal = sklearn.pipeline.Pipeline(
320315
steps=[
321-
('imputer1', Imputer()),
322-
('imputer2', Imputer()),
316+
('imputer1', SimpleImputer()),
317+
('imputer2', SimpleImputer()),
323318
('classif', sklearn.tree.DecisionTreeClassifier())
324319
]
325320
)
@@ -350,7 +345,7 @@ def test_existing_flow_exists(self):
350345
if LooseVersion(sklearn.__version__) >= '0.20':
351346
ohe_params['categories'] = 'auto'
352347
steps = [
353-
('imputation', Imputer(strategy='median')),
348+
('imputation', SimpleImputer(strategy='median')),
354349
('hotencoding', sklearn.preprocessing.OneHotEncoder(**ohe_params)),
355350
(
356351
'variencethreshold',

tests/test_flows/test_flow_functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ def test_get_flow_reinstantiate_model_no_extension(self):
288288
def test_get_flow_reinstantiate_model_wrong_version(self):
289289
# Note that CI does not test against 0.19.1.
290290
openml.config.server = self.production_server
291-
_, sklearn_major, _ = LooseVersion(sklearn.__version__).version
291+
_, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
292292
flow = 8175
293293
expected = 'Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied.'
294294
self.assertRaisesRegex(ValueError,

tests/test_runs/test_run.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,8 @@
77
from sklearn.tree import DecisionTreeClassifier
88
from sklearn.model_selection import GridSearchCV
99
from sklearn.pipeline import Pipeline
10-
from sklearn.preprocessing import Imputer
1110

12-
from openml.testing import TestBase
11+
from openml.testing import TestBase, SimpleImputer
1312
import openml
1413
import openml.extensions.sklearn
1514

@@ -106,7 +105,7 @@ def _check_array(array, type_):
106105
def test_to_from_filesystem_vanilla(self):
107106

108107
model = Pipeline([
109-
('imputer', Imputer(strategy='mean')),
108+
('imputer', SimpleImputer(strategy='mean')),
110109
('classifier', DecisionTreeClassifier(max_depth=1)),
111110
])
112111
task = openml.tasks.get_task(119)
@@ -139,7 +138,7 @@ def test_to_from_filesystem_vanilla(self):
139138
def test_to_from_filesystem_search(self):
140139

141140
model = Pipeline([
142-
('imputer', Imputer(strategy='mean')),
141+
('imputer', SimpleImputer(strategy='mean')),
143142
('classifier', DecisionTreeClassifier(max_depth=1)),
144143
])
145144
model = GridSearchCV(
@@ -175,7 +174,7 @@ def test_to_from_filesystem_search(self):
175174
def test_to_from_filesystem_no_model(self):
176175

177176
model = Pipeline([
178-
('imputer', Imputer(strategy='mean')),
177+
('imputer', SimpleImputer(strategy='mean')),
179178
('classifier', DummyClassifier()),
180179
])
181180
task = openml.tasks.get_task(119)
@@ -205,7 +204,7 @@ def test_publish_with_local_loaded_flow(self):
205204
extension = openml.extensions.sklearn.SklearnExtension()
206205

207206
model = Pipeline([
208-
('imputer', Imputer(strategy='mean')),
207+
('imputer', SimpleImputer(strategy='mean')),
209208
('classifier', DummyClassifier()),
210209
])
211210
task = openml.tasks.get_task(119)

0 commit comments

Comments
 (0)