Skip to content

Commit d2945ba

Browse files
NeeratyoymfeurerPGijsbers
authored
Adding sklearn 0.24 support (#1016)
* Adding importable helper functions * Changing import of cat, cont * Better docstrings * Adding unit test to check ColumnTransformer * Refinements from @mfeurer * Editing example to support both NumPy and Pandas * Unit test fix to mark for deletion * Making some unit tests work * Waiting for dataset to be processed * Minor test collection fix * Template to handle missing tasks * Accounting for more missing tasks: * Fixing some more unit tests * Simplifying check_task_existence * black changes * Minor formatting * Handling task exists check * Testing edited check task func * Flake fix * More retries on connection error * Adding max_retries to config default * Update database retry unit test * Print to debug hash exception * Fixing checksum unit test * Retry on _download_text_file * Update datasets_tutorial.py * Update custom_flow_tutorial.py * Update test_study_functions.py * Update test_dataset_functions.py * more retries, but also more time between retries * allow for even more retries on get calls * Catching failed get task * undo stupid change * fix one more test * Refactoring md5 hash check inside _send_request * Fixing a fairly common unit test fail * Reverting loose check on unit test * Updating examples to run on sklearn 0.24 * Spawning tests for sklearn 0.24 * Adding numpy import * Fixing integer type check to allow np.integer * Making unit tests run on sklearn 0.24 * black fix * Trying to loosen check on unit test as fix * simplify examples * disable test for old python version Co-authored-by: Matthias Feurer <[email protected]> Co-authored-by: PGijsbers <[email protected]> Co-authored-by: neeratyoy <>
1 parent 80ae046 commit d2945ba

File tree

7 files changed

+53
-53
lines changed

7 files changed

+53
-53
lines changed

.github/workflows/ubuntu-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ jobs:
99
strategy:
1010
matrix:
1111
python-version: [3.6, 3.7, 3.8]
12-
scikit-learn: [0.21.2, 0.22.2, 0.23.1]
12+
scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24]
1313
exclude: # no scikit-learn 0.21.2 release for Python 3.8
1414
- python-version: 3.8
1515
scikit-learn: 0.21.2

examples/30_extended/flows_and_runs_tutorial.py

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
# License: BSD 3-Clause
99

1010
import openml
11-
import numpy as np
1211
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
1312

1413
############################################################################
@@ -54,7 +53,7 @@
5453
task = openml.tasks.get_task(403)
5554

5655
# Build any classifier or pipeline
57-
clf = tree.ExtraTreeClassifier()
56+
clf = tree.DecisionTreeClassifier()
5857

5958
# Run the flow
6059
run = openml.runs.run_model_on_task(clf, task)
@@ -83,7 +82,10 @@
8382
# ############################
8483
#
8584
# When you need to handle 'dirty' data, build pipelines to model then automatically.
86-
task = openml.tasks.get_task(1)
85+
# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via
86+
# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical
87+
# variables and missing values in both.
88+
task = openml.tasks.get_task(96)
8789

8890
# OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
8991
from openml.extensions.sklearn import cat, cont
@@ -96,20 +98,14 @@
9698
[
9799
(
98100
"categorical",
99-
pipeline.Pipeline(
100-
[
101-
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
102-
(
103-
"Encoder",
104-
preprocessing.OneHotEncoder(
105-
sparse=False, handle_unknown="ignore"
106-
),
107-
),
108-
]
109-
),
101+
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
110102
cat, # returns the categorical feature indices
111103
),
112-
("continuous", "passthrough", cont), # returns the numeric feature indices
104+
(
105+
"continuous",
106+
impute.SimpleImputer(strategy="median"),
107+
cont,
108+
), # returns the numeric feature indices
113109
]
114110
),
115111
),
@@ -146,20 +142,14 @@
146142
[
147143
(
148144
"categorical",
149-
pipeline.Pipeline(
150-
[
151-
("Imputer", impute.SimpleImputer(strategy="most_frequent")),
152-
(
153-
"Encoder",
154-
preprocessing.OneHotEncoder(
155-
sparse=False, handle_unknown="ignore"
156-
),
157-
),
158-
]
159-
),
145+
preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
160146
categorical_feature_indices,
161147
),
162-
("continuous", "passthrough", numeric_feature_indices),
148+
(
149+
"continuous",
150+
impute.SimpleImputer(strategy="median"),
151+
numeric_feature_indices,
152+
),
163153
]
164154
),
165155
),
@@ -182,7 +172,9 @@
182172
task = openml.tasks.get_task(6)
183173

184174
# The following lines can then be executed offline:
185-
run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)
175+
run = openml.runs.run_model_on_task(
176+
pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array",
177+
)
186178

187179
# The run may be stored offline, and the flow will be stored along with it:
188180
run.to_filesystem(directory="myrun")

examples/30_extended/run_setup_tutorial.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,9 @@
5959
# easy as you want it to be
6060

6161

62-
cat_imp = make_pipeline(
63-
SimpleImputer(strategy="most_frequent"),
64-
OneHotEncoder(handle_unknown="ignore", sparse=False),
65-
TruncatedSVD(),
66-
)
67-
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
62+
cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),)
63+
cont_imp = SimpleImputer(strategy="median")
64+
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
6865
model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
6966

7067
# Let's change some hyperparameters. Of course, in any good application we

examples/40_paper/2018_neurips_perrone_example.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"):
177177
cat_cols = list_categorical_attributes(flow_type=flow_type)
178178
num_cols = list(set(X.columns) - set(cat_cols))
179179

180-
# Missing value imputers
181-
cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None")
180+
# Missing value imputers for numeric columns
182181
num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
183182

184-
# Creating the one-hot encoder
183+
# Creating the one-hot encoder for numerical representation of categorical columns
185184
enc = OneHotEncoder(handle_unknown="ignore")
186185

187-
# Pipeline to handle categorical column transformations
188-
cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)])
189-
190186
# Combining column transformers
191-
ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)])
187+
ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
192188

193189
# Creating the full pipeline with the surrogate model
194190
clf = RandomForestRegressor(n_estimators=50)

tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ def test_serialize_model(self):
189189
if LooseVersion(sklearn.__version__) >= "0.22":
190190
fixture_parameters.update({"ccp_alpha": "0.0"})
191191
fixture_parameters.move_to_end("ccp_alpha", last=False)
192+
if LooseVersion(sklearn.__version__) >= "0.24":
193+
del fixture_parameters["presort"]
192194

193195
structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []}
194196

@@ -1317,12 +1319,18 @@ def test__get_fn_arguments_with_defaults(self):
13171319
(sklearn.tree.DecisionTreeClassifier.__init__, 14),
13181320
(sklearn.pipeline.Pipeline.__init__, 2),
13191321
]
1320-
else:
1322+
elif sklearn_version < "0.24":
13211323
fns = [
13221324
(sklearn.ensemble.RandomForestRegressor.__init__, 18),
13231325
(sklearn.tree.DecisionTreeClassifier.__init__, 14),
13241326
(sklearn.pipeline.Pipeline.__init__, 2),
13251327
]
1328+
else:
1329+
fns = [
1330+
(sklearn.ensemble.RandomForestRegressor.__init__, 18),
1331+
(sklearn.tree.DecisionTreeClassifier.__init__, 13),
1332+
(sklearn.pipeline.Pipeline.__init__, 2),
1333+
]
13261334

13271335
for fn, num_params_with_defaults in fns:
13281336
defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn)
@@ -1523,7 +1531,7 @@ def test_obtain_parameter_values(self):
15231531
"bootstrap": [True, False],
15241532
"criterion": ["gini", "entropy"],
15251533
},
1526-
cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1),
1534+
cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True),
15271535
n_iter=5,
15281536
)
15291537
flow = self.extension.model_to_flow(model)

tests/test_flows/test_flow_functions.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,16 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
325325
# Note that CI does not test against 0.19.1.
326326
openml.config.server = self.production_server
327327
_, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
328-
flow = 8175
329-
expected = "Trying to deserialize a model with dependency" " sklearn==0.19.1 not satisfied."
328+
if sklearn_major > 23:
329+
flow = 18587 # 18687, 18725 --- flows building random forest on >= 0.23
330+
flow_sklearn_version = "0.23.1"
331+
else:
332+
flow = 8175
333+
flow_sklearn_version = "0.19.1"
334+
expected = (
335+
"Trying to deserialize a model with dependency "
336+
"sklearn=={} not satisfied.".format(flow_sklearn_version)
337+
)
330338
self.assertRaisesRegex(
331339
ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True
332340
)

tests/test_study/test_study_examples.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# License: BSD 3-Clause
22

3-
from openml.testing import TestBase, SimpleImputer, CustomImputer
3+
from openml.testing import TestBase
44
from openml.extensions.sklearn import cat, cont
55

66
import sklearn
@@ -13,8 +13,8 @@ class TestStudyFunctions(TestBase):
1313
"""Test the example code of Bischl et al. (2018)"""
1414

1515
@unittest.skipIf(
16-
LooseVersion(sklearn.__version__) < "0.20",
17-
reason="columntransformer introduction in 0.20.0",
16+
LooseVersion(sklearn.__version__) < "0.24",
17+
reason="columntransformer introduction in 0.24.0",
1818
)
1919
def test_Figure1a(self):
2020
"""Test listing in Figure 1a on a single task and the old OpenML100 study.
@@ -39,15 +39,14 @@ def test_Figure1a(self):
3939
import openml
4040
import sklearn.metrics
4141
import sklearn.tree
42+
from sklearn.impute import SimpleImputer
4243
from sklearn.pipeline import Pipeline, make_pipeline
4344
from sklearn.compose import ColumnTransformer
4445
from sklearn.preprocessing import OneHotEncoder, StandardScaler
4546

4647
benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite
47-
cat_imp = make_pipeline(
48-
SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
49-
)
50-
cont_imp = make_pipeline(CustomImputer(), StandardScaler())
48+
cat_imp = OneHotEncoder(handle_unknown="ignore")
49+
cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
5150
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
5251
clf = Pipeline(
5352
steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]

0 commit comments

Comments
 (0)