Adding sklearn 0.24 support (#1016)

Neeratyoy · mfeurer · PGijsbers · web-flow · commit d2945ba70831 · 2021-02-11T11:11:48.000+01:00
* Adding importable helper functions * Changing import of cat, cont * Better docstrings * Adding unit test to check ColumnTransformer * Refinements from @mfeurer * Editing example to support both NumPy and Pandas * Unit test fix to mark for deletion * Making some unit tests work * Waiting for dataset to be processed * Minor test collection fix * Template to handle missing tasks * Accounting for more missing tasks: * Fixing some more unit tests * Simplifying check_task_existence * black changes * Minor formatting * Handling task exists check * Testing edited check task func * Flake fix * More retries on connection error * Adding max_retries to config default * Update database retry unit test * Print to debug hash exception * Fixing checksum unit test * Retry on _download_text_file * Update datasets_tutorial.py * Update custom_flow_tutorial.py * Update test_study_functions.py * Update test_dataset_functions.py * more retries, but also more time between retries * allow for even more retries on get calls * Catching failed get task * undo stupid change * fix one more test * Refactoring md5 hash check inside _send_request * Fixing a fairly common unit test fail * Reverting loose check on unit test * Updating examples to run on sklearn 0.24 * Spawning tests for sklearn 0.24 * Adding numpy import * Fixing integer type check to allow np.integer * Making unit tests run on sklearn 0.24 * black fix * Trying to loosen check on unit test as fix * simplify examples * disable test for old python version Co-authored-by: Matthias Feurer <feurerm@informatik.uni-freiburg.de> Co-authored-by: PGijsbers <p.gijsbers@tue.nl> Co-authored-by: neeratyoy <>
diff --git a/.github/workflows/ubuntu-test.yml b/.github/workflows/ubuntu-test.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.6, 3.7, 3.8]
-        scikit-learn: [0.21.2, 0.22.2, 0.23.1]
+        scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24]
         exclude:  # no scikit-learn 0.21.2 release for Python 3.8
           - python-version: 3.8
             scikit-learn: 0.21.2
diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py
@@ -8,7 +8,6 @@
 # License: BSD 3-Clause
 
 import openml
-import numpy as np
 from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
 
 ############################################################################
@@ -54,7 +53,7 @@
 task = openml.tasks.get_task(403)
 
 # Build any classifier or pipeline
-clf = tree.ExtraTreeClassifier()
+clf = tree.DecisionTreeClassifier()
 
 # Run the flow
 run = openml.runs.run_model_on_task(clf, task)
@@ -83,7 +82,10 @@
 # ############################
 #
 # When you need to handle 'dirty' data, build pipelines to model then automatically.
-task = openml.tasks.get_task(1)
+# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via
+# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical
+# variables and missing values in both.
+task = openml.tasks.get_task(96)
 
 # OpenML helper functions for sklearn can be plugged in directly for complicated pipelines
 from openml.extensions.sklearn import cat, cont
@@ -96,20 +98,14 @@
                 [
                     (
                         "categorical",
-                        pipeline.Pipeline(
-                            [
-                                ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
-                                (
-                                    "Encoder",
-                                    preprocessing.OneHotEncoder(
-                                        sparse=False, handle_unknown="ignore"
-                                    ),
-                                ),
-                            ]
-                        ),
+                        preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
                         cat,  # returns the categorical feature indices
                     ),
-                    ("continuous", "passthrough", cont),  # returns the numeric feature indices
+                    (
+                        "continuous",
+                        impute.SimpleImputer(strategy="median"),
+                        cont,
+                    ),  # returns the numeric feature indices
                 ]
             ),
         ),
@@ -146,20 +142,14 @@
                 [
                     (
                         "categorical",
-                        pipeline.Pipeline(
-                            [
-                                ("Imputer", impute.SimpleImputer(strategy="most_frequent")),
-                                (
-                                    "Encoder",
-                                    preprocessing.OneHotEncoder(
-                                        sparse=False, handle_unknown="ignore"
-                                    ),
-                                ),
-                            ]
-                        ),
+                        preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"),
                         categorical_feature_indices,
                     ),
-                    ("continuous", "passthrough", numeric_feature_indices),
+                    (
+                        "continuous",
+                        impute.SimpleImputer(strategy="median"),
+                        numeric_feature_indices,
+                    ),
                 ]
             ),
         ),
@@ -182,7 +172,9 @@
 task = openml.tasks.get_task(6)
 
 # The following lines can then be executed offline:
-run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False)
+run = openml.runs.run_model_on_task(
+    pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array",
+)
 
 # The run may be stored offline, and the flow will be stored along with it:
 run.to_filesystem(directory="myrun")
diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
@@ -59,12 +59,9 @@
 # easy as you want it to be
 
 
-cat_imp = make_pipeline(
-    SimpleImputer(strategy="most_frequent"),
-    OneHotEncoder(handle_unknown="ignore", sparse=False),
-    TruncatedSVD(),
-)
-ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
+cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),)
+cont_imp = SimpleImputer(strategy="median")
+ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
 model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),])
 
 # Let's change some hyperparameters. Of course, in any good application we
diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py
@@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"):
 cat_cols = list_categorical_attributes(flow_type=flow_type)
 num_cols = list(set(X.columns) - set(cat_cols))
 
-# Missing value imputers
-cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None")
+# Missing value imputers for numeric columns
 num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1)
 
-# Creating the one-hot encoder
+# Creating the one-hot encoder for numerical representation of categorical columns
 enc = OneHotEncoder(handle_unknown="ignore")
 
-# Pipeline to handle categorical column transformations
-cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)])
-
 # Combining column transformers
-ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)])
+ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)])
 
 # Creating the full pipeline with the surrogate model
 clf = RandomForestRegressor(n_estimators=50)
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -189,6 +189,8 @@ def test_serialize_model(self):
         if LooseVersion(sklearn.__version__) >= "0.22":
             fixture_parameters.update({"ccp_alpha": "0.0"})
             fixture_parameters.move_to_end("ccp_alpha", last=False)
+        if LooseVersion(sklearn.__version__) >= "0.24":
+            del fixture_parameters["presort"]
 
         structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []}
 
@@ -1317,12 +1319,18 @@ def test__get_fn_arguments_with_defaults(self):
                 (sklearn.tree.DecisionTreeClassifier.__init__, 14),
                 (sklearn.pipeline.Pipeline.__init__, 2),
             ]
-        else:
+        elif sklearn_version < "0.24":
             fns = [
                 (sklearn.ensemble.RandomForestRegressor.__init__, 18),
                 (sklearn.tree.DecisionTreeClassifier.__init__, 14),
                 (sklearn.pipeline.Pipeline.__init__, 2),
             ]
+        else:
+            fns = [
+                (sklearn.ensemble.RandomForestRegressor.__init__, 18),
+                (sklearn.tree.DecisionTreeClassifier.__init__, 13),
+                (sklearn.pipeline.Pipeline.__init__, 2),
+            ]
 
         for fn, num_params_with_defaults in fns:
             defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn)
@@ -1523,7 +1531,7 @@ def test_obtain_parameter_values(self):
                 "bootstrap": [True, False],
                 "criterion": ["gini", "entropy"],
             },
-            cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1),
+            cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True),
             n_iter=5,
         )
         flow = self.extension.model_to_flow(model)
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -325,8 +325,16 @@ def test_get_flow_reinstantiate_model_wrong_version(self):
         # Note that CI does not test against 0.19.1.
         openml.config.server = self.production_server
         _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
-        flow = 8175
-        expected = "Trying to deserialize a model with dependency" " sklearn==0.19.1 not satisfied."
+        if sklearn_major > 23:
+            flow = 18587  # 18687, 18725 --- flows building random forest on >= 0.23
+            flow_sklearn_version = "0.23.1"
+        else:
+            flow = 8175
+            flow_sklearn_version = "0.19.1"
+        expected = (
+            "Trying to deserialize a model with dependency "
+            "sklearn=={} not satisfied.".format(flow_sklearn_version)
+        )
         self.assertRaisesRegex(
             ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True
         )
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
@@ -1,6 +1,6 @@
 # License: BSD 3-Clause
 
-from openml.testing import TestBase, SimpleImputer, CustomImputer
+from openml.testing import TestBase
 from openml.extensions.sklearn import cat, cont
 
 import sklearn
@@ -13,8 +13,8 @@ class TestStudyFunctions(TestBase):
     """Test the example code of Bischl et al. (2018)"""
 
     @unittest.skipIf(
-        LooseVersion(sklearn.__version__) < "0.20",
-        reason="columntransformer introduction in 0.20.0",
+        LooseVersion(sklearn.__version__) < "0.24",
+        reason="columntransformer introduction in 0.24.0",
     )
     def test_Figure1a(self):
         """Test listing in Figure 1a on a single task and the old OpenML100 study.
@@ -39,15 +39,14 @@ def test_Figure1a(self):
         import openml
         import sklearn.metrics
         import sklearn.tree
+        from sklearn.impute import SimpleImputer
         from sklearn.pipeline import Pipeline, make_pipeline
         from sklearn.compose import ColumnTransformer
         from sklearn.preprocessing import OneHotEncoder, StandardScaler
 
         benchmark_suite = openml.study.get_study("OpenML100", "tasks")  # obtain the benchmark suite
-        cat_imp = make_pipeline(
-            SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore")
-        )
-        cont_imp = make_pipeline(CustomImputer(), StandardScaler())
+        cat_imp = OneHotEncoder(handle_unknown="ignore")
+        cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())
         ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)])
         clf = Pipeline(
             steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]