Dataframe run on task (#777)

amueller · Neeratyoy · anonymous99199 · web-flow · commit bf3cd2ebaac1 · 2020-10-23T11:39:26.000+02:00
* run on tasks allows dataframes * don't force third subcomponent part to be list * Making DataFrame default behaviour for runs; Fixing test cases for the same * Fixing PEP8 + Adding docstring to CustomImputer() * run on tasks allows dataframes * Attempting rebase * Fixing test cases * Trying test case fixes * run on tasks allows dataframes * don't force third subcomponent part to be list * Making DataFrame default behaviour for runs; Fixing test cases for the same * Fixing PEP8 + Adding docstring to CustomImputer() * Attempting rebase * Fixing test cases * Trying test case fixes * Allowing functions in subcomponents * Fixing test cases * Adding dataset output param to run * Fixing test cases * Changes suggested by mfeurer * Editing predict_proba function * Test case fix * Test case fix * Edit unit test to bypass server issue * Fixing unit test * Reiterating with @PGijsbers comments * Minor fixes to test cases * Adding unit test and suggestions from @mfeurer * Fixing test case for all sklearn versions * Testing changes * Fixing import in example * Triggering unit tests * Degugging failed example script * Adding unit tests * Push for debugging * Push for @mfeurer to debug * Resetting to debug * Updating branch * pre-commit fixes * Handling failing examples * Reiteration with clean ups and minor fixes * Closing comments * Black fixes * feedback from @mfeurer * Minor fix * suggestions from @PGijsbers Co-authored-by: neeratyoy <neeratyoy@gmail.com> Co-authored-by: neeratyoy <de4nas@gmail.com>
diff --git a/.travis.yml b/.travis.yml
@@ -15,20 +15,21 @@ env:
   - TEST_DIR=/tmp/test_dir/
   - MODULE=openml
   matrix:
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
-    - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
-    # Checks for older scikit-learn versions (which also don't nicely work with
-    # Python3.7)
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
-    - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" COVERAGE="true" DOCPUSH="true" SKIP_TESTS="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" RUN_FLAKE8="true" SKIP_TESTS="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.2"
+  # Checks for older scikit-learn versions (which also don't nicely work with
+  # Python3.7)
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
 
 # Travis issue
 # https://github.com/travis-ci/travis-ci/issues/8920
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -66,7 +66,7 @@
 ############################################################################
 # Get the actual data.
 #
-# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
+# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy
 # sparse matrix, or as a Pandas DataFrame. The format is
 # controlled with the parameter ``dataset_format`` which can be either 'array'
 # (default) or 'dataframe'. Let's first build our dataset from a NumPy array
diff --git a/examples/30_extended/flow_id_tutorial.py b/examples/30_extended/flow_id_tutorial.py
@@ -15,6 +15,11 @@
 
 import openml
 
+
+# Activating test server
+openml.config.start_using_configuration_for_example()
+
+
 clf = sklearn.tree.DecisionTreeClassifier()
 
 ####################################################################################################
@@ -69,3 +74,6 @@
 # This also works with the actual model (generalizing the first part of this example):
 flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
 print(flow_ids)
+
+# Deactivating test server
+openml.config.stop_using_configuration_for_example()
diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py
@@ -37,6 +37,11 @@
 import sklearn.ensemble
 import sklearn.impute
 import sklearn.preprocessing
+from sklearn.pipeline import make_pipeline, Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
+from sklearn.experimental import enable_hist_gradient_boosting
 
 
 openml.config.start_using_configuration_for_example()
@@ -52,22 +57,39 @@
 # we will create a fairly complex model, with many preprocessing components and
 # many potential hyperparameters. Of course, the model can be as complex and as
 # easy as you want it to be
-model_original = sklearn.pipeline.make_pipeline(
-    sklearn.impute.SimpleImputer(), sklearn.ensemble.RandomForestClassifier()
-)
 
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.decomposition import TruncatedSVD
+
+
+# Helper functions to return required columns for ColumnTransformer
+def cont(X):
+    return X.dtypes != "category"
+
+
+def cat(X):
+    return X.dtypes == "category"
+
+
+cat_imp = make_pipeline(
+    SimpleImputer(strategy="most_frequent"),
+    OneHotEncoder(handle_unknown="ignore", sparse=False),
+    TruncatedSVD(),
+)
+ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
+model_original = sklearn.pipeline.Pipeline(
+    steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
+)
 
 # Let's change some hyperparameters. Of course, in any good application we
 # would tune them using, e.g., Random Search or Bayesian Optimization, but for
 # the purpose of this tutorial we set them to some specific values that might
 # or might not be optimal
 hyperparameters_original = {
-    "simpleimputer__strategy": "median",
-    "randomforestclassifier__criterion": "entropy",
-    "randomforestclassifier__max_features": 0.2,
-    "randomforestclassifier__min_samples_leaf": 1,
-    "randomforestclassifier__n_estimators": 16,
-    "randomforestclassifier__random_state": 42,
+    "estimator__loss": "auto",
+    "estimator__learning_rate": 0.15,
+    "estimator__max_iter": 50,
+    "estimator__min_samples_leaf": 1,
 }
 model_original.set_params(**hyperparameters_original)
 
diff --git a/examples/30_extended/study_tutorial.py b/examples/30_extended/study_tutorial.py
@@ -17,8 +17,11 @@
 
 import numpy as np
 import sklearn.tree
-import sklearn.pipeline
-import sklearn.impute
+from sklearn.pipeline import make_pipeline, Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
 
 import openml
 
@@ -68,7 +71,7 @@
 )
 print(evaluations.head())
 
-############################################################################
+###########################################################from openml.testing import cat, cont#################
 # Uploading studies
 # =================
 #
@@ -78,12 +81,30 @@
 
 openml.config.start_using_configuration_for_example()
 
-# Very simple classifier which ignores the feature type
+# Model that can handle missing values
+from sklearn.experimental import enable_hist_gradient_boosting
+from sklearn.ensemble import HistGradientBoostingClassifier
+
+
+# Helper functions to return required columns for ColumnTransformer
+def cont(X):
+    return X.dtypes != "category"
+
+
+def cat(X):
+    return X.dtypes == "category"
+
+
+cat_imp = make_pipeline(
+    SimpleImputer(strategy="most_frequent"),
+    OneHotEncoder(handle_unknown="ignore", sparse=False),
+    TruncatedSVD(),
+)
+ct = ColumnTransformer(
+    [("cat", cat_imp, cat), ("cont", FunctionTransformer(lambda x: x, validate=False), cont)]
+)
 clf = sklearn.pipeline.Pipeline(
-    steps=[
-        ("imputer", sklearn.impute.SimpleImputer()),
-        ("estimator", sklearn.tree.DecisionTreeClassifier(max_depth=5)),
-    ]
+    steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
 )
 
 suite = openml.study.get_suite(1)
diff --git a/openml/__init__.py b/openml/__init__.py
@@ -113,6 +113,7 @@ def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None)
     "study",
     "utils",
     "_api_calls",
+    "__version__",
 ]
 
 # Load the scikit-learn extension by default
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -815,12 +815,12 @@ def edit_dataset(
 ) -> int:
     """
       Edits an OpenMLDataset.
-      Specify atleast one field to edit, apart from data_id
+      Specify at least one field to edit, apart from data_id
        - For certain fields, a new dataset version is created : attributes, data,
        default_target_attribute, ignore_attribute, row_id_attribute.
 
-       - For other fields, the uploader can edit the exisiting version.
-        Noone except the uploader can edit the exisitng version.
+       - For other fields, the uploader can edit the existing version.
+        No one except the uploader can edit the existing version.
 
       Parameters
       ----------
diff --git a/openml/exceptions.py b/openml/exceptions.py
@@ -27,7 +27,7 @@ def __init__(self, message: str, code: int = None, url: str = None):
         self.url = url
         super().__init__(message)
 
-    def __repr__(self):
+    def __str__(self):
         return "%s returned code %s: %s" % (self.url, self.code, self.message,)
 
 
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
@@ -11,7 +11,7 @@
 from re import IGNORECASE
 import sys
 import time
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, cast
 import warnings
 
 import numpy as np
@@ -1546,7 +1546,7 @@ def _run_model_on_fold(
         fold_no: int,
         y_train: Optional[np.ndarray] = None,
         X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
-    ) -> Tuple[np.ndarray, np.ndarray, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]:
+    ) -> Tuple[np.ndarray, pd.DataFrame, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]:
         """Run a model on a repeat,fold,subsample triplet of the task and return prediction
         information.
 
@@ -1579,24 +1579,21 @@ def _run_model_on_fold(
 
         Returns
         -------
-        arff_datacontent : List[List]
-            Arff representation (list of lists) of the predictions that were
-            generated by this fold (required to populate predictions.arff)
-        arff_tracecontent :  List[List]
-            Arff representation (list of lists) of the trace data that was generated by this
-            fold
-            (will be used to populate trace.arff, leave it empty if the model did not perform
-            any
-            hyperparameter optimization).
+        pred_y : np.ndarray
+            Predictions on the training/test set, depending on the task type.
+            For supervised tasks, predicitons are on the test set.
+            For unsupervised tasks, predicitons are on the training set.
+        proba_y : pd.DataFrame
+            Predicted probabilities for the test set.
+            None, if task is not Classification or Learning Curve prediction.
         user_defined_measures : OrderedDict[str, float]
             User defined measures that were generated on this fold
-        model : Any
-            The model trained on this repeat,fold,subsample triple. Will be used to generate
-            trace
-            information later on (in ``obtain_arff_trace``).
+        trace : Optional[OpenMLRunTrace]]
+            arff trace object from a fitted model and the trace content obtained by
+            repeatedly calling ``run_model_on_task``
         """
 
-        def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarray:
+        def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.DataFrame:
             """Transforms predicted probabilities to match with OpenML class indices.
 
             Parameters
@@ -1609,16 +1606,31 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
 
             Returns
             -------
-            np.ndarray
+            pd.DataFrame
             """
+
+            if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
+                if task.class_labels is not None:
+                    if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
+                        # mapping (decoding) the predictions to the categories
+                        # creating a separate copy to not change the expected pred_y type
+                        y = [task.class_labels[pred] for pred in y]
+                else:
+                    raise ValueError("The task has no class labels")
+            else:
+                return None
+
             # y: list or numpy array of predictions
             # model_classes: sklearn classifier mapping from original array id to
             # prediction index id
-            if not isinstance(classes, list):
-                raise ValueError("please convert model classes to list prior to " "calling this fn")
-            result = np.zeros((len(y), len(classes)), dtype=np.float32)
-            for obs, prediction_idx in enumerate(y):
-                result[obs][prediction_idx] = 1.0
+            if not isinstance(model_classes, list):
+                raise ValueError("please convert model classes to list prior to calling this fn")
+            # DataFrame allows more accurate mapping of classes as column names
+            result = pd.DataFrame(
+                0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32
+            )
+            for obs, prediction in enumerate(y):
+                result.loc[obs, prediction] = 1.0
             return result
 
         if isinstance(task, OpenMLSupervisedTask):
@@ -1677,6 +1689,16 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
             else:
                 model_classes = used_estimator.classes_
 
+            if not isinstance(model_classes, list):
+                model_classes = model_classes.tolist()
+
+            # to handle the case when dataset is numpy and categories are encoded
+            # however the class labels stored in task are still categories
+            if isinstance(y_train, np.ndarray) and isinstance(
+                cast(List, task.class_labels)[0], str
+            ):
+                model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes]
+
         modelpredict_start_cputime = time.process_time()
         modelpredict_start_walltime = time.time()
 
@@ -1708,9 +1730,10 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
 
             try:
                 proba_y = model_copy.predict_proba(X_test)
-            except AttributeError:
+                proba_y = pd.DataFrame(proba_y, columns=model_classes)  # handles X_test as numpy
+            except AttributeError:  # predict_proba is not available when probability=False
                 if task.class_labels is not None:
-                    proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels))
+                    proba_y = _prediction_to_probabilities(pred_y, model_classes)
                 else:
                     raise ValueError("The task has no class labels")
 
@@ -1726,20 +1749,24 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
                     # then we need to add a column full of zeros into the probabilities
                     # for class 3 because the rest of the library expects that the
                     # probabilities are ordered the same way as the classes are ordered).
-                    proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))
-                    for idx, model_class in enumerate(model_classes):
-                        proba_y_new[:, model_class] = proba_y[:, idx]
-                    proba_y = proba_y_new
-
-                if proba_y.shape[1] != len(task.class_labels):
                     message = "Estimator only predicted for {}/{} classes!".format(
                         proba_y.shape[1], len(task.class_labels),
                     )
                     warnings.warn(message)
                     openml.config.logger.warn(message)
+
+                    for i, col in enumerate(task.class_labels):
+                        # adding missing columns with 0 probability
+                        if col not in model_classes:
+                            proba_y[col] = 0
+                    proba_y = proba_y[task.class_labels]
             else:
                 raise ValueError("The task has no class labels")
 
+            if not np.all(set(proba_y.columns) == set(task.class_labels)):
+                missing_cols = list(set(task.class_labels) - set(proba_y.columns))
+                raise ValueError("Predicted probabilities missing for the columns: ", missing_cols)
+
         elif isinstance(task, OpenMLRegressionTask):
             proba_y = None
 
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -263,7 +263,13 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
         for key in self.components:
             component_dict = OrderedDict()  # type: 'OrderedDict[str, Dict]'
             component_dict["oml:identifier"] = key
-            component_dict["oml:flow"] = self.components[key]._to_dict()["oml:flow"]
+            if self.components[key] in ["passthrough", "drop"]:
+                component_dict["oml:flow"] = {
+                    "oml-python:serialized_object": "component_reference",
+                    "value": {"key": self.components[key], "step_name": self.components[key]},
+                }
+            else:
+                component_dict["oml:flow"] = self.components[key]._to_dict()["oml:flow"]
 
             for key_ in component_dict:
                 # We only need to check if the key is a string, because the
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
diff --git a/openml/testing.py b/openml/testing.py
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@`
`66`	`66`	`############################################################################`
`67`	`67`	`# Get the actual data.`
`68`	`68`	`#`
`69`		`-# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy`
	`69`	`+# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy`
`70`	`70`	`# sparse matrix, or as a Pandas DataFrame. The format is`
`71`	`71`	# controlled with the parameter ``dataset_format`` which can be either 'array'
`72`	`72`	`# (default) or 'dataframe'. Let's first build our dataset from a NumPy array`
Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None)`
`113`	`113`	`"study",`
`114`	`114`	`"utils",`
`115`	`115`	`"_api_calls",`
	`116`	`+ "__version__",`
`116`	`117`	`]`
`117`	`118`
`118`	`119`	`# Load the scikit-learn extension by default`