Skip to content

Commit bf3cd2e

Browse files
amuellerNeeratyoyanonymous99199
authored
Dataframe run on task (#777)
* run on tasks allows dataframes * don't force third subcomponent part to be list * Making DataFrame default behaviour for runs; Fixing test cases for the same * Fixing PEP8 + Adding docstring to CustomImputer() * run on tasks allows dataframes * Attempting rebase * Fixing test cases * Trying test case fixes * run on tasks allows dataframes * don't force third subcomponent part to be list * Making DataFrame default behaviour for runs; Fixing test cases for the same * Fixing PEP8 + Adding docstring to CustomImputer() * Attempting rebase * Fixing test cases * Trying test case fixes * Allowing functions in subcomponents * Fixing test cases * Adding dataset output param to run * Fixing test cases * Changes suggested by mfeurer * Editing predict_proba function * Test case fix * Test case fix * Edit unit test to bypass server issue * Fixing unit test * Reiterating with @PGijsbers comments * Minor fixes to test cases * Adding unit test and suggestions from @mfeurer * Fixing test case for all sklearn versions * Testing changes * Fixing import in example * Triggering unit tests * Degugging failed example script * Adding unit tests * Push for debugging * Push for @mfeurer to debug * Resetting to debug * Updating branch * pre-commit fixes * Handling failing examples * Reiteration with clean ups and minor fixes * Closing comments * Black fixes * feedback from @mfeurer * Minor fix * suggestions from @PGijsbers Co-authored-by: neeratyoy <[email protected]> Co-authored-by: neeratyoy <[email protected]>
1 parent 88b7cc0 commit bf3cd2e

File tree

15 files changed

+560
-160
lines changed

15 files changed

+560
-160
lines changed

.travis.yml

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,21 @@ env:
1515
- TEST_DIR=/tmp/test_dir/
1616
- MODULE=openml
1717
matrix:
18-
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
19-
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
20-
- DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
21-
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
22-
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
23-
- DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
24-
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
25-
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
26-
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
27-
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
28-
# Checks for older scikit-learn versions (which also don't nicely work with
29-
# Python3.7)
30-
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
31-
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
18+
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" COVERAGE="true" DOCPUSH="true" SKIP_TESTS="true"
19+
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" RUN_FLAKE8="true" SKIP_TESTS="true"
20+
- DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
21+
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
22+
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.23.1" TEST_DIST="true"
23+
- DISTRIB="conda" PYTHON_VERSION="3.8" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
24+
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
25+
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.22.2" TEST_DIST="true"
26+
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
27+
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
28+
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.2"
29+
# Checks for older scikit-learn versions (which also don't nicely work with
30+
# Python3.7)
31+
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
32+
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" SCIPY_VERSION=1.2.0
3233

3334
# Travis issue
3435
# https://github.com/travis-ci/travis-ci/issues/8920

examples/30_extended/datasets_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@
6666
############################################################################
6767
# Get the actual data.
6868
#
69-
# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
69+
# The dataset can be returned in 3 possible formats: as a NumPy array, a SciPy
7070
# sparse matrix, or as a Pandas DataFrame. The format is
7171
# controlled with the parameter ``dataset_format`` which can be either 'array'
7272
# (default) or 'dataframe'. Let's first build our dataset from a NumPy array

examples/30_extended/flow_id_tutorial.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@
1515

1616
import openml
1717

18+
19+
# Activating test server
20+
openml.config.start_using_configuration_for_example()
21+
22+
1823
clf = sklearn.tree.DecisionTreeClassifier()
1924

2025
####################################################################################################
@@ -69,3 +74,6 @@
6974
# This also works with the actual model (generalizing the first part of this example):
7075
flow_ids = openml.flows.get_flow_id(model=clf, exact_version=False)
7176
print(flow_ids)
77+
78+
# Deactivating test server
79+
openml.config.stop_using_configuration_for_example()

examples/30_extended/run_setup_tutorial.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
import sklearn.ensemble
3838
import sklearn.impute
3939
import sklearn.preprocessing
40+
from sklearn.pipeline import make_pipeline, Pipeline
41+
from sklearn.compose import ColumnTransformer
42+
from sklearn.impute import SimpleImputer
43+
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
44+
from sklearn.experimental import enable_hist_gradient_boosting
4045

4146

4247
openml.config.start_using_configuration_for_example()
@@ -52,22 +57,39 @@
5257
# we will create a fairly complex model, with many preprocessing components and
5358
# many potential hyperparameters. Of course, the model can be as complex and as
5459
# easy as you want it to be
55-
model_original = sklearn.pipeline.make_pipeline(
56-
sklearn.impute.SimpleImputer(), sklearn.ensemble.RandomForestClassifier()
57-
)
5860

61+
from sklearn.ensemble import HistGradientBoostingClassifier
62+
from sklearn.decomposition import TruncatedSVD
63+
64+
65+
# Helper functions to return required columns for ColumnTransformer
66+
def cont(X):
67+
return X.dtypes != "category"
68+
69+
70+
def cat(X):
71+
return X.dtypes == "category"
72+
73+
74+
cat_imp = make_pipeline(
75+
SimpleImputer(strategy="most_frequent"),
76+
OneHotEncoder(handle_unknown="ignore", sparse=False),
77+
TruncatedSVD(),
78+
)
79+
ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)])
80+
model_original = sklearn.pipeline.Pipeline(
81+
steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
82+
)
5983

6084
# Let's change some hyperparameters. Of course, in any good application we
6185
# would tune them using, e.g., Random Search or Bayesian Optimization, but for
6286
# the purpose of this tutorial we set them to some specific values that might
6387
# or might not be optimal
6488
hyperparameters_original = {
65-
"simpleimputer__strategy": "median",
66-
"randomforestclassifier__criterion": "entropy",
67-
"randomforestclassifier__max_features": 0.2,
68-
"randomforestclassifier__min_samples_leaf": 1,
69-
"randomforestclassifier__n_estimators": 16,
70-
"randomforestclassifier__random_state": 42,
89+
"estimator__loss": "auto",
90+
"estimator__learning_rate": 0.15,
91+
"estimator__max_iter": 50,
92+
"estimator__min_samples_leaf": 1,
7193
}
7294
model_original.set_params(**hyperparameters_original)
7395

examples/30_extended/study_tutorial.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,11 @@
1717

1818
import numpy as np
1919
import sklearn.tree
20-
import sklearn.pipeline
21-
import sklearn.impute
20+
from sklearn.pipeline import make_pipeline, Pipeline
21+
from sklearn.compose import ColumnTransformer
22+
from sklearn.impute import SimpleImputer
23+
from sklearn.decomposition import TruncatedSVD
24+
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
2225

2326
import openml
2427

@@ -68,7 +71,7 @@
6871
)
6972
print(evaluations.head())
7073

71-
############################################################################
74+
###########################################################from openml.testing import cat, cont#################
7275
# Uploading studies
7376
# =================
7477
#
@@ -78,12 +81,30 @@
7881

7982
openml.config.start_using_configuration_for_example()
8083

81-
# Very simple classifier which ignores the feature type
84+
# Model that can handle missing values
85+
from sklearn.experimental import enable_hist_gradient_boosting
86+
from sklearn.ensemble import HistGradientBoostingClassifier
87+
88+
89+
# Helper functions to return required columns for ColumnTransformer
90+
def cont(X):
91+
return X.dtypes != "category"
92+
93+
94+
def cat(X):
95+
return X.dtypes == "category"
96+
97+
98+
cat_imp = make_pipeline(
99+
SimpleImputer(strategy="most_frequent"),
100+
OneHotEncoder(handle_unknown="ignore", sparse=False),
101+
TruncatedSVD(),
102+
)
103+
ct = ColumnTransformer(
104+
[("cat", cat_imp, cat), ("cont", FunctionTransformer(lambda x: x, validate=False), cont)]
105+
)
82106
clf = sklearn.pipeline.Pipeline(
83-
steps=[
84-
("imputer", sklearn.impute.SimpleImputer()),
85-
("estimator", sklearn.tree.DecisionTreeClassifier(max_depth=5)),
86-
]
107+
steps=[("transform", ct), ("estimator", HistGradientBoostingClassifier()),]
87108
)
88109

89110
suite = openml.study.get_suite(1)

openml/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None)
113113
"study",
114114
"utils",
115115
"_api_calls",
116+
"__version__",
116117
]
117118

118119
# Load the scikit-learn extension by default

openml/datasets/functions.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -815,12 +815,12 @@ def edit_dataset(
815815
) -> int:
816816
"""
817817
Edits an OpenMLDataset.
818-
Specify atleast one field to edit, apart from data_id
818+
Specify at least one field to edit, apart from data_id
819819
- For certain fields, a new dataset version is created : attributes, data,
820820
default_target_attribute, ignore_attribute, row_id_attribute.
821821
822-
- For other fields, the uploader can edit the exisiting version.
823-
Noone except the uploader can edit the exisitng version.
822+
- For other fields, the uploader can edit the existing version.
823+
No one except the uploader can edit the existing version.
824824
825825
Parameters
826826
----------

openml/exceptions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def __init__(self, message: str, code: int = None, url: str = None):
2727
self.url = url
2828
super().__init__(message)
2929

30-
def __repr__(self):
30+
def __str__(self):
3131
return "%s returned code %s: %s" % (self.url, self.code, self.message,)
3232

3333

openml/extensions/sklearn/extension.py

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from re import IGNORECASE
1212
import sys
1313
import time
14-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
14+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union, cast
1515
import warnings
1616

1717
import numpy as np
@@ -1546,7 +1546,7 @@ def _run_model_on_fold(
15461546
fold_no: int,
15471547
y_train: Optional[np.ndarray] = None,
15481548
X_test: Optional[Union[np.ndarray, scipy.sparse.spmatrix, pd.DataFrame]] = None,
1549-
) -> Tuple[np.ndarray, np.ndarray, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]:
1549+
) -> Tuple[np.ndarray, pd.DataFrame, "OrderedDict[str, float]", Optional[OpenMLRunTrace]]:
15501550
"""Run a model on a repeat,fold,subsample triplet of the task and return prediction
15511551
information.
15521552
@@ -1579,24 +1579,21 @@ def _run_model_on_fold(
15791579
15801580
Returns
15811581
-------
1582-
arff_datacontent : List[List]
1583-
Arff representation (list of lists) of the predictions that were
1584-
generated by this fold (required to populate predictions.arff)
1585-
arff_tracecontent : List[List]
1586-
Arff representation (list of lists) of the trace data that was generated by this
1587-
fold
1588-
(will be used to populate trace.arff, leave it empty if the model did not perform
1589-
any
1590-
hyperparameter optimization).
1582+
pred_y : np.ndarray
1583+
Predictions on the training/test set, depending on the task type.
1584+
For supervised tasks, predicitons are on the test set.
1585+
For unsupervised tasks, predicitons are on the training set.
1586+
proba_y : pd.DataFrame
1587+
Predicted probabilities for the test set.
1588+
None, if task is not Classification or Learning Curve prediction.
15911589
user_defined_measures : OrderedDict[str, float]
15921590
User defined measures that were generated on this fold
1593-
model : Any
1594-
The model trained on this repeat,fold,subsample triple. Will be used to generate
1595-
trace
1596-
information later on (in ``obtain_arff_trace``).
1591+
trace : Optional[OpenMLRunTrace]]
1592+
arff trace object from a fitted model and the trace content obtained by
1593+
repeatedly calling ``run_model_on_task``
15971594
"""
15981595

1599-
def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarray:
1596+
def _prediction_to_probabilities(y: np.ndarray, model_classes: List[Any]) -> pd.DataFrame:
16001597
"""Transforms predicted probabilities to match with OpenML class indices.
16011598
16021599
Parameters
@@ -1609,16 +1606,31 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
16091606
16101607
Returns
16111608
-------
1612-
np.ndarray
1609+
pd.DataFrame
16131610
"""
1611+
1612+
if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
1613+
if task.class_labels is not None:
1614+
if isinstance(y_train, np.ndarray) and isinstance(task.class_labels[0], str):
1615+
# mapping (decoding) the predictions to the categories
1616+
# creating a separate copy to not change the expected pred_y type
1617+
y = [task.class_labels[pred] for pred in y]
1618+
else:
1619+
raise ValueError("The task has no class labels")
1620+
else:
1621+
return None
1622+
16141623
# y: list or numpy array of predictions
16151624
# model_classes: sklearn classifier mapping from original array id to
16161625
# prediction index id
1617-
if not isinstance(classes, list):
1618-
raise ValueError("please convert model classes to list prior to " "calling this fn")
1619-
result = np.zeros((len(y), len(classes)), dtype=np.float32)
1620-
for obs, prediction_idx in enumerate(y):
1621-
result[obs][prediction_idx] = 1.0
1626+
if not isinstance(model_classes, list):
1627+
raise ValueError("please convert model classes to list prior to calling this fn")
1628+
# DataFrame allows more accurate mapping of classes as column names
1629+
result = pd.DataFrame(
1630+
0, index=np.arange(len(y)), columns=model_classes, dtype=np.float32
1631+
)
1632+
for obs, prediction in enumerate(y):
1633+
result.loc[obs, prediction] = 1.0
16221634
return result
16231635

16241636
if isinstance(task, OpenMLSupervisedTask):
@@ -1677,6 +1689,16 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
16771689
else:
16781690
model_classes = used_estimator.classes_
16791691

1692+
if not isinstance(model_classes, list):
1693+
model_classes = model_classes.tolist()
1694+
1695+
# to handle the case when dataset is numpy and categories are encoded
1696+
# however the class labels stored in task are still categories
1697+
if isinstance(y_train, np.ndarray) and isinstance(
1698+
cast(List, task.class_labels)[0], str
1699+
):
1700+
model_classes = [cast(List[str], task.class_labels)[i] for i in model_classes]
1701+
16801702
modelpredict_start_cputime = time.process_time()
16811703
modelpredict_start_walltime = time.time()
16821704

@@ -1708,9 +1730,10 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
17081730

17091731
try:
17101732
proba_y = model_copy.predict_proba(X_test)
1711-
except AttributeError:
1733+
proba_y = pd.DataFrame(proba_y, columns=model_classes) # handles X_test as numpy
1734+
except AttributeError: # predict_proba is not available when probability=False
17121735
if task.class_labels is not None:
1713-
proba_y = _prediction_to_probabilities(pred_y, list(task.class_labels))
1736+
proba_y = _prediction_to_probabilities(pred_y, model_classes)
17141737
else:
17151738
raise ValueError("The task has no class labels")
17161739

@@ -1726,20 +1749,24 @@ def _prediction_to_probabilities(y: np.ndarray, classes: List[Any]) -> np.ndarra
17261749
# then we need to add a column full of zeros into the probabilities
17271750
# for class 3 because the rest of the library expects that the
17281751
# probabilities are ordered the same way as the classes are ordered).
1729-
proba_y_new = np.zeros((proba_y.shape[0], len(task.class_labels)))
1730-
for idx, model_class in enumerate(model_classes):
1731-
proba_y_new[:, model_class] = proba_y[:, idx]
1732-
proba_y = proba_y_new
1733-
1734-
if proba_y.shape[1] != len(task.class_labels):
17351752
message = "Estimator only predicted for {}/{} classes!".format(
17361753
proba_y.shape[1], len(task.class_labels),
17371754
)
17381755
warnings.warn(message)
17391756
openml.config.logger.warn(message)
1757+
1758+
for i, col in enumerate(task.class_labels):
1759+
# adding missing columns with 0 probability
1760+
if col not in model_classes:
1761+
proba_y[col] = 0
1762+
proba_y = proba_y[task.class_labels]
17401763
else:
17411764
raise ValueError("The task has no class labels")
17421765

1766+
if not np.all(set(proba_y.columns) == set(task.class_labels)):
1767+
missing_cols = list(set(task.class_labels) - set(proba_y.columns))
1768+
raise ValueError("Predicted probabilities missing for the columns: ", missing_cols)
1769+
17431770
elif isinstance(task, OpenMLRegressionTask):
17441771
proba_y = None
17451772

openml/flows/flow.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,13 @@ def _to_dict(self) -> "OrderedDict[str, OrderedDict]":
263263
for key in self.components:
264264
component_dict = OrderedDict() # type: 'OrderedDict[str, Dict]'
265265
component_dict["oml:identifier"] = key
266-
component_dict["oml:flow"] = self.components[key]._to_dict()["oml:flow"]
266+
if self.components[key] in ["passthrough", "drop"]:
267+
component_dict["oml:flow"] = {
268+
"oml-python:serialized_object": "component_reference",
269+
"value": {"key": self.components[key], "step_name": self.components[key]},
270+
}
271+
else:
272+
component_dict["oml:flow"] = self.components[key]._to_dict()["oml:flow"]
267273

268274
for key_ in component_dict:
269275
# We only need to check if the key is a string, because the

0 commit comments

Comments
 (0)