|
| 1 | +""" |
| 2 | +========= |
| 3 | +Run Setup |
| 4 | +========= |
| 5 | +
|
| 6 | +By: Jan N. van Rijn |
| 7 | +
|
| 8 | +One of the key features of the openml-python library is that is allows to |
| 9 | +reinstantiate flows with hyperparameter settings that were uploaded before. |
| 10 | +This tutorial uses the concept of setups. Although setups are not extensively |
| 11 | +described in the OpenML documentation (because most users will not directly |
| 12 | +use them), they form a important concept within OpenML distinguishing between |
| 13 | +hyperparameter configurations. |
| 14 | +A setup is the combination of a flow with all its hyperparameters set. |
| 15 | +
|
| 16 | +A key requirement for reinstantiating a flow is to have the same scikit-learn |
| 17 | +version as the flow that was uploaded. However, this tutorial will upload the |
| 18 | +flow (that will later be reinstantiated) itself, so it can be ran with any |
| 19 | +scikit-learn version that is supported by this library. In this case, the |
| 20 | +requirement of the corresponding scikit-learn versions is automatically met. |
| 21 | +
|
| 22 | +In this tutorial we will |
| 23 | + 1) Create a flow and use it to solve a task; |
| 24 | + 2) Download the flow, reinstantiate the model with same hyperparameters, |
| 25 | + and solve the same task again; |
| 26 | + 3) We will verify that the obtained results are exactly the same. |
| 27 | +""" |
| 28 | +import logging |
| 29 | +import numpy as np |
| 30 | +import openml |
| 31 | +import sklearn.ensemble |
| 32 | +import sklearn.impute |
| 33 | +import sklearn.preprocessing |
| 34 | + |
| 35 | + |
| 36 | +root = logging.getLogger() |
| 37 | +root.setLevel(logging.INFO) |
| 38 | + |
| 39 | +############################################################################### |
| 40 | +# 1) Create a flow and use it to solve a task |
| 41 | +############################################################################### |
| 42 | + |
| 43 | +# first, let's download the task that we are interested in |
| 44 | +task = openml.tasks.get_task(6) |
| 45 | + |
| 46 | + |
| 47 | +# we will create a fairly complex model, with many preprocessing components and |
| 48 | +# many potential hyperparameters. Of course, the model can be as complex and as |
| 49 | +# easy as you want it to be |
| 50 | +model_original = sklearn.pipeline.make_pipeline( |
| 51 | + sklearn.impute.SimpleImputer(), |
| 52 | + sklearn.ensemble.RandomForestClassifier() |
| 53 | +) |
| 54 | + |
| 55 | + |
| 56 | +# Let's change some hyperparameters. Of course, in any good application we |
| 57 | +# would tune them using, e.g., Random Search or Bayesian Optimization, but for |
| 58 | +# the purpose of this tutorial we set them to some specific values that might |
| 59 | +# or might not be optimal |
| 60 | +hyperparameters_original = { |
| 61 | + 'simpleimputer__strategy': 'median', |
| 62 | + 'randomforestclassifier__criterion': 'entropy', |
| 63 | + 'randomforestclassifier__max_features': 0.2, |
| 64 | + 'randomforestclassifier__min_samples_leaf': 1, |
| 65 | + 'randomforestclassifier__n_estimators': 16, |
| 66 | + 'randomforestclassifier__random_state': 42, |
| 67 | +} |
| 68 | +model_original.set_params(**hyperparameters_original) |
| 69 | + |
| 70 | +# solve the task and upload the result (this implicitly creates the flow) |
| 71 | +run = openml.runs.run_model_on_task( |
| 72 | + model_original, |
| 73 | + task, |
| 74 | + avoid_duplicate_runs=False) |
| 75 | +run_original = run.publish() # this implicitly uploads the flow |
| 76 | + |
| 77 | +############################################################################### |
| 78 | +# 2) Download the flow, reinstantiate the model with same hyperparameters, |
| 79 | +# and solve the same task again. |
| 80 | +############################################################################### |
| 81 | + |
| 82 | +# obtain setup id (note that the setup id is assigned by the OpenML server - |
| 83 | +# therefore it was not yet available in our local copy of the run) |
| 84 | +run_downloaded = openml.runs.get_run(run_original.run_id) |
| 85 | +setup_id = run_downloaded.setup_id |
| 86 | + |
| 87 | +# after this, we can easily reinstantiate the model |
| 88 | +model_duplicate = openml.setups.initialize_model(setup_id) |
| 89 | +# it will automatically have all the hyperparameters set |
| 90 | + |
| 91 | +# and run the task again |
| 92 | +run_duplicate = openml.runs.run_model_on_task( |
| 93 | + model_duplicate, task, avoid_duplicate_runs=False) |
| 94 | + |
| 95 | + |
| 96 | +############################################################################### |
| 97 | +# 3) We will verify that the obtained results are exactly the same. |
| 98 | +############################################################################### |
| 99 | + |
| 100 | +# the run has stored all predictions in the field data content |
| 101 | +np.testing.assert_array_equal(run_original.data_content, |
| 102 | + run_duplicate.data_content) |
0 commit comments