|
8 | 8 | # License: BSD 3-Clause |
9 | 9 |
|
10 | 10 | import openml |
11 | | -import numpy as np |
12 | 11 | from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree |
13 | 12 |
|
14 | 13 | ############################################################################ |
|
54 | 53 | task = openml.tasks.get_task(403) |
55 | 54 |
|
56 | 55 | # Build any classifier or pipeline |
57 | | -clf = tree.ExtraTreeClassifier() |
| 56 | +clf = tree.DecisionTreeClassifier() |
58 | 57 |
|
59 | 58 | # Run the flow |
60 | 59 | run = openml.runs.run_model_on_task(clf, task) |
|
83 | 82 | # ############################ |
84 | 83 | # |
85 | 84 | # When you need to handle 'dirty' data, build pipelines to model then automatically. |
86 | | -task = openml.tasks.get_task(1) |
| 85 | +# To demonstrate this using the dataset `credit-a <https://test.openml.org/d/16>`_ via |
| 86 | +# `task <https://test.openml.org/t/96>`_ as it contains both numerical and categorical |
| 87 | +# variables and missing values in both. |
| 88 | +task = openml.tasks.get_task(96) |
87 | 89 |
|
88 | 90 | # OpenML helper functions for sklearn can be plugged in directly for complicated pipelines |
89 | 91 | from openml.extensions.sklearn import cat, cont |
|
96 | 98 | [ |
97 | 99 | ( |
98 | 100 | "categorical", |
99 | | - pipeline.Pipeline( |
100 | | - [ |
101 | | - ("Imputer", impute.SimpleImputer(strategy="most_frequent")), |
102 | | - ( |
103 | | - "Encoder", |
104 | | - preprocessing.OneHotEncoder( |
105 | | - sparse=False, handle_unknown="ignore" |
106 | | - ), |
107 | | - ), |
108 | | - ] |
109 | | - ), |
| 101 | + preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), |
110 | 102 | cat, # returns the categorical feature indices |
111 | 103 | ), |
112 | | - ("continuous", "passthrough", cont), # returns the numeric feature indices |
| 104 | + ( |
| 105 | + "continuous", |
| 106 | + impute.SimpleImputer(strategy="median"), |
| 107 | + cont, |
| 108 | + ), # returns the numeric feature indices |
113 | 109 | ] |
114 | 110 | ), |
115 | 111 | ), |
|
146 | 142 | [ |
147 | 143 | ( |
148 | 144 | "categorical", |
149 | | - pipeline.Pipeline( |
150 | | - [ |
151 | | - ("Imputer", impute.SimpleImputer(strategy="most_frequent")), |
152 | | - ( |
153 | | - "Encoder", |
154 | | - preprocessing.OneHotEncoder( |
155 | | - sparse=False, handle_unknown="ignore" |
156 | | - ), |
157 | | - ), |
158 | | - ] |
159 | | - ), |
| 145 | + preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), |
160 | 146 | categorical_feature_indices, |
161 | 147 | ), |
162 | | - ("continuous", "passthrough", numeric_feature_indices), |
| 148 | + ( |
| 149 | + "continuous", |
| 150 | + impute.SimpleImputer(strategy="median"), |
| 151 | + numeric_feature_indices, |
| 152 | + ), |
163 | 153 | ] |
164 | 154 | ), |
165 | 155 | ), |
|
182 | 172 | task = openml.tasks.get_task(6) |
183 | 173 |
|
184 | 174 | # The following lines can then be executed offline: |
185 | | -run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False) |
| 175 | +run = openml.runs.run_model_on_task( |
| 176 | + pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array", |
| 177 | +) |
186 | 178 |
|
187 | 179 | # The run may be stored offline, and the flow will be stored along with it: |
188 | 180 | run.to_filesystem(directory="myrun") |
|
0 commit comments