Skip to content

Commit b59cc46

Browse files
author
Matthias Feurer
authored
Update documentation (#740)
* improve examples * update year in license file * fix unit test
1 parent 4c71d1d commit b59cc46

File tree

14 files changed

+94
-59
lines changed

14 files changed

+94
-59
lines changed

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
BSD 3-Clause License
22

3-
Copyright (c) 2014-2018, Matthias Feurer, Jan van Rijn, Andreas Müller,
3+
Copyright (c) 2014-2019, Matthias Feurer, Jan van Rijn, Andreas Müller,
44
Joaquin Vanschoren and others.
55
All rights reserved.
66

doc/conf.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import os
1616
import sys
1717
import sphinx_bootstrap_theme
18+
import time
1819
import openml
1920

2021
# If extensions (or modules to document with autodoc) are in another directory,
@@ -65,7 +66,7 @@
6566
# General information about the project.
6667
project = u'OpenML'
6768
copyright = (
68-
u'2014-2019, the OpenML-Python team.'
69+
u'2014-{}, the OpenML-Python team.'.format(time.strftime("%Y,%m,%d,%H,%M,%S").split(',')[0])
6970
)
7071

7172
# The version info for the project you're documenting, acts as replacement for

doc/index.rst

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,16 +21,12 @@ Example
2121
.. code:: python
2222
2323
import openml
24-
from sklearn import preprocessing, tree, pipeline
25-
26-
# Set the OpenML API Key which is required to upload your runs.
27-
# You can get your own API by signing up to OpenML.org.
28-
openml.config.apikey = 'ABC'
24+
from sklearn import impute, tree, pipeline
2925
3026
# Define a scikit-learn classifier or pipeline
3127
clf = pipeline.Pipeline(
3228
steps=[
33-
('imputer', preprocessing.Imputer()),
29+
('imputer', impute.SimpleImputer()),
3430
('estimator', tree.DecisionTreeClassifier())
3531
]
3632
)
@@ -39,10 +35,13 @@ Example
3935
task = openml.tasks.get_task(31)
4036
# Run the scikit-learn model on the task.
4137
run = openml.runs.run_model_on_task(clf, task)
42-
# Publish the experiment on OpenML (optional, requires an API key).
38+
# Publish the experiment on OpenML (optional, requires an API key.
39+
# You can get your own API key by signing up to OpenML.org)
4340
run.publish()
4441
print('View the run online: %s/run/%d' % (openml.config.server, run.run_id))
4542
43+
You can find more examples in our `examples gallery <examples/index.html>`_.
44+
4645
----------------------------
4746
How to get OpenML for python
4847
----------------------------

examples/fetch_evaluations_tutorial.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
############################################################################
2222
import openml
23-
from pprint import pprint
2423

2524
############################################################################
2625
# Listing evaluations
@@ -37,7 +36,7 @@
3736
output_format='dataframe')
3837

3938
# Querying the returned results for precision above 0.98
40-
pprint(evals[evals.value > 0.98])
39+
print(evals[evals.value > 0.98])
4140

4241
#############################################################################
4342
# Viewing a sample task
@@ -47,7 +46,7 @@
4746
# We will start by displaying a simple *supervised classification* task:
4847
task_id = 167140 # https://www.openml.org/t/167140
4948
task = openml.tasks.get_task(task_id)
50-
pprint(vars(task))
49+
print(task)
5150

5251
#############################################################################
5352
# Obtaining all the evaluations for the task
@@ -60,11 +59,11 @@
6059
evals = openml.evaluations.list_evaluations(function=metric, task=[task_id],
6160
output_format='dataframe')
6261
# Displaying the first 10 rows
63-
pprint(evals.head(n=10))
62+
print(evals.head(n=10))
6463
# Sorting the evaluations in decreasing order of the metric chosen
6564
evals = evals.sort_values(by='value', ascending=False)
6665
print("\nDisplaying head of sorted dataframe: ")
67-
pprint(evals.head())
66+
print(evals.head())
6867

6968
#############################################################################
7069
# Obtaining CDF of metric for chosen task
@@ -147,4 +146,4 @@ def plot_flow_compare(evaluations, top_n=10, metric='predictive_accuracy'):
147146
flow_ids = evals.flow_id.unique()[:top_n]
148147
flow_names = evals.flow_name.unique()[:top_n]
149148
for i in range(top_n):
150-
pprint((flow_ids[i], flow_names[i]))
149+
print((flow_ids[i], flow_names[i]))

examples/flows_and_runs_tutorial.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
"""
77

88
import openml
9-
from pprint import pprint
109
from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
1110

1211
############################################################################
@@ -58,7 +57,7 @@
5857
# Run the flow
5958
run = openml.runs.run_model_on_task(clf, task)
6059

61-
# pprint(vars(run), depth=2)
60+
print(run)
6261

6362
############################################################################
6463
# Share the run on the OpenML server
@@ -75,17 +74,37 @@
7574
# We can now also inspect the flow object which was automatically created:
7675

7776
flow = openml.flows.get_flow(run.flow_id)
78-
pprint(vars(flow), depth=1)
77+
print(flow)
7978

8079
############################################################################
8180
# It also works with pipelines
8281
# ############################
8382
#
8483
# When you need to handle 'dirty' data, build pipelines to model then automatically.
85-
task = openml.tasks.get_task(115)
84+
task = openml.tasks.get_task(1)
85+
features = task.get_dataset().features
86+
nominal_feature_indices = [
87+
i for i in range(len(features))
88+
if features[i].name != task.target_name and features[i].data_type == 'nominal'
89+
]
8690
pipe = pipeline.Pipeline(steps=[
87-
('Imputer', impute.SimpleImputer(strategy='median')),
88-
('OneHotEncoder', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')),
91+
(
92+
'Preprocessing',
93+
compose.ColumnTransformer([
94+
('Nominal', pipeline.Pipeline(
95+
[
96+
('Imputer', impute.SimpleImputer(strategy='most_frequent')),
97+
(
98+
'Encoder',
99+
preprocessing.OneHotEncoder(
100+
sparse=False, handle_unknown='ignore',
101+
)
102+
),
103+
]),
104+
nominal_feature_indices,
105+
),
106+
]),
107+
),
89108
('Classifier', ensemble.RandomForestClassifier(n_estimators=10))
90109
])
91110

examples/introduction_tutorial.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
Introduction
3-
===================
3+
============
44
55
An introduction to OpenML, followed up by a simple example.
66
"""
@@ -15,6 +15,8 @@
1515
# * Works seamlessly with scikit-learn and other libraries
1616
# * Large scale benchmarking, compare to state of the art
1717
#
18+
19+
############################################################################
1820
# Installation
1921
# ^^^^^^^^^^^^
2022
# Installation is done via ``pip``:
@@ -26,6 +28,8 @@
2628
# For further information, please check out the installation guide at
2729
# https://openml.github.io/openml-python/master/contributing.html#installation
2830
#
31+
32+
############################################################################
2933
# Authentication
3034
# ^^^^^^^^^^^^^^
3135
#
@@ -49,6 +53,7 @@
4953
# .. warning:: This example uploads data. For that reason, this example
5054
# connects to the test server instead. This prevents the live server from
5155
# crowding with example datasets, tasks, studies, and so on.
56+
5257
############################################################################
5358
import openml
5459
from sklearn import neighbors

examples/tasks_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@
133133
############################################################################
134134
# Properties of the task are stored as member variables:
135135

136-
print(vars(task))
136+
print(task)
137137

138138
############################################################################
139139
# And:

openml/datasets/data_feature.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,19 @@
11
class OpenMLDataFeature(object):
2-
"""Data Feature (a.k.a. Attribute) object.
2+
"""
3+
Data Feature (a.k.a. Attribute) object.
34
4-
Parameters
5-
----------
6-
index : int
7-
The index of this feature
8-
name : str
9-
Name of the feature
10-
data_type : str
11-
can be nominal, numeric, string, date (corresponds to arff)
12-
nominal_values : list(str)
13-
list of the possible values, in case of nominal attribute
14-
number_missing_values : int
15-
"""
5+
Parameters
6+
----------
7+
index : int
8+
The index of this feature
9+
name : str
10+
Name of the feature
11+
data_type : str
12+
can be nominal, numeric, string, date (corresponds to arff)
13+
nominal_values : list(str)
14+
list of the possible values, in case of nominal attribute
15+
number_missing_values : int
16+
"""
1617
LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
1718

1819
def __init__(self, index, name, data_type, nominal_values,
@@ -22,8 +23,16 @@ def __init__(self, index, name, data_type, nominal_values,
2223
if data_type not in self.LEGAL_DATA_TYPES:
2324
raise ValueError('data type should be in %s, found: %s' %
2425
(str(self.LEGAL_DATA_TYPES), data_type))
25-
if nominal_values is not None and type(nominal_values) != list:
26-
raise ValueError('Nominal_values is of wrong datatype')
26+
if data_type == 'nominal':
27+
if nominal_values is None:
28+
raise TypeError('Dataset features require attribute `nominal_values` for nominal '
29+
'feature type.')
30+
elif not isinstance(nominal_values, list):
31+
raise TypeError('Argument `nominal_values` is of wrong datatype, should be list, '
32+
'but is {}'.format(type(nominal_values)))
33+
else:
34+
if nominal_values is not None:
35+
raise TypeError('Argument `nominal_values` must be None for non-nominal feature.')
2736
if type(number_missing_values) != int:
2837
raise ValueError('number_missing_values is of wrong datatype')
2938

openml/datasets/dataset.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,6 @@ def __init__(self, name, description, format=None,
153153

154154
if features is not None:
155155
self.features = {}
156-
# todo add nominal values (currently not in database)
157156
for idx, xmlfeature in enumerate(features['oml:feature']):
158157
nr_missing = xmlfeature.get('oml:number_of_missing_values', 0)
159158
feature = OpenMLDataFeature(int(xmlfeature['oml:index']),

openml/setups/setup.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@
44
class OpenMLSetup(object):
55
"""Setup object (a.k.a. Configuration).
66
7-
Parameters
8-
----------
9-
setup_id : int
10-
The OpenML setup id
11-
flow_id : int
12-
The flow that it is build upon
13-
parameters : dict
14-
The setting of the parameters
7+
Parameters
8+
----------
9+
setup_id : int
10+
The OpenML setup id
11+
flow_id : int
12+
The flow that it is build upon
13+
parameters : dict
14+
The setting of the parameters
1515
"""
1616

1717
def __init__(self, setup_id, flow_id, parameters):

0 commit comments

Comments
 (0)