Update documentation (#740)

Matthias Feurer · web-flow · commit b59cc461f8ed · 2019-07-26T16:27:21.000+02:00
* improve examples

* update year in license file

* fix unit test
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2014-2018, Matthias Feurer, Jan van Rijn, Andreas Müller, 
+Copyright (c) 2014-2019, Matthias Feurer, Jan van Rijn, Andreas Müller, 
 Joaquin Vanschoren and others.
 All rights reserved.
 
diff --git a/doc/conf.py b/doc/conf.py
@@ -15,6 +15,7 @@
 import os
 import sys
 import sphinx_bootstrap_theme
+import time
 import openml
 
 # If extensions (or modules to document with autodoc) are in another directory,
@@ -65,7 +66,7 @@
 # General information about the project.
 project = u'OpenML'
 copyright = (
-    u'2014-2019, the OpenML-Python team.'
+    u'2014-{}, the OpenML-Python team.'.format(time.strftime("%Y,%m,%d,%H,%M,%S").split(',')[0])
 )
 
 # The version info for the project you're documenting, acts as replacement for
diff --git a/doc/index.rst b/doc/index.rst
@@ -21,16 +21,12 @@ Example
 .. code:: python
 
     import openml
-    from sklearn import preprocessing, tree, pipeline
-
-    # Set the OpenML API Key which is required to upload your runs.
-    # You can get your own API by signing up to OpenML.org.
-    openml.config.apikey = 'ABC'
+    from sklearn import impute, tree, pipeline
 
     # Define a scikit-learn classifier or pipeline
     clf = pipeline.Pipeline(
         steps=[
-            ('imputer', preprocessing.Imputer()),
+            ('imputer', impute.SimpleImputer()),
             ('estimator', tree.DecisionTreeClassifier())
         ]
     )
@@ -39,10 +35,13 @@ Example
     task = openml.tasks.get_task(31)
     # Run the scikit-learn model on the task.
     run = openml.runs.run_model_on_task(clf, task)
-    # Publish the experiment on OpenML (optional, requires an API key).
+    # Publish the experiment on OpenML (optional, requires an API key.
+    # You can get your own API key by signing up to OpenML.org)
     run.publish()
     print('View the run online: %s/run/%d' % (openml.config.server, run.run_id))
 
+You can find more examples in our `examples gallery <examples/index.html>`_.
+
 ----------------------------
 How to get OpenML for python
 ----------------------------
diff --git a/examples/fetch_evaluations_tutorial.py b/examples/fetch_evaluations_tutorial.py
@@ -20,7 +20,6 @@
 
 ############################################################################
 import openml
-from pprint import pprint
 
 ############################################################################
 # Listing evaluations
@@ -37,7 +36,7 @@
                                             output_format='dataframe')
 
 # Querying the returned results for precision above 0.98
-pprint(evals[evals.value > 0.98])
+print(evals[evals.value > 0.98])
 
 #############################################################################
 # Viewing a sample task
@@ -47,7 +46,7 @@
 # We will start by displaying a simple *supervised classification* task:
 task_id = 167140        # https://www.openml.org/t/167140
 task = openml.tasks.get_task(task_id)
-pprint(vars(task))
+print(task)
 
 #############################################################################
 # Obtaining all the evaluations for the task
@@ -60,11 +59,11 @@
 evals = openml.evaluations.list_evaluations(function=metric, task=[task_id],
                                             output_format='dataframe')
 # Displaying the first 10 rows
-pprint(evals.head(n=10))
+print(evals.head(n=10))
 # Sorting the evaluations in decreasing order of the metric chosen
 evals = evals.sort_values(by='value', ascending=False)
 print("\nDisplaying head of sorted dataframe: ")
-pprint(evals.head())
+print(evals.head())
 
 #############################################################################
 # Obtaining CDF of metric for chosen task
@@ -147,4 +146,4 @@ def plot_flow_compare(evaluations, top_n=10, metric='predictive_accuracy'):
 flow_ids = evals.flow_id.unique()[:top_n]
 flow_names = evals.flow_name.unique()[:top_n]
 for i in range(top_n):
-    pprint((flow_ids[i], flow_names[i]))
+    print((flow_ids[i], flow_names[i]))
diff --git a/examples/flows_and_runs_tutorial.py b/examples/flows_and_runs_tutorial.py
@@ -6,7 +6,6 @@
 """
 
 import openml
-from pprint import pprint
 from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
 
 ############################################################################
@@ -58,7 +57,7 @@
 # Run the flow
 run = openml.runs.run_model_on_task(clf, task)
 
-# pprint(vars(run), depth=2)
+print(run)
 
 ############################################################################
 # Share the run on the OpenML server
@@ -75,17 +74,37 @@
 # We can now also inspect the flow object which was automatically created:
 
 flow = openml.flows.get_flow(run.flow_id)
-pprint(vars(flow), depth=1)
+print(flow)
 
 ############################################################################
 # It also works with pipelines
 # ############################
 #
 # When you need to handle 'dirty' data, build pipelines to model then automatically.
-task = openml.tasks.get_task(115)
+task = openml.tasks.get_task(1)
+features = task.get_dataset().features
+nominal_feature_indices = [
+    i for i in range(len(features))
+    if features[i].name != task.target_name and features[i].data_type == 'nominal'
+]
 pipe = pipeline.Pipeline(steps=[
-    ('Imputer', impute.SimpleImputer(strategy='median')),
-    ('OneHotEncoder', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')),
+    (
+        'Preprocessing',
+        compose.ColumnTransformer([
+            ('Nominal', pipeline.Pipeline(
+                [
+                    ('Imputer', impute.SimpleImputer(strategy='most_frequent')),
+                    (
+                        'Encoder',
+                        preprocessing.OneHotEncoder(
+                            sparse=False, handle_unknown='ignore',
+                        )
+                    ),
+                ]),
+                nominal_feature_indices,
+             ),
+        ]),
+    ),
     ('Classifier', ensemble.RandomForestClassifier(n_estimators=10))
 ])
 
diff --git a/examples/introduction_tutorial.py b/examples/introduction_tutorial.py
@@ -1,6 +1,6 @@
 """
 Introduction
-===================
+============
 
 An introduction to OpenML, followed up by a simple example.
 """
@@ -15,6 +15,8 @@
 # * Works seamlessly with scikit-learn and other libraries
 # * Large scale benchmarking, compare to state of the art
 #
+
+############################################################################
 # Installation
 # ^^^^^^^^^^^^
 # Installation is done via ``pip``:
@@ -26,6 +28,8 @@
 # For further information, please check out the installation guide at
 # https://openml.github.io/openml-python/master/contributing.html#installation
 #
+
+############################################################################
 # Authentication
 # ^^^^^^^^^^^^^^
 #
@@ -49,6 +53,7 @@
 # .. warning:: This example uploads data. For that reason, this example
 #   connects to the test server instead. This prevents the live server from
 #   crowding with example datasets, tasks, studies, and so on.
+
 ############################################################################
 import openml
 from sklearn import neighbors
diff --git a/examples/tasks_tutorial.py b/examples/tasks_tutorial.py
@@ -133,7 +133,7 @@
 ############################################################################
 # Properties of the task are stored as member variables:
 
-print(vars(task))
+print(task)
 
 ############################################################################
 # And:
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
@@ -1,18 +1,19 @@
 class OpenMLDataFeature(object):
-    """Data Feature (a.k.a. Attribute) object.
+    """
+    Data Feature (a.k.a. Attribute) object.
 
-       Parameters
-       ----------
-       index : int
-            The index of this feature
-        name : str
-            Name of the feature
-        data_type : str
-            can be nominal, numeric, string, date (corresponds to arff)
-        nominal_values : list(str)
-            list of the possible values, in case of nominal attribute
-        number_missing_values : int
-       """
+    Parameters
+    ----------
+    index : int
+        The index of this feature
+    name : str
+        Name of the feature
+    data_type : str
+        can be nominal, numeric, string, date (corresponds to arff)
+    nominal_values : list(str)
+        list of the possible values, in case of nominal attribute
+    number_missing_values : int
+    """
     LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
 
     def __init__(self, index, name, data_type, nominal_values,
@@ -22,8 +23,16 @@ def __init__(self, index, name, data_type, nominal_values,
         if data_type not in self.LEGAL_DATA_TYPES:
             raise ValueError('data type should be in %s, found: %s' %
                              (str(self.LEGAL_DATA_TYPES), data_type))
-        if nominal_values is not None and type(nominal_values) != list:
-            raise ValueError('Nominal_values is of wrong datatype')
+        if data_type == 'nominal':
+            if nominal_values is None:
+                raise TypeError('Dataset features require attribute `nominal_values` for nominal '
+                                'feature type.')
+            elif not isinstance(nominal_values, list):
+                raise TypeError('Argument `nominal_values` is of wrong datatype, should be list, '
+                                'but is {}'.format(type(nominal_values)))
+        else:
+            if nominal_values is not None:
+                raise TypeError('Argument `nominal_values` must be None for non-nominal feature.')
         if type(number_missing_values) != int:
             raise ValueError('number_missing_values is of wrong datatype')
 
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -153,7 +153,6 @@ def __init__(self, name, description, format=None,
 
         if features is not None:
             self.features = {}
-            # todo add nominal values (currently not in database)
             for idx, xmlfeature in enumerate(features['oml:feature']):
                 nr_missing = xmlfeature.get('oml:number_of_missing_values', 0)
                 feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
@@ -4,14 +4,14 @@
 class OpenMLSetup(object):
     """Setup object (a.k.a. Configuration).
 
-       Parameters
-       ----------
-       setup_id : int
-            The OpenML setup id
-       flow_id : int
-            The flow that it is build upon
-        parameters : dict
-            The setting of the parameters
+    Parameters
+    ----------
+    setup_id : int
+        The OpenML setup id
+    flow_id : int
+        The flow that it is build upon
+    parameters : dict
+        The setting of the parameters
     """
 
     def __init__(self, setup_id, flow_id, parameters):
diff --git a/openml/study/functions.py b/openml/study/functions.py
@@ -182,8 +182,8 @@ def create_study(
     where the runs are the main entity (collection consists of runs and all
     entities (flows, tasks, etc) that are related to these runs)
 
-    Parameters:
-    -----------
+    Parameters
+    ----------
     alias : str (optional)
         a string ID, unique on server (url-friendly)
     benchmark_suite : int (optional)
@@ -195,8 +195,8 @@ def create_study(
     run_ids : list
         a list of run ids associated with this study
 
-    Returns:
-    --------
+    Returns
+    -------
     OpenMLStudy
         A local OpenML study object (call publish method to upload to server)
     """
@@ -228,8 +228,8 @@ def create_benchmark_suite(
     Creates an OpenML benchmark suite (collection of entity types, where
     the tasks are the linked entity)
 
-    Parameters:
-    -----------
+    Parameters
+    ----------
     alias : str (optional)
         a string ID, unique on server (url-friendly)
     name : str
@@ -239,8 +239,8 @@ def create_benchmark_suite(
     task_ids : list
         a list of task ids associated with this study
 
-    Returns:
-    --------
+    Returns
+    -------
     OpenMLStudy
         A local OpenML study object (call publish method to upload to server)
     """
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -133,14 +133,14 @@ def list_tasks(
 ) -> Union[Dict, pd.DataFrame]:
     """
     Return a number of tasks having the given tag and task_type_id
+
     Parameters
     ----------
     Filter task_type_id is separated from the other filters because
     it is used as task_type_id in the task description, but it is named
     type when used as a filter in list tasks call.
     task_type_id : int, optional
-        ID of the task type as detailed
-        `here <https://www.openml.org/search?type=task_type>`_.
+        ID of the task type as detailed `here <https://www.openml.org/search?type=task_type>`_.
         - Supervised classification: 1
         - Supervised regression: 2
         - Learning curve: 3
@@ -362,7 +362,7 @@ def get_task(task_id: int, download_data: bool = True) -> OpenMLTask:
         # List of class labels availaible in dataset description
         # Including class labels as part of task meta data handles
         #   the case where data download was initially disabled
-        if isinstance(task, OpenMLClassificationTask):
+        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
             task.class_labels = \
                 dataset.retrieve_class_labels(task.target_name)
         # Clustering tasks do not have class labels
diff --git a/openml/testing.py b/openml/testing.py
@@ -73,7 +73,9 @@ def setUp(self, n_levels: int = 1):
             self.static_cache_dir = os.path.join(static_cache_dir, 'files')
 
         if self.static_cache_dir is None:
-            raise ValueError('Cannot find test cache dir!')
+            raise ValueError(
+                'Cannot find test cache dir, expected it to be {}!'.format(static_cache_dir)
+            )
 
         self.cwd = os.getcwd()
         workdir = os.path.dirname(os.path.abspath(__file__))
diff --git a/tests/files/org/openml/test/datasets/-1/features.xml b/tests/files/org/openml/test/datasets/-1/features.xml
@@ -180003,6 +180003,8 @@
     <oml:index>20000</oml:index>
     <oml:name>class</oml:name>
     <oml:data_type>nominal</oml:data_type>
+    <oml:nominal_value>-1</oml:nominal_value>
+    <oml:nominal_value>1</oml:nominal_value>
     <oml:is_target>false</oml:is_target>
     <oml:is_ignore>false</oml:is_ignore>
     <oml:is_row_identifier>false</oml:is_row_identifier>