Added documentation for creating tasks (#719)

joaquinvanschoren · Matthias Feurer · commit 1f5d6a2c3aca · 2019-07-26T15:40:24.000+02:00
* Added documentation for creating tasks

* PEP8 fix

* Pleasing PEP8

* Pleasing PEP8

* bugfix

* use test server IDs

* Upload new dataset to properly test task creation

* fixing dataset upload

* trailing whitespace madness

* fix unit test

It failed when the random task already existed.

* Update test_clustering_task.py

* PEP8

* activate dataset

* Resolved review comments and reworked example

* Making suggested changes; Removing pprint; Using numpy to filter

* Returning to prod server after example
diff --git a/examples/tasks_tutorial.py b/examples/tasks_tutorial.py
@@ -7,7 +7,6 @@
 
 import openml
 import pandas as pd
-from pprint import pprint
 
 ############################################################################
 #
@@ -40,11 +39,11 @@
 tasks = pd.DataFrame.from_dict(tasks, orient='index')
 print(tasks.columns)
 print("First 5 of %s tasks:" % len(tasks))
-pprint(tasks.head())
+print(tasks.head())
 
 # The same can be obtained through lesser lines of code
 tasks_df = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe')
-pprint(tasks_df.head())
+print(tasks_df.head())
 
 ############################################################################
 # We can filter the list of tasks to only contain datasets with more than
@@ -78,22 +77,22 @@
 tasks = openml.tasks.list_tasks(tag='OpenML100')
 tasks = pd.DataFrame.from_dict(tasks, orient='index')
 print("First 5 of %s tasks:" % len(tasks))
-pprint(tasks.head())
+print(tasks.head())
 
 ############################################################################
 # Furthermore, we can list tasks based on the dataset id:
 
 tasks = openml.tasks.list_tasks(data_id=1471)
 tasks = pd.DataFrame.from_dict(tasks, orient='index')
 print("First 5 of %s tasks:" % len(tasks))
-pprint(tasks.head())
+print(tasks.head())
 
 ############################################################################
 # In addition, a size limit and an offset can be applied both separately and simultaneously:
 
 tasks = openml.tasks.list_tasks(size=10, offset=50)
 tasks = pd.DataFrame.from_dict(tasks, orient='index')
-pprint(tasks)
+print(tasks)
 
 ############################################################################
 #
@@ -134,11 +133,87 @@
 ############################################################################
 # Properties of the task are stored as member variables:
 
-pprint(vars(task))
+print(vars(task))
 
 ############################################################################
 # And:
 
 ids = [2, 1891, 31, 9983]
 tasks = openml.tasks.get_tasks(ids)
-pprint(tasks[0])
+print(tasks[0])
+
+############################################################################
+# Creating tasks
+# ^^^^^^^^^^^^^^
+#
+# You can also create new tasks. Take the following into account:
+#
+# * You can only create tasks on _active_ datasets
+# * For now, only the following tasks are supported: classification, regression,
+# clustering, and learning curve analysis.
+# * For now, tasks can only be created on a single dataset.
+# * The exact same task must not already exist.
+#
+# Creating a task requires the following input:
+#
+# * task_type_id: The task type ID, required (see below). Required.
+# * dataset_id: The dataset ID. Required.
+# * target_name: The name of the attribute you aim to predict.
+# Optional.
+# * estimation_procedure_id : The ID of the estimation procedure used to create train-test
+# splits. Optional.
+# * evaluation_measure: The name of the evaluation measure. Optional.
+# * Any additional inputs for specific tasks
+#
+# It is best to leave the evaluation measure open if there is no strong prerequisite for a
+# specific measure. OpenML will always compute all appropriate measures and you can filter
+# or sort results on your favourite measure afterwards. Only add an evaluation measure if
+# necessary (e.g. when other measure make no sense), since it will create a new task, which
+# scatters results across tasks.
+
+
+############################################################################
+# Example
+# #######
+#
+# Let's create a classification task on a dataset. In this example we will do this on the
+# Iris dataset (ID=128 (on test server)). We'll use 10-fold cross-validation (ID=1),
+# and _predictive accuracy_ as the predefined measure (this can also be left open).
+# If a task with these parameters exist, we will get an appropriate exception.
+# If such a task doesn't exist, a task will be created and the corresponding task_id
+# will be returned.
+
+
+# using test server for example uploads
+openml.config.start_using_configuration_for_example()
+
+try:
+    tasktypes = openml.tasks.TaskTypeEnum
+    my_task = openml.tasks.create_task(
+        task_type_id=tasktypes.SUPERVISED_CLASSIFICATION,
+        dataset_id=128,
+        target_name="class",
+        evaluation_measure="predictive_accuracy",
+        estimation_procedure_id=1)
+    my_task.publish()
+except openml.exceptions.OpenMLServerException as e:
+    # Error code for 'task already exists'
+    if e.code == 614:
+        # Lookup task
+        tasks = openml.tasks.list_tasks(data_id=128, output_format='dataframe').to_numpy()
+        tasks = tasks[tasks[:, 4] == "Supervised Classification"]
+        tasks = tasks[tasks[:, 6] == "10-fold Crossvalidation"]
+        tasks = tasks[tasks[:, 19] == "predictive_accuracy"]
+        task_id = tasks[0][0]
+        print("Task already exists. Task ID is", task_id)
+
+# reverting to prod server
+openml.config.stop_using_configuration_for_example()
+
+
+############################################################################
+# [Complete list of task types](https://www.openml.org/search?type=task_type)
+# [Complete list of model estimation procedures](
+# https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure)
+# [Complete list of evaluation measures](
+# https://www.openml.org/search?q=measure_type%3Aevaluation_measure&type=measure)
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
@@ -40,7 +40,6 @@ def test_upload_task(self):
                     dataset_id=dataset_id,
                     estimation_procedure_id=self.estimation_procedure
                 )
-
                 task_id = task.publish()
                 TestBase._mark_entity_for_removal('task', task_id)
                 TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,6 @@ def test_upload_task(self):`
`40`	`40`	`dataset_id=dataset_id,`
`41`	`41`	`estimation_procedure_id=self.estimation_procedure`
`42`	`42`	`)`
`43`		`-`
`44`	`43`	`task_id = task.publish()`
`45`	`44`	`TestBase._mark_entity_for_removal('task', task_id)`
`46`	`45`	`TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],`