fork api (#944)

sahithyaravi · PGijsbers · web-flow · commit 9bc84a94a16d · 2020-10-23T16:57:31.000+02:00
* fork api * improve docs (+1 squashed commits) Squashed commits: [ec5c0d10] import changes * minor change (+1 squashed commits) Squashed commits: [1822c99] improve docs (+1 squashed commits) Squashed commits: [ec5c0d10] import changes * docs update * clarify example * Update doc/progress.rst * Fix whitespaces for docstring * fix error * Use id 999999 for unknown dataset Co-authored-by: PGijsbers <p.gijsbers@tue.nl>
diff --git a/doc/api.rst b/doc/api.rst
@@ -74,6 +74,8 @@ Modules
     list_datasets
     list_qualities
     status_update
+    edit_dataset
+    fork_dataset
 
 :mod:`openml.evaluations`: Evaluation Functions
 -----------------------------------------------
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -8,7 +8,7 @@ Changelog
 
 0.11.0
 ~~~~~~
-* ADD #929: Add data edit API
+* ADD #929: Add ``edit_dataset`` and ``fork_dataset`` to allow editing and forking of uploaded datasets.
 * FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
   switching the server.
 * FIX #885: Logger no longer registered by default. Added utility functions to easily register
diff --git a/examples/30_extended/create_upload_tutorial.py b/examples/30_extended/create_upload_tutorial.py
@@ -100,7 +100,7 @@
     # The attribute that represents the row-id column, if present in the
     # dataset.
     row_id_attribute=None,
-    # Attribute or list of attributes that should be excluded in modelling, such as 
+    # Attribute or list of attributes that should be excluded in modelling, such as
     # identifiers and indexes. E.g. "feat1" or ["feat1","feat2"]
     ignore_attribute=None,
     # How to cite the paper.
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -11,7 +11,7 @@
 
 import openml
 import pandas as pd
-from openml.datasets.functions import edit_dataset, get_dataset
+from openml.datasets import edit_dataset, fork_dataset, get_dataset
 
 ############################################################################
 # Exercise 0
@@ -139,11 +139,23 @@
 
 
 ############################################################################
-# Edit critical fields, allowed only for owners of the dataset:
-# default_target_attribute, row_id_attribute, ignore_attribute
-# To edit critical fields of a dataset owned by you, configure the API key:
+# Editing critical fields (default_target_attribute, row_id_attribute, ignore_attribute) is allowed
+# only for the dataset owner. Further, critical fields cannot be edited if the dataset has any
+# tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you,
+# configure the API key:
 # openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
 data_id = edit_dataset(564, default_target_attribute="y")
 print(f"Edited dataset ID: {data_id}")
 
+
+############################################################################
+# Fork dataset
+# Used to create a copy of the dataset with you as the owner.
+# Use this API only if you are unable to edit the critical fields (default_target_attribute,
+# ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API.
+# After the dataset is forked, you can edit the new version of the dataset using edit_dataset.
+
+data_id = fork_dataset(564)
+print(f"Forked dataset ID: {data_id}")
+
 openml.config.stop_using_configuration_for_example()
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
@@ -9,6 +9,8 @@
     list_datasets,
     status_update,
     list_qualities,
+    edit_dataset,
+    fork_dataset,
 )
 from .dataset import OpenMLDataset
 from .data_feature import OpenMLDataFeature
@@ -24,4 +26,6 @@
     "OpenMLDataFeature",
     "status_update",
     "list_qualities",
+    "edit_dataset",
+    "fork_dataset",
 ]
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -813,56 +813,63 @@ def edit_dataset(
     original_data_url=None,
     paper_url=None,
 ) -> int:
+    """ Edits an OpenMLDataset.
+
+    In addition to providing the dataset id of the dataset to edit (through data_id),
+    you must specify a value for at least one of the optional function arguments,
+    i.e. one value for a field to edit.
+
+    This function allows editing of both non-critical and critical fields.
+    Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.
+
+     - Editing non-critical data fields is allowed for all authenticated users.
+     - Editing critical fields is allowed only for the owner, provided there are no tasks
+       associated with this dataset.
+
+    If dataset has tasks or if the user is not the owner, the only way
+    to edit critical fields is to use fork_dataset followed by edit_dataset.
+
+    Parameters
+    ----------
+    data_id : int
+        ID of the dataset.
+    description : str
+        Description of the dataset.
+    creator : str
+        The person who created the dataset.
+    contributor : str
+        People who contributed to the current version of the dataset.
+    collection_date : str
+        The date the data was originally collected, given by the uploader.
+    language : str
+        Language in which the data is represented.
+        Starts with 1 upper case letter, rest lower case, e.g. 'English'.
+    default_target_attribute : str
+        The default target attribute, if it exists.
+        Can have multiple values, comma separated.
+    ignore_attribute : str | list
+        Attributes that should be excluded in modelling,
+        such as identifiers and indexes.
+    citation : str
+        Reference(s) that should be cited when building on this data.
+    row_id_attribute : str, optional
+        The attribute that represents the row-id column, if present in the
+        dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
+        specified, the index of the dataframe will be used as the
+        ``row_id_attribute``. If the name of the index is ``None``, it will
+        be discarded.
+
+        .. versionadded: 0.8
+            Inference of ``row_id_attribute`` from a dataframe.
+    original_data_url : str, optional
+        For derived data, the url to the original dataset.
+    paper_url : str, optional
+        Link to a paper describing the dataset.
+
+    Returns
+    -------
+    Dataset id
     """
-      Edits an OpenMLDataset.
-      Specify at least one field to edit, apart from data_id
-       - For certain fields, a new dataset version is created : attributes, data,
-       default_target_attribute, ignore_attribute, row_id_attribute.
-
-       - For other fields, the uploader can edit the existing version.
-        No one except the uploader can edit the existing version.
-
-      Parameters
-      ----------
-      data_id : int
-          ID of the dataset.
-      description : str
-          Description of the dataset.
-      creator : str
-          The person who created the dataset.
-      contributor : str
-          People who contributed to the current version of the dataset.
-      collection_date : str
-          The date the data was originally collected, given by the uploader.
-      language : str
-          Language in which the data is represented.
-          Starts with 1 upper case letter, rest lower case, e.g. 'English'.
-      default_target_attribute : str
-          The default target attribute, if it exists.
-          Can have multiple values, comma separated.
-      ignore_attribute : str | list
-          Attributes that should be excluded in modelling,
-          such as identifiers and indexes.
-      citation : str
-          Reference(s) that should be cited when building on this data.
-      row_id_attribute : str, optional
-          The attribute that represents the row-id column, if present in the
-          dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
-          specified, the index of the dataframe will be used as the
-          ``row_id_attribute``. If the name of the index is ``None``, it will
-          be discarded.
-
-          .. versionadded: 0.8
-              Inference of ``row_id_attribute`` from a dataframe.
-      original_data_url : str, optional
-          For derived data, the url to the original dataset.
-      paper_url : str, optional
-          Link to a paper describing the dataset.
-
-
-      Returns
-      -------
-      data_id of the existing edited version or the new version created and published"""
     if not isinstance(data_id, int):
         raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
 
@@ -897,6 +904,45 @@ def edit_dataset(
     return int(data_id)
 
 
+def fork_dataset(data_id: int) -> int:
+    """
+     Creates a new dataset version, with the authenticated user as the new owner.
+     The forked dataset can have distinct dataset meta-data,
+     but the actual data itself is shared with the original version.
+
+     This API is intended for use when a user is unable to edit the critical fields of a dataset
+     through the edit_dataset API.
+     (Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.)
+
+     Specifically, this happens when the user is:
+            1. Not the owner of the dataset.
+            2. User is the owner of the dataset, but the dataset has tasks.
+
+     In these two cases the only way to edit critical fields is:
+            1. STEP 1: Fork the dataset using fork_dataset API
+            2. STEP 2: Call edit_dataset API on the forked version.
+
+
+    Parameters
+    ----------
+    data_id : int
+        id of the dataset to be forked
+
+    Returns
+    -------
+    Dataset id of the forked dataset
+
+    """
+    if not isinstance(data_id, int):
+        raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
+    # compose data fork parameters
+    form_data = {"data_id": data_id}
+    result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data)
+    result = xmltodict.parse(result_xml)
+    data_id = result["oml:data_fork"]["oml:id"]
+    return int(data_id)
+
+
 def _get_dataset_description(did_cache_dir, dataset_id):
     """Get the dataset description as xml dictionary.
 
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -26,7 +26,6 @@
 from openml.utils import _tag_entity, _create_cache_directory_for_id
 from openml.datasets.functions import (
     create_dataset,
-    edit_dataset,
     attributes_arff_from_df,
     _get_cached_dataset,
     _get_cached_dataset_features,
@@ -40,6 +39,7 @@
     _get_online_dataset_format,
     DATASETS_CACHE_DIR_NAME,
 )
+from openml.datasets import fork_dataset, edit_dataset
 
 
 class TestOpenMLDataset(TestBase):
@@ -1386,10 +1386,10 @@ def test_data_edit_errors(self):
             OpenMLServerException,
             "Unknown dataset",
             edit_dataset,
-            data_id=100000,
+            data_id=999999,
             description="xor operation dataset",
         )
-        # Check server exception when owner/admin edits critical features of dataset with tasks
+        # Check server exception when owner/admin edits critical fields of dataset with tasks
         self.assertRaisesRegex(
             OpenMLServerException,
             "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
@@ -1398,7 +1398,7 @@ def test_data_edit_errors(self):
             data_id=223,
             default_target_attribute="y",
         )
-        # Check server exception when a non-owner or non-admin tries to edit critical features
+        # Check server exception when a non-owner or non-admin tries to edit critical fields
         self.assertRaisesRegex(
             OpenMLServerException,
             "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
@@ -1407,3 +1407,12 @@ def test_data_edit_errors(self):
             data_id=128,
             default_target_attribute="y",
         )
+
+    def test_data_fork(self):
+        did = 1
+        result = fork_dataset(did)
+        self.assertNotEqual(did, result)
+        # Check server exception when unknown dataset is provided
+        self.assertRaisesRegex(
+            OpenMLServerException, "Unknown dataset", fork_dataset, data_id=999999,
+        )