change edit_api to reflect server (#941)

sahithyaravi · web-flow · commit f70c720c1624 · 2020-08-31T20:27:31.000+02:00
* change edit_api to reflect server

* change test and example to reflect rest API changes

* tutorial comments

* Update datasets_tutorial.py
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -21,7 +21,7 @@
 #
 #   * Use the output_format parameter to select output type
 #   * Default gives 'dict' (other option: 'dataframe', see below)
-
+#
 openml_list = openml.datasets.list_datasets()  # returns a dict
 
 # Show a nice table with some key data properties
@@ -117,31 +117,33 @@
 # This example uses the test server, to avoid editing a dataset on the main server.
 openml.config.start_using_configuration_for_example()
 ############################################################################
-# Changes to these field edits existing version: allowed only for dataset owner
+# Edit non-critical fields, allowed for all authorized users:
+# description, creator, contributor, collection_date, language, citation,
+# original_data_url, paper_url
+desc = (
+    "This data sets consists of 3 different types of irises' "
+    "(Setosa, Versicolour, and Virginica) petal and sepal length,"
+    " stored in a 150x4 numpy.ndarray"
+)
+did = 128
 data_id = edit_dataset(
-    564,
-    description="xor dataset represents XOR operation",
-    contributor="",
-    collection_date="2019-10-29 17:06:18",
-    original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
-    paper_url="",
-    citation="kaggle",
+    did,
+    description=desc,
+    creator="R.A.Fisher",
+    collection_date="1937",
+    citation="The use of multiple measurements in taxonomic problems",
     language="English",
 )
 edited_dataset = get_dataset(data_id)
 print(f"Edited dataset ID: {data_id}")
 
 
 ############################################################################
-# Changes to these fields: attributes, default_target_attribute,
-# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone
-
-new_attributes = [
-    ("x0", "REAL"),
-    ("x1", "REAL"),
-    ("y", "REAL"),
-]
-data_id = edit_dataset(564, attributes=new_attributes)
+# Edit critical fields, allowed only for owners of the dataset:
+# default_target_attribute, row_id_attribute, ignore_attribute
+# To edit critical fields of a dataset owned by you, configure the API key:
+# openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
+data_id = edit_dataset(564, default_target_attribute="y")
 print(f"Edited dataset ID: {data_id}")
 
 openml.config.stop_using_configuration_for_example()
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -806,8 +806,6 @@ def edit_dataset(
     contributor=None,
     collection_date=None,
     language=None,
-    attributes=None,
-    data=None,
     default_target_attribute=None,
     ignore_attribute=None,
     citation=None,
@@ -839,17 +837,6 @@ def edit_dataset(
       language : str
           Language in which the data is represented.
           Starts with 1 upper case letter, rest lower case, e.g. 'English'.
-      attributes : list, dict, or 'auto'
-          A list of tuples. Each tuple consists of the attribute name and type.
-          If passing a pandas DataFrame, the attributes can be automatically
-          inferred by passing ``'auto'``. Specific attributes can be manually
-          specified by a passing a dictionary where the key is the name of the
-          attribute and the value is the data type of the attribute.
-      data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
-          An array that contains both the attributes and the targets. When
-          providing a dataframe, the attribute names and type can be inferred by
-          passing ``attributes='auto'``.
-          The target feature is indicated as meta-data of the dataset.
       default_target_attribute : str
           The default target attribute, if it exists.
           Can have multiple values, comma separated.
@@ -879,54 +866,6 @@ def edit_dataset(
     if not isinstance(data_id, int):
         raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
 
-    # case 1, changing these fields creates a new version of the dataset with changed field
-    if any(
-        field is not None
-        for field in [
-            data,
-            attributes,
-            default_target_attribute,
-            row_id_attribute,
-            ignore_attribute,
-        ]
-    ):
-        logger.warning("Creating a new version of dataset, cannot edit existing version")
-
-        # Get old dataset and features
-        dataset = get_dataset(data_id)
-        df, y, categorical, attribute_names = dataset.get_data(dataset_format="dataframe")
-        attributes_old = attributes_arff_from_df(df)
-
-        # Sparse data needs to be provided in a different format from dense data
-        if dataset.format == "sparse_arff":
-            df, y, categorical, attribute_names = dataset.get_data(dataset_format="array")
-            data_old = coo_matrix(df)
-        else:
-            data_old = df
-        data_new = data if data is not None else data_old
-        dataset_new = create_dataset(
-            name=dataset.name,
-            description=description or dataset.description,
-            creator=creator or dataset.creator,
-            contributor=contributor or dataset.contributor,
-            collection_date=collection_date or dataset.collection_date,
-            language=language or dataset.language,
-            licence=dataset.licence,
-            attributes=attributes or attributes_old,
-            data=data_new,
-            default_target_attribute=default_target_attribute or dataset.default_target_attribute,
-            ignore_attribute=ignore_attribute or dataset.ignore_attribute,
-            citation=citation or dataset.citation,
-            row_id_attribute=row_id_attribute or dataset.row_id_attribute,
-            original_data_url=original_data_url or dataset.original_data_url,
-            paper_url=paper_url or dataset.paper_url,
-            update_comment=dataset.update_comment,
-            version_label=dataset.version_label,
-        )
-        dataset_new.publish()
-        return dataset_new.dataset_id
-
-    # case 2, changing any of these fields will update existing dataset
     # compose data edit parameters as xml
     form_data = {"data_id": data_id}
     xml = OrderedDict()  # type: 'OrderedDict[str, OrderedDict]'
@@ -937,6 +876,9 @@ def edit_dataset(
     xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
     xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
     xml["oml:data_edit_parameters"]["oml:language"] = language
+    xml["oml:data_edit_parameters"]["oml:default_target_attribute"] = default_target_attribute
+    xml["oml:data_edit_parameters"]["oml:row_id_attribute"] = row_id_attribute
+    xml["oml:data_edit_parameters"]["oml:ignore_attribute"] = ignore_attribute
     xml["oml:data_edit_parameters"]["oml:citation"] = citation
     xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
     xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -1341,57 +1341,43 @@ def test_get_dataset_cache_format_feather(self):
         self.assertEqual(len(attribute_names), X.shape[1])
 
     def test_data_edit(self):
-
-        # admin key for test server (only admins or owners can edit datasets).
-        # all users can edit their own datasets)
-        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
-
-        # case 1, editing description, creator, contributor, collection_date, original_data_url,
-        # paper_url, citation, language edits existing dataset.
-        did = 564
-        result = edit_dataset(
-            did,
-            description="xor dataset represents XOR operation",
-            contributor="",
-            collection_date="2019-10-29 17:06:18",
-            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
-            paper_url="",
-            citation="kaggle",
-            language="English",
+        # Case 1
+        # All users can edit non-critical fields of datasets
+        desc = (
+            "This data sets consists of 3 different types of irises' "
+            "(Setosa, Versicolour, and Virginica) petal and sepal length,"
+            " stored in a 150x4 numpy.ndarray"
         )
-        self.assertEqual(result, did)
-
-        # case 2, editing data, attributes, default_target_attribute, row_id_attribute,
-        # ignore_attribute generates a new dataset
-
-        column_names = [
-            ("input1", "REAL"),
-            ("input2", "REAL"),
-            ("y", "REAL"),
-        ]
-        desc = "xor dataset represents XOR operation"
+        did = 128
         result = edit_dataset(
-            564,
+            did,
             description=desc,
-            contributor="",
-            collection_date="2019-10-29 17:06:18",
-            attributes=column_names,
-            original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
-            paper_url="",
-            citation="kaggle",
+            creator="R.A.Fisher",
+            collection_date="1937",
+            citation="The use of multiple measurements in taxonomic problems",
             language="English",
         )
-        self.assertNotEqual(did, result)
+        self.assertEqual(did, result)
+        edited_dataset = openml.datasets.get_dataset(did)
+        self.assertEqual(edited_dataset.description, desc)
+
+        # Case 2
+        # only owners (or admin) can edit all critical fields of datasets
+        # this is a dataset created by CI, so it is editable by this test
+        did = 315
+        result = edit_dataset(did, default_target_attribute="col_1", ignore_attribute="col_2")
+        self.assertEqual(did, result)
+        edited_dataset = openml.datasets.get_dataset(did)
+        self.assertEqual(edited_dataset.ignore_attribute, ["col_2"])
 
     def test_data_edit_errors(self):
-
-        # admin key for test server (only admins or owners can edit datasets).
-        openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
         # Check server exception when no field to edit is provided
         self.assertRaisesRegex(
             OpenMLServerException,
-            "Please provide atleast one field among description, creator, contributor, "
-            "collection_date, language, citation, original_data_url or paper_url to edit.",
+            "Please provide atleast one field among description, creator, "
+            "contributor, collection_date, language, citation, "
+            "original_data_url, default_target_attribute, row_id_attribute, "
+            "ignore_attribute or paper_url to edit.",
             edit_dataset,
             data_id=564,
         )
@@ -1403,12 +1389,21 @@ def test_data_edit_errors(self):
             data_id=100000,
             description="xor operation dataset",
         )
-        # Check server exception when a non-owner or non-admin tries to edit existing dataset
-        openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
+        # Check server exception when owner/admin edits critical features of dataset with tasks
         self.assertRaisesRegex(
             OpenMLServerException,
-            "Dataset is not owned by you",
+            "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
+            "can only be edited for datasets without any tasks.",
             edit_dataset,
-            data_id=564,
-            description="xor data",
+            data_id=223,
+            default_target_attribute="y",
+        )
+        # Check server exception when a non-owner or non-admin tries to edit critical features
+        self.assertRaisesRegex(
+            OpenMLServerException,
+            "Critical features default_target_attribute, row_id_attribute and ignore_attribute "
+            "can be edited only by the owner. Fork the dataset if changes are required.",
+            edit_dataset,
+            data_id=128,
+            default_target_attribute="y",
         )