Skip to content

Commit 9bc84a9

Browse files
fork api (#944)
* fork api * improve docs (+1 squashed commits) Squashed commits: [ec5c0d10] import changes * minor change (+1 squashed commits) Squashed commits: [1822c99] improve docs (+1 squashed commits) Squashed commits: [ec5c0d10] import changes * docs update * clarify example * Update doc/progress.rst * Fix whitespaces for docstring * fix error * Use id 999999 for unknown dataset Co-authored-by: PGijsbers <[email protected]>
1 parent bf3cd2e commit 9bc84a9

File tree

7 files changed

+132
-59
lines changed

7 files changed

+132
-59
lines changed

doc/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ Modules
7474
list_datasets
7575
list_qualities
7676
status_update
77+
edit_dataset
78+
fork_dataset
7779

7880
:mod:`openml.evaluations`: Evaluation Functions
7981
-----------------------------------------------

doc/progress.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ Changelog
88

99
0.11.0
1010
~~~~~~
11-
* ADD #929: Add data edit API
11+
* ADD #929: Add ``edit_dataset`` and ``fork_dataset`` to allow editing and forking of uploaded datasets.
1212
* FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
1313
switching the server.
1414
* FIX #885: Logger no longer registered by default. Added utility functions to easily register

examples/30_extended/create_upload_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@
100100
# The attribute that represents the row-id column, if present in the
101101
# dataset.
102102
row_id_attribute=None,
103-
# Attribute or list of attributes that should be excluded in modelling, such as
103+
# Attribute or list of attributes that should be excluded in modelling, such as
104104
# identifiers and indexes. E.g. "feat1" or ["feat1","feat2"]
105105
ignore_attribute=None,
106106
# How to cite the paper.

examples/30_extended/datasets_tutorial.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import openml
1313
import pandas as pd
14-
from openml.datasets.functions import edit_dataset, get_dataset
14+
from openml.datasets import edit_dataset, fork_dataset, get_dataset
1515

1616
############################################################################
1717
# Exercise 0
@@ -139,11 +139,23 @@
139139

140140

141141
############################################################################
142-
# Edit critical fields, allowed only for owners of the dataset:
143-
# default_target_attribute, row_id_attribute, ignore_attribute
144-
# To edit critical fields of a dataset owned by you, configure the API key:
142+
# Editing critical fields (default_target_attribute, row_id_attribute, ignore_attribute) is allowed
143+
# only for the dataset owner. Further, critical fields cannot be edited if the dataset has any
144+
# tasks associated with it. To edit critical fields of a dataset (without tasks) owned by you,
145+
# configure the API key:
145146
# openml.config.apikey = 'FILL_IN_OPENML_API_KEY'
146147
data_id = edit_dataset(564, default_target_attribute="y")
147148
print(f"Edited dataset ID: {data_id}")
148149

150+
151+
############################################################################
152+
# Fork dataset
153+
# Used to create a copy of the dataset with you as the owner.
154+
# Use this API only if you are unable to edit the critical fields (default_target_attribute,
155+
# ignore_attribute, row_id_attribute) of a dataset through the edit_dataset API.
156+
# After the dataset is forked, you can edit the new version of the dataset using edit_dataset.
157+
158+
data_id = fork_dataset(564)
159+
print(f"Forked dataset ID: {data_id}")
160+
149161
openml.config.stop_using_configuration_for_example()

openml/datasets/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
list_datasets,
1010
status_update,
1111
list_qualities,
12+
edit_dataset,
13+
fork_dataset,
1214
)
1315
from .dataset import OpenMLDataset
1416
from .data_feature import OpenMLDataFeature
@@ -24,4 +26,6 @@
2426
"OpenMLDataFeature",
2527
"status_update",
2628
"list_qualities",
29+
"edit_dataset",
30+
"fork_dataset",
2731
]

openml/datasets/functions.py

Lines changed: 95 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -813,56 +813,63 @@ def edit_dataset(
813813
original_data_url=None,
814814
paper_url=None,
815815
) -> int:
816+
""" Edits an OpenMLDataset.
817+
818+
In addition to providing the dataset id of the dataset to edit (through data_id),
819+
you must specify a value for at least one of the optional function arguments,
820+
i.e. one value for a field to edit.
821+
822+
This function allows editing of both non-critical and critical fields.
823+
Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.
824+
825+
- Editing non-critical data fields is allowed for all authenticated users.
826+
- Editing critical fields is allowed only for the owner, provided there are no tasks
827+
associated with this dataset.
828+
829+
If dataset has tasks or if the user is not the owner, the only way
830+
to edit critical fields is to use fork_dataset followed by edit_dataset.
831+
832+
Parameters
833+
----------
834+
data_id : int
835+
ID of the dataset.
836+
description : str
837+
Description of the dataset.
838+
creator : str
839+
The person who created the dataset.
840+
contributor : str
841+
People who contributed to the current version of the dataset.
842+
collection_date : str
843+
The date the data was originally collected, given by the uploader.
844+
language : str
845+
Language in which the data is represented.
846+
Starts with 1 upper case letter, rest lower case, e.g. 'English'.
847+
default_target_attribute : str
848+
The default target attribute, if it exists.
849+
Can have multiple values, comma separated.
850+
ignore_attribute : str | list
851+
Attributes that should be excluded in modelling,
852+
such as identifiers and indexes.
853+
citation : str
854+
Reference(s) that should be cited when building on this data.
855+
row_id_attribute : str, optional
856+
The attribute that represents the row-id column, if present in the
857+
dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
858+
specified, the index of the dataframe will be used as the
859+
``row_id_attribute``. If the name of the index is ``None``, it will
860+
be discarded.
861+
862+
.. versionadded: 0.8
863+
Inference of ``row_id_attribute`` from a dataframe.
864+
original_data_url : str, optional
865+
For derived data, the url to the original dataset.
866+
paper_url : str, optional
867+
Link to a paper describing the dataset.
868+
869+
Returns
870+
-------
871+
Dataset id
816872
"""
817-
Edits an OpenMLDataset.
818-
Specify at least one field to edit, apart from data_id
819-
- For certain fields, a new dataset version is created : attributes, data,
820-
default_target_attribute, ignore_attribute, row_id_attribute.
821-
822-
- For other fields, the uploader can edit the existing version.
823-
No one except the uploader can edit the existing version.
824-
825-
Parameters
826-
----------
827-
data_id : int
828-
ID of the dataset.
829-
description : str
830-
Description of the dataset.
831-
creator : str
832-
The person who created the dataset.
833-
contributor : str
834-
People who contributed to the current version of the dataset.
835-
collection_date : str
836-
The date the data was originally collected, given by the uploader.
837-
language : str
838-
Language in which the data is represented.
839-
Starts with 1 upper case letter, rest lower case, e.g. 'English'.
840-
default_target_attribute : str
841-
The default target attribute, if it exists.
842-
Can have multiple values, comma separated.
843-
ignore_attribute : str | list
844-
Attributes that should be excluded in modelling,
845-
such as identifiers and indexes.
846-
citation : str
847-
Reference(s) that should be cited when building on this data.
848-
row_id_attribute : str, optional
849-
The attribute that represents the row-id column, if present in the
850-
dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
851-
specified, the index of the dataframe will be used as the
852-
``row_id_attribute``. If the name of the index is ``None``, it will
853-
be discarded.
854-
855-
.. versionadded: 0.8
856-
Inference of ``row_id_attribute`` from a dataframe.
857-
original_data_url : str, optional
858-
For derived data, the url to the original dataset.
859-
paper_url : str, optional
860-
Link to a paper describing the dataset.
861-
862-
863-
Returns
864-
-------
865-
data_id of the existing edited version or the new version created and published"""
866873
if not isinstance(data_id, int):
867874
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
868875

@@ -897,6 +904,45 @@ def edit_dataset(
897904
return int(data_id)
898905

899906

907+
def fork_dataset(data_id: int) -> int:
908+
"""
909+
Creates a new dataset version, with the authenticated user as the new owner.
910+
The forked dataset can have distinct dataset meta-data,
911+
but the actual data itself is shared with the original version.
912+
913+
This API is intended for use when a user is unable to edit the critical fields of a dataset
914+
through the edit_dataset API.
915+
(Critical fields are default_target_attribute, ignore_attribute, row_id_attribute.)
916+
917+
Specifically, this happens when the user is:
918+
1. Not the owner of the dataset.
919+
2. User is the owner of the dataset, but the dataset has tasks.
920+
921+
In these two cases the only way to edit critical fields is:
922+
1. STEP 1: Fork the dataset using fork_dataset API
923+
2. STEP 2: Call edit_dataset API on the forked version.
924+
925+
926+
Parameters
927+
----------
928+
data_id : int
929+
id of the dataset to be forked
930+
931+
Returns
932+
-------
933+
Dataset id of the forked dataset
934+
935+
"""
936+
if not isinstance(data_id, int):
937+
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
938+
# compose data fork parameters
939+
form_data = {"data_id": data_id}
940+
result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data)
941+
result = xmltodict.parse(result_xml)
942+
data_id = result["oml:data_fork"]["oml:id"]
943+
return int(data_id)
944+
945+
900946
def _get_dataset_description(did_cache_dir, dataset_id):
901947
"""Get the dataset description as xml dictionary.
902948

tests/test_datasets/test_dataset_functions.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
from openml.utils import _tag_entity, _create_cache_directory_for_id
2727
from openml.datasets.functions import (
2828
create_dataset,
29-
edit_dataset,
3029
attributes_arff_from_df,
3130
_get_cached_dataset,
3231
_get_cached_dataset_features,
@@ -40,6 +39,7 @@
4039
_get_online_dataset_format,
4140
DATASETS_CACHE_DIR_NAME,
4241
)
42+
from openml.datasets import fork_dataset, edit_dataset
4343

4444

4545
class TestOpenMLDataset(TestBase):
@@ -1386,10 +1386,10 @@ def test_data_edit_errors(self):
13861386
OpenMLServerException,
13871387
"Unknown dataset",
13881388
edit_dataset,
1389-
data_id=100000,
1389+
data_id=999999,
13901390
description="xor operation dataset",
13911391
)
1392-
# Check server exception when owner/admin edits critical features of dataset with tasks
1392+
# Check server exception when owner/admin edits critical fields of dataset with tasks
13931393
self.assertRaisesRegex(
13941394
OpenMLServerException,
13951395
"Critical features default_target_attribute, row_id_attribute and ignore_attribute "
@@ -1398,7 +1398,7 @@ def test_data_edit_errors(self):
13981398
data_id=223,
13991399
default_target_attribute="y",
14001400
)
1401-
# Check server exception when a non-owner or non-admin tries to edit critical features
1401+
# Check server exception when a non-owner or non-admin tries to edit critical fields
14021402
self.assertRaisesRegex(
14031403
OpenMLServerException,
14041404
"Critical features default_target_attribute, row_id_attribute and ignore_attribute "
@@ -1407,3 +1407,12 @@ def test_data_edit_errors(self):
14071407
data_id=128,
14081408
default_target_attribute="y",
14091409
)
1410+
1411+
def test_data_fork(self):
1412+
did = 1
1413+
result = fork_dataset(did)
1414+
self.assertNotEqual(did, result)
1415+
# Check server exception when unknown dataset is provided
1416+
self.assertRaisesRegex(
1417+
OpenMLServerException, "Unknown dataset", fork_dataset, data_id=999999,
1418+
)

0 commit comments

Comments
 (0)