Skip to content

Commit 9c93f5b

Browse files
authored
Edit api (#935)
* version1 * minor fixes * tests * reformat code * check new version * remove get data * code format * review comments * fix duplicate * type annotate * example * tests for exceptions * fix pep8 * black format
1 parent 1670050 commit 9c93f5b

File tree

4 files changed

+269
-5
lines changed

4 files changed

+269
-5
lines changed

doc/progress.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ Changelog
88

99
0.11.0
1010
~~~~~~
11-
11+
* ADD #929: Add data edit API
1212
* FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
1313
switching the server.
1414
* FIX #885: Logger no longer registered by default. Added utility functions to easily register

examples/30_extended/datasets_tutorial.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@
55
66
How to list and download datasets.
77
"""
8-
############################################################################
8+
""
99

1010
# License: BSD 3-Clauses
1111

1212
import openml
1313
import pandas as pd
14+
from openml.datasets.functions import edit_dataset, get_dataset
1415

1516
############################################################################
1617
# Exercise 0
@@ -42,9 +43,9 @@
4243
# * Find a dataset called 'eeg_eye_state'.
4344
# * Find all datasets with more than 50 classes.
4445
datalist[datalist.NumberOfInstances > 10000].sort_values(["NumberOfInstances"]).head(n=20)
45-
############################################################################
46+
""
4647
datalist.query('name == "eeg-eye-state"')
47-
############################################################################
48+
""
4849
datalist.query("NumberOfClasses > 50")
4950

5051
############################################################################
@@ -108,3 +109,39 @@
108109
alpha=0.8,
109110
cmap="plasma",
110111
)
112+
113+
114+
############################################################################
115+
# Edit a created dataset
116+
# =================================================
117+
# This example uses the test server, to avoid editing a dataset on the main server.
118+
openml.config.start_using_configuration_for_example()
119+
############################################################################
120+
# Changes to these field edits existing version: allowed only for dataset owner
121+
data_id = edit_dataset(
122+
564,
123+
description="xor dataset represents XOR operation",
124+
contributor="",
125+
collection_date="2019-10-29 17:06:18",
126+
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
127+
paper_url="",
128+
citation="kaggle",
129+
language="English",
130+
)
131+
edited_dataset = get_dataset(data_id)
132+
print(f"Edited dataset ID: {data_id}")
133+
134+
135+
############################################################################
136+
# Changes to these fields: attributes, default_target_attribute,
137+
# row_id_attribute, ignore_attribute generates a new edited version: allowed for anyone
138+
139+
new_attributes = [
140+
("x0", "REAL"),
141+
("x1", "REAL"),
142+
("y", "REAL"),
143+
]
144+
data_id = edit_dataset(564, attributes=new_attributes)
145+
print(f"Edited dataset ID: {data_id}")
146+
147+
openml.config.stop_using_configuration_for_example()

openml/datasets/functions.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,154 @@ def status_update(data_id, status):
799799
raise ValueError("Data id/status does not collide")
800800

801801

802+
def edit_dataset(
803+
data_id,
804+
description=None,
805+
creator=None,
806+
contributor=None,
807+
collection_date=None,
808+
language=None,
809+
attributes=None,
810+
data=None,
811+
default_target_attribute=None,
812+
ignore_attribute=None,
813+
citation=None,
814+
row_id_attribute=None,
815+
original_data_url=None,
816+
paper_url=None,
817+
) -> int:
818+
"""
819+
Edits an OpenMLDataset.
820+
Specify atleast one field to edit, apart from data_id
821+
- For certain fields, a new dataset version is created : attributes, data,
822+
default_target_attribute, ignore_attribute, row_id_attribute.
823+
824+
- For other fields, the uploader can edit the exisiting version.
825+
Noone except the uploader can edit the exisitng version.
826+
827+
Parameters
828+
----------
829+
data_id : int
830+
ID of the dataset.
831+
description : str
832+
Description of the dataset.
833+
creator : str
834+
The person who created the dataset.
835+
contributor : str
836+
People who contributed to the current version of the dataset.
837+
collection_date : str
838+
The date the data was originally collected, given by the uploader.
839+
language : str
840+
Language in which the data is represented.
841+
Starts with 1 upper case letter, rest lower case, e.g. 'English'.
842+
attributes : list, dict, or 'auto'
843+
A list of tuples. Each tuple consists of the attribute name and type.
844+
If passing a pandas DataFrame, the attributes can be automatically
845+
inferred by passing ``'auto'``. Specific attributes can be manually
846+
specified by a passing a dictionary where the key is the name of the
847+
attribute and the value is the data type of the attribute.
848+
data : ndarray, list, dataframe, coo_matrix, shape (n_samples, n_features)
849+
An array that contains both the attributes and the targets. When
850+
providing a dataframe, the attribute names and type can be inferred by
851+
passing ``attributes='auto'``.
852+
The target feature is indicated as meta-data of the dataset.
853+
default_target_attribute : str
854+
The default target attribute, if it exists.
855+
Can have multiple values, comma separated.
856+
ignore_attribute : str | list
857+
Attributes that should be excluded in modelling,
858+
such as identifiers and indexes.
859+
citation : str
860+
Reference(s) that should be cited when building on this data.
861+
row_id_attribute : str, optional
862+
The attribute that represents the row-id column, if present in the
863+
dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
864+
specified, the index of the dataframe will be used as the
865+
``row_id_attribute``. If the name of the index is ``None``, it will
866+
be discarded.
867+
868+
.. versionadded: 0.8
869+
Inference of ``row_id_attribute`` from a dataframe.
870+
original_data_url : str, optional
871+
For derived data, the url to the original dataset.
872+
paper_url : str, optional
873+
Link to a paper describing the dataset.
874+
875+
876+
Returns
877+
-------
878+
data_id of the existing edited version or the new version created and published"""
879+
if not isinstance(data_id, int):
880+
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
881+
882+
# case 1, changing these fields creates a new version of the dataset with changed field
883+
if any(
884+
field is not None
885+
for field in [
886+
data,
887+
attributes,
888+
default_target_attribute,
889+
row_id_attribute,
890+
ignore_attribute,
891+
]
892+
):
893+
logger.warning("Creating a new version of dataset, cannot edit existing version")
894+
dataset = get_dataset(data_id)
895+
896+
decoded_arff = dataset._get_arff(format="arff")
897+
data_old = decoded_arff["data"]
898+
data_new = data if data is not None else data_old
899+
dataset_new = create_dataset(
900+
name=dataset.name,
901+
description=description or dataset.description,
902+
creator=creator or dataset.creator,
903+
contributor=contributor or dataset.contributor,
904+
collection_date=collection_date or dataset.collection_date,
905+
language=language or dataset.language,
906+
licence=dataset.licence,
907+
attributes=attributes or decoded_arff["attributes"],
908+
data=data_new,
909+
default_target_attribute=default_target_attribute or dataset.default_target_attribute,
910+
ignore_attribute=ignore_attribute or dataset.ignore_attribute,
911+
citation=citation or dataset.citation,
912+
row_id_attribute=row_id_attribute or dataset.row_id_attribute,
913+
original_data_url=original_data_url or dataset.original_data_url,
914+
paper_url=paper_url or dataset.paper_url,
915+
update_comment=dataset.update_comment,
916+
version_label=dataset.version_label,
917+
)
918+
dataset_new.publish()
919+
return dataset_new.dataset_id
920+
921+
# case 2, changing any of these fields will update existing dataset
922+
# compose data edit parameters as xml
923+
form_data = {"data_id": data_id}
924+
xml = OrderedDict() # type: 'OrderedDict[str, OrderedDict]'
925+
xml["oml:data_edit_parameters"] = OrderedDict()
926+
xml["oml:data_edit_parameters"]["@xmlns:oml"] = "http://openml.org/openml"
927+
xml["oml:data_edit_parameters"]["oml:description"] = description
928+
xml["oml:data_edit_parameters"]["oml:creator"] = creator
929+
xml["oml:data_edit_parameters"]["oml:contributor"] = contributor
930+
xml["oml:data_edit_parameters"]["oml:collection_date"] = collection_date
931+
xml["oml:data_edit_parameters"]["oml:language"] = language
932+
xml["oml:data_edit_parameters"]["oml:citation"] = citation
933+
xml["oml:data_edit_parameters"]["oml:original_data_url"] = original_data_url
934+
xml["oml:data_edit_parameters"]["oml:paper_url"] = paper_url
935+
936+
# delete None inputs
937+
for k in list(xml["oml:data_edit_parameters"]):
938+
if not xml["oml:data_edit_parameters"][k]:
939+
del xml["oml:data_edit_parameters"][k]
940+
941+
file_elements = {"edit_parameters": ("description.xml", xmltodict.unparse(xml))}
942+
result_xml = openml._api_calls._perform_api_call(
943+
"data/edit", "post", data=form_data, file_elements=file_elements
944+
)
945+
result = xmltodict.parse(result_xml)
946+
data_id = result["oml:data_edit"]["oml:id"]
947+
return int(data_id)
948+
949+
802950
def _get_dataset_description(did_cache_dir, dataset_id):
803951
"""Get the dataset description as xml dictionary.
804952

tests/test_datasets/test_dataset_functions.py

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,17 @@
1616

1717
import openml
1818
from openml import OpenMLDataset
19-
from openml.exceptions import OpenMLCacheException, OpenMLHashException, OpenMLPrivateDatasetError
19+
from openml.exceptions import (
20+
OpenMLCacheException,
21+
OpenMLHashException,
22+
OpenMLPrivateDatasetError,
23+
OpenMLServerException,
24+
)
2025
from openml.testing import TestBase
2126
from openml.utils import _tag_entity, _create_cache_directory_for_id
2227
from openml.datasets.functions import (
2328
create_dataset,
29+
edit_dataset,
2430
attributes_arff_from_df,
2531
_get_cached_dataset,
2632
_get_cached_dataset_features,
@@ -1331,3 +1337,76 @@ def test_get_dataset_cache_format_feather(self):
13311337
self.assertEqual(X.shape, (150, 5))
13321338
self.assertEqual(len(categorical), X.shape[1])
13331339
self.assertEqual(len(attribute_names), X.shape[1])
1340+
1341+
def test_data_edit(self):
1342+
1343+
# admin key for test server (only admins or owners can edit datasets).
1344+
# all users can edit their own datasets)
1345+
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
1346+
1347+
# case 1, editing description, creator, contributor, collection_date, original_data_url,
1348+
# paper_url, citation, language edits existing dataset.
1349+
did = 564
1350+
result = edit_dataset(
1351+
did,
1352+
description="xor dataset represents XOR operation",
1353+
contributor="",
1354+
collection_date="2019-10-29 17:06:18",
1355+
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
1356+
paper_url="",
1357+
citation="kaggle",
1358+
language="English",
1359+
)
1360+
self.assertEqual(result, did)
1361+
1362+
# case 2, editing data, attributes, default_target_attribute, row_id_attribute,
1363+
# ignore_attribute generates a new dataset
1364+
1365+
column_names = [
1366+
("input1", "REAL"),
1367+
("input2", "REAL"),
1368+
("y", "REAL"),
1369+
]
1370+
desc = "xor dataset represents XOR operation"
1371+
result = edit_dataset(
1372+
564,
1373+
description=desc,
1374+
contributor="",
1375+
collection_date="2019-10-29 17:06:18",
1376+
attributes=column_names,
1377+
original_data_url="https://www.kaggle.com/ancientaxe/and-or-xor",
1378+
paper_url="",
1379+
citation="kaggle",
1380+
language="English",
1381+
)
1382+
self.assertNotEqual(did, result)
1383+
1384+
def test_data_edit_errors(self):
1385+
1386+
# admin key for test server (only admins or owners can edit datasets).
1387+
openml.config.apikey = "d488d8afd93b32331cf6ea9d7003d4c3"
1388+
# Check server exception when no field to edit is provided
1389+
self.assertRaisesRegex(
1390+
OpenMLServerException,
1391+
"Please provide atleast one field among description, creator, contributor, "
1392+
"collection_date, language, citation, original_data_url or paper_url to edit.",
1393+
edit_dataset,
1394+
data_id=564,
1395+
)
1396+
# Check server exception when unknown dataset is provided
1397+
self.assertRaisesRegex(
1398+
OpenMLServerException,
1399+
"Unknown dataset",
1400+
edit_dataset,
1401+
data_id=100000,
1402+
description="xor operation dataset",
1403+
)
1404+
# Check server exception when a non-owner or non-admin tries to edit existing dataset
1405+
openml.config.apikey = "5f0b74b33503e4ad4a7181a91e28719f"
1406+
self.assertRaisesRegex(
1407+
OpenMLServerException,
1408+
"Dataset is not owned by you",
1409+
edit_dataset,
1410+
data_id=564,
1411+
description="xor data",
1412+
)

0 commit comments

Comments
 (0)