Skip to content

Commit 8665b34

Browse files
Add Feature Descriptions Rebase Clean (#1316)
Co-authored-by: Jan van Rijn <[email protected]>
1 parent b06ecee commit 8665b34

File tree

4 files changed

+101
-0
lines changed

4 files changed

+101
-0
lines changed

openml/datasets/data_feature.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ class OpenMLDataFeature:
2323
list of the possible values, in case of nominal attribute
2424
number_missing_values : int
2525
Number of rows that have a missing value for this feature.
26+
ontologies : list(str)
27+
list of ontologies attached to this feature. An ontology describes the
28+
concept that are described in a feature. An ontology is defined by an
29+
URL where the information is provided.
2630
"""
2731

2832
LEGAL_DATA_TYPES: ClassVar[Sequence[str]] = ["nominal", "numeric", "string", "date"]
@@ -34,6 +38,7 @@ def __init__( # noqa: PLR0913
3438
data_type: str,
3539
nominal_values: list[str],
3640
number_missing_values: int,
41+
ontologies: list[str] | None = None,
3742
):
3843
if not isinstance(index, int):
3944
raise TypeError(f"Index must be `int` but is {type(index)}")
@@ -67,6 +72,7 @@ def __init__( # noqa: PLR0913
6772
self.data_type = str(data_type)
6873
self.nominal_values = nominal_values
6974
self.number_missing_values = number_missing_values
75+
self.ontologies = ontologies
7076

7177
def __repr__(self) -> str:
7278
return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)

openml/datasets/dataset.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1069,6 +1069,7 @@ def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature
10691069
xmlfeature["oml:data_type"],
10701070
xmlfeature.get("oml:nominal_value"),
10711071
int(nr_missing),
1072+
xmlfeature.get("oml:ontology"),
10721073
)
10731074
if idx != feature.index:
10741075
raise ValueError("Data features not provided in right order")

openml/datasets/functions.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1061,6 +1061,59 @@ def fork_dataset(data_id: int) -> int:
10611061
return int(data_id)
10621062

10631063

1064+
def data_feature_add_ontology(data_id: int, index: int, ontology: str) -> bool:
1065+
"""
1066+
An ontology describes the concept that are described in a feature. An
1067+
ontology is defined by an URL where the information is provided. Adds
1068+
an ontology (URL) to a given dataset feature (defined by a dataset id
1069+
and index). The dataset has to exists on OpenML and needs to have been
1070+
processed by the evaluation engine.
1071+
1072+
Parameters
1073+
----------
1074+
data_id : int
1075+
id of the dataset to which the feature belongs
1076+
index : int
1077+
index of the feature in dataset (0-based)
1078+
ontology : str
1079+
URL to ontology (max. 256 characters)
1080+
1081+
Returns
1082+
-------
1083+
True or throws an OpenML server exception
1084+
"""
1085+
upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
1086+
openml._api_calls._perform_api_call("data/feature/ontology/add", "post", data=upload_data)
1087+
# an error will be thrown in case the request was unsuccessful
1088+
return True
1089+
1090+
1091+
def data_feature_remove_ontology(data_id: int, index: int, ontology: str) -> bool:
1092+
"""
1093+
Removes an existing ontology (URL) from a given dataset feature (defined
1094+
by a dataset id and index). The dataset has to exists on OpenML and needs
1095+
to have been processed by the evaluation engine. Ontology needs to be
1096+
attached to the specific fearure.
1097+
1098+
Parameters
1099+
----------
1100+
data_id : int
1101+
id of the dataset to which the feature belongs
1102+
index : int
1103+
index of the feature in dataset (0-based)
1104+
ontology : str
1105+
URL to ontology (max. 256 characters)
1106+
1107+
Returns
1108+
-------
1109+
True or throws an OpenML server exception
1110+
"""
1111+
upload_data: dict[str, int | str] = {"data_id": data_id, "index": index, "ontology": ontology}
1112+
openml._api_calls._perform_api_call("data/feature/ontology/remove", "post", data=upload_data)
1113+
# an error will be thrown in case the request was unsuccessful
1114+
return True
1115+
1116+
10641117
def _topic_add_dataset(data_id: int, topic: str) -> int:
10651118
"""
10661119
Adds a topic for a dataset.

tests/test_datasets/test_dataset.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,47 @@ def test_tagging(self):
330330
datasets = openml.datasets.list_datasets(tag=tag, output_format="dataframe")
331331
assert datasets.empty
332332

333+
def test_get_feature_with_ontology_data_id_11(self):
334+
# test on car dataset, which has built-in ontology references
335+
dataset = openml.datasets.get_dataset(11)
336+
assert len(dataset.features) == 7
337+
assert len(dataset.features[1].ontologies) >= 2
338+
assert len(dataset.features[2].ontologies) >= 1
339+
assert len(dataset.features[3].ontologies) >= 1
340+
341+
def test_add_remove_ontology_to_dataset(self):
342+
did = 1
343+
feature_index = 1
344+
ontology = 'https://www.openml.org/unittest/' + str(time())
345+
openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
346+
openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)
347+
348+
def test_add_same_ontology_multiple_features(self):
349+
did = 1
350+
ontology = 'https://www.openml.org/unittest/' + str(time())
351+
352+
for i in range(3):
353+
openml.datasets.functions.data_feature_add_ontology(did, i, ontology)
354+
355+
356+
def test_add_illegal_long_ontology(self):
357+
did = 1
358+
ontology = 'http://www.google.com/' + ('a' * 257)
359+
try:
360+
openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
361+
assert False
362+
except openml.exceptions.OpenMLServerException as e:
363+
assert e.code == 1105
364+
365+
def test_add_illegal_url_ontology(self):
366+
did = 1
367+
ontology = 'not_a_url' + str(time())
368+
try:
369+
openml.datasets.functions.data_feature_add_ontology(did, 1, ontology)
370+
assert False
371+
except openml.exceptions.OpenMLServerException as e:
372+
assert e.code == 1106
373+
333374
@pytest.mark.production()
334375
class OpenMLDatasetTestSparse(TestBase):
335376
_multiprocess_can_split_ = True

0 commit comments

Comments
 (0)