Skip to content

Commit e42bdff

Browse files
authored
Merge branch 'develop' into fix_608
2 parents b256aec + b660d7d commit e42bdff

File tree

5 files changed

+140
-30
lines changed

5 files changed

+140
-30
lines changed

doc/progress.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ Changelog
99
0.10.0
1010
~~~~~~
1111
* FIX #608: Fixing dataset_id referenced before assignment error in get_run function.
12+
* FIX #589: Fixing a bug that did not successfully upload the columns to ignore when creating and publishing a dataset.
13+
* DOC #639: More descriptive documention for function to convert array format.
1214
* ADD #687: Adds a function to retrieve the list of evaluation measures available.
1315
* ADD #695: A function to retrieve all the data quality measures available.
1416

@@ -27,6 +29,7 @@ Changelog
2729
* ADD #659: Lazy loading of task splits.
2830
* ADD #516: `run_flow_on_task` flow uploading is now optional.
2931
* ADD #680: Adds `openml.config.start_using_configuration_for_example` (and resp. stop) to easily connect to the test server.
32+
* ADD #75, #653: Adds a pretty print for objects of the top-level classes.
3033
* FIX #642: `check_datasets_active` now correctly also returns active status of deactivated datasets.
3134
* FIX #304, #636: Allow serialization of numpy datatypes and list of lists of more types (e.g. bools, ints) for flows.
3235
* FIX #651: Fixed a bug that would prevent openml-python from finding the user's config file.

openml/datasets/dataset.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,9 @@ def __init__(self, name, description, format=None,
132132
self.default_target_attribute = default_target_attribute
133133
self.row_id_attribute = row_id_attribute
134134
if isinstance(ignore_attribute, str):
135-
self.ignore_attributes = [ignore_attribute]
135+
self.ignore_attribute = [ignore_attribute]
136136
elif isinstance(ignore_attribute, list) or ignore_attribute is None:
137-
self.ignore_attributes = ignore_attribute
137+
self.ignore_attribute = ignore_attribute
138138
else:
139139
raise ValueError('Wrong data type for ignore_attribute. '
140140
'Should be list.')
@@ -472,7 +472,7 @@ def get_data(
472472
self,
473473
target: Optional[Union[List[str], str]] = None,
474474
include_row_id: bool = False,
475-
include_ignore_attributes: bool = False,
475+
include_ignore_attribute: bool = False,
476476
dataset_format: str = "dataframe",
477477
) -> Tuple[
478478
Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix],
@@ -489,7 +489,7 @@ def get_data(
489489
Splitting multiple columns is currently not supported.
490490
include_row_id : boolean (default=False)
491491
Whether to include row ids in the returned dataset.
492-
include_ignore_attributes : boolean (default=False)
492+
include_ignore_attribute : boolean (default=False)
493493
Whether to include columns that are marked as "ignore"
494494
on the server in the dataset.
495495
dataset_format : string (default='dataframe')
@@ -528,11 +528,11 @@ def get_data(
528528
elif isinstance(self.row_id_attribute, Iterable):
529529
to_exclude.extend(self.row_id_attribute)
530530

531-
if not include_ignore_attributes and self.ignore_attributes is not None:
532-
if isinstance(self.ignore_attributes, str):
533-
to_exclude.append(self.ignore_attributes)
534-
elif isinstance(self.ignore_attributes, Iterable):
535-
to_exclude.extend(self.ignore_attributes)
531+
if not include_ignore_attribute and self.ignore_attribute is not None:
532+
if isinstance(self.ignore_attribute, str):
533+
to_exclude.append(self.ignore_attribute)
534+
elif isinstance(self.ignore_attribute, Iterable):
535+
to_exclude.extend(self.ignore_attribute)
536536

537537
if len(to_exclude) > 0:
538538
logger.info("Going to remove the following attributes:"
@@ -615,7 +615,7 @@ def retrieve_class_labels(self, target_name: str = 'class') -> Union[None, List[
615615
return None
616616

617617
def get_features_by_type(self, data_type, exclude=None,
618-
exclude_ignore_attributes=True,
618+
exclude_ignore_attribute=True,
619619
exclude_row_id_attribute=True):
620620
"""
621621
Return indices of features of a given type, e.g. all nominal features.
@@ -628,7 +628,7 @@ def get_features_by_type(self, data_type, exclude=None,
628628
exclude : list(int)
629629
Indices to exclude (and adapt the return values as if these indices
630630
are not present)
631-
exclude_ignore_attributes : bool
631+
exclude_ignore_attribute : bool
632632
Whether to exclude the defined ignore attributes (and adapt the
633633
return values as if these indices are not present)
634634
exclude_row_id_attribute : bool
@@ -642,9 +642,9 @@ def get_features_by_type(self, data_type, exclude=None,
642642
"""
643643
if data_type not in OpenMLDataFeature.LEGAL_DATA_TYPES:
644644
raise TypeError("Illegal feature type requested")
645-
if self.ignore_attributes is not None:
646-
if not isinstance(self.ignore_attributes, list):
647-
raise TypeError("ignore_attributes should be a list")
645+
if self.ignore_attribute is not None:
646+
if not isinstance(self.ignore_attribute, list):
647+
raise TypeError("ignore_attribute should be a list")
648648
if self.row_id_attribute is not None:
649649
if not isinstance(self.row_id_attribute, str):
650650
raise TypeError("row id attribute should be a str")
@@ -656,8 +656,8 @@ def get_features_by_type(self, data_type, exclude=None,
656656
to_exclude = []
657657
if exclude is not None:
658658
to_exclude.extend(exclude)
659-
if exclude_ignore_attributes and self.ignore_attributes is not None:
660-
to_exclude.extend(self.ignore_attributes)
659+
if exclude_ignore_attribute and self.ignore_attribute is not None:
660+
to_exclude.extend(self.ignore_attribute)
661661
if exclude_row_id_attribute and self.row_id_attribute is not None:
662662
to_exclude.append(self.row_id_attribute)
663663

openml/datasets/functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,10 +301,10 @@ def __list_datasets(api_call, output_format='dict'):
301301

302302
datasets = dict()
303303
for dataset_ in datasets_dict['oml:data']['oml:dataset']:
304-
ignore_attributes = ['oml:file_id', 'oml:quality']
304+
ignore_attribute = ['oml:file_id', 'oml:quality']
305305
dataset = {k.replace('oml:', ''): v
306306
for (k, v) in dataset_.items()
307-
if k not in ignore_attributes}
307+
if k not in ignore_attribute}
308308
dataset['did'] = int(dataset['did'])
309309
dataset['version'] = int(dataset['version'])
310310

tests/test_datasets/test_dataset.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def test_get_data_with_target_pandas(self):
141141
self.assertNotIn("class", attribute_names)
142142

143143
def test_get_data_rowid_and_ignore_and_target(self):
144-
self.dataset.ignore_attributes = ["condition"]
144+
self.dataset.ignore_attribute = ["condition"]
145145
self.dataset.row_id_attribute = ["hardness"]
146146
X, y, categorical, names = self.dataset.get_data(target="class")
147147
self.assertEqual(X.shape, (898, 36))
@@ -151,15 +151,15 @@ def test_get_data_rowid_and_ignore_and_target(self):
151151
self.assertEqual(y.shape, (898, ))
152152

153153
def test_get_data_with_ignore_attributes(self):
154-
self.dataset.ignore_attributes = ["condition"]
155-
rval, _, categorical, _ = self.dataset.get_data(include_ignore_attributes=True)
154+
self.dataset.ignore_attribute = ["condition"]
155+
rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True)
156156
for (dtype, is_cat) in zip(rval.dtypes, categorical):
157157
expected_type = 'category' if is_cat else 'float64'
158158
self.assertEqual(dtype.name, expected_type)
159159
self.assertEqual(rval.shape, (898, 39))
160160
self.assertEqual(len(categorical), 39)
161161

162-
rval, _, categorical, _ = self.dataset.get_data(include_ignore_attributes=False)
162+
rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False)
163163
for (dtype, is_cat) in zip(rval.dtypes, categorical):
164164
expected_type = 'category' if is_cat else 'float64'
165165
self.assertEqual(dtype.name, expected_type)
@@ -271,17 +271,17 @@ def test_get_sparse_dataset_with_rowid(self):
271271
self.assertEqual(len(categorical), 20000)
272272

273273
def test_get_sparse_dataset_with_ignore_attributes(self):
274-
self.sparse_dataset.ignore_attributes = ["V256"]
274+
self.sparse_dataset.ignore_attribute = ["V256"]
275275
rval, _, categorical, _ = self.sparse_dataset.get_data(
276-
dataset_format='array', include_ignore_attributes=True
276+
dataset_format='array', include_ignore_attribute=True
277277
)
278278
self.assertTrue(sparse.issparse(rval))
279279
self.assertEqual(rval.dtype, np.float32)
280280
self.assertEqual(rval.shape, (600, 20001))
281281

282282
self.assertEqual(len(categorical), 20001)
283283
rval, _, categorical, _ = self.sparse_dataset.get_data(
284-
dataset_format='array', include_ignore_attributes=False
284+
dataset_format='array', include_ignore_attribute=False
285285
)
286286
self.assertTrue(sparse.issparse(rval))
287287
self.assertEqual(rval.dtype, np.float32)
@@ -290,13 +290,13 @@ def test_get_sparse_dataset_with_ignore_attributes(self):
290290

291291
def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
292292
# TODO: re-add row_id and ignore attributes
293-
self.sparse_dataset.ignore_attributes = ["V256"]
293+
self.sparse_dataset.ignore_attribute = ["V256"]
294294
self.sparse_dataset.row_id_attribute = ["V512"]
295295
X, y, categorical, _ = self.sparse_dataset.get_data(
296296
dataset_format='array',
297297
target="class",
298298
include_row_id=False,
299-
include_ignore_attributes=False,
299+
include_ignore_attribute=False,
300300
)
301301
self.assertTrue(sparse.issparse(X))
302302
self.assertEqual(X.dtype, np.float32)

tests/test_datasets/test_dataset_functions.py

Lines changed: 110 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,9 +1012,10 @@ def test_ignore_attributes_dataset(self):
10121012
original_data_url=original_data_url,
10131013
paper_url=paper_url
10141014
)
1015-
self.assertEqual(dataset.ignore_attributes, ['outlook'])
1015+
self.assertEqual(dataset.ignore_attribute, ['outlook'])
10161016

10171017
# pass a list to ignore_attribute
1018+
ignore_attribute = ['outlook', 'windy']
10181019
dataset = openml.datasets.functions.create_dataset(
10191020
name=name,
10201021
description=description,
@@ -1025,15 +1026,15 @@ def test_ignore_attributes_dataset(self):
10251026
licence=licence,
10261027
default_target_attribute=default_target_attribute,
10271028
row_id_attribute=None,
1028-
ignore_attribute=['outlook', 'windy'],
1029+
ignore_attribute=ignore_attribute,
10291030
citation=citation,
10301031
attributes='auto',
10311032
data=df,
10321033
version_label='test',
10331034
original_data_url=original_data_url,
10341035
paper_url=paper_url
10351036
)
1036-
self.assertEqual(dataset.ignore_attributes, ['outlook', 'windy'])
1037+
self.assertEqual(dataset.ignore_attribute, ignore_attribute)
10371038

10381039
# raise an error if unknown type
10391040
err_msg = 'Wrong data type for ignore_attribute. Should be list.'
@@ -1057,6 +1058,112 @@ def test_ignore_attributes_dataset(self):
10571058
paper_url=paper_url
10581059
)
10591060

1061+
def test___publish_fetch_ignore_attribute(self):
1062+
"""(Part 1) Test to upload and retrieve dataset and check ignore_attributes
1063+
1064+
DEPENDS on test_publish_fetch_ignore_attribute() to be executed after this
1065+
This test is split into two parts:
1066+
1) test___publish_fetch_ignore_attribute()
1067+
This will be executed earlier, owing to alphabetical sorting.
1068+
This test creates and publish() a dataset and checks for a valid ID.
1069+
2) test_publish_fetch_ignore_attribute()
1070+
This will be executed after test___publish_fetch_ignore_attribute(),
1071+
owing to alphabetical sorting. The time gap is to allow the server
1072+
more time time to compute data qualities.
1073+
The dataset ID obtained previously is used to fetch the dataset.
1074+
The retrieved dataset is checked for valid ignore_attributes.
1075+
"""
1076+
# the returned fixt
1077+
data = [
1078+
['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
1079+
['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
1080+
['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
1081+
['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
1082+
['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes']
1083+
]
1084+
column_names = ['rnd_str', 'outlook', 'temperature', 'humidity',
1085+
'windy', 'play']
1086+
df = pd.DataFrame(data, columns=column_names)
1087+
# enforce the type of each column
1088+
df['outlook'] = df['outlook'].astype('category')
1089+
df['windy'] = df['windy'].astype('bool')
1090+
df['play'] = df['play'].astype('category')
1091+
# meta-information
1092+
name = '%s-pandas_testing_dataset' % self._get_sentinel()
1093+
description = 'Synthetic dataset created from a Pandas DataFrame'
1094+
creator = 'OpenML tester'
1095+
collection_date = '01-01-2018'
1096+
language = 'English'
1097+
licence = 'MIT'
1098+
default_target_attribute = 'play'
1099+
citation = 'None'
1100+
original_data_url = 'http://openml.github.io/openml-python'
1101+
paper_url = 'http://openml.github.io/openml-python'
1102+
1103+
# pass a list to ignore_attribute
1104+
ignore_attribute = ['outlook', 'windy']
1105+
dataset = openml.datasets.functions.create_dataset(
1106+
name=name,
1107+
description=description,
1108+
creator=creator,
1109+
contributor=None,
1110+
collection_date=collection_date,
1111+
language=language,
1112+
licence=licence,
1113+
default_target_attribute=default_target_attribute,
1114+
row_id_attribute=None,
1115+
ignore_attribute=ignore_attribute,
1116+
citation=citation,
1117+
attributes='auto',
1118+
data=df,
1119+
version_label='test',
1120+
original_data_url=original_data_url,
1121+
paper_url=paper_url
1122+
)
1123+
1124+
# publish dataset
1125+
upload_did = dataset.publish()
1126+
# test if publish was successful
1127+
self.assertIsInstance(upload_did, int)
1128+
# variables to carry forward for test_publish_fetch_ignore_attribute()
1129+
self.__class__.test_publish_fetch_ignore_attribute_did = upload_did
1130+
self.__class__.test_publish_fetch_ignore_attribute_list = ignore_attribute
1131+
1132+
def test_publish_fetch_ignore_attribute(self):
1133+
"""(Part 2) Test to upload and retrieve dataset and check ignore_attributes
1134+
1135+
DEPENDS on test___publish_fetch_ignore_attribute() to be executed first
1136+
This will be executed after test___publish_fetch_ignore_attribute(),
1137+
owing to alphabetical sorting. The time gap is to allow the server
1138+
more time time to compute data qualities.
1139+
The dataset ID obtained previously is used to fetch the dataset.
1140+
The retrieved dataset is checked for valid ignore_attributes.
1141+
"""
1142+
# Retrieving variables from test___publish_fetch_ignore_attribute()
1143+
upload_did = self.__class__.test_publish_fetch_ignore_attribute_did
1144+
ignore_attribute = self.__class__.test_publish_fetch_ignore_attribute_list
1145+
trials = 1
1146+
timeout_limit = 200
1147+
dataset = None
1148+
# fetching from server
1149+
# loop till timeout or fetch not successful
1150+
while True:
1151+
if trials > timeout_limit:
1152+
break
1153+
try:
1154+
dataset = openml.datasets.get_dataset(upload_did)
1155+
break
1156+
except Exception as e:
1157+
# returned code 273: Dataset not processed yet
1158+
# returned code 362: No qualities found
1159+
print("Trial {}/{}: ".format(trials, timeout_limit))
1160+
print("\tFailed to fetch dataset:{} with '{}'.".format(upload_did, str(e)))
1161+
trials += 1
1162+
continue
1163+
if dataset is None:
1164+
raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(upload_did))
1165+
self.assertEqual(dataset.ignore_attribute, ignore_attribute)
1166+
10601167
def test_create_dataset_row_id_attribute_error(self):
10611168
# meta-information
10621169
name = '%s-pandas_testing_dataset' % self._get_sentinel()

0 commit comments

Comments
 (0)