@@ -1012,9 +1012,10 @@ def test_ignore_attributes_dataset(self):
10121012 original_data_url = original_data_url ,
10131013 paper_url = paper_url
10141014 )
1015- self .assertEqual (dataset .ignore_attributes , ['outlook' ])
1015+ self .assertEqual (dataset .ignore_attribute , ['outlook' ])
10161016
10171017 # pass a list to ignore_attribute
1018+ ignore_attribute = ['outlook' , 'windy' ]
10181019 dataset = openml .datasets .functions .create_dataset (
10191020 name = name ,
10201021 description = description ,
@@ -1025,15 +1026,15 @@ def test_ignore_attributes_dataset(self):
10251026 licence = licence ,
10261027 default_target_attribute = default_target_attribute ,
10271028 row_id_attribute = None ,
1028- ignore_attribute = [ 'outlook' , 'windy' ] ,
1029+ ignore_attribute = ignore_attribute ,
10291030 citation = citation ,
10301031 attributes = 'auto' ,
10311032 data = df ,
10321033 version_label = 'test' ,
10331034 original_data_url = original_data_url ,
10341035 paper_url = paper_url
10351036 )
1036- self .assertEqual (dataset .ignore_attributes , [ 'outlook' , 'windy' ] )
1037+ self .assertEqual (dataset .ignore_attribute , ignore_attribute )
10371038
10381039 # raise an error if unknown type
10391040 err_msg = 'Wrong data type for ignore_attribute. Should be list.'
@@ -1057,6 +1058,112 @@ def test_ignore_attributes_dataset(self):
10571058 paper_url = paper_url
10581059 )
10591060
1061+ def test___publish_fetch_ignore_attribute (self ):
1062+ """(Part 1) Test to upload and retrieve dataset and check ignore_attributes
1063+
1064+ DEPENDS on test_publish_fetch_ignore_attribute() to be executed after this
1065+ This test is split into two parts:
1066+ 1) test___publish_fetch_ignore_attribute()
1067+ This will be executed earlier, owing to alphabetical sorting.
1068+ This test creates and publish() a dataset and checks for a valid ID.
1069+ 2) test_publish_fetch_ignore_attribute()
1070+ This will be executed after test___publish_fetch_ignore_attribute(),
1071+ owing to alphabetical sorting. The time gap is to allow the server
1072+ more time time to compute data qualities.
1073+ The dataset ID obtained previously is used to fetch the dataset.
1074+ The retrieved dataset is checked for valid ignore_attributes.
1075+ """
1076+ # the returned fixt
1077+ data = [
1078+ ['a' , 'sunny' , 85.0 , 85.0 , 'FALSE' , 'no' ],
1079+ ['b' , 'sunny' , 80.0 , 90.0 , 'TRUE' , 'no' ],
1080+ ['c' , 'overcast' , 83.0 , 86.0 , 'FALSE' , 'yes' ],
1081+ ['d' , 'rainy' , 70.0 , 96.0 , 'FALSE' , 'yes' ],
1082+ ['e' , 'rainy' , 68.0 , 80.0 , 'FALSE' , 'yes' ]
1083+ ]
1084+ column_names = ['rnd_str' , 'outlook' , 'temperature' , 'humidity' ,
1085+ 'windy' , 'play' ]
1086+ df = pd .DataFrame (data , columns = column_names )
1087+ # enforce the type of each column
1088+ df ['outlook' ] = df ['outlook' ].astype ('category' )
1089+ df ['windy' ] = df ['windy' ].astype ('bool' )
1090+ df ['play' ] = df ['play' ].astype ('category' )
1091+ # meta-information
1092+ name = '%s-pandas_testing_dataset' % self ._get_sentinel ()
1093+ description = 'Synthetic dataset created from a Pandas DataFrame'
1094+ creator = 'OpenML tester'
1095+ collection_date = '01-01-2018'
1096+ language = 'English'
1097+ licence = 'MIT'
1098+ default_target_attribute = 'play'
1099+ citation = 'None'
1100+ original_data_url = 'http://openml.github.io/openml-python'
1101+ paper_url = 'http://openml.github.io/openml-python'
1102+
1103+ # pass a list to ignore_attribute
1104+ ignore_attribute = ['outlook' , 'windy' ]
1105+ dataset = openml .datasets .functions .create_dataset (
1106+ name = name ,
1107+ description = description ,
1108+ creator = creator ,
1109+ contributor = None ,
1110+ collection_date = collection_date ,
1111+ language = language ,
1112+ licence = licence ,
1113+ default_target_attribute = default_target_attribute ,
1114+ row_id_attribute = None ,
1115+ ignore_attribute = ignore_attribute ,
1116+ citation = citation ,
1117+ attributes = 'auto' ,
1118+ data = df ,
1119+ version_label = 'test' ,
1120+ original_data_url = original_data_url ,
1121+ paper_url = paper_url
1122+ )
1123+
1124+ # publish dataset
1125+ upload_did = dataset .publish ()
1126+ # test if publish was successful
1127+ self .assertIsInstance (upload_did , int )
1128+ # variables to carry forward for test_publish_fetch_ignore_attribute()
1129+ self .__class__ .test_publish_fetch_ignore_attribute_did = upload_did
1130+ self .__class__ .test_publish_fetch_ignore_attribute_list = ignore_attribute
1131+
1132+ def test_publish_fetch_ignore_attribute (self ):
1133+ """(Part 2) Test to upload and retrieve dataset and check ignore_attributes
1134+
1135+ DEPENDS on test___publish_fetch_ignore_attribute() to be executed first
1136+ This will be executed after test___publish_fetch_ignore_attribute(),
1137+ owing to alphabetical sorting. The time gap is to allow the server
1138+ more time time to compute data qualities.
1139+ The dataset ID obtained previously is used to fetch the dataset.
1140+ The retrieved dataset is checked for valid ignore_attributes.
1141+ """
1142+ # Retrieving variables from test___publish_fetch_ignore_attribute()
1143+ upload_did = self .__class__ .test_publish_fetch_ignore_attribute_did
1144+ ignore_attribute = self .__class__ .test_publish_fetch_ignore_attribute_list
1145+ trials = 1
1146+ timeout_limit = 200
1147+ dataset = None
1148+ # fetching from server
1149+ # loop till timeout or fetch not successful
1150+ while True :
1151+ if trials > timeout_limit :
1152+ break
1153+ try :
1154+ dataset = openml .datasets .get_dataset (upload_did )
1155+ break
1156+ except Exception as e :
1157+ # returned code 273: Dataset not processed yet
1158+ # returned code 362: No qualities found
1159+ print ("Trial {}/{}: " .format (trials , timeout_limit ))
1160+ print ("\t Failed to fetch dataset:{} with '{}'." .format (upload_did , str (e )))
1161+ trials += 1
1162+ continue
1163+ if dataset is None :
1164+ raise ValueError ("TIMEOUT: Failed to fetch uploaded dataset - {}" .format (upload_did ))
1165+ self .assertEqual (dataset .ignore_attribute , ignore_attribute )
1166+
10601167 def test_create_dataset_row_id_attribute_error (self ):
10611168 # meta-information
10621169 name = '%s-pandas_testing_dataset' % self ._get_sentinel ()
0 commit comments