Merge pull request #710 from openml/fix_589

PGijsbers · web-flow · commit b660d7d54055 · 2019-06-25T14:19:06.000+02:00
Fixing ignoring of features on upload
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -8,7 +8,8 @@ Changelog
 
 0.10.0
 ~~~~~~
-
+* FIX #589: Fixing a bug that did not successfully upload the columns to ignore when creating and publishing a dataset.
+* DOC #639: More descriptive documention for function to convert array format.
 * ADD #687: Adds a function to retrieve the list of evaluation measures available.
 * ADD #695: A function to retrieve all the data quality measures available.
 
@@ -27,6 +28,7 @@ Changelog
 * ADD #659: Lazy loading of task splits.
 * ADD #516: `run_flow_on_task` flow uploading is now optional.
 * ADD #680: Adds `openml.config.start_using_configuration_for_example` (and resp. stop) to easily connect to the test server.
+* ADD #75, #653: Adds a pretty print for objects of the top-level classes.
 * FIX #642: `check_datasets_active` now correctly also returns active status of deactivated datasets.
 * FIX #304, #636: Allow serialization of numpy datatypes and list of lists of more types (e.g. bools, ints) for flows.
 * FIX #651: Fixed a bug that would prevent openml-python from finding the user's config file.
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -132,9 +132,9 @@ def __init__(self, name, description, format=None,
         self.default_target_attribute = default_target_attribute
         self.row_id_attribute = row_id_attribute
         if isinstance(ignore_attribute, str):
-            self.ignore_attributes = [ignore_attribute]
+            self.ignore_attribute = [ignore_attribute]
         elif isinstance(ignore_attribute, list) or ignore_attribute is None:
-            self.ignore_attributes = ignore_attribute
+            self.ignore_attribute = ignore_attribute
         else:
             raise ValueError('Wrong data type for ignore_attribute. '
                              'Should be list.')
@@ -472,7 +472,7 @@ def get_data(
             self,
             target: Optional[Union[List[str], str]] = None,
             include_row_id: bool = False,
-            include_ignore_attributes: bool = False,
+            include_ignore_attribute: bool = False,
             dataset_format: str = "dataframe",
     ) -> Tuple[
             Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix],
@@ -489,7 +489,7 @@ def get_data(
             Splitting multiple columns is currently not supported.
         include_row_id : boolean (default=False)
             Whether to include row ids in the returned dataset.
-        include_ignore_attributes : boolean (default=False)
+        include_ignore_attribute : boolean (default=False)
             Whether to include columns that are marked as "ignore"
             on the server in the dataset.
         dataset_format : string (default='dataframe')
@@ -528,11 +528,11 @@ def get_data(
             elif isinstance(self.row_id_attribute, Iterable):
                 to_exclude.extend(self.row_id_attribute)
 
-        if not include_ignore_attributes and self.ignore_attributes is not None:
-            if isinstance(self.ignore_attributes, str):
-                to_exclude.append(self.ignore_attributes)
-            elif isinstance(self.ignore_attributes, Iterable):
-                to_exclude.extend(self.ignore_attributes)
+        if not include_ignore_attribute and self.ignore_attribute is not None:
+            if isinstance(self.ignore_attribute, str):
+                to_exclude.append(self.ignore_attribute)
+            elif isinstance(self.ignore_attribute, Iterable):
+                to_exclude.extend(self.ignore_attribute)
 
         if len(to_exclude) > 0:
             logger.info("Going to remove the following attributes:"
@@ -615,7 +615,7 @@ def retrieve_class_labels(self, target_name: str = 'class') -> Union[None, List[
         return None
 
     def get_features_by_type(self, data_type, exclude=None,
-                             exclude_ignore_attributes=True,
+                             exclude_ignore_attribute=True,
                              exclude_row_id_attribute=True):
         """
         Return indices of features of a given type, e.g. all nominal features.
@@ -628,7 +628,7 @@ def get_features_by_type(self, data_type, exclude=None,
         exclude : list(int)
             Indices to exclude (and adapt the return values as if these indices
                         are not present)
-        exclude_ignore_attributes : bool
+        exclude_ignore_attribute : bool
             Whether to exclude the defined ignore attributes (and adapt the
             return values as if these indices are not present)
         exclude_row_id_attribute : bool
@@ -642,9 +642,9 @@ def get_features_by_type(self, data_type, exclude=None,
         """
         if data_type not in OpenMLDataFeature.LEGAL_DATA_TYPES:
             raise TypeError("Illegal feature type requested")
-        if self.ignore_attributes is not None:
-            if not isinstance(self.ignore_attributes, list):
-                raise TypeError("ignore_attributes should be a list")
+        if self.ignore_attribute is not None:
+            if not isinstance(self.ignore_attribute, list):
+                raise TypeError("ignore_attribute should be a list")
         if self.row_id_attribute is not None:
             if not isinstance(self.row_id_attribute, str):
                 raise TypeError("row id attribute should be a str")
@@ -656,8 +656,8 @@ def get_features_by_type(self, data_type, exclude=None,
         to_exclude = []
         if exclude is not None:
             to_exclude.extend(exclude)
-        if exclude_ignore_attributes and self.ignore_attributes is not None:
-            to_exclude.extend(self.ignore_attributes)
+        if exclude_ignore_attribute and self.ignore_attribute is not None:
+            to_exclude.extend(self.ignore_attribute)
         if exclude_row_id_attribute and self.row_id_attribute is not None:
             to_exclude.append(self.row_id_attribute)
 
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -301,10 +301,10 @@ def __list_datasets(api_call, output_format='dict'):
 
     datasets = dict()
     for dataset_ in datasets_dict['oml:data']['oml:dataset']:
-        ignore_attributes = ['oml:file_id', 'oml:quality']
+        ignore_attribute = ['oml:file_id', 'oml:quality']
         dataset = {k.replace('oml:', ''): v
                    for (k, v) in dataset_.items()
-                   if k not in ignore_attributes}
+                   if k not in ignore_attribute}
         dataset['did'] = int(dataset['did'])
         dataset['version'] = int(dataset['version'])
 
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -141,7 +141,7 @@ def test_get_data_with_target_pandas(self):
         self.assertNotIn("class", attribute_names)
 
     def test_get_data_rowid_and_ignore_and_target(self):
-        self.dataset.ignore_attributes = ["condition"]
+        self.dataset.ignore_attribute = ["condition"]
         self.dataset.row_id_attribute = ["hardness"]
         X, y, categorical, names = self.dataset.get_data(target="class")
         self.assertEqual(X.shape, (898, 36))
@@ -151,15 +151,15 @@ def test_get_data_rowid_and_ignore_and_target(self):
         self.assertEqual(y.shape, (898, ))
 
     def test_get_data_with_ignore_attributes(self):
-        self.dataset.ignore_attributes = ["condition"]
-        rval, _, categorical, _ = self.dataset.get_data(include_ignore_attributes=True)
+        self.dataset.ignore_attribute = ["condition"]
+        rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True)
         for (dtype, is_cat) in zip(rval.dtypes, categorical):
             expected_type = 'category' if is_cat else 'float64'
             self.assertEqual(dtype.name, expected_type)
         self.assertEqual(rval.shape, (898, 39))
         self.assertEqual(len(categorical), 39)
 
-        rval, _, categorical, _ = self.dataset.get_data(include_ignore_attributes=False)
+        rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False)
         for (dtype, is_cat) in zip(rval.dtypes, categorical):
             expected_type = 'category' if is_cat else 'float64'
             self.assertEqual(dtype.name, expected_type)
@@ -271,17 +271,17 @@ def test_get_sparse_dataset_with_rowid(self):
         self.assertEqual(len(categorical), 20000)
 
     def test_get_sparse_dataset_with_ignore_attributes(self):
-        self.sparse_dataset.ignore_attributes = ["V256"]
+        self.sparse_dataset.ignore_attribute = ["V256"]
         rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format='array', include_ignore_attributes=True
+            dataset_format='array', include_ignore_attribute=True
         )
         self.assertTrue(sparse.issparse(rval))
         self.assertEqual(rval.dtype, np.float32)
         self.assertEqual(rval.shape, (600, 20001))
 
         self.assertEqual(len(categorical), 20001)
         rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format='array', include_ignore_attributes=False
+            dataset_format='array', include_ignore_attribute=False
         )
         self.assertTrue(sparse.issparse(rval))
         self.assertEqual(rval.dtype, np.float32)
@@ -290,13 +290,13 @@ def test_get_sparse_dataset_with_ignore_attributes(self):
 
     def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         # TODO: re-add row_id and ignore attributes
-        self.sparse_dataset.ignore_attributes = ["V256"]
+        self.sparse_dataset.ignore_attribute = ["V256"]
         self.sparse_dataset.row_id_attribute = ["V512"]
         X, y, categorical, _ = self.sparse_dataset.get_data(
             dataset_format='array',
             target="class",
             include_row_id=False,
-            include_ignore_attributes=False,
+            include_ignore_attribute=False,
         )
         self.assertTrue(sparse.issparse(X))
         self.assertEqual(X.dtype, np.float32)
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -1012,9 +1012,10 @@ def test_ignore_attributes_dataset(self):
             original_data_url=original_data_url,
             paper_url=paper_url
         )
-        self.assertEqual(dataset.ignore_attributes, ['outlook'])
+        self.assertEqual(dataset.ignore_attribute, ['outlook'])
 
         # pass a list to ignore_attribute
+        ignore_attribute = ['outlook', 'windy']
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
@@ -1025,15 +1026,15 @@ def test_ignore_attributes_dataset(self):
             licence=licence,
             default_target_attribute=default_target_attribute,
             row_id_attribute=None,
-            ignore_attribute=['outlook', 'windy'],
+            ignore_attribute=ignore_attribute,
             citation=citation,
             attributes='auto',
             data=df,
             version_label='test',
             original_data_url=original_data_url,
             paper_url=paper_url
         )
-        self.assertEqual(dataset.ignore_attributes, ['outlook', 'windy'])
+        self.assertEqual(dataset.ignore_attribute, ignore_attribute)
 
         # raise an error if unknown type
         err_msg = 'Wrong data type for ignore_attribute. Should be list.'
@@ -1057,6 +1058,112 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url
             )
 
+    def test___publish_fetch_ignore_attribute(self):
+        """(Part 1) Test to upload and retrieve dataset and check ignore_attributes
+
+        DEPENDS on test_publish_fetch_ignore_attribute() to be executed after this
+        This test is split into two parts:
+        1) test___publish_fetch_ignore_attribute()
+            This will be executed earlier, owing to alphabetical sorting.
+            This test creates and publish() a dataset and checks for a valid ID.
+        2) test_publish_fetch_ignore_attribute()
+            This will be executed after test___publish_fetch_ignore_attribute(),
+            owing to alphabetical sorting. The time gap is to allow the server
+            more time time to compute data qualities.
+            The dataset ID obtained previously is used to fetch the dataset.
+            The retrieved dataset is checked for valid ignore_attributes.
+        """
+        # the returned fixt
+        data = [
+            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
+            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
+            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
+            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
+            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes']
+        ]
+        column_names = ['rnd_str', 'outlook', 'temperature', 'humidity',
+                        'windy', 'play']
+        df = pd.DataFrame(data, columns=column_names)
+        # enforce the type of each column
+        df['outlook'] = df['outlook'].astype('category')
+        df['windy'] = df['windy'].astype('bool')
+        df['play'] = df['play'].astype('category')
+        # meta-information
+        name = '%s-pandas_testing_dataset' % self._get_sentinel()
+        description = 'Synthetic dataset created from a Pandas DataFrame'
+        creator = 'OpenML tester'
+        collection_date = '01-01-2018'
+        language = 'English'
+        licence = 'MIT'
+        default_target_attribute = 'play'
+        citation = 'None'
+        original_data_url = 'http://openml.github.io/openml-python'
+        paper_url = 'http://openml.github.io/openml-python'
+
+        # pass a list to ignore_attribute
+        ignore_attribute = ['outlook', 'windy']
+        dataset = openml.datasets.functions.create_dataset(
+            name=name,
+            description=description,
+            creator=creator,
+            contributor=None,
+            collection_date=collection_date,
+            language=language,
+            licence=licence,
+            default_target_attribute=default_target_attribute,
+            row_id_attribute=None,
+            ignore_attribute=ignore_attribute,
+            citation=citation,
+            attributes='auto',
+            data=df,
+            version_label='test',
+            original_data_url=original_data_url,
+            paper_url=paper_url
+        )
+
+        # publish dataset
+        upload_did = dataset.publish()
+        # test if publish was successful
+        self.assertIsInstance(upload_did, int)
+        # variables to carry forward for test_publish_fetch_ignore_attribute()
+        self.__class__.test_publish_fetch_ignore_attribute_did = upload_did
+        self.__class__.test_publish_fetch_ignore_attribute_list = ignore_attribute
+
+    def test_publish_fetch_ignore_attribute(self):
+        """(Part 2) Test to upload and retrieve dataset and check ignore_attributes
+
+        DEPENDS on test___publish_fetch_ignore_attribute() to be executed first
+        This will be executed after test___publish_fetch_ignore_attribute(),
+        owing to alphabetical sorting. The time gap is to allow the server
+        more time time to compute data qualities.
+        The dataset ID obtained previously is used to fetch the dataset.
+        The retrieved dataset is checked for valid ignore_attributes.
+        """
+        # Retrieving variables from test___publish_fetch_ignore_attribute()
+        upload_did = self.__class__.test_publish_fetch_ignore_attribute_did
+        ignore_attribute = self.__class__.test_publish_fetch_ignore_attribute_list
+        trials = 1
+        timeout_limit = 200
+        dataset = None
+        # fetching from server
+        # loop till timeout or fetch not successful
+        while True:
+            if trials > timeout_limit:
+                break
+            try:
+                dataset = openml.datasets.get_dataset(upload_did)
+                break
+            except Exception as e:
+                # returned code 273: Dataset not processed yet
+                # returned code 362: No qualities found
+                print("Trial {}/{}: ".format(trials, timeout_limit))
+                print("\tFailed to fetch dataset:{} with '{}'.".format(upload_did, str(e)))
+                trials += 1
+                continue
+        if dataset is None:
+            raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(upload_did))
+        self.assertEqual(dataset.ignore_attribute, ignore_attribute)
+
     def test_create_dataset_row_id_attribute_error(self):
         # meta-information
         name = '%s-pandas_testing_dataset' % self._get_sentinel()