[MRG] EHN: inferred row_id_attribute from dataframe to create a dataset (#586)

glemaitre · mfeurer · commit 696db49251ad · 2018-11-16T14:37:00.000+01:00
* EHN: inferred row_id_attribute from dataframe to create a dataset

* reset the index of dataframe after inference

* TST: check the size of the dataset

* PEP8

* TST: check that an error is raised when row_id_attributes is not a known attribute

* DOC: Update the docstring

* PEP8
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -417,8 +417,9 @@ def attributes_arff_from_df(df):
 def create_dataset(name, description, creator, contributor,
                    collection_date, language,
                    licence, attributes, data,
-                   default_target_attribute, row_id_attribute,
-                   ignore_attribute, citation, format=None,
+                   default_target_attribute,
+                   ignore_attribute, citation,
+                   row_id_attribute=None, format=None,
                    original_data_url=None, paper_url=None,
                    update_comment=None, version_label=None):
     """Create a dataset.
@@ -433,11 +434,6 @@ def create_dataset(name, description, creator, contributor,
         Name of the dataset.
     description : str
         Description of the dataset.
-    format : str, optional
-        Format of the dataset which can be either 'arff' or 'sparse_arff'.
-        By default, the format is automatically inferred.
-        .. deprecated: 0.8
-            ``format`` is deprecated in 0.8 and will be removed in 0.10.
     creator : str
         The person who created the dataset.
     contributor : str
@@ -463,14 +459,25 @@ def create_dataset(name, description, creator, contributor,
     default_target_attribute : str
         The default target attribute, if it exists.
         Can have multiple values, comma separated.
-    row_id_attribute : str
-        The attribute that represents the row-id column, if present in the dataset.
     ignore_attribute : str | list
         Attributes that should be excluded in modelling, such as identifiers and indexes.
     citation : str
         Reference(s) that should be cited when building on this data.
     version_label : str, optional
         Version label provided by user, can be a date, hash, or some other type of id.
+    row_id_attribute : str, optional
+        The attribute that represents the row-id column, if present in the
+        dataset. If ``data`` is a dataframe and ``row_id_attribute`` is not
+        specified, the index of the dataframe will be used as the
+        ``row_id_attribute``. If the name of the index is ``None``, it will
+        be discarded.
+        .. versionadded: 0.8
+           Inference of ``row_id_attribute`` from a dataframe.
+    format : str, optional
+        Format of the dataset which can be either 'arff' or 'sparse_arff'.
+        By default, the format is automatically inferred.
+        .. deprecated: 0.8
+            ``format`` is deprecated in 0.8 and will be removed in 0.10.
     original_data_url : str, optional
         For derived data, the url to the original dataset.
     paper_url : str, optional
@@ -483,6 +490,15 @@ def create_dataset(name, description, creator, contributor,
     class:`openml.OpenMLDataset`
         Dataset description."""
 
+    if isinstance(data, (pd.DataFrame, pd.SparseDataFrame)):
+        # infer the row id from the index of the dataset
+        if row_id_attribute is None:
+            row_id_attribute = data.index.name
+        # When calling data.values, the index will be skipped. We need to reset
+        # the index such that it is part of the data.
+        if data.index.name is not None:
+            data = data.reset_index()
+
     if attributes == 'auto' or isinstance(attributes, dict):
         if not hasattr(data, "columns"):
             raise ValueError("Automatically inferring the attributes required "
@@ -499,6 +515,16 @@ def create_dataset(name, description, creator, contributor,
     else:
         attributes_ = attributes
 
+    if row_id_attribute is not None:
+        is_row_id_an_attribute = any([attr[0] == row_id_attribute
+                                      for attr in attributes_])
+        if not is_row_id_an_attribute:
+            raise ValueError(
+                "'row_id_attribute' should be one of the data attribute. "
+                " Got '{}' while candidates are {}."
+                .format(row_id_attribute, [attr[0] for attr in attributes_])
+            )
+
     data = data.values if hasattr(data, "columns") else data
 
     if format is not None:
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -2,6 +2,7 @@
 import os
 import sys
 import random
+from itertools import product
 if sys.version_info[0] >= 3:
     from unittest import mock
 else:
@@ -803,6 +804,104 @@ def test_create_dataset_pandas(self):
         self.assertTrue(
             '@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}' in downloaded_data)
 
+    def test_create_dataset_row_id_attribute_error(self):
+        # meta-information
+        name = 'Pandas_testing_dataset'
+        description = 'Synthetic dataset created from a Pandas DataFrame'
+        creator = 'OpenML tester'
+        collection_date = '01-01-2018'
+        language = 'English'
+        licence = 'MIT'
+        default_target_attribute = 'target'
+        citation = 'None'
+        original_data_url = 'http://openml.github.io/openml-python'
+        paper_url = 'http://openml.github.io/openml-python'
+        # Check that the index name is well inferred.
+        data = [['a', 1, 0],
+                ['b', 2, 1],
+                ['c', 3, 0],
+                ['d', 4, 1],
+                ['e', 5, 0]]
+        column_names = ['rnd_str', 'integer', 'target']
+        df = pd.DataFrame(data, columns=column_names)
+        # affecting row_id_attribute to an unknown column should raise an error
+        err_msg = ("should be one of the data attribute.")
+        with pytest.raises(ValueError, match=err_msg):
+            openml.datasets.functions.create_dataset(
+                name=name,
+                description=description,
+                creator=creator,
+                contributor=None,
+                collection_date=collection_date,
+                language=language,
+                licence=licence,
+                default_target_attribute=default_target_attribute,
+                ignore_attribute=None,
+                citation=citation,
+                attributes='auto',
+                data=df,
+                row_id_attribute='unknown_row_id',
+                format=None,
+                version_label='test',
+                original_data_url=original_data_url,
+                paper_url=paper_url
+            )
+
+    def test_create_dataset_row_id_attribute_inference(self):
+        # meta-information
+        name = 'Pandas_testing_dataset'
+        description = 'Synthetic dataset created from a Pandas DataFrame'
+        creator = 'OpenML tester'
+        collection_date = '01-01-2018'
+        language = 'English'
+        licence = 'MIT'
+        default_target_attribute = 'target'
+        citation = 'None'
+        original_data_url = 'http://openml.github.io/openml-python'
+        paper_url = 'http://openml.github.io/openml-python'
+        # Check that the index name is well inferred.
+        data = [['a', 1, 0],
+                ['b', 2, 1],
+                ['c', 3, 0],
+                ['d', 4, 1],
+                ['e', 5, 0]]
+        column_names = ['rnd_str', 'integer', 'target']
+        df = pd.DataFrame(data, columns=column_names)
+        row_id_attr = [None, 'integer']
+        df_index_name = [None, 'index_name']
+        expected_row_id = [None, 'index_name', 'integer', 'integer']
+        for output_row_id, (row_id, index_name) in zip(expected_row_id,
+                                                       product(row_id_attr,
+                                                               df_index_name)):
+            df.index.name = index_name
+            dataset = openml.datasets.functions.create_dataset(
+                name=name,
+                description=description,
+                creator=creator,
+                contributor=None,
+                collection_date=collection_date,
+                language=language,
+                licence=licence,
+                default_target_attribute=default_target_attribute,
+                ignore_attribute=None,
+                citation=citation,
+                attributes='auto',
+                data=df,
+                row_id_attribute=row_id,
+                format=None,
+                version_label='test',
+                original_data_url=original_data_url,
+                paper_url=paper_url
+            )
+            self.assertEqual(dataset.row_id_attribute, output_row_id)
+            upload_did = dataset.publish()
+            arff_dataset = arff.loads(_get_online_dataset_arff(upload_did))
+            arff_data = np.array(arff_dataset['data'], dtype=object)
+            # if we set the name of the index then the index will be added to
+            # the data
+            expected_shape = (5, 3) if index_name is None else (5, 4)
+            self.assertEqual(arff_data.shape, expected_shape)
+
     def test_create_dataset_attributes_auto_without_df(self):
         # attributes cannot be inferred without passing a dataframe
         data = np.array([[1, 2, 3],