[MRG] EHN: support SparseDataFrame when creating a dataset (#583)

glemaitre · mfeurer · commit b9035c44ce78 · 2018-11-16T16:41:41.000+01:00
* EHN: support SparseDataFrame when creating a dataset * TST: check attributes inference dtype * PEP8 * EXA: add sparse dataframe in the example * Fix typos. * Fix typo. * Refactoring task.py (#588) * [MRG] EHN: inferred row_id_attribute from dataframe to create a dataset (#586) * EHN: inferred row_id_attribute from dataframe to create a dataset * reset the index of dataframe after inference * TST: check the size of the dataset * PEP8 * TST: check that an error is raised when row_id_attributes is not a known attribute * DOC: Update the docstring * PEP8 * add examples to the menu, remove double progress (#554) * PEP8 * PEP8
diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py
@@ -24,6 +24,7 @@
 # * A list
 # * A pandas dataframe
 # * A sparse matrix
+# * A pandas sparse dataframe
 
 ############################################################################
 # Dataset is a numpy array
@@ -243,7 +244,7 @@
 
 sparse_data = coo_matrix((
     [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
-    ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]),
+    ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
 ))
 
 column_names = [
@@ -273,3 +274,38 @@
 
 upload_did = xor_dataset.publish()
 print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
+
+
+############################################################################
+# Dataset is a pandas sparse dataframe
+# ====================================
+
+sparse_data = coo_matrix((
+    [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+    ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
+))
+column_names = ['input1', 'input2', 'y']
+df = pd.SparseDataFrame(sparse_data, columns=column_names)
+print(df.info())
+
+xor_dataset = create_dataset(
+    name="XOR",
+    description='Dataset representing the XOR operation',
+    creator=None,
+    contributor=None,
+    collection_date=None,
+    language='English',
+    licence=None,
+    default_target_attribute='y',
+    row_id_attribute=None,
+    ignore_attribute=None,
+    citation=None,
+    attributes='auto',
+    data=df,
+    version_label='example',
+)
+
+############################################################################
+
+upload_did = xor_dataset.publish()
+print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did))
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -502,8 +502,8 @@ def create_dataset(name, description, creator, contributor,
     if attributes == 'auto' or isinstance(attributes, dict):
         if not hasattr(data, "columns"):
             raise ValueError("Automatically inferring the attributes required "
-                             "a pandas DataFrame. A {!r} was given instead."
-                             .format(data))
+                             "a pandas DataFrame or SparseDataFrame. "
+                             "A {!r} was given instead.".format(data))
         # infer the type of data for each column of the DataFrame
         attributes_ = attributes_arff_from_df(data)
         if isinstance(attributes, dict):
@@ -525,7 +525,16 @@ def create_dataset(name, description, creator, contributor,
                 .format(row_id_attribute, [attr[0] for attr in attributes_])
             )
 
-    data = data.values if hasattr(data, "columns") else data
+    if hasattr(data, "columns"):
+        if isinstance(data, pd.SparseDataFrame):
+            data = data.to_coo()
+            # liac-arff only support COO matrices with sorted rows
+            row_idx_sorted = np.argsort(data.row)
+            data.row = data.row[row_idx_sorted]
+            data.col = data.col[row_idx_sorted]
+            data.data = data.data[row_idx_sorted]
+        else:
+            data = data.values
 
     if format is not None:
         warn("The format parameter will be deprecated in the future,"
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -411,6 +411,7 @@ def test_data_status(self):
         self.assertEqual(result[did]['status'], 'active')
 
     def test_attributes_arff_from_df(self):
+        # DataFrame case
         df = pd.DataFrame(
             [[1, 1.0, 'xxx', 'A', True], [2, 2.0, 'yyy', 'B', False]],
             columns=['integer', 'floating', 'string', 'category', 'boolean']
@@ -422,6 +423,16 @@ def test_attributes_arff_from_df(self):
                                       ('string', 'STRING'),
                                       ('category', ['A', 'B']),
                                       ('boolean', ['True', 'False'])])
+        # SparseDataFrame case
+        df = pd.SparseDataFrame([[1, 1.0],
+                                 [2, 2.0],
+                                 [0, 0]],
+                                columns=['integer', 'floating'],
+                                default_fill_value=0)
+        df['integer'] = df['integer'].astype(np.int64)
+        attributes = attributes_arff_from_df(df)
+        self.assertEqual(attributes, [('integer', 'INTEGER'),
+                                      ('floating', 'REAL')])
 
     def test_attributes_arff_from_df_mixed_dtype_categories(self):
         # liac-arff imposed categorical attributes to be of sting dtype. We
@@ -769,6 +780,46 @@ def test_create_dataset_pandas(self):
             "Uploaded ARFF does not match original one"
         )
 
+        # Check that SparseDataFrame are supported properly
+        sparse_data = scipy.sparse.coo_matrix((
+            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
+        ))
+        column_names = ['input1', 'input2', 'y']
+        df = pd.SparseDataFrame(sparse_data, columns=column_names)
+        # meta-information
+        description = 'Synthetic dataset created from a Pandas SparseDataFrame'
+        dataset = openml.datasets.functions.create_dataset(
+            name=name,
+            description=description,
+            creator=creator,
+            contributor=None,
+            collection_date=collection_date,
+            language=language,
+            licence=licence,
+            default_target_attribute=default_target_attribute,
+            row_id_attribute=None,
+            ignore_attribute=None,
+            citation=citation,
+            attributes='auto',
+            data=df,
+            format=None,
+            version_label='test',
+            original_data_url=original_data_url,
+            paper_url=paper_url
+        )
+        upload_did = dataset.publish()
+        self.assertEqual(
+            _get_online_dataset_arff(upload_did),
+            dataset._dataset,
+            "Uploaded ARFF does not match original one"
+        )
+        self.assertEqual(
+            _get_online_dataset_format(upload_did),
+            'sparse_arff',
+            "Wrong format for dataset"
+        )
+
         # Check that we can overwrite the attributes
         data = [['a'], ['b'], ['c'], ['d'], ['e']]
         column_names = ['rnd_str']