Removing support for deprecated pandas SparseDataFrame (#897)

Neeratyoy · web-flow · commit 249abc901c34 · 2020-03-06T14:01:52.000+01:00
* Removing support for pandas SparseDataFrame

* Fixing rebase loss

* Reiterating with Matthias' changes

* Rolling back setup

* Fixing PEP8

* Changing check to detect sparse dataframes

* Fixing edge case to handle server side arff issue

* Removing stray comment

* Failing test case fix

* Removing stray comment
diff --git a/.travis.yml b/.travis.yml
@@ -15,7 +15,6 @@ env:
   - TEST_DIR=/tmp/test_dir/
   - MODULE=openml
   matrix:
-  - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2"
   - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
   - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
   - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
diff --git a/doc/progress.rst b/doc/progress.rst
@@ -10,11 +10,13 @@ Changelog
 ~~~~~~
 
 * FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
-  switching the server
+  switching the server.
 * FIX #885: Logger no longer registered by default. Added utility functions to easily register
   logging to console and file.
 * MAINT #767: Source distribution installation is now unit-tested.
+* MAINT #836: OpenML supports only pandas version 1.0.0 or above.
 * MAINT #865: OpenML no longer bundles test files in the source distribution.
+* MAINT #897: Dropping support for Python 3.5.
 * ADD #894: Support caching of datasets using feather format as an option.
 
 0.10.2
diff --git a/examples/30_extended/create_upload_tutorial.py b/examples/30_extended/create_upload_tutorial.py
@@ -283,15 +283,15 @@
 
 
 ############################################################################
-# Dataset is a pandas sparse dataframe
-# ====================================
+# Dataset is a pandas dataframe with sparse columns
+# =================================================
 
 sparse_data = coo_matrix((
-    [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+    [1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0],
     ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
 ))
 column_names = ['input1', 'input2', 'y']
-df = pd.SparseDataFrame(sparse_data, columns=column_names)
+df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
 print(df.info())
 
 xor_dataset = create_dataset(
diff --git a/examples/30_extended/datasets_tutorial.py b/examples/30_extended/datasets_tutorial.py
@@ -68,7 +68,7 @@
 # Get the actual data.
 #
 # The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
-# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is
+# sparse matrix, or as a Pandas DataFrame. The format is
 # controlled with the parameter ``dataset_format`` which can be either 'array'
 # (default) or 'dataframe'. Let's first build our dataset from a NumPy array
 # and manually create a dataframe.
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -551,9 +551,7 @@ def _encode_if_category(column):
                 )
         elif array_format == "dataframe":
             if scipy.sparse.issparse(data):
-                return pd.SparseDataFrame(data, columns=attribute_names)
-            else:
-                return data
+                return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
         else:
             data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
             logger.warning(
@@ -602,7 +600,7 @@ def get_data(
         dataset_format : string (default='dataframe')
             The format of returned dataset.
             If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
-            If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame.
+            If ``dataframe``, the returned dataset will be a Pandas DataFrame.
 
         Returns
         -------
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -672,7 +672,7 @@ def create_dataset(name, description, creator, contributor,
     class:`openml.OpenMLDataset`
         Dataset description."""
 
-    if isinstance(data, (pd.DataFrame, pd.SparseDataFrame)):
+    if isinstance(data, pd.DataFrame):
         # infer the row id from the index of the dataset
         if row_id_attribute is None:
             row_id_attribute = data.index.name
@@ -684,8 +684,7 @@ def create_dataset(name, description, creator, contributor,
     if attributes == 'auto' or isinstance(attributes, dict):
         if not hasattr(data, "columns"):
             raise ValueError("Automatically inferring attributes requires "
-                             "a pandas DataFrame or SparseDataFrame. "
-                             "A {!r} was given instead.".format(data))
+                             "a pandas DataFrame. A {!r} was given instead.".format(data))
         # infer the type of data for each column of the DataFrame
         attributes_ = attributes_arff_from_df(data)
         if isinstance(attributes, dict):
@@ -708,8 +707,8 @@ def create_dataset(name, description, creator, contributor,
             )
 
     if hasattr(data, "columns"):
-        if isinstance(data, pd.SparseDataFrame):
-            data = data.to_coo()
+        if all(isinstance(dtype, pd.SparseDtype) for dtype in data.dtypes):
+            data = data.sparse.to_coo()
             # liac-arff only support COO matrices with sorted rows
             row_idx_sorted = np.argsort(data.row)
             data.row = data.row[row_idx_sorted]
diff --git a/setup.py b/setup.py
@@ -9,9 +9,9 @@
 with open("openml/__version__.py") as fh:
     version = fh.readlines()[-1].split()[-1].strip("\"'")
 
-if sys.version_info < (3, 5):
+if sys.version_info < (3, 6):
     raise ValueError(
-        'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.5 or higher.'
+        'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.6 or higher.'
         .format(sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
     )
 
@@ -42,14 +42,14 @@
                      exclude=["*.tests", "*.tests.*", "tests.*", "tests"],
                  ),
                  package_data={'': ['*.txt', '*.md']},
-                 python_requires=">=3.5",
+                 python_requires=">=3.6",
                  install_requires=[
                      'liac-arff>=2.4.0',
                      'xmltodict',
                      'requests',
                      'scikit-learn>=0.18',
                      'python-dateutil',  # Installed through pandas anyway.
-                     'pandas>=0.19.2, <1.0.0',
+                     'pandas>=1.0.0',
                      'scipy>=0.13.3',
                      'numpy>=1.6.2',
                  ],
@@ -92,6 +92,5 @@
                               'Operating System :: Unix',
                               'Operating System :: MacOS',
                               'Programming Language :: Python :: 3',
-                              'Programming Language :: Python :: 3.5',
                               'Programming Language :: Python :: 3.6',
                               'Programming Language :: Python :: 3.7'])
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -286,7 +286,9 @@ def test_get_sparse_dataset(self):
 
     def test_get_sparse_dataframe(self):
         rval, *_ = self.sparse_dataset.get_data()
-        self.assertTrue(isinstance(rval, pd.SparseDataFrame))
+        self.assertIsInstance(rval, pd.DataFrame)
+        np.testing.assert_array_equal(
+            [pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes), rval.dtypes)
         self.assertEqual((600, 20001), rval.shape)
 
     def test_get_sparse_dataset_with_rowid(self):
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -561,12 +561,9 @@ def test_attributes_arff_from_df(self):
                                       ('string', 'STRING'),
                                       ('category', ['A', 'B']),
                                       ('boolean', ['True', 'False'])])
-        # SparseDataFrame case
-        df = pd.SparseDataFrame([[1, 1.0],
-                                 [2, 2.0],
-                                 [0, 0]],
-                                columns=['integer', 'floating'],
-                                default_fill_value=0)
+        # DataFrame with Sparse columns case
+        df = pd.DataFrame({"integer": pd.arrays.SparseArray([1, 2, 0], fill_value=0),
+                           "floating": pd.arrays.SparseArray([1.0, 2.0, 0], fill_value=0.0)})
         df['integer'] = df['integer'].astype(np.int64)
         attributes = attributes_arff_from_df(df)
         self.assertEqual(attributes, [('integer', 'INTEGER'),
@@ -925,15 +922,15 @@ def test_create_dataset_pandas(self):
             "Uploaded ARFF does not match original one"
         )
 
-        # Check that SparseDataFrame are supported properly
+        # Check that DataFrame with Sparse columns are supported properly
         sparse_data = scipy.sparse.coo_matrix((
-            [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
             ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
         ))
         column_names = ['input1', 'input2', 'y']
-        df = pd.SparseDataFrame(sparse_data, columns=column_names)
+        df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
         # meta-information
-        description = 'Synthetic dataset created from a Pandas SparseDataFrame'
+        description = 'Synthetic dataset created from a Pandas DataFrame with Sparse columns'
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@`
`68`	`68`	`# Get the actual data.`
`69`	`69`	`#`
`70`	`70`	`# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy`
`71`		`-# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is`
	`71`	`+# sparse matrix, or as a Pandas DataFrame. The format is`
`72`	`72`	# controlled with the parameter ``dataset_format`` which can be either 'array'
`73`	`73`	`# (default) or 'dataframe'. Let's first build our dataset from a NumPy array`
`74`	`74`	`# and manually create a dataframe.`