Skip to content

Commit 249abc9

Browse files
authored
Removing support for deprecated pandas SparseDataFrame (#897)
* Removing support for pandas SparseDataFrame * Fixing rebase loss * Reiterating with Matthias' changes * Rolling back setup * Fixing PEP8 * Changing check to detect sparse dataframes * Fixing edge case to handle server side arff issue * Removing stray comment * Failing test case fix * Removing stray comment
1 parent 4b9b873 commit 249abc9

File tree

9 files changed

+28
-32
lines changed

9 files changed

+28
-32
lines changed

.travis.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ env:
1515
- TEST_DIR=/tmp/test_dir/
1616
- MODULE=openml
1717
matrix:
18-
- DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2"
1918
- DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2" TEST_DIST="true"
2019
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
2120
- DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"

doc/progress.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@ Changelog
1010
~~~~~~
1111

1212
* FIX #873: Fixes an issue which resulted in incorrect URLs when printing OpenML objects after
13-
switching the server
13+
switching the server.
1414
* FIX #885: Logger no longer registered by default. Added utility functions to easily register
1515
logging to console and file.
1616
* MAINT #767: Source distribution installation is now unit-tested.
17+
* MAINT #836: OpenML supports only pandas version 1.0.0 or above.
1718
* MAINT #865: OpenML no longer bundles test files in the source distribution.
19+
* MAINT #897: Dropping support for Python 3.5.
1820
* ADD #894: Support caching of datasets using feather format as an option.
1921

2022
0.10.2

examples/30_extended/create_upload_tutorial.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -283,15 +283,15 @@
283283

284284

285285
############################################################################
286-
# Dataset is a pandas sparse dataframe
287-
# ====================================
286+
# Dataset is a pandas dataframe with sparse columns
287+
# =================================================
288288

289289
sparse_data = coo_matrix((
290-
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
290+
[1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0],
291291
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
292292
))
293293
column_names = ['input1', 'input2', 'y']
294-
df = pd.SparseDataFrame(sparse_data, columns=column_names)
294+
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
295295
print(df.info())
296296

297297
xor_dataset = create_dataset(

examples/30_extended/datasets_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
# Get the actual data.
6969
#
7070
# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
71-
# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is
71+
# sparse matrix, or as a Pandas DataFrame. The format is
7272
# controlled with the parameter ``dataset_format`` which can be either 'array'
7373
# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
7474
# and manually create a dataframe.

openml/datasets/dataset.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -551,9 +551,7 @@ def _encode_if_category(column):
551551
)
552552
elif array_format == "dataframe":
553553
if scipy.sparse.issparse(data):
554-
return pd.SparseDataFrame(data, columns=attribute_names)
555-
else:
556-
return data
554+
return pd.DataFrame.sparse.from_spmatrix(data, columns=attribute_names)
557555
else:
558556
data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
559557
logger.warning(
@@ -602,7 +600,7 @@ def get_data(
602600
dataset_format : string (default='dataframe')
603601
The format of returned dataset.
604602
If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
605-
If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame.
603+
If ``dataframe``, the returned dataset will be a Pandas DataFrame.
606604
607605
Returns
608606
-------

openml/datasets/functions.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ def create_dataset(name, description, creator, contributor,
672672
class:`openml.OpenMLDataset`
673673
Dataset description."""
674674

675-
if isinstance(data, (pd.DataFrame, pd.SparseDataFrame)):
675+
if isinstance(data, pd.DataFrame):
676676
# infer the row id from the index of the dataset
677677
if row_id_attribute is None:
678678
row_id_attribute = data.index.name
@@ -684,8 +684,7 @@ def create_dataset(name, description, creator, contributor,
684684
if attributes == 'auto' or isinstance(attributes, dict):
685685
if not hasattr(data, "columns"):
686686
raise ValueError("Automatically inferring attributes requires "
687-
"a pandas DataFrame or SparseDataFrame. "
688-
"A {!r} was given instead.".format(data))
687+
"a pandas DataFrame. A {!r} was given instead.".format(data))
689688
# infer the type of data for each column of the DataFrame
690689
attributes_ = attributes_arff_from_df(data)
691690
if isinstance(attributes, dict):
@@ -708,8 +707,8 @@ def create_dataset(name, description, creator, contributor,
708707
)
709708

710709
if hasattr(data, "columns"):
711-
if isinstance(data, pd.SparseDataFrame):
712-
data = data.to_coo()
710+
if all(isinstance(dtype, pd.SparseDtype) for dtype in data.dtypes):
711+
data = data.sparse.to_coo()
713712
# liac-arff only support COO matrices with sorted rows
714713
row_idx_sorted = np.argsort(data.row)
715714
data.row = data.row[row_idx_sorted]

setup.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
with open("openml/__version__.py") as fh:
1010
version = fh.readlines()[-1].split()[-1].strip("\"'")
1111

12-
if sys.version_info < (3, 5):
12+
if sys.version_info < (3, 6):
1313
raise ValueError(
14-
'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.5 or higher.'
14+
'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.6 or higher.'
1515
.format(sys.version_info.major, sys.version_info.minor, sys.version_info.micro)
1616
)
1717

@@ -42,14 +42,14 @@
4242
exclude=["*.tests", "*.tests.*", "tests.*", "tests"],
4343
),
4444
package_data={'': ['*.txt', '*.md']},
45-
python_requires=">=3.5",
45+
python_requires=">=3.6",
4646
install_requires=[
4747
'liac-arff>=2.4.0',
4848
'xmltodict',
4949
'requests',
5050
'scikit-learn>=0.18',
5151
'python-dateutil', # Installed through pandas anyway.
52-
'pandas>=0.19.2, <1.0.0',
52+
'pandas>=1.0.0',
5353
'scipy>=0.13.3',
5454
'numpy>=1.6.2',
5555
],
@@ -92,6 +92,5 @@
9292
'Operating System :: Unix',
9393
'Operating System :: MacOS',
9494
'Programming Language :: Python :: 3',
95-
'Programming Language :: Python :: 3.5',
9695
'Programming Language :: Python :: 3.6',
9796
'Programming Language :: Python :: 3.7'])

tests/test_datasets/test_dataset.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,9 @@ def test_get_sparse_dataset(self):
286286

287287
def test_get_sparse_dataframe(self):
288288
rval, *_ = self.sparse_dataset.get_data()
289-
self.assertTrue(isinstance(rval, pd.SparseDataFrame))
289+
self.assertIsInstance(rval, pd.DataFrame)
290+
np.testing.assert_array_equal(
291+
[pd.SparseDtype(np.float32, fill_value=0.0)] * len(rval.dtypes), rval.dtypes)
290292
self.assertEqual((600, 20001), rval.shape)
291293

292294
def test_get_sparse_dataset_with_rowid(self):

tests/test_datasets/test_dataset_functions.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -561,12 +561,9 @@ def test_attributes_arff_from_df(self):
561561
('string', 'STRING'),
562562
('category', ['A', 'B']),
563563
('boolean', ['True', 'False'])])
564-
# SparseDataFrame case
565-
df = pd.SparseDataFrame([[1, 1.0],
566-
[2, 2.0],
567-
[0, 0]],
568-
columns=['integer', 'floating'],
569-
default_fill_value=0)
564+
# DataFrame with Sparse columns case
565+
df = pd.DataFrame({"integer": pd.arrays.SparseArray([1, 2, 0], fill_value=0),
566+
"floating": pd.arrays.SparseArray([1.0, 2.0, 0], fill_value=0.0)})
570567
df['integer'] = df['integer'].astype(np.int64)
571568
attributes = attributes_arff_from_df(df)
572569
self.assertEqual(attributes, [('integer', 'INTEGER'),
@@ -925,15 +922,15 @@ def test_create_dataset_pandas(self):
925922
"Uploaded ARFF does not match original one"
926923
)
927924

928-
# Check that SparseDataFrame are supported properly
925+
# Check that DataFrame with Sparse columns are supported properly
929926
sparse_data = scipy.sparse.coo_matrix((
930-
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
927+
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
931928
([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1])
932929
))
933930
column_names = ['input1', 'input2', 'y']
934-
df = pd.SparseDataFrame(sparse_data, columns=column_names)
931+
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
935932
# meta-information
936-
description = 'Synthetic dataset created from a Pandas SparseDataFrame'
933+
description = 'Synthetic dataset created from a Pandas DataFrame with Sparse columns'
937934
dataset = openml.datasets.functions.create_dataset(
938935
name=name,
939936
description=description,

0 commit comments

Comments
 (0)