Skip to content

Commit 5dadf0e

Browse files
committed
Merge remote-tracking branch 'upstream/main' into BUG-56994/pyarrow-assignment-unexpected-dtypes
2 parents fadf487 + 0320b3c commit 5dadf0e

File tree

13 files changed

+92
-62
lines changed

13 files changed

+92
-62
lines changed

.github/workflows/unit-tests.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,8 @@ jobs:
388388
389389
- name: Run Tests
390390
uses: ./.github/actions/run-tests
391+
env:
392+
PYTHON_GIL: 0
391393

392394
emscripten:
393395
# Note: the Python version, Emscripten toolchain version are determined

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ repos:
6767
- id: fix-encoding-pragma
6868
args: [--remove]
6969
- id: trailing-whitespace
70+
args: [--markdown-linebreak-ext=md]
7071
- repo: https://github.com/PyCQA/isort
7172
rev: 5.13.2
7273
hooks:

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,7 @@ I/O
559559
- Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
560560
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
561561
- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
562+
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
562563
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
563564
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
564565
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
@@ -599,7 +600,7 @@ Reshaping
599600
Sparse
600601
^^^^^^
601602
- Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`)
602-
-
603+
- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`)
603604

604605
ExtensionArray
605606
^^^^^^^^^^^^^^

pandas/core/arrays/sparse/accessor.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,12 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
291291
Examples
292292
--------
293293
>>> import scipy.sparse
294-
>>> mat = scipy.sparse.eye(3, dtype=float)
294+
>>> mat = scipy.sparse.eye(3, dtype=int)
295295
>>> pd.DataFrame.sparse.from_spmatrix(mat)
296296
0 1 2
297-
0 1.0 0 0
298-
1 0 1.0 0
299-
2 0 0 1.0
297+
0 1 0 0
298+
1 0 1 0
299+
2 0 0 1
300300
"""
301301
from pandas._libs.sparse import IntIndex
302302

@@ -313,7 +313,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
313313
indices = data.indices
314314
indptr = data.indptr
315315
array_data = data.data
316-
dtype = SparseDtype(array_data.dtype, 0)
316+
dtype = SparseDtype(array_data.dtype)
317317
arrays = []
318318
for i in range(n_columns):
319319
sl = slice(indptr[i], indptr[i + 1])
@@ -393,8 +393,6 @@ def to_coo(self) -> spmatrix:
393393
cols, rows, data = [], [], []
394394
for col, (_, ser) in enumerate(self._parent.items()):
395395
sp_arr = ser.array
396-
if sp_arr.fill_value != 0:
397-
raise ValueError("fill value must be 0 when converting to COO matrix")
398396

399397
row = sp_arr.sp_index.indices
400398
cols.append(np.repeat(col, len(row)))

pandas/core/computation/eval.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,8 +193,11 @@ def eval(
193193
corresponding bitwise operators. :class:`~pandas.Series` and
194194
:class:`~pandas.DataFrame` objects are supported and behave as they would
195195
with plain ol' Python evaluation.
196-
`eval` can run arbitrary code which can make you vulnerable to code
197-
injection if you pass user input to this function.
196+
197+
.. warning::
198+
199+
``eval`` can run arbitrary code which can make you vulnerable to code
200+
injection and untrusted data.
198201
199202
Parameters
200203
----------

pandas/core/dtypes/dtypes.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1666,7 +1666,7 @@ class SparseDtype(ExtensionDtype):
16661666
"""
16671667
Dtype for data stored in :class:`SparseArray`.
16681668
1669-
`SparseDtype` is used as the data type for :class:`SparseArray`, enabling
1669+
``SparseDtype`` is used as the data type for :class:`SparseArray`, enabling
16701670
more efficient storage of data that contains a significant number of
16711671
repetitive values typically represented by a fill value. It supports any
16721672
scalar dtype as the underlying data type of the non-fill values.
@@ -1677,19 +1677,20 @@ class SparseDtype(ExtensionDtype):
16771677
The dtype of the underlying array storing the non-fill value values.
16781678
fill_value : scalar, optional
16791679
The scalar value not stored in the SparseArray. By default, this
1680-
depends on `dtype`.
1680+
depends on ``dtype``.
16811681
16821682
=========== ==========
16831683
dtype na_value
16841684
=========== ==========
16851685
float ``np.nan``
1686+
complex ``np.nan``
16861687
int ``0``
16871688
bool ``False``
16881689
datetime64 ``pd.NaT``
16891690
timedelta64 ``pd.NaT``
16901691
=========== ==========
16911692
1692-
The default value may be overridden by specifying a `fill_value`.
1693+
The default value may be overridden by specifying a ``fill_value``.
16931694
16941695
Attributes
16951696
----------

pandas/core/dtypes/missing.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
618618
nan
619619
>>> na_value_for_dtype(np.dtype("float64"))
620620
nan
621+
>>> na_value_for_dtype(np.dtype("complex128"))
622+
nan
621623
>>> na_value_for_dtype(np.dtype("bool"))
622624
False
623625
>>> na_value_for_dtype(np.dtype("datetime64[ns]"))
@@ -629,7 +631,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
629631
elif dtype.kind in "mM":
630632
unit = np.datetime_data(dtype)[0]
631633
return dtype.type("NaT", unit)
632-
elif dtype.kind == "f":
634+
elif dtype.kind in "fc":
633635
return np.nan
634636
elif dtype.kind in "iu":
635637
if compat:

pandas/core/frame.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4782,6 +4782,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
47824782
ValueError
47834783
* If both of ``include`` and ``exclude`` are empty
47844784
* If ``include`` and ``exclude`` have overlapping elements
4785+
TypeError
47854786
* If any kind of string dtype is passed in.
47864787
47874788
See Also

pandas/io/parsers/readers.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -674,6 +674,14 @@ def _read(
674674
# Extract some of the arguments (pass chunksize on).
675675
iterator = kwds.get("iterator", False)
676676
chunksize = kwds.get("chunksize", None)
677+
678+
# Check type of encoding_errors
679+
errors = kwds.get("encoding_errors", "strict")
680+
if not isinstance(errors, str):
681+
raise ValueError(
682+
f"encoding_errors must be a string, got {type(errors).__name__}"
683+
)
684+
677685
if kwds.get("engine") == "pyarrow":
678686
if iterator:
679687
raise ValueError(

pandas/tests/arrays/sparse/test_accessor.py

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -105,28 +105,36 @@ def test_accessor_raises(self):
105105

106106
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
107107
@pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])])
108-
@pytest.mark.parametrize("dtype", ["float64", "int64"])
108+
@pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
109109
def test_from_spmatrix(self, format, labels, dtype):
110110
sp_sparse = pytest.importorskip("scipy.sparse")
111111

112-
sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item())
112+
sp_dtype = SparseDtype(dtype)
113113

114-
mat = sp_sparse.eye(10, format=format, dtype=dtype)
115-
result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels)
114+
sp_mat = sp_sparse.eye(10, format=format, dtype=dtype)
115+
result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels)
116+
mat = np.eye(10, dtype=dtype)
116117
expected = pd.DataFrame(
117-
np.eye(10, dtype=dtype), index=labels, columns=labels
118+
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
119+
index=labels,
120+
columns=labels,
118121
).astype(sp_dtype)
119122
tm.assert_frame_equal(result, expected)
120123

121124
@pytest.mark.parametrize("format", ["csc", "csr", "coo"])
122-
def test_from_spmatrix_including_explicit_zero(self, format):
125+
@pytest.mark.parametrize("dtype", [np.int64, bool])
126+
def test_from_spmatrix_including_explicit_zero(self, format, dtype):
123127
sp_sparse = pytest.importorskip("scipy.sparse")
124128

125-
mat = sp_sparse.random(10, 2, density=0.5, format=format)
126-
mat.data[0] = 0
127-
result = pd.DataFrame.sparse.from_spmatrix(mat)
128-
dtype = SparseDtype("float64", 0.0)
129-
expected = pd.DataFrame(mat.todense()).astype(dtype)
129+
sp_dtype = SparseDtype(dtype)
130+
131+
sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype)
132+
sp_mat.data[0] = 0
133+
result = pd.DataFrame.sparse.from_spmatrix(sp_mat)
134+
mat = sp_mat.toarray()
135+
expected = pd.DataFrame(
136+
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value)
137+
).astype(sp_dtype)
130138
tm.assert_frame_equal(result, expected)
131139

132140
@pytest.mark.parametrize(
@@ -136,41 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format):
136144
def test_from_spmatrix_columns(self, columns):
137145
sp_sparse = pytest.importorskip("scipy.sparse")
138146

139-
dtype = SparseDtype("float64", 0.0)
147+
sp_dtype = SparseDtype(np.float64)
140148

141-
mat = sp_sparse.random(10, 2, density=0.5)
142-
result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns)
143-
expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype)
149+
sp_mat = sp_sparse.random(10, 2, density=0.5)
150+
result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns)
151+
mat = sp_mat.toarray()
152+
expected = pd.DataFrame(
153+
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
154+
columns=columns,
155+
).astype(sp_dtype)
144156
tm.assert_frame_equal(result, expected)
145157

146158
@pytest.mark.parametrize(
147-
"colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
159+
"columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
148160
)
149-
def test_to_coo(self, colnames):
161+
@pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
162+
def test_to_coo(self, columns, dtype):
150163
sp_sparse = pytest.importorskip("scipy.sparse")
151164

152-
df = pd.DataFrame(
153-
{colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]"
154-
)
155-
result = df.sparse.to_coo()
156-
expected = sp_sparse.coo_matrix(np.asarray(df))
157-
assert (result != expected).nnz == 0
165+
sp_dtype = SparseDtype(dtype)
158166

159-
@pytest.mark.parametrize("fill_value", [1, np.nan])
160-
def test_to_coo_nonzero_fill_val_raises(self, fill_value):
161-
pytest.importorskip("scipy")
162-
df = pd.DataFrame(
163-
{
164-
"A": SparseArray(
165-
[fill_value, fill_value, fill_value, 2], fill_value=fill_value
166-
),
167-
"B": SparseArray(
168-
[fill_value, 2, fill_value, fill_value], fill_value=fill_value
169-
),
170-
}
171-
)
172-
with pytest.raises(ValueError, match="fill value must be 0"):
173-
df.sparse.to_coo()
167+
expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype)
168+
mat = expected.toarray()
169+
result = pd.DataFrame(
170+
np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
171+
columns=columns,
172+
dtype=sp_dtype,
173+
).sparse.to_coo()
174+
assert (result != expected).nnz == 0
174175

175176
def test_to_coo_midx_categorical(self):
176177
# GH#50996

0 commit comments

Comments
 (0)