Merge remote-tracking branch 'upstream/main' into split-arrow

yuanx749 · yuanx749 · commit ab5f3373ecb2 · 2024-06-28T12:45:45.000+08:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -67,6 +67,7 @@ repos:
     -   id: fix-encoding-pragma
         args: [--remove]
     -   id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
 -   repo: https://github.com/PyCQA/isort
     rev: 5.13.2
     hooks:
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -230,6 +230,7 @@ Other API changes
 - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
 - :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`)
 - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`)
+- Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`)
 - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`)
 - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
 - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`)
@@ -559,6 +560,7 @@ I/O
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
 - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
+- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
@@ -599,7 +601,7 @@ Reshaping
 Sparse
 ^^^^^^
 - Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`)
--
+- Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`)
 
 ExtensionArray
 ^^^^^^^^^^^^^^
diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx
@@ -453,10 +453,6 @@ class Resolution(Enum):
         """
         cdef:
             str abbrev
-        if freq in {"T", "t", "L", "l", "U", "u", "N", "n"}:
-            raise ValueError(
-                f"Frequency \'{freq}\' is no longer supported."
-            )
         try:
             if freq in c_DEPR_ABBREVS:
                 abbrev = c_DEPR_ABBREVS[freq]
diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi
@@ -39,8 +39,6 @@ UnitChoices: TypeAlias = Literal[
     "minute",
     "min",
     "minutes",
-    "T",
-    "t",
     "s",
     "seconds",
     "sec",
@@ -50,21 +48,17 @@ UnitChoices: TypeAlias = Literal[
     "millisecond",
     "milli",
     "millis",
-    "L",
-    "l",
     "us",
     "microseconds",
     "microsecond",
     "µs",
     "micro",
     "micros",
-    "u",
     "ns",
     "nanoseconds",
     "nano",
     "nanos",
     "nanosecond",
-    "n",
 ]
 _S = TypeVar("_S", bound=timedelta)
 
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1818,11 +1818,6 @@ class Timedelta(_Timedelta):
         * 'microseconds', 'microsecond', 'micros', 'micro', or 'us'
         * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'.
 
-        .. deprecated:: 2.2.0
-
-            Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour
-            of the values `h`, `min`, `s`, `ms`, `us`, and `ns`.
-
         .. deprecated:: 3.0.0
 
             Allowing the values `w`, `d`, `MIN`, `MS`, `US` and `NS` to denote units
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
@@ -291,12 +291,12 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
         Examples
         --------
         >>> import scipy.sparse
-        >>> mat = scipy.sparse.eye(3, dtype=float)
+        >>> mat = scipy.sparse.eye(3, dtype=int)
         >>> pd.DataFrame.sparse.from_spmatrix(mat)
              0    1    2
-        0  1.0    0    0
-        1    0  1.0    0
-        2    0    0  1.0
+        0    1    0    0
+        1    0    1    0
+        2    0    0    1
         """
         from pandas._libs.sparse import IntIndex
 
@@ -313,7 +313,7 @@ def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
         indices = data.indices
         indptr = data.indptr
         array_data = data.data
-        dtype = SparseDtype(array_data.dtype, 0)
+        dtype = SparseDtype(array_data.dtype)
         arrays = []
         for i in range(n_columns):
             sl = slice(indptr[i], indptr[i + 1])
@@ -393,8 +393,6 @@ def to_coo(self) -> spmatrix:
         cols, rows, data = [], [], []
         for col, (_, ser) in enumerate(self._parent.items()):
             sp_arr = ser.array
-            if sp_arr.fill_value != 0:
-                raise ValueError("fill value must be 0 when converting to COO matrix")
 
             row = sp_arr.sp_index.indices
             cols.append(np.repeat(col, len(row)))
diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
@@ -193,8 +193,11 @@ def eval(
     corresponding bitwise operators.  :class:`~pandas.Series` and
     :class:`~pandas.DataFrame` objects are supported and behave as they would
     with plain ol' Python evaluation.
-    `eval` can run arbitrary code which can make you vulnerable to code
-    injection if you pass user input to this function.
+
+    .. warning::
+
+        ``eval`` can run arbitrary code which can make you vulnerable to code
+         injection and untrusted data.
 
     Parameters
     ----------
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -1666,7 +1666,7 @@ class SparseDtype(ExtensionDtype):
     """
     Dtype for data stored in :class:`SparseArray`.
 
-    `SparseDtype` is used as the data type for :class:`SparseArray`, enabling
+    ``SparseDtype`` is used as the data type for :class:`SparseArray`, enabling
     more efficient storage of data that contains a significant number of
     repetitive values typically represented by a fill value. It supports any
     scalar dtype as the underlying data type of the non-fill values.
@@ -1677,19 +1677,20 @@ class SparseDtype(ExtensionDtype):
         The dtype of the underlying array storing the non-fill value values.
     fill_value : scalar, optional
         The scalar value not stored in the SparseArray. By default, this
-        depends on `dtype`.
+        depends on ``dtype``.
 
         =========== ==========
         dtype       na_value
         =========== ==========
         float       ``np.nan``
+        complex     ``np.nan``
         int         ``0``
         bool        ``False``
         datetime64  ``pd.NaT``
         timedelta64 ``pd.NaT``
         =========== ==========
 
-        The default value may be overridden by specifying a `fill_value`.
+        The default value may be overridden by specifying a ``fill_value``.
 
     Attributes
     ----------
diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
@@ -618,6 +618,8 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
     nan
     >>> na_value_for_dtype(np.dtype("float64"))
     nan
+    >>> na_value_for_dtype(np.dtype("complex128"))
+    nan
     >>> na_value_for_dtype(np.dtype("bool"))
     False
     >>> na_value_for_dtype(np.dtype("datetime64[ns]"))
@@ -629,7 +631,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
     elif dtype.kind in "mM":
         unit = np.datetime_data(dtype)[0]
         return dtype.type("NaT", unit)
-    elif dtype.kind == "f":
+    elif dtype.kind in "fc":
         return np.nan
     elif dtype.kind in "iu":
         if compat:
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -674,6 +674,14 @@ def _read(
     # Extract some of the arguments (pass chunksize on).
     iterator = kwds.get("iterator", False)
     chunksize = kwds.get("chunksize", None)
+
+    # Check type of encoding_errors
+    errors = kwds.get("encoding_errors", "strict")
+    if not isinstance(errors, str):
+        raise ValueError(
+            f"encoding_errors must be a string, got {type(errors).__name__}"
+        )
+
     if kwds.get("engine") == "pyarrow":
         if iterator:
             raise ValueError(
diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py
@@ -105,28 +105,36 @@ def test_accessor_raises(self):
 
     @pytest.mark.parametrize("format", ["csc", "csr", "coo"])
     @pytest.mark.parametrize("labels", [None, list(string.ascii_letters[:10])])
-    @pytest.mark.parametrize("dtype", ["float64", "int64"])
+    @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
     def test_from_spmatrix(self, format, labels, dtype):
         sp_sparse = pytest.importorskip("scipy.sparse")
 
-        sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item())
+        sp_dtype = SparseDtype(dtype)
 
-        mat = sp_sparse.eye(10, format=format, dtype=dtype)
-        result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels)
+        sp_mat = sp_sparse.eye(10, format=format, dtype=dtype)
+        result = pd.DataFrame.sparse.from_spmatrix(sp_mat, index=labels, columns=labels)
+        mat = np.eye(10, dtype=dtype)
         expected = pd.DataFrame(
-            np.eye(10, dtype=dtype), index=labels, columns=labels
+            np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
+            index=labels,
+            columns=labels,
         ).astype(sp_dtype)
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("format", ["csc", "csr", "coo"])
-    def test_from_spmatrix_including_explicit_zero(self, format):
+    @pytest.mark.parametrize("dtype", [np.int64, bool])
+    def test_from_spmatrix_including_explicit_zero(self, format, dtype):
         sp_sparse = pytest.importorskip("scipy.sparse")
 
-        mat = sp_sparse.random(10, 2, density=0.5, format=format)
-        mat.data[0] = 0
-        result = pd.DataFrame.sparse.from_spmatrix(mat)
-        dtype = SparseDtype("float64", 0.0)
-        expected = pd.DataFrame(mat.todense()).astype(dtype)
+        sp_dtype = SparseDtype(dtype)
+
+        sp_mat = sp_sparse.random(10, 2, density=0.5, format=format, dtype=dtype)
+        sp_mat.data[0] = 0
+        result = pd.DataFrame.sparse.from_spmatrix(sp_mat)
+        mat = sp_mat.toarray()
+        expected = pd.DataFrame(
+            np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value)
+        ).astype(sp_dtype)
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize(
@@ -136,41 +144,34 @@ def test_from_spmatrix_including_explicit_zero(self, format):
     def test_from_spmatrix_columns(self, columns):
         sp_sparse = pytest.importorskip("scipy.sparse")
 
-        dtype = SparseDtype("float64", 0.0)
+        sp_dtype = SparseDtype(np.float64)
 
-        mat = sp_sparse.random(10, 2, density=0.5)
-        result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns)
-        expected = pd.DataFrame(mat.toarray(), columns=columns).astype(dtype)
+        sp_mat = sp_sparse.random(10, 2, density=0.5)
+        result = pd.DataFrame.sparse.from_spmatrix(sp_mat, columns=columns)
+        mat = sp_mat.toarray()
+        expected = pd.DataFrame(
+            np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
+            columns=columns,
+        ).astype(sp_dtype)
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize(
-        "colnames", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
+        "columns", [("A", "B"), (1, 2), (1, pd.NA), (0.1, 0.2), ("x", "x"), (0, 0)]
     )
-    def test_to_coo(self, colnames):
+    @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
+    def test_to_coo(self, columns, dtype):
         sp_sparse = pytest.importorskip("scipy.sparse")
 
-        df = pd.DataFrame(
-            {colnames[0]: [0, 1, 0], colnames[1]: [1, 0, 0]}, dtype="Sparse[int64, 0]"
-        )
-        result = df.sparse.to_coo()
-        expected = sp_sparse.coo_matrix(np.asarray(df))
-        assert (result != expected).nnz == 0
+        sp_dtype = SparseDtype(dtype)
 
-    @pytest.mark.parametrize("fill_value", [1, np.nan])
-    def test_to_coo_nonzero_fill_val_raises(self, fill_value):
-        pytest.importorskip("scipy")
-        df = pd.DataFrame(
-            {
-                "A": SparseArray(
-                    [fill_value, fill_value, fill_value, 2], fill_value=fill_value
-                ),
-                "B": SparseArray(
-                    [fill_value, 2, fill_value, fill_value], fill_value=fill_value
-                ),
-            }
-        )
-        with pytest.raises(ValueError, match="fill value must be 0"):
-            df.sparse.to_coo()
+        expected = sp_sparse.random(10, 2, density=0.5, format="coo", dtype=dtype)
+        mat = expected.toarray()
+        result = pd.DataFrame(
+            np.ma.array(mat, mask=(mat == 0)).filled(sp_dtype.fill_value),
+            columns=columns,
+            dtype=sp_dtype,
+        ).sparse.to_coo()
+        assert (result != expected).nnz == 0
 
     def test_to_coo_midx_categorical(self):
         # GH#50996
diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
@@ -697,6 +697,9 @@ def test_array_equivalent_index_with_tuples():
         ("f2", np.nan),
         ("f4", np.nan),
         ("f8", np.nan),
+        # Complex
+        ("c8", np.nan),
+        ("c16", np.nan),
         # Object
         ("O", np.nan),
         # Interval
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -1281,7 +1281,7 @@ def test_loc_getitem_time_object(self, frame_or_series):
         tm.assert_equal(result, expected)
 
     @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"])
-    @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex])
+    @pytest.mark.parametrize("dtype", [np.complex128, np.float64, np.int64, bool])
     def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype):
         sp_sparse = pytest.importorskip("scipy.sparse")
 
@@ -1296,13 +1296,13 @@ def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype):
 
         # regression test for GH#34526
         itr_idx = range(2, rows)
-        result = df.loc[itr_idx].values
+        result = np.nan_to_num(df.loc[itr_idx].values)
         expected = spmatrix.toarray()[itr_idx]
         tm.assert_numpy_array_equal(result, expected)
 
         # regression test for GH#34540
         result = df.loc[itr_idx].dtypes.values
-        expected = np.full(cols, SparseDtype(dtype, fill_value=0))
+        expected = np.full(cols, SparseDtype(dtype))
         tm.assert_numpy_array_equal(result, expected)
 
     def test_loc_getitem_listlike_all_retains_sparse(self):
@@ -1314,18 +1314,16 @@ def test_loc_getitem_sparse_frame(self):
         # GH34687
         sp_sparse = pytest.importorskip("scipy.sparse")
 
-        df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5))
+        df = DataFrame.sparse.from_spmatrix(sp_sparse.eye(5, dtype=np.int64))
         result = df.loc[range(2)]
         expected = DataFrame(
-            [[1.0, 0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0]],
-            dtype=SparseDtype("float64", 0.0),
+            [[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]],
+            dtype=SparseDtype(np.int64),
         )
         tm.assert_frame_equal(result, expected)
 
         result = df.loc[range(2)].loc[range(1)]
-        expected = DataFrame(
-            [[1.0, 0.0, 0.0, 0.0, 0.0]], dtype=SparseDtype("float64", 0.0)
-        )
+        expected = DataFrame([[1, 0, 0, 0, 0]], dtype=SparseDtype(np.int64))
         tm.assert_frame_equal(result, expected)
 
     def test_loc_getitem_sparse_series(self):
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -555,7 +555,7 @@ def test_explicit_encoding(io_class, mode, msg):
             expected.to_csv(buffer, mode=f"w{mode}")
 
 
-@pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"])
+@pytest.mark.parametrize("encoding_errors", ["strict", "replace"])
 @pytest.mark.parametrize("format", ["csv", "json"])
 def test_encoding_errors(encoding_errors, format):
     # GH39450
@@ -590,6 +590,17 @@ def test_encoding_errors(encoding_errors, format):
             tm.assert_frame_equal(df, expected)
 
 
+@pytest.mark.parametrize("encoding_errors", [0, None])
+def test_encoding_errors_badtype(encoding_errors):
+    # GH 59075
+    content = StringIO("A,B\n1,2\n3,4\n")
+    reader = partial(pd.read_csv, encoding_errors=encoding_errors)
+    expected_error = "encoding_errors must be a string, got "
+    expected_error += f"{type(encoding_errors).__name__}"
+    with pytest.raises(ValueError, match=expected_error):
+        reader(content)
+
+
 def test_bad_encdoing_errors():
     # GH 39777
     with tm.ensure_clean() as path:
diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py