Merge remote-tracking branch 'upstream/main' into ref/json/lessstate

mroeschke · mroeschke · commit f765bfe43d60 · 2024-07-01T09:40:17.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -230,6 +230,7 @@ Other API changes
 - 3rd party ``py.path`` objects are no longer explicitly supported in IO methods. Use :py:class:`pathlib.Path` objects instead (:issue:`57091`)
 - :func:`read_table`'s ``parse_dates`` argument defaults to ``None`` to improve consistency with :func:`read_csv` (:issue:`57476`)
 - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`)
+- Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`)
 - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`)
 - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
 - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`)
@@ -558,6 +559,7 @@ I/O
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
 - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
+- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
@@ -594,6 +596,7 @@ Reshaping
 ^^^^^^^^^
 - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
+- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
 
 Sparse
 ^^^^^^
diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c
@@ -245,7 +245,12 @@ static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) {
 }
 
 static PyModuleDef_Slot pandas_datetime_slots[] = {
-    {Py_mod_exec, pandas_datetime_exec}, {0, NULL}};
+    {Py_mod_exec, pandas_datetime_exec},
+#if PY_VERSION_HEX >= 0x030D0000
+    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
+#endif
+    {0, NULL},
+};
 
 static struct PyModuleDef pandas_datetimemodule = {
     PyModuleDef_HEAD_INIT,
diff --git a/pandas/_libs/src/parser/pd_parser.c b/pandas/_libs/src/parser/pd_parser.c
@@ -161,7 +161,12 @@ static int pandas_parser_exec(PyObject *Py_UNUSED(module)) {
 }
 
 static PyModuleDef_Slot pandas_parser_slots[] = {
-    {Py_mod_exec, pandas_parser_exec}, {0, NULL}};
+    {Py_mod_exec, pandas_parser_exec},
+#if PY_VERSION_HEX >= 0x030D0000
+    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
+#endif
+    {0, NULL},
+};
 
 static struct PyModuleDef pandas_parsermodule = {
     PyModuleDef_HEAD_INIT,
diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx
@@ -453,10 +453,6 @@ class Resolution(Enum):
         """
         cdef:
             str abbrev
-        if freq in {"T", "t", "L", "l", "U", "u", "N", "n"}:
-            raise ValueError(
-                f"Frequency \'{freq}\' is no longer supported."
-            )
         try:
             if freq in c_DEPR_ABBREVS:
                 abbrev = c_DEPR_ABBREVS[freq]
diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi
@@ -39,8 +39,6 @@ UnitChoices: TypeAlias = Literal[
     "minute",
     "min",
     "minutes",
-    "T",
-    "t",
     "s",
     "seconds",
     "sec",
@@ -50,21 +48,17 @@ UnitChoices: TypeAlias = Literal[
     "millisecond",
     "milli",
     "millis",
-    "L",
-    "l",
     "us",
     "microseconds",
     "microsecond",
     "µs",
     "micro",
     "micros",
-    "u",
     "ns",
     "nanoseconds",
     "nano",
     "nanos",
     "nanosecond",
-    "n",
 ]
 _S = TypeVar("_S", bound=timedelta)
 
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1818,11 +1818,6 @@ class Timedelta(_Timedelta):
         * 'microseconds', 'microsecond', 'micros', 'micro', or 'us'
         * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'.
 
-        .. deprecated:: 2.2.0
-
-            Values `H`, `T`, `S`, `L`, `U`, and `N` are deprecated in favour
-            of the values `h`, `min`, `s`, `ms`, `us`, and `ns`.
-
         .. deprecated:: 3.0.0
 
             Allowing the values `w`, `d`, `MIN`, `MS`, `US` and `NS` to denote units
diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
@@ -193,8 +193,11 @@ def eval(
     corresponding bitwise operators.  :class:`~pandas.Series` and
     :class:`~pandas.DataFrame` objects are supported and behave as they would
     with plain ol' Python evaluation.
-    `eval` can run arbitrary code which can make you vulnerable to code
-    injection if you pass user input to this function.
+
+    .. warning::
+
+        ``eval`` can run arbitrary code which can make you vulnerable to code
+         injection and untrusted data.
 
     Parameters
     ----------
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -288,21 +288,19 @@ def get_new_values(self, values, fill_value=None):
 
         dtype = values.dtype
 
-        # if our mask is all True, then we can use our existing dtype
-        if mask_all:
-            dtype = values.dtype
-            new_values = np.empty(result_shape, dtype=dtype)
-        else:
-            if isinstance(dtype, ExtensionDtype):
-                # GH#41875
-                # We are assuming that fill_value can be held by this dtype,
-                #  unlike the non-EA case that promotes.
-                cls = dtype.construct_array_type()
-                new_values = cls._empty(result_shape, dtype=dtype)
+        if isinstance(dtype, ExtensionDtype):
+            # GH#41875
+            # We are assuming that fill_value can be held by this dtype,
+            #  unlike the non-EA case that promotes.
+            cls = dtype.construct_array_type()
+            new_values = cls._empty(result_shape, dtype=dtype)
+            if not mask_all:
                 new_values[:] = fill_value
-            else:
+        else:
+            if not mask_all:
                 dtype, fill_value = maybe_promote(dtype, fill_value)
-                new_values = np.empty(result_shape, dtype=dtype)
+            new_values = np.empty(result_shape, dtype=dtype)
+            if not mask_all:
                 new_values.fill(fill_value)
 
         name = dtype.name
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -674,6 +674,14 @@ def _read(
     # Extract some of the arguments (pass chunksize on).
     iterator = kwds.get("iterator", False)
     chunksize = kwds.get("chunksize", None)
+
+    # Check type of encoding_errors
+    errors = kwds.get("encoding_errors", "strict")
+    if not isinstance(errors, str):
+        raise ValueError(
+            f"encoding_errors must be a string, got {type(errors).__name__}"
+        )
+
     if kwds.get("engine") == "pyarrow":
         if iterator:
             raise ValueError(
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -555,7 +555,7 @@ def test_explicit_encoding(io_class, mode, msg):
             expected.to_csv(buffer, mode=f"w{mode}")
 
 
-@pytest.mark.parametrize("encoding_errors", [None, "strict", "replace"])
+@pytest.mark.parametrize("encoding_errors", ["strict", "replace"])
 @pytest.mark.parametrize("format", ["csv", "json"])
 def test_encoding_errors(encoding_errors, format):
     # GH39450
@@ -590,6 +590,17 @@ def test_encoding_errors(encoding_errors, format):
             tm.assert_frame_equal(df, expected)
 
 
+@pytest.mark.parametrize("encoding_errors", [0, None])
+def test_encoding_errors_badtype(encoding_errors):
+    # GH 59075
+    content = StringIO("A,B\n1,2\n3,4\n")
+    reader = partial(pd.read_csv, encoding_errors=encoding_errors)
+    expected_error = "encoding_errors must be a string, got "
+    expected_error += f"{type(encoding_errors).__name__}"
+    with pytest.raises(ValueError, match=expected_error):
+        reader(content)
+
+
 def test_bad_encdoing_errors():
     # GH 39777
     with tm.ensure_clean() as path:
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -2769,3 +2769,17 @@ def test_unstack_copy(self, m):
         result = df.unstack(sort=False)
         result.iloc[0, 0] = -1
         tm.assert_frame_equal(df, df_orig)
+
+    def test_pivot_empty_with_datetime(self):
+        # GH#59126
+        df = DataFrame(
+            {
+                "timestamp": Series([], dtype=pd.DatetimeTZDtype(tz="UTC")),
+                "category": Series([], dtype=str),
+                "value": Series([], dtype=str),
+            }
+        )
+        df_pivoted = df.pivot_table(
+            index="category", columns="value", values="timestamp"
+        )
+        assert df_pivoted.empty
diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py
@@ -56,10 +56,3 @@ def test_units_H_S_deprecated_from_attrname_to_abbrevs(freq):
 
     with tm.assert_produces_warning(FutureWarning, match=msg):
         Resolution.get_reso_from_freqstr(freq)
-
-
-@pytest.mark.parametrize("freq", ["T", "t", "L", "U", "N", "n"])
-def test_reso_abbrev_T_L_U_N_raises(freq):
-    msg = f"Frequency '{freq}' is no longer supported."
-    with pytest.raises(ValueError, match=msg):
-        Resolution.get_reso_from_freqstr(freq)
diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py
@@ -45,7 +45,7 @@ def _get_sys_info() -> dict[str, JSONSerializable]:
     language_code, encoding = locale.getlocale()
     return {
         "commit": _get_commit_hash(),
-        "python": ".".join([str(i) for i in sys.version_info]),
+        "python": platform.python_version(),
         "python-bits": struct.calcsize("P") * 8,
         "OS": uname_result.system,
         "OS-release": uname_result.release,
@@ -70,33 +70,25 @@ def _get_dependency_info() -> dict[str, JSONSerializable]:
         "pytz",
         "dateutil",
         # install / build,
-        "setuptools",
         "pip",
         "Cython",
-        # test
-        "pytest",
-        "hypothesis",
         # docs
         "sphinx",
-        # Other, need a min version
-        "blosc",
-        "feather",
-        "xlsxwriter",
-        "lxml.etree",
-        "html5lib",
-        "pymysql",
-        "psycopg2",
-        "jinja2",
         # Other, not imported.
         "IPython",
-        "pandas_datareader",
     ]
+    # Optional dependencies
     deps.extend(list(VERSIONS))
 
     result: dict[str, JSONSerializable] = {}
     for modname in deps:
-        mod = import_optional_dependency(modname, errors="ignore")
-        result[modname] = get_version(mod) if mod else None
+        try:
+            mod = import_optional_dependency(modname, errors="ignore")
+        except Exception:
+            # Dependency conflicts may cause a non ImportError
+            result[modname] = "N/A"
+        else:
+            result[modname] = get_version(mod) if mod else None
     return result