Merge branch 'main' into tst-string-xfails

jbrockmendel · jbrockmendel · commit b6d762b3b77b · 2025-06-30T14:17:53.000-07:00
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -31,39 +31,6 @@ Other enhancements
 - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for :class:`StringDtype` columns (:issue:`60633`)
 - The :meth:`~Series.sum` reduction is now implemented for :class:`StringDtype` columns (:issue:`59853`)
 
-.. ---------------------------------------------------------------------------
-.. _whatsnew_230.notable_bug_fixes:
-
-Notable bug fixes
-~~~~~~~~~~~~~~~~~
-
-These are bug fixes that might have notable behavior changes.
-
-.. _whatsnew_230.notable_bug_fixes.string_comparisons:
-
-Comparisons between different string dtypes
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-In previous versions, comparing :class:`Series` of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy
-
-    object < (python, NaN) < (pyarrow, NaN) < (python, NA) < (pyarrow, NA)
-
-in determining the result dtype when there are different string dtypes compared. Some examples:
-
-- When ``pd.StringDtype("pyarrow", na_value=pd.NA)`` is compared against any other string dtype, the result will always be ``boolean[pyarrow]``.
-- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("pyarrow", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
-- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("python", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
-
-.. _whatsnew_230.api_changes:
-
-API changes
-~~~~~~~~~~~
-
-- When enabling the ``future.infer_string`` option, :class:`Index` set operations (like
-  union or intersection) will now ignore the dtype of an empty :class:`RangeIndex` or
-  empty :class:`Index` with ``object`` dtype when determining the dtype of the resulting
-  Index (:issue:`60797`)
-
 .. ---------------------------------------------------------------------------
 .. _whatsnew_230.deprecations:
 
@@ -85,8 +52,6 @@ Numeric
 
 Strings
 ^^^^^^^
-- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`)
-- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`)
 - Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` where an ``Exception`` was not raised for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
 - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` that incorrectly returned integer results with ``method="average"`` and raised an error if it would truncate results (:issue:`59768`)
 - Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
diff --git a/doc/source/whatsnew/v2.3.1.rst b/doc/source/whatsnew/v2.3.1.rst
@@ -9,11 +9,57 @@ including other versions of pandas.
 {{ header }}
 
 .. ---------------------------------------------------------------------------
-.. _whatsnew_231.enhancements:
+.. _whatsnew_231.string_fixes:
+
+Improvements and fixes for the StringDtype
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. _whatsnew_231.string_fixes.string_comparisons:
+
+Comparisons between different string dtypes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions, comparing :class:`Series` of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy
+
+    object < (python, NaN) < (pyarrow, NaN) < (python, NA) < (pyarrow, NA)
+
+in determining the result dtype when there are different string dtypes compared. Some examples:
+
+- When ``pd.StringDtype("pyarrow", na_value=pd.NA)`` is compared against any other string dtype, the result will always be ``boolean[pyarrow]``.
+- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("pyarrow", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
+- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("python", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
+
+.. _whatsnew_231.string_fixes.ignore_empty:
+
+Index set operations ignore empty RangeIndex and object dtype Index
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When enabling the ``future.infer_string`` option, :class:`Index` set operations (like
+union or intersection) will now ignore the dtype of an empty :class:`RangeIndex` or
+empty :class:`Index` with ``object`` dtype when determining the dtype of the resulting
+Index (:issue:`60797`).
+
+This ensures that combining such empty Index with strings will infer the string dtype
+correctly, rather than defaulting to ``object`` dtype. For example:
+
+.. code-block:: python
+
+    >>> pd.options.mode.infer_string = True
+    >>> df = pd.DataFrame()
+    >>> df.columns.dtype
+    dtype('int64')               # default RangeIndex for empty columns
+    >>> df["a"] = [1, 2, 3]
+    >>> df.columns.dtype
+    <StringDtype(na_value=nan)>  # new columns use string dtype instead of object dtype
+
+.. _whatsnew_231.string_fixes.bugs:
+
+Bug fixes
+^^^^^^^^^
+- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`)
+- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`)
+- Fixed bug in :meth:`DataFrame.explode` and :meth:`Series.explode` where methods would fail with ``dtype="str"`` (:issue:`61623`)
 
-Enhancements
-~~~~~~~~~~~~
--
 
 .. _whatsnew_231.regressions:
 
@@ -26,7 +72,7 @@ Fixed regressions
 
 Bug fixes
 ~~~~~~~~~
-- Fixed bug in :meth:`DataFrame.explode` and :meth:`Series.explode` where methods would fail with ``dtype="str"`` (:issue:`61623`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_231.other:
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -28,6 +28,9 @@ Enhancement2
 
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
+- :func:`pandas.merge` propagates the ``attrs`` attribute to the result if all
+  inputs have identical ``attrs``, as has so far already been the case for
+  :func:`pandas.concat`.
 - :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`)
 - :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
 - Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
@@ -745,6 +748,7 @@ Indexing
 - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`)
 - Bug in :meth:`DataFrame.__getitem__` when slicing a :class:`DataFrame` with many rows raised an ``OverflowError`` (:issue:`59531`)
 - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`)
+- Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` returning incorrect dtype when selecting from a :class:`DataFrame` with mixed data types. (:issue:`60600`)
 - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`)
 - Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`)
 - Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`)
@@ -777,6 +781,7 @@ I/O
 - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
 - Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`)
 - Bug in :meth:`DataFrame.to_stata` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`)
+- Bug in :meth:`DataFrame.to_stata` when input encoded length and normal length are mismatched (:issue:`61583`)
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
 - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
diff --git a/pandas/_libs/src/datetime/pd_datetime.c b/pandas/_libs/src/datetime/pd_datetime.c
@@ -192,6 +192,10 @@ static npy_datetime PyDateTimeToEpoch(PyObject *dt, NPY_DATETIMEUNIT base) {
   return npy_dt;
 }
 
+/* Initializes and exposes a customer datetime C-API from the pandas library
+ * by creating a PyCapsule that stores function pointers, which can be accessed
+ * later by other C code or Cython code that imports the capsule.
+ */
 static int pandas_datetime_exec(PyObject *Py_UNUSED(module)) {
   PyDateTime_IMPORT;
   PandasDateTime_CAPI *capi = PyMem_Malloc(sizeof(PandasDateTime_CAPI));
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -152,8 +152,8 @@ def import_optional_dependency(
     install_name = package_name if package_name is not None else name
 
     msg = (
-        f"Missing optional dependency '{install_name}'. {extra} "
-        f"Use pip or conda to install {install_name}."
+        f"`Import {install_name}` failed. {extra} "
+        f"Use pip or conda to install the {install_name} package."
     )
     try:
         module = importlib.import_module(name)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -330,8 +330,8 @@ def attrs(self) -> dict[Hashable, Any]:
         -----
         Many operations that create new datasets will copy ``attrs``. Copies
         are always deep so that changing ``attrs`` will only affect the
-        present dataset. ``pandas.concat`` copies ``attrs`` only if all input
-        datasets have the same ``attrs``.
+        present dataset. :func:`pandas.concat` and :func:`pandas.merge` will
+        only copy ``attrs`` if all input datasets have the same ``attrs``.
 
         Examples
         --------
@@ -6090,11 +6090,11 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
                 assert isinstance(name, str)
                 object.__setattr__(self, name, getattr(other, name, None))
 
-        if method == "concat":
-            objs = other.objs
-            # propagate attrs only if all concat arguments have the same attrs
+        elif hasattr(other, "input_objs"):
+            objs = other.input_objs
+            # propagate attrs only if all inputs have the same attrs
             if all(bool(obj.attrs) for obj in objs):
-                # all concatenate arguments have non-empty attrs
+                # all inputs have non-empty attrs
                 attrs = objs[0].attrs
                 have_same_attrs = all(obj.attrs == attrs for obj in objs[1:])
                 if have_same_attrs:
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1066,8 +1066,10 @@ def _getitem_lowerdim(self, tup: tuple):
 
         tup = self._validate_key_length(tup)
 
-        for i, key in enumerate(tup):
-            if is_label_like(key):
+        # Reverse tuple so that we are indexing along columns before rows
+        # and avoid unintended dtype inference. # GH60600
+        for i, key in zip(range(len(tup) - 1, -1, -1), reversed(tup)):
+            if is_label_like(key) or is_list_like(key):
                 # We don't need to check for tuples here because those are
                 #  caught by the _is_nested_tuple_indexer check above.
                 section = self._getitem_axis(key, axis=i)
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -550,7 +550,7 @@ def _get_result(
             result = sample._constructor_from_mgr(mgr, axes=mgr.axes)
             result._name = name
             return result.__finalize__(
-                types.SimpleNamespace(objs=objs), method="concat"
+                types.SimpleNamespace(input_objs=objs), method="concat"
             )
 
         # combine as columns in a frame
@@ -571,7 +571,9 @@ def _get_result(
             )
             df = cons(data, index=index, copy=False)
             df.columns = columns
-            return df.__finalize__(types.SimpleNamespace(objs=objs), method="concat")
+            return df.__finalize__(
+                types.SimpleNamespace(input_objs=objs), method="concat"
+            )
 
     # combine block managers
     else:
@@ -610,7 +612,7 @@ def _get_result(
         )
 
         out = sample._constructor_from_mgr(new_data, axes=new_data.axes)
-        return out.__finalize__(types.SimpleNamespace(objs=objs), method="concat")
+        return out.__finalize__(types.SimpleNamespace(input_objs=objs), method="concat")
 
 
 def new_axes(
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -10,6 +10,7 @@
 )
 import datetime
 from functools import partial
+import types
 from typing import (
     TYPE_CHECKING,
     Literal,
@@ -1134,7 +1135,10 @@ def get_result(self) -> DataFrame:
         join_index, left_indexer, right_indexer = self._get_join_info()
 
         result = self._reindex_and_concat(join_index, left_indexer, right_indexer)
-        result = result.__finalize__(self, method=self._merge_type)
+        result = result.__finalize__(
+            types.SimpleNamespace(input_objs=[self.left, self.right]),
+            method=self._merge_type,
+        )
 
         if self.indicator:
             result = self._indicator_post_merge(result)
@@ -1143,7 +1147,9 @@ def get_result(self) -> DataFrame:
 
         self._maybe_restore_index_levels(result)
 
-        return result.__finalize__(self, method="merge")
+        return result.__finalize__(
+            types.SimpleNamespace(input_objs=[self.left, self.right]), method="merge"
+        )
 
     @final
     @cache_readonly
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -2739,7 +2739,7 @@ def _encode_strings(self) -> None:
                 encoded = self.data[col].str.encode(self._encoding)
                 # If larger than _max_string_length do nothing
                 if (
-                    max_len_string_array(ensure_object(encoded._values))
+                    max_len_string_array(ensure_object(self.data[col]._values))
                     <= self._max_string_length
                 ):
                     self.data[col] = encoded
@@ -3263,11 +3263,15 @@ def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes:
             bio.write(gso_type)
 
             # llll
-            utf8_string = bytes(strl, "utf-8")
-            bio.write(struct.pack(len_type, len(utf8_string) + 1))
+            if isinstance(strl, str):
+                strl_convert = bytes(strl, "utf-8")
+            else:
+                strl_convert = strl
+
+            bio.write(struct.pack(len_type, len(strl_convert) + 1))
 
             # xxx...xxx
-            bio.write(utf8_string)
+            bio.write(strl_convert)
             bio.write(null)
 
         return bio.getvalue()
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
@@ -315,7 +315,7 @@ def test_attrs(self):
         result = df.rename(columns=str)
         assert result.attrs == {"version": 1}
 
-    def test_attrs_deepcopy(self):
+    def test_attrs_is_deepcopy(self):
         df = DataFrame({"A": [2, 3]})
         assert df.attrs == {}
         df.attrs["tags"] = {"spam", "ham"}
@@ -324,6 +324,30 @@ def test_attrs_deepcopy(self):
         assert result.attrs == df.attrs
         assert result.attrs["tags"] is not df.attrs["tags"]
 
+    def test_attrs_concat(self):
+        # concat propagates attrs if all input attrs are equal
+        df1 = DataFrame({"A": [2, 3]})
+        df1.attrs = {"a": 1, "b": 2}
+        df2 = DataFrame({"A": [4, 5]})
+        df2.attrs = df1.attrs.copy()
+        df3 = DataFrame({"A": [6, 7]})
+        df3.attrs = df1.attrs.copy()
+        assert pd.concat([df1, df2, df3]).attrs == df1.attrs
+        # concat does not propagate attrs if input attrs are different
+        df2.attrs = {"c": 3}
+        assert pd.concat([df1, df2, df3]).attrs == {}
+
+    def test_attrs_merge(self):
+        # merge propagates attrs if all input attrs are equal
+        df1 = DataFrame({"key": ["a", "b"], "val1": [1, 2]})
+        df1.attrs = {"a": 1, "b": 2}
+        df2 = DataFrame({"key": ["a", "b"], "val2": [3, 4]})
+        df2.attrs = df1.attrs.copy()
+        assert pd.merge(df1, df2).attrs == df1.attrs
+        # merge does not propagate attrs if input attrs are different
+        df2.attrs = {"c": 3}
+        assert pd.merge(df1, df2).attrs == {}
+
     @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None])
     def test_set_flags(
         self,
diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py
@@ -164,7 +164,6 @@ def test_concat(self, objs, kwargs):
                     allows_duplicate_labels=False
                 ),
                 False,
-                marks=not_implemented,
             ),
             # false true false
             pytest.param(
@@ -173,7 +172,6 @@ def test_concat(self, objs, kwargs):
                 ),
                 pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
                 False,
-                marks=not_implemented,
             ),
             # true true true
             (
@@ -296,7 +294,6 @@ def test_concat_raises(self):
         with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
             pd.concat(objs, axis=1)
 
-    @not_implemented
     def test_merge_raises(self):
         a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags(
             allows_duplicate_labels=False
diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py
@@ -80,12 +80,16 @@ def test_metadata_propagation_indiv(self, monkeypatch):
         def finalize(self, other, method=None, **kwargs):
             for name in self._metadata:
                 if method == "merge":
-                    left, right = other.left, other.right
+                    left, right = other.input_objs
                     value = getattr(left, name, "") + "|" + getattr(right, name, "")
                     object.__setattr__(self, name, value)
                 elif method == "concat":
                     value = "+".join(
-                        [getattr(o, name) for o in other.objs if getattr(o, name, None)]
+                        [
+                            getattr(o, name)
+                            for o in other.input_objs
+                            if getattr(o, name, None)
+                        ]
                     )
                     object.__setattr__(self, name, value)
                 else:
diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py
@@ -97,7 +97,7 @@ def finalize(self, other, method=None, **kwargs):
                     value = "+".join(
                         [
                             getattr(obj, name)
-                            for obj in other.objs
+                            for obj in other.input_objs
                             if getattr(obj, name, None)
                         ]
                     )
diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py
@@ -757,7 +757,7 @@ def test_missing_keys_raises_keyerror(self):
         df = DataFrame(np.arange(12).reshape(4, 3), columns=["A", "B", "C"])
         df2 = df.set_index(["A", "B"])
 
-        with pytest.raises(KeyError, match="1"):
+        with pytest.raises(KeyError, match="6"):
             df2.loc[(1, 6)]
 
     def test_missing_key_raises_keyerror2(self):
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py
diff --git a/pandas/tests/util/test_numba.py b/pandas/tests/util/test_numba.py