Merge branch 'pandas-dev:main' into pytz-link

star1327p · web-flow · commit 081254ca85c7 · 2025-07-10T18:30:24.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -758,6 +758,7 @@ Indexing
 - Bug in :meth:`DataFrame.from_records` throwing a ``ValueError`` when passed an empty list in ``index`` (:issue:`58594`)
 - Bug in :meth:`DataFrame.loc` and :meth:`DataFrame.iloc` returning incorrect dtype when selecting from a :class:`DataFrame` with mixed data types. (:issue:`60600`)
 - Bug in :meth:`DataFrame.loc` with inconsistent behavior of loc-set with 2 given indexes to Series (:issue:`59933`)
+- Bug in :meth:`Index.equals` when comparing between :class:`Series` with string dtype :class:`Index` (:issue:`61099`)
 - Bug in :meth:`Index.get_indexer` and similar methods when ``NaN`` is located at or after position 128 (:issue:`58924`)
 - Bug in :meth:`MultiIndex.insert` when a new value inserted to a datetime-like level gets cast to ``NaT`` and fails indexing (:issue:`60388`)
 - Bug in :meth:`Series.__setitem__` when assigning boolean series with boolean indexer will raise ``LossySetitemError`` (:issue:`57338`)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -5481,11 +5481,7 @@ def equals(self, other: Any) -> bool:
             # quickly return if the lengths are different
             return False
 
-        if (
-            isinstance(self.dtype, StringDtype)
-            and self.dtype.na_value is np.nan
-            and other.dtype != self.dtype
-        ):
+        if isinstance(self.dtype, StringDtype) and other.dtype != self.dtype:
             # TODO(infer_string) can we avoid this special case?
             # special case for object behavior
             return other.equals(self.astype(object))
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -6,10 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 from pandas.core.dtypes.common import (
     is_float_dtype,
     is_integer_dtype,
@@ -444,13 +440,12 @@ def test_constructor_str_unknown(self):
         with pytest.raises(ValueError, match="Unknown dtype"):
             Categorical([1, 2], dtype="foo")
 
-    @pytest.mark.xfail(
-        using_string_dtype() and HAS_PYARROW, reason="Can't be NumPy strings"
-    )
     def test_constructor_np_strs(self):
         # GH#31499 Hashtable.map_locations needs to work on np.str_ objects
-        cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
-        assert all(isinstance(x, np.str_) for x in cat.categories)
+        #  We can't pass all-strings because the constructor would cast
+        #  those to StringDtype post-PDEP14
+        cat = Categorical(["1", "0", "1", 2], [np.str_("0"), np.str_("1"), 2])
+        assert all(isinstance(x, (np.str_, int)) for x in cat.categories)
 
     def test_constructor_from_categorical_with_dtype(self):
         dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py
@@ -1,7 +1,4 @@
 import numpy as np
-import pytest
-
-from pandas._config import using_string_dtype
 
 from pandas import (
     Categorical,
@@ -77,17 +74,19 @@ def test_print_none_width(self):
         with option_context("display.width", None):
             assert exp == repr(a)
 
-    @pytest.mark.skipif(
-        using_string_dtype(),
-        reason="Change once infer_string is set to True by default",
-    )
-    def test_unicode_print(self):
+    def test_unicode_print(self, using_infer_string):
         c = Categorical(["aaaaa", "bb", "cccc"] * 20)
         expected = """\
 ['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc']
 Length: 60
 Categories (3, object): ['aaaaa', 'bb', 'cccc']"""
 
+        if using_infer_string:
+            expected = expected.replace(
+                "(3, object): ['aaaaa', 'bb', 'cccc']",
+                "(3, str): [aaaaa, bb, cccc]",
+            )
+
         assert repr(c) == expected
 
         c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20)
@@ -96,6 +95,12 @@ def test_unicode_print(self):
 Length: 60
 Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']"""  # noqa: E501
 
+        if using_infer_string:
+            expected = expected.replace(
+                "(3, object): ['ああああ', 'いいいいい', 'ううううううう']",
+                "(3, str): [ああああ, いいいいい, ううううううう]",
+            )
+
         assert repr(c) == expected
 
         # unicode option should not affect to Categorical, as it doesn't care
@@ -106,6 +111,12 @@ def test_unicode_print(self):
 Length: 60
 Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']"""  # noqa: E501
 
+        if using_infer_string:
+            expected = expected.replace(
+                "(3, object): ['ああああ', 'いいいいい', 'ううううううう']",
+                "(3, str): [ああああ, いいいいい, ううううううう]",
+            )
+
             assert repr(c) == expected
 
     def test_categorical_repr(self):
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -745,10 +743,7 @@ def test_astype_tz_object_conversion(self, tz):
         result = result.astype({"tz": "datetime64[ns, Europe/London]"})
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) GH#60639")
-    def test_astype_dt64_to_string(
-        self, frame_or_series, tz_naive_fixture, using_infer_string
-    ):
+    def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
         # GH#41409
         tz = tz_naive_fixture
 
@@ -766,10 +761,7 @@ def test_astype_dt64_to_string(
         item = result.iloc[0]
         if frame_or_series is DataFrame:
             item = item.iloc[0]
-        if using_infer_string:
-            assert item is np.nan
-        else:
-            assert item is pd.NA
+        assert item is pd.NA
 
         # For non-NA values, we should match what we get for non-EA str
         alt = obj.astype(str)
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -11,8 +11,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import HAS_PYARROW
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -2183,19 +2181,14 @@ def test_enum_column_equality():
     tm.assert_series_equal(result, expected)
 
 
-def test_mixed_col_index_dtype(using_infer_string):
+def test_mixed_col_index_dtype(string_dtype_no_object):
     # GH 47382
     df1 = DataFrame(columns=list("abc"), data=1.0, index=[0])
     df2 = DataFrame(columns=list("abc"), data=0.0, index=[0])
-    df1.columns = df2.columns.astype("string")
+    df1.columns = df2.columns.astype(string_dtype_no_object)
     result = df1 + df2
     expected = DataFrame(columns=list("abc"), data=1.0, index=[0])
-    if using_infer_string:
-        # df2.columns.dtype will be "str" instead of object,
-        #  so the aligned result will be "string", not object
-        if HAS_PYARROW:
-            dtype = "string[pyarrow]"
-        else:
-            dtype = "string"
-        expected.columns = expected.columns.astype(dtype)
+
+    expected.columns = expected.columns.astype(string_dtype_no_object)
+
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
@@ -11,8 +11,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -76,10 +74,7 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
 
 
 class TestGroupBy:
-    # TODO(infer_string) resample sum introduces 0's
-    # https://github.com/pandas-dev/pandas/issues/60229
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_groupby_with_timegrouper(self):
+    def test_groupby_with_timegrouper(self, using_infer_string):
         # GH 4161
         # TimeGrouper requires a sorted index
         # also verifies that the resultant index has the correct name
@@ -116,8 +111,11 @@ def test_groupby_with_timegrouper(self):
                 {"Buyer": 0, "Quantity": 0},
                 index=exp_dti,
             )
-            # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"
+            # Cast to object/str to avoid implicit cast when setting
+            #  entry to "CarlCarlCarl"
             expected = expected.astype({"Buyer": object})
+            if using_infer_string:
+                expected = expected.astype({"Buyer": "str"})
             expected.iloc[0, 0] = "CarlCarlCarl"
             expected.iloc[6, 0] = "CarlCarl"
             expected.iloc[18, 0] = "Joe"
diff --git a/pandas/tests/indexes/base_class/test_formats.py b/pandas/tests/indexes/base_class/test_formats.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
 import pandas._config.config as cf
 
 from pandas import Index
@@ -16,7 +15,6 @@ def test_repr_is_valid_construction_code(self):
         res = eval(repr(idx))
         tm.assert_index_equal(res, idx)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="repr different")
     @pytest.mark.parametrize(
         "index,expected",
         [
@@ -77,11 +75,13 @@ def test_repr_is_valid_construction_code(self):
             ),
         ],
     )
-    def test_string_index_repr(self, index, expected):
+    def test_string_index_repr(self, index, expected, using_infer_string):
         result = repr(index)
+        if using_infer_string:
+            expected = expected.replace("dtype='object'", "dtype='str'")
+
         assert result == expected
 
-    @pytest.mark.xfail(using_string_dtype(), reason="repr different")
     @pytest.mark.parametrize(
         "index,expected",
         [
@@ -121,11 +121,16 @@ def test_string_index_repr(self, index, expected):
             ),
         ],
     )
-    def test_string_index_repr_with_unicode_option(self, index, expected):
+    def test_string_index_repr_with_unicode_option(
+        self, index, expected, using_infer_string
+    ):
         # Enable Unicode option -----------------------------------------
         with cf.option_context("display.unicode.east_asian_width", True):
             result = repr(index)
-            assert result == expected
+
+        if using_infer_string:
+            expected = expected.replace("dtype='object'", "dtype='str'")
+        assert result == expected
 
     def test_repr_summary(self):
         with cf.option_context("display.max_seq_items", 10):
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
@@ -199,6 +199,7 @@ def test_unique(self, data, categories, expected_data, ordered):
         expected = CategoricalIndex(expected_data, dtype=dtype)
         tm.assert_index_equal(idx.unique(), expected)
 
+    # TODO(3.0): remove this test once using_string_dtype() is always True
     @pytest.mark.xfail(using_string_dtype(), reason="repr doesn't roundtrip")
     def test_repr_roundtrip(self):
         ci = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True)
diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py