Merge remote-tracking branch 'upstream/main' into deps/optional/bump

mroeschke · mroeschke · commit 6ffc7a9a5a0b · 2025-09-05T11:13:15.000-07:00
diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
@@ -24,6 +24,7 @@ Version 2.3
 .. toctree::
    :maxdepth: 2
 
+   v2.3.3
    v2.3.2
    v2.3.1
    v2.3.0
diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst
@@ -0,0 +1,32 @@
+.. _whatsnew_233:
+
+What's new in 2.3.3 (September XX, 2025)
+----------------------------------------
+
+These are the changes in pandas 2.3.3. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_233.string_fixes:
+
+Improvements and fixes for the StringDtype
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Most changes in this release are related to :class:`StringDtype` which will
+become the default string dtype in pandas 3.0. See
+:ref:`whatsnew_230.upcoming_changes` for more details.
+
+.. _whatsnew_233.string_fixes.bugs:
+
+Bug fixes
+^^^^^^^^^
+- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
+  with a compiled regex and custom flags (:issue:`62240`)
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_233.contributors:
+
+Contributors
+~~~~~~~~~~~~
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -965,6 +965,7 @@ Indexing
 - Bug in :meth:`Series.__setitem__` when assigning boolean series with boolean indexer will raise ``LossySetitemError`` (:issue:`57338`)
 - Bug in printing :attr:`Index.names` and :attr:`MultiIndex.levels` would not escape single quotes (:issue:`60190`)
 - Bug in reindexing of :class:`DataFrame` with :class:`PeriodDtype` columns in case of consolidated block (:issue:`60980`, :issue:`60273`)
+- Bug in :meth:`DataFrame.loc.__getitem__` and :meth:`DataFrame.iloc.__getitem__` with a :class:`CategoricalDtype` column with integer categories raising when trying to index a row containing a ``NaN`` entry (:issue:`58954`)
 - Bug in :meth:`Index.__getitem__` incorrectly raising with a 0-dim ``np.ndarray`` key (:issue:`55601`)
 
 Missing
diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
@@ -300,29 +300,23 @@ def _str_contains(
 
     def _str_match(
         self,
-        pat: str | re.Pattern,
+        pat: str,
         case: bool = True,
         flags: int = 0,
         na: Scalar | lib.NoDefault = lib.no_default,
     ):
-        if isinstance(pat, re.Pattern):
-            # GH#61952
-            pat = pat.pattern
-        if isinstance(pat, str) and not pat.startswith("^"):
+        if not pat.startswith("^"):
             pat = f"^{pat}"
         return self._str_contains(pat, case, flags, na, regex=True)
 
     def _str_fullmatch(
         self,
-        pat: str | re.Pattern,
+        pat: str,
         case: bool = True,
         flags: int = 0,
         na: Scalar | lib.NoDefault = lib.no_default,
     ):
-        if isinstance(pat, re.Pattern):
-            # GH#61952
-            pat = pat.pattern
-        if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")):
+        if not pat.endswith("$") or pat.endswith("\\$"):
             pat = f"{pat}$"
         return self._str_match(pat, case, flags, na)
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -54,6 +54,7 @@
         ArrayLike,
         Dtype,
         NpDtype,
+        Scalar,
         npt,
     )
 
@@ -335,8 +336,6 @@ def astype(self, dtype, copy: bool = True):
     _str_startswith = ArrowStringArrayMixin._str_startswith
     _str_endswith = ArrowStringArrayMixin._str_endswith
     _str_pad = ArrowStringArrayMixin._str_pad
-    _str_match = ArrowStringArrayMixin._str_match
-    _str_fullmatch = ArrowStringArrayMixin._str_fullmatch
     _str_lower = ArrowStringArrayMixin._str_lower
     _str_upper = ArrowStringArrayMixin._str_upper
     _str_strip = ArrowStringArrayMixin._str_strip
@@ -351,6 +350,28 @@ def astype(self, dtype, copy: bool = True):
     _str_len = ArrowStringArrayMixin._str_len
     _str_slice = ArrowStringArrayMixin._str_slice
 
+    @staticmethod
+    def _is_re_pattern_with_flags(pat: str | re.Pattern) -> bool:
+        # check if `pat` is a compiled regex pattern with flags that are not
+        # supported by pyarrow
+        return (
+            isinstance(pat, re.Pattern)
+            and (pat.flags & ~(re.IGNORECASE | re.UNICODE)) != 0
+        )
+
+    @staticmethod
+    def _preprocess_re_pattern(pat: re.Pattern, case: bool) -> tuple[str, bool, int]:
+        pattern = pat.pattern
+        flags = pat.flags
+        # flags is not supported by pyarrow, but `case` is -> extract and remove
+        if flags & re.IGNORECASE:
+            case = False
+            flags = flags & ~re.IGNORECASE
+        # when creating a pattern with re.compile and a string, it automatically
+        # gets a UNICODE flag, while pyarrow assumes unicode for strings anyway
+        flags = flags & ~re.UNICODE
+        return pattern, case, flags
+
     def _str_contains(
         self,
         pat,
@@ -359,13 +380,42 @@ def _str_contains(
         na=lib.no_default,
         regex: bool = True,
     ):
-        if flags:
+        if flags or self._is_re_pattern_with_flags(pat):
             return super()._str_contains(pat, case, flags, na, regex)
         if isinstance(pat, re.Pattern):
-            pat = pat.pattern
+            # TODO flags passed separately by user are ignored
+            pat, case, flags = self._preprocess_re_pattern(pat, case)
 
         return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
 
+    def _str_match(
+        self,
+        pat: str | re.Pattern,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
+    ):
+        if flags or self._is_re_pattern_with_flags(pat):
+            return super()._str_match(pat, case, flags, na)
+        if isinstance(pat, re.Pattern):
+            pat, case, flags = self._preprocess_re_pattern(pat, case)
+
+        return ArrowStringArrayMixin._str_match(self, pat, case, flags, na)
+
+    def _str_fullmatch(
+        self,
+        pat: str | re.Pattern,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
+    ):
+        if flags or self._is_re_pattern_with_flags(pat):
+            return super()._str_fullmatch(pat, case, flags, na)
+        if isinstance(pat, re.Pattern):
+            pat, case, flags = self._preprocess_re_pattern(pat, case)
+
+        return ArrowStringArrayMixin._str_fullmatch(self, pat, case, flags, na)
+
     def _str_replace(
         self,
         pat: str | re.Pattern,
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -50,6 +50,7 @@
     is_list_like,
 )
 from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
     DatetimeTZDtype,
     ExtensionDtype,
     SparseDtype,
@@ -1138,7 +1139,24 @@ def fast_xs(self, loc: int) -> SingleBlockManager:
             # Such assignment may incorrectly coerce NaT to None
             # result[blk.mgr_locs] = blk._slice((slice(None), loc))
             for i, rl in enumerate(blk.mgr_locs):
-                result[rl] = blk.iget((i, loc))
+                item = blk.iget((i, loc))
+                if (
+                    result.dtype.kind in "iub"
+                    and lib.is_float(item)
+                    and isna(item)
+                    and isinstance(blk.dtype, CategoricalDtype)
+                ):
+                    # GH#58954 caused bc interleaved_dtype is wrong for Categorical
+                    # TODO(GH#38240) this will be unnecessary
+                    # Note that doing this in a try/except would work for the
+                    #  integer case, but not for bool, which will cast the NaN
+                    #  entry to True.
+                    if result.dtype.kind == "b":
+                        new_dtype = object
+                    else:
+                        new_dtype = np.float64
+                    result = result.astype(new_dtype)
+                result[rl] = item
 
         if isinstance(dtype, ExtensionDtype):
             cls = dtype.construct_array_type()
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -262,8 +262,7 @@ def _str_match(
     ):
         if not case:
             flags |= re.IGNORECASE
-        if isinstance(pat, re.Pattern):
-            pat = pat.pattern
+
         regex = re.compile(pat, flags=flags)
 
         f = lambda x: regex.match(x) is not None
@@ -278,8 +277,7 @@ def _str_fullmatch(
     ):
         if not case:
             flags |= re.IGNORECASE
-        if isinstance(pat, re.Pattern):
-            pat = pat.pattern
+
         regex = re.compile(pat, flags=flags)
 
         f = lambda x: regex.fullmatch(x) is not None
diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
@@ -571,3 +571,25 @@ def test_getitem_categorical_with_nan(self):
         df = DataFrame(ser)
         assert df.loc[np.nan, 0] == 2
         assert df.loc[np.nan][0] == 2
+
+    def test_getitem_row_categorical_with_nan(self):
+        # GH#58954
+        df = DataFrame({"a": [1, 2], "b": CategoricalIndex([1, None])})
+
+        res = df.iloc[1]
+        expected = Series([2, np.nan], index=df.columns, name=1)
+        tm.assert_series_equal(res, expected)
+
+        res = df.loc[1]
+        tm.assert_series_equal(res, expected)
+
+    def test_getitem_row_categorical_with_nan_bool(self):
+        # GH#58954
+        df = DataFrame({"a": [True, False], "b": CategoricalIndex([False, None])})
+
+        res = df.iloc[1]
+        expected = Series([False, np.nan], index=df.columns, dtype=object, name=1)
+        tm.assert_series_equal(res, expected)
+
+        res = df.loc[1]
+        tm.assert_series_equal(res, expected)
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
@@ -283,13 +283,60 @@ def test_contains_nan(any_string_dtype):
 
 def test_contains_compiled_regex(any_string_dtype):
     # GH#61942
-    ser = Series(["foo", "bar", "baz"], dtype=any_string_dtype)
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+
+    ser = Series(["foo", "bar", "Baz"], dtype=any_string_dtype)
+
     pat = re.compile("ba.")
     result = ser.str.contains(pat)
+    expected = Series([False, True, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # TODO this currently works for pyarrow-backed dtypes but raises for python
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        result = ser.str.contains(pat, case=False)
+        expected = Series([False, True, True], dtype=expected_dtype)
+        tm.assert_series_equal(result, expected)
+    else:
+        with pytest.raises(
+            ValueError, match="cannot process flags argument with a compiled pattern"
+        ):
+            ser.str.contains(pat, case=False)
+
+    pat = re.compile("ba.", flags=re.IGNORECASE)
+    result = ser.str.contains(pat)
+    expected = Series([False, True, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # TODO should this be supported?
+    with pytest.raises(
+        ValueError, match="cannot process flags argument with a compiled pattern"
+    ):
+        ser.str.contains(pat, flags=re.IGNORECASE)
+
 
+def test_contains_compiled_regex_flags(any_string_dtype):
+    # ensure other (than ignorecase) flags are respected
     expected_dtype = (
         np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
     )
+
+    ser = Series(["foobar", "foo\nbar", "Baz"], dtype=any_string_dtype)
+
+    pat = re.compile("^ba")
+    result = ser.str.contains(pat)
+    expected = Series([False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    pat = re.compile("^ba", flags=re.MULTILINE)
+    result = ser.str.contains(pat)
+    expected = Series([False, True, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    pat = re.compile("^ba", flags=re.MULTILINE | re.IGNORECASE)
+    result = ser.str.contains(pat)
     expected = Series([False, True, True], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -833,14 +880,36 @@ def test_match_case_kwarg(any_string_dtype):
 
 def test_match_compiled_regex(any_string_dtype):
     # GH#61952
-    values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
-    result = values.str.match(re.compile(r"ab"), case=False)
     expected_dtype = (
         np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
     )
+
+    values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
+
+    result = values.str.match(re.compile("ab"))
+    expected = Series([True, False, True, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # TODO this currently works for pyarrow-backed dtypes but raises for python
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        result = values.str.match(re.compile("ab"), case=False)
+        expected = Series([True, True, True, True], dtype=expected_dtype)
+        tm.assert_series_equal(result, expected)
+    else:
+        with pytest.raises(
+            ValueError, match="cannot process flags argument with a compiled pattern"
+        ):
+            values.str.match(re.compile("ab"), case=False)
+
+    result = values.str.match(re.compile("ab", flags=re.IGNORECASE))
     expected = Series([True, True, True, True], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
+    with pytest.raises(
+        ValueError, match="cannot process flags argument with a compiled pattern"
+    ):
+        values.str.match(re.compile("ab"), flags=re.IGNORECASE)
+
 
 # --------------------------------------------------------------------------------------
 # str.fullmatch
@@ -913,14 +982,36 @@ def test_fullmatch_case_kwarg(any_string_dtype):
 
 def test_fullmatch_compiled_regex(any_string_dtype):
     # GH#61952
-    values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
-    result = values.str.fullmatch(re.compile(r"ab"), case=False)
     expected_dtype = (
         np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
     )
+
+    values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
+
+    result = values.str.fullmatch(re.compile("ab"))
+    expected = Series([True, False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # TODO this currently works for pyarrow-backed dtypes but raises for python
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        result = values.str.fullmatch(re.compile("ab"), case=False)
+        expected = Series([True, True, False, False], dtype=expected_dtype)
+        tm.assert_series_equal(result, expected)
+    else:
+        with pytest.raises(
+            ValueError, match="cannot process flags argument with a compiled pattern"
+        ):
+            values.str.fullmatch(re.compile("ab"), case=False)
+
+    result = values.str.fullmatch(re.compile("ab", flags=re.IGNORECASE))
     expected = Series([True, True, False, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
+    with pytest.raises(
+        ValueError, match="cannot process flags argument with a compiled pattern"
+    ):
+        values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE)
+
 
 # --------------------------------------------------------------------------------------
 # str.findall