From 62a8c218839e0c1ce5410a41f712037b66c5df4c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 4 Sep 2025 17:01:00 +0200 Subject: [PATCH 1/5] REGR: fix string contains/match methods with compiled regex with flags --- pandas/core/arrays/_arrow_string_mixins.py | 10 +-- pandas/core/arrays/string_arrow.py | 52 +++++++++++++- pandas/core/strings/object_array.py | 6 +- pandas/tests/strings/test_find_replace.py | 83 +++++++++++++++++++--- 4 files changed, 128 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 55eddb8045ca6..d80b097066c27 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -316,10 +316,7 @@ def _str_match( flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): - if isinstance(pat, re.Pattern): - # GH#61952 - pat = pat.pattern - if isinstance(pat, str) and not pat.startswith("^"): + if not pat.startswith("^"): pat = f"^{pat}" return self._str_contains(pat, case, flags, na, regex=True) @@ -330,10 +327,7 @@ def _str_fullmatch( flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): - if isinstance(pat, re.Pattern): - # GH#61952 - pat = pat.pattern - if isinstance(pat, str) and (not pat.endswith("$") or pat.endswith("\\$")): + if not pat.endswith("$") or pat.endswith("\\$"): pat = f"{pat}$" return self._str_match(pat, case, flags, na) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6e29848171ace..c59b22a135329 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -55,6 +55,7 @@ ArrayLike, Dtype, NpDtype, + Scalar, npt, ) @@ -333,8 +334,6 @@ def astype(self, dtype, copy: bool = True): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad - _str_match = ArrowStringArrayMixin._str_match - _str_fullmatch = ArrowStringArrayMixin._str_fullmatch _str_lower = ArrowStringArrayMixin._str_lower _str_upper = ArrowStringArrayMixin._str_upper _str_strip = ArrowStringArrayMixin._str_strip @@ -349,6 +348,19 @@ def astype(self, dtype, copy: bool = True): _str_len = ArrowStringArrayMixin._str_len _str_slice = ArrowStringArrayMixin._str_slice + @staticmethod + def _preprocess_re_pattern(pat: re.Pattern, case: bool): + flags = pat.flags + pat = pat.pattern + # flags is not supported by pyarrow, but `case` is -> extract and remove + if flags & re.IGNORECASE: + case = False + flags = flags & ~re.IGNORECASE + # when creating a pattern with re.compile and a string, it automatically + # gets a UNICODE flag, while pyarrow assumes unicode for strings anyway + flags = flags & ~re.UNICODE + return pat, case, flags + def _str_contains( self, pat, @@ -360,10 +372,44 @@ def _str_contains( if flags: return super()._str_contains(pat, case, flags, na, regex) if isinstance(pat, re.Pattern): - pat = pat.pattern + pat, case, flags = self._preprocess_re_pattern(pat, case) + if flags: + return super()._str_contains(pat, case, flags, na, regex) return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) + def _str_match( + self, + pat: str | re.Pattern, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + ): + if flags: + return super()._str_match(pat, case, flags, na) + if isinstance(pat, re.Pattern): + pat, case, flags = self._preprocess_re_pattern(pat, case) + if flags: + return super()._str_match(pat, case, flags, na) + + return ArrowStringArrayMixin._str_match(self, pat, case, flags, na) + + def _str_fullmatch( + self, + pat: str | re.Pattern, + case: bool = True, + flags: int = 0, + na: Scalar | lib.NoDefault = lib.no_default, + ): + if flags: + return super()._str_fullmatch(pat, case, flags, na) + if isinstance(pat, re.Pattern): + pat, case, flags = self._preprocess_re_pattern(pat, case) + if flags: + return super()._str_fullmatch(pat, case, flags, na) + + return ArrowStringArrayMixin._str_fullmatch(self, pat, case, flags, na) + def _str_replace( self, pat: str | re.Pattern, diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 397fdcc5cac38..ba35542b7f112 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -262,8 +262,7 @@ def _str_match( ): if not case: flags |= re.IGNORECASE - if isinstance(pat, re.Pattern): - pat = pat.pattern + regex = re.compile(pat, flags=flags) f = lambda x: regex.match(x) is not None @@ -278,8 +277,7 @@ def _str_fullmatch( ): if not case: flags |= re.IGNORECASE - if isinstance(pat, re.Pattern): - pat = pat.pattern + regex = re.compile(pat, flags=flags) f = lambda x: regex.fullmatch(x) is not None diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index cce96f38d216a..8a235dc9a8105 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -283,16 +283,39 @@ def test_contains_nan(any_string_dtype): def test_contains_compiled_regex(any_string_dtype): # GH#61942 - ser = Series(["foo", "bar", "baz"], dtype=any_string_dtype) - pat = re.compile("ba.") - result = ser.str.contains(pat) - expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) + + ser = Series(["foo", "bar", "Baz"], dtype=any_string_dtype) + + pat = re.compile("ba.") + result = ser.str.contains(pat) + expected = Series([False, True, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + # TODO this currently works for pyarrow-backed dtypes but raises for python + if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow": + result = ser.str.contains(pat, case=False) + expected = Series([False, True, True], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + else: + with pytest.raises( + ValueError, match="cannot process flags argument with a compiled pattern" + ): + ser.str.contains(pat, case=False) + + pat = re.compile("ba.", flags=re.IGNORECASE) + result = ser.str.contains(pat) expected = Series([False, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) + # TODO should this be supported? + with pytest.raises( + ValueError, match="cannot process flags argument with a compiled pattern" + ): + ser.str.contains(pat, flags=re.IGNORECASE) + # -------------------------------------------------------------------------------------- # str.startswith @@ -833,14 +856,36 @@ def test_match_case_kwarg(any_string_dtype): def test_match_compiled_regex(any_string_dtype): # GH#61952 - values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - result = values.str.match(re.compile(r"ab"), case=False) expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) + + values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) + + result = values.str.match(re.compile("ab")) + expected = Series([True, False, True, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + # TODO this currently works for pyarrow-backed dtypes but raises for python + if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow": + result = values.str.match(re.compile("ab"), case=False) + expected = Series([True, True, True, True], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + else: + with pytest.raises( + ValueError, match="cannot process flags argument with a compiled pattern" + ): + values.str.match(re.compile("ab"), case=False) + + result = values.str.match(re.compile("ab", flags=re.IGNORECASE)) expected = Series([True, True, True, True], dtype=expected_dtype) tm.assert_series_equal(result, expected) + with pytest.raises( + ValueError, match="cannot process flags argument with a compiled pattern" + ): + values.str.match(re.compile("ab"), flags=re.IGNORECASE) + # -------------------------------------------------------------------------------------- # str.fullmatch @@ -913,14 +958,36 @@ def test_fullmatch_case_kwarg(any_string_dtype): def test_fullmatch_compiled_regex(any_string_dtype): # GH#61952 - values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) - result = values.str.fullmatch(re.compile(r"ab"), case=False) expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" ) + + values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) + + result = values.str.fullmatch(re.compile("ab")) + expected = Series([True, False, False, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + # TODO this currently works for pyarrow-backed dtypes but raises for python + if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow": + result = values.str.fullmatch(re.compile("ab"), case=False) + expected = Series([True, True, False, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + else: + with pytest.raises( + ValueError, match="cannot process flags argument with a compiled pattern" + ): + values.str.fullmatch(re.compile("ab"), case=False) + + result = values.str.fullmatch(re.compile("ab", flags=re.IGNORECASE)) expected = Series([True, True, False, False], dtype=expected_dtype) tm.assert_series_equal(result, expected) + with pytest.raises( + ValueError, match="cannot process flags argument with a compiled pattern" + ): + values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE) + # -------------------------------------------------------------------------------------- # str.findall From e1100e2bd4156f86c1d42a4ebababbc058919f12 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Sep 2025 09:48:15 +0200 Subject: [PATCH 2/5] add additional test for custom flags being respected --- pandas/core/arrays/string_arrow.py | 22 ++++++++++++--------- pandas/tests/strings/test_find_replace.py | 24 +++++++++++++++++++++++ 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c59b22a135329..ecf65d8f31149 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -348,6 +348,15 @@ def astype(self, dtype, copy: bool = True): _str_len = ArrowStringArrayMixin._str_len _str_slice = ArrowStringArrayMixin._str_slice + @staticmethod + def _is_re_pattern_with_flags(pat: str | re.Pattern) -> bool: + # check if `pat` is a compiled regex pattern with flags that are not + # supported by pyarrow + return ( + isinstance(pat, re.Pattern) + and (pat.flags & ~(re.IGNORECASE | re.UNICODE)) != 0 + ) + @staticmethod def _preprocess_re_pattern(pat: re.Pattern, case: bool): flags = pat.flags @@ -369,12 +378,11 @@ def _str_contains( na=lib.no_default, regex: bool = True, ): - if flags: + if flags or self._is_re_pattern_with_flags(pat): return super()._str_contains(pat, case, flags, na, regex) if isinstance(pat, re.Pattern): + # TODO flags passed separately by user are ignored pat, case, flags = self._preprocess_re_pattern(pat, case) - if flags: - return super()._str_contains(pat, case, flags, na, regex) return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) @@ -385,12 +393,10 @@ def _str_match( flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): - if flags: + if flags or self._is_re_pattern_with_flags(pat): return super()._str_match(pat, case, flags, na) if isinstance(pat, re.Pattern): pat, case, flags = self._preprocess_re_pattern(pat, case) - if flags: - return super()._str_match(pat, case, flags, na) return ArrowStringArrayMixin._str_match(self, pat, case, flags, na) @@ -401,12 +407,10 @@ def _str_fullmatch( flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, ): - if flags: + if flags or self._is_re_pattern_with_flags(pat): return super()._str_fullmatch(pat, case, flags, na) if isinstance(pat, re.Pattern): pat, case, flags = self._preprocess_re_pattern(pat, case) - if flags: - return super()._str_fullmatch(pat, case, flags, na) return ArrowStringArrayMixin._str_fullmatch(self, pat, case, flags, na) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 8a235dc9a8105..093aa1aac27e2 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -317,6 +317,30 @@ def test_contains_compiled_regex(any_string_dtype): ser.str.contains(pat, flags=re.IGNORECASE) +def test_contains_compiled_regex_flags(any_string_dtype): + # ensure other (than ignorecase) flags are respected + expected_dtype = ( + np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" + ) + + ser = Series(["foobar", "foo\nbar", "Baz"], dtype=any_string_dtype) + + pat = re.compile("^ba") + result = ser.str.contains(pat) + expected = Series([False, False, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + pat = re.compile("^ba", flags=re.MULTILINE) + result = ser.str.contains(pat) + expected = Series([False, True, False], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + pat = re.compile("^ba", flags=re.MULTILINE | re.IGNORECASE) + result = ser.str.contains(pat) + expected = Series([False, True, True], dtype=expected_dtype) + tm.assert_series_equal(result, expected) + + # -------------------------------------------------------------------------------------- # str.startswith # -------------------------------------------------------------------------------------- From 5cb60e0b5b1661834f7e84fc08b1f9d0bdc4b51c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Sep 2025 10:03:38 +0200 Subject: [PATCH 3/5] update type annotations --- pandas/core/arrays/_arrow_string_mixins.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index d80b097066c27..ad118d6be6b18 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -311,7 +311,7 @@ def _str_contains( def _str_match( self, - pat: str | re.Pattern, + pat: str, case: bool = True, flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, @@ -322,7 +322,7 @@ def _str_match( def _str_fullmatch( self, - pat: str | re.Pattern, + pat: str, case: bool = True, flags: int = 0, na: Scalar | lib.NoDefault = lib.no_default, From e065d332c0735c169a26d142be42a0fe310253c4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Sep 2025 10:40:55 +0200 Subject: [PATCH 4/5] add whatsnew --- doc/source/whatsnew/v2.3.3.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index e31ae4a8a647b..cbde6f52d4472 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -22,7 +22,8 @@ become the default string dtype in pandas 3.0. See Bug fixes ^^^^^^^^^ -- +- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch`` + with a compiled regex and custom flags (:issue:`62240`) .. --------------------------------------------------------------------------- .. _whatsnew_233.contributors: From 6a1445125d0f948c6208bad3b18562b4ef300768 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 5 Sep 2025 11:11:01 +0200 Subject: [PATCH 5/5] try fixing typing issues --- pandas/core/arrays/string_arrow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ecf65d8f31149..e396ce91a293a 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -358,9 +358,9 @@ def _is_re_pattern_with_flags(pat: str | re.Pattern) -> bool: ) @staticmethod - def _preprocess_re_pattern(pat: re.Pattern, case: bool): + def _preprocess_re_pattern(pat: re.Pattern, case: bool) -> tuple[str, bool, int]: + pattern = pat.pattern flags = pat.flags - pat = pat.pattern # flags is not supported by pyarrow, but `case` is -> extract and remove if flags & re.IGNORECASE: case = False @@ -368,7 +368,7 @@ def _preprocess_re_pattern(pat: re.Pattern, case: bool): # when creating a pattern with re.compile and a string, it automatically # gets a UNICODE flag, while pyarrow assumes unicode for strings anyway flags = flags & ~re.UNICODE - return pat, case, flags + return pattern, case, flags def _str_contains( self,