From 0a4c2f5b33e6966c0dbc9fd847dbb212214413e3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Sep 2025 18:22:14 +0200 Subject: [PATCH] BUG: avoid validation error for ufunc with string[python] array (#62498) --- doc/source/whatsnew/v2.3.3.rst | 3 ++- pandas/core/arrays/numpy_.py | 8 +++++++ pandas/tests/arrays/string_/test_string.py | 27 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index bc5a4c5b27a90..db710d29b66a0 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -47,7 +47,8 @@ Bug fixes - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch`` with a compiled regex and custom flags (:issue:`62240`) - Fix :meth:`Series.str.match` and :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`) - +- Fix comparing a :class:`StringDtype` Series with mixed objects raising an error (:issue:`60228`) +- Fix error being raised when using a numpy ufunc with a Python-backed string array (:issue:`40800`) Improvements and fixes for Copy-on-Write ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index e0031d3db6ca7..07fd207933fc1 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -209,6 +209,14 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # e.g. test_np_max_nested_tuples return result else: + if self.dtype.type is str: # type: ignore[comparison-overlap] + # StringDtype + try: + return type(self)(result) + except ValueError: + # if validation of input fails (no strings) + # -> fallback to returning raw numpy array + return result # one return value; re-box array-like results return type(self)(result) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 8c15f0c98f90e..b468480cf5f86 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -864,3 +864,30 @@ def test_tolist(dtype): result = arr.tolist() expected = vals tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("box", [pd.Series, pd.array]) +def test_numpy_array_ufunc(dtype, box): + arr = box(["a", "bb", "ccc"], dtype=dtype) + + # custom ufunc that works with string (object) input -> returning numeric + str_len_ufunc = np.frompyfunc(lambda x: len(x), 1, 1) + result = str_len_ufunc(arr) + expected_cls = pd.Series if box is pd.Series else np.array + # TODO we should infer int64 dtype here? + expected = expected_cls([1, 2, 3], dtype=object) + tm.assert_equal(result, expected) + + # custom ufunc returning strings + str_multiply_ufunc = np.frompyfunc(lambda x: x * 2, 1, 1) + result = str_multiply_ufunc(arr) + expected = box(["aa", "bbbb", "cccccc"], dtype=dtype) + if dtype.storage == "pyarrow": + # TODO ArrowStringArray should also preserve the class / dtype + if box is pd.array: + expected = np.array(["aa", "bbbb", "cccccc"], dtype=object) + else: + # not specifying the dtype because the exact dtype is not yet preserved + expected = pd.Series(["aa", "bbbb", "cccccc"]) + + tm.assert_equal(result, expected)