Skip to content

Commit b64f0df

Browse files
[backport 2.3.x] BUG: avoid validation error for ufunc with string[python] array (#62498) (#62505)
1 parent 058eb2b commit b64f0df

File tree

3 files changed

+36
-0
lines changed

3 files changed

+36
-0
lines changed

doc/source/whatsnew/v2.3.3.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ Bug fixes
4848
with a compiled regex and custom flags (:issue:`62240`)
4949
- Fix :meth:`Series.str.match` and :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`)
5050
- Fix comparing a :class:`StringDtype` Series with mixed objects raising an error (:issue:`60228`)
51+
- Fix error being raised when using a numpy ufunc with a Python-backed string array (:issue:`40800`)
5152

5253
Improvements and fixes for Copy-on-Write
5354
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

pandas/core/arrays/numpy_.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,14 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
209209
# e.g. test_np_max_nested_tuples
210210
return result
211211
else:
212+
if self.dtype.type is str: # type: ignore[comparison-overlap]
213+
# StringDtype
214+
try:
215+
return type(self)(result)
216+
except ValueError:
217+
# if validation of input fails (no strings)
218+
# -> fallback to returning raw numpy array
219+
return result
212220
# one return value; re-box array-like results
213221
return type(self)(result)
214222

pandas/tests/arrays/string_/test_string.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -864,3 +864,30 @@ def test_tolist(dtype):
864864
result = arr.tolist()
865865
expected = vals
866866
tm.assert_equal(result, expected)
867+
868+
869+
@pytest.mark.parametrize("box", [pd.Series, pd.array])
870+
def test_numpy_array_ufunc(dtype, box):
871+
arr = box(["a", "bb", "ccc"], dtype=dtype)
872+
873+
# custom ufunc that works with string (object) input -> returning numeric
874+
str_len_ufunc = np.frompyfunc(lambda x: len(x), 1, 1)
875+
result = str_len_ufunc(arr)
876+
expected_cls = pd.Series if box is pd.Series else np.array
877+
# TODO we should infer int64 dtype here?
878+
expected = expected_cls([1, 2, 3], dtype=object)
879+
tm.assert_equal(result, expected)
880+
881+
# custom ufunc returning strings
882+
str_multiply_ufunc = np.frompyfunc(lambda x: x * 2, 1, 1)
883+
result = str_multiply_ufunc(arr)
884+
expected = box(["aa", "bbbb", "cccccc"], dtype=dtype)
885+
if dtype.storage == "pyarrow":
886+
# TODO ArrowStringArray should also preserve the class / dtype
887+
if box is pd.array:
888+
expected = np.array(["aa", "bbbb", "cccccc"], dtype=object)
889+
else:
890+
# not specifying the dtype because the exact dtype is not yet preserved
891+
expected = pd.Series(["aa", "bbbb", "cccccc"])
892+
893+
tm.assert_equal(result, expected)

0 commit comments

Comments
 (0)