Skip to content

Commit a55d441

Browse files
BUG: avoid validation error for ufunc with string[python] array
1 parent e97a56e commit a55d441

File tree

3 files changed

+37
-1
lines changed

3 files changed

+37
-1
lines changed

doc/source/whatsnew/v2.3.3.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ Bug fixes
4747
- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
4848
with a compiled regex and custom flags (:issue:`62240`)
4949
- Fix :meth:`Series.str.match` and :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`)
50-
50+
- Fix error being raised when using a numpy ufunc with a Python-backed string array (:issue:`40800`)
5151

5252
Improvements and fixes for Copy-on-Write
5353
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

pandas/core/arrays/numpy_.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,15 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
234234
# e.g. test_np_max_nested_tuples
235235
return result
236236
else:
237+
if self.dtype.type is str:
238+
# StringDtype
239+
try:
240+
# specify dtype to preserve storage/na_value
241+
return type(self)(result, dtype=self.dtype)
242+
except ValueError:
243+
# if validation of input fails (no strings)
244+
# -> fallback to returning raw numpy array
245+
return result
237246
# one return value; re-box array-like results
238247
return type(self)(result)
239248

pandas/tests/arrays/string_/test_string.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -840,3 +840,30 @@ def test_string_array_view_type_error():
840840
arr = pd.array(["a", "b", "c"], dtype="string")
841841
with pytest.raises(TypeError, match="Cannot change data-type for string array."):
842842
arr.view("i8")
843+
844+
845+
@pytest.mark.parametrize("box", [pd.Series, pd.array])
846+
def test_numpy_array_ufunc(dtype, box):
847+
arr = box(["a", "bb", "ccc"], dtype=dtype)
848+
849+
# custom ufunc that works with string (object) input -> returning numeric
850+
str_len_ufunc = np.frompyfunc(lambda x: len(x), 1, 1)
851+
result = str_len_ufunc(arr)
852+
expected_cls = pd.Series if box is pd.Series else np.array
853+
# TODO we should infer int64 dtype here?
854+
expected = expected_cls([1, 2, 3], dtype=object)
855+
tm.assert_equal(result, expected)
856+
857+
# custom ufunc returning strings
858+
str_multiply_ufunc = np.frompyfunc(lambda x: x * 2, 1, 1)
859+
result = str_multiply_ufunc(arr)
860+
expected = box(["aa", "bbbb", "cccccc"], dtype=dtype)
861+
if dtype.storage == "pyarrow":
862+
# TODO ArrowStringArray should also preserve the class / dtype
863+
if box is pd.array:
864+
expected = np.array(["aa", "bbbb", "cccccc"], dtype=object)
865+
else:
866+
# not specifying the dtype because the exact dtype is not yet preserved
867+
expected = pd.Series(["aa", "bbbb", "cccccc"])
868+
869+
tm.assert_equal(result, expected)

0 commit comments

Comments
 (0)