Skip to content

Commit 46ce29b

Browse files
String dtype: fix isin() values handling for python storage
1 parent 352289b commit 46ce29b

File tree

3 files changed

+45
-6
lines changed

3 files changed

+45
-6
lines changed

pandas/conftest.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1310,7 +1310,13 @@ def string_storage(request):
13101310
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
13111311
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
13121312
("python", np.nan),
1313-
]
1313+
],
1314+
ids=[
1315+
"string=string[python]",
1316+
"string=string[pyarrow]",
1317+
"string=str[pyarrow]",
1318+
"string=str[python]",
1319+
],
13141320
)
13151321
def string_dtype_arguments(request):
13161322
"""
@@ -1341,6 +1347,7 @@ def dtype_backend(request):
13411347

13421348
# Alias so we can test with cartesian product of string_storage
13431349
string_storage2 = string_storage
1350+
string_dtype_arguments2 = string_dtype_arguments
13441351

13451352

13461353
@pytest.fixture(params=tm.BYTES_DTYPES)

pandas/core/arrays/string_.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
nanops,
4747
ops,
4848
)
49+
from pandas.core.algorithms import isin
4950
from pandas.core.array_algos import masked_reductions
5051
from pandas.core.arrays.base import ExtensionArray
5152
from pandas.core.arrays.floating import (
@@ -65,6 +66,7 @@
6566
import pyarrow
6667

6768
from pandas._typing import (
69+
ArrayLike,
6870
AxisInt,
6971
Dtype,
7072
DtypeObj,
@@ -731,6 +733,22 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
731733
# base class implementation that uses __setitem__
732734
ExtensionArray._putmask(self, mask, value)
733735

736+
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
737+
if not isinstance(values, BaseStringArray):
738+
if not lib.is_string_array(np.asarray(values), skipna=True):
739+
values = np.array(
740+
[val for val in values if isinstance(val, str) or isna(val)],
741+
dtype=object,
742+
)
743+
if not len(values):
744+
return np.zeros(self.shape, dtype=bool)
745+
746+
values = self._from_sequence(values, dtype=self.dtype)
747+
else:
748+
values = values.astype(self.dtype, copy=False)
749+
750+
return isin(np.asarray(self), np.asarray(values))
751+
734752
def astype(self, dtype, copy: bool = True):
735753
dtype = pandas_dtype(dtype)
736754

pandas/tests/arrays/string_/test_string.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ def dtype(string_dtype_arguments):
3030
return pd.StringDtype(storage=storage, na_value=na_value)
3131

3232

33+
@pytest.fixture
34+
def dtype2(string_dtype_arguments2):
35+
storage, na_value = string_dtype_arguments2
36+
return pd.StringDtype(storage=storage, na_value=na_value)
37+
38+
3339
@pytest.fixture
3440
def cls(dtype):
3541
"""Fixture giving array type from parametrized 'dtype'"""
@@ -665,11 +671,7 @@ def test_isin(dtype, fixed_now_ts):
665671
tm.assert_series_equal(result, expected)
666672

667673
result = s.isin(["a", pd.NA])
668-
if dtype.storage == "python" and dtype.na_value is np.nan:
669-
# TODO(infer_string) we should make this consistent
670-
expected = pd.Series([True, False, False])
671-
else:
672-
expected = pd.Series([True, False, True])
674+
expected = pd.Series([True, False, True])
673675
tm.assert_series_equal(result, expected)
674676

675677
result = s.isin([])
@@ -681,6 +683,18 @@ def test_isin(dtype, fixed_now_ts):
681683
tm.assert_series_equal(result, expected)
682684

683685

686+
def test_isin_string_array(dtype, dtype2):
687+
s = pd.Series(["a", "b", None], dtype=dtype)
688+
689+
result = s.isin(pd.array(["a", "c"], dtype=dtype2))
690+
expected = pd.Series([True, False, False])
691+
tm.assert_series_equal(result, expected)
692+
693+
result = s.isin(pd.array(["a", None], dtype=dtype2))
694+
expected = pd.Series([True, False, True])
695+
tm.assert_series_equal(result, expected)
696+
697+
684698
def test_setitem_scalar_with_mask_validation(dtype):
685699
# https://github.com/pandas-dev/pandas/issues/47628
686700
# setting None with a boolean mask (through _putmaks) should still result

0 commit comments

Comments
 (0)