Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,19 +397,19 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):

def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
if dtype is None:
dtype = np.int64
dtype = np.bool_
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(
sep, dtype
)
if len(labels) == 0:
return np.empty(shape=(0, 0), dtype=dtype), labels
dummies = np.vstack(dummies_pa.to_numpy())
_dtype = pandas_dtype(dtype)
dummies_dtype: NpDtype
if isinstance(_dtype, np.dtype):
dummies_dtype = _dtype
else:
dummies_dtype = np.bool_
if len(labels) == 0:
return np.empty(shape=(0, 0), dtype=dummies_dtype), labels
dummies = np.vstack(dummies_pa.to_numpy())
Comment on lines -404 to +412
Copy link
Author

@komo-fr komo-fr Jan 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the existing implementation, the following code would raise a TypeError: Cannot interpret 'BooleanDtype' as a data type due to the line return np.empty(shape=(0, 0), dtype=dtype):

# Empty Series
sr = pd.Series(dtype="string[pyarrow]")
sr.str.get_dummies(dtype=pd.BooleanDtype())

With this PR, the default dtype is changed to a boolean type, which makes similar issues more likely to occur. To address this, I modified the code to pass dummies_dtype to np.empty() instead of using dtype directly.

Related test: https://github.com/pandas-dev/pandas/blob/main/pandas/tests/strings/test_strings.py#L136

return dummies.astype(dummies_dtype, copy=False), labels

def _convert_int_result(self, result):
Expand Down
43 changes: 32 additions & 11 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2489,7 +2489,7 @@ def get_dummies(
----------
sep : str, default "|"
String to split on.
dtype : dtype, default np.int64
dtype : dtype, default bool
Data type for new columns. Only a single dtype is allowed.

Returns
Expand All @@ -2505,27 +2505,48 @@ def get_dummies(
Examples
--------
>>> pd.Series(["a|b", "a", "a|c"]).str.get_dummies()
a b c
0 1 1 0
1 1 0 0
2 1 0 1
a b c
0 True True False
1 True False False
2 True False True

>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies()
a b c
0 True True False
1 False False False
2 True False True

>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=np.int64)
a b c
0 1 1 0
1 0 0 0
2 1 0 1

>>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool)
a b c
0 True True False
1 False False False
2 True False True
"""
from pandas.core.frame import DataFrame

# we need to cast to Series of strings as only that has all
# methods available for making the dummies...
input_dtype = self._data.dtype
if dtype is None and not isinstance(input_dtype, ArrowDtype):
from pandas.core.arrays.string_ import StringDtype

if isinstance(input_dtype, CategoricalDtype):
input_dtype = input_dtype.categories.dtype

if isinstance(input_dtype, ArrowDtype):
import pyarrow as pa

dtype = ArrowDtype(pa.bool_())
elif (
isinstance(input_dtype, StringDtype)
and input_dtype.na_value is not np.nan
):
from pandas.core.dtypes.common import pandas_dtype

dtype = pandas_dtype("boolean")
else:
dtype = np.bool_
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I based this logic on the existing implementation of pd.get_dummies():
https://github.com/pandas-dev/pandas/blob/v2.2.3/pandas/core/reshape/encoding.py#L252-L269

I added the condition if dtype is None and not isinstance(input_dtype, ArrowDtype): to avoid errors when input_dtype is an ArrowDtype.
The reason is that not excluding ArrowDtype would cause an error with the following code:

sr = pd.Series(["A", "B", "A"], dtype=pd.ArrowDtype(pa.string()))
sr.str.get_dummies(dtype=pd.ArrowDtype(pa.bool_()))

Output (this issue also exists in the implementation before this PR):

...
  File "/Users/komo_fr/P_Project/pandas_workspace/pandas-komo_fr/pandas/core/strings/accessor.py", line 2532, in get_dummies
    DataFrame(result, columns=name, dtype=dtype),
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
...
pyarrow.lib.ArrowNotImplementedError: Unsupported cast from list<item: bool> to bool using function cast_boolean

With this PR, the default dtype is changed to a boolean type, which makes similar issues more likely to occur.
Since I wasn’t sure how to fully resolve this problem and it could lead to a much larger PR, I chose to exclude ArrowDtype cases for now.


result, name = self._data.array._str_get_dummies(sep, dtype)
if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype):
return self._wrap_result(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/strings/object_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
from pandas import Series

if dtype is None:
dtype = np.int64
dtype = np.bool_
arr = Series(self).fillna("")
try:
arr = sep + arr + sep
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2206,6 +2206,16 @@ def test_get_dummies():
)
tm.assert_frame_equal(result, expected)

ser = pd.Series(
["a", "b"],
dtype=pd.CategoricalDtype(pd.Index(["a", "b"], dtype=ArrowDtype(pa.string()))),
)
result = ser.str.get_dummies()
expected = pd.DataFrame(
[[True, False], [False, True]], dtype=ArrowDtype(pa.bool_()), columns=["a", "b"]
)
tm.assert_frame_equal(result, expected)


def test_str_partition():
ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
Expand Down
29 changes: 26 additions & 3 deletions pandas/tests/strings/test_get_dummies.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import pandas.util._test_decorators as td

from pandas import (
NA,
CategoricalDtype,
DataFrame,
Index,
MultiIndex,
Expand All @@ -22,19 +24,28 @@
def test_get_dummies(any_string_dtype):
s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
result = s.str.get_dummies("|")
expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"))
exp_dtype = (
"boolean"
if any_string_dtype == "string" and any_string_dtype.na_value is NA
else "bool"
)
expected = DataFrame(
[[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=exp_dtype
)
tm.assert_frame_equal(result, expected)

s = Series(["a;b", "a", 7], dtype=any_string_dtype)
result = s.str.get_dummies(";")
expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"))
expected = DataFrame(
[[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"), dtype=exp_dtype
)
tm.assert_frame_equal(result, expected)


def test_get_dummies_index():
# GH9980, GH8028
idx = Index(["a|b", "a|c", "b|c"])
result = idx.str.get_dummies("|")
result = idx.str.get_dummies("|", dtype=np.int64)
Comment on lines 45 to +48
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The behavior where the output becomes a MultiIndex when the input data is a pd.Index assumes that the dtype is not a boolean type:
https://github.com/pandas-dev/pandas/blob/main/pandas/core/strings/accessor.py#L381-L389

With this PR, the default behavior of str.get_dummies() changes to use a boolean dtype. To ensure the test cases remain consistent with the intended behavior, I modified them to explicitly specify the dtype.


expected = MultiIndex.from_tuples(
[(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c")
Expand Down Expand Up @@ -125,3 +136,15 @@ def test_get_dummies_with_pa_str_dtype(any_string_dtype):
dtype="str[pyarrow]",
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype_type", ["string", "category"])
def test_get_dummies_ea_dtype(dtype_type, string_dtype_no_object):
dtype = string_dtype_no_object
exp_dtype = "boolean" if dtype.na_value is NA else "bool"
if dtype_type == "category":
dtype = CategoricalDtype(Index(["a", "b"], dtype))
s = Series(["a", "b"], dtype=dtype)
result = s.str.get_dummies()
expected = DataFrame([[1, 0], [0, 1]], columns=list("ab"), dtype=exp_dtype)
tm.assert_frame_equal(result, expected)
Loading