StringMethods get_dummies defers to pd.get_dummies

aaronchucarroll · aaronchucarroll · commit 2000f183caea · 2024-08-19T15:25:11.000-04:00
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -2356,8 +2356,22 @@ def wrap(
         )
         return self._wrap_result(result)
 
+    from collections.abc import Iterable
+    from typing import TYPE_CHECKING
+
+    if TYPE_CHECKING:
+        from pandas._typing import NpDtype
+
     @forbid_nonstring_types(["bytes"])
-    def get_dummies(self, sep: str = "|"):
+    def get_dummies(
+        self,
+        sep: str = "|",
+        prefix=None,
+        prefix_sep: str | Iterable[str] | dict[str, str] = "_",
+        dummy_na: bool = False,
+        sparse: bool = False,
+        dtype: NpDtype | None = int,
+    ):
         """
         Return DataFrame of dummy/indicator variables for Series.
 
@@ -2395,13 +2409,67 @@ def get_dummies(self, sep: str = "|"):
         """
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
-        result, name = self._data.array._str_get_dummies(sep)
-        return self._wrap_result(
-            result,
-            name=name,
-            expand=True,
-            returns_string=False,
+        # result, name = self._data.array._str_get_dummies(sep)
+        # return self._wrap_result(
+        #     result,
+        #     name=name,
+        #     expand=True,
+        #     returns_string=False,
+        # )
+        from pandas import (
+            MultiIndex,
+            Series,
+        )
+        from pandas.core.reshape.encoding import get_dummies
+
+        input_series = Series(self._data) if isinstance(self._data, ABCIndex) else self._data
+        string_series = input_series.apply(lambda x: str(x) if not isna(x) else x)
+        split_series = string_series.str.split(sep, expand=True).stack()
+        valid_split_series = split_series[
+            (split_series.astype(str) != 'None') &
+            ~(split_series.index.get_level_values(0).duplicated(keep='first') & split_series.isna())
+        ]
+
+        dummy_df = get_dummies(
+            valid_split_series,
+            None,
+            None,
+            dummy_na,
+            None,
+            sparse,
+            False,
+            dtype
         )
+        grouped_dummies = dummy_df.groupby(level=0)
+        if dtype == bool:
+            result_df = grouped_dummies.any()
+        else:
+            result_df = grouped_dummies.sum()
+
+        if isinstance(prefix, str):
+            result_df.columns = [f"{prefix}{prefix_sep}{col}" for col in result_df.columns]
+        elif isinstance(prefix, dict):
+            if len(prefix) != len(result_df.columns):
+                len_msg = (
+                    f"Length of 'prefix' ({len(prefix)}) did not match the "
+                    "length of the columns being encoded "
+                    f"({len(result_df.columns)})."
+                )
+                raise ValueError(len_msg)
+            result_df.columns = [f"{prefix[col]}{prefix_sep}{col}" for col in result_df.columns]
+        elif isinstance(prefix, list):
+            if len(prefix) != len(result_df.columns):
+                len_msg = (
+                    f"Length of 'prefix' ({len(prefix)}) did not match the "
+                    "length of the columns being encoded "
+                    f"({len(result_df.columns)})."
+                )
+                raise ValueError(len_msg)
+            result_df.columns = [f"{prefix[i]}{prefix_sep}{col}" for i, col in enumerate(result_df.columns)]
+
+        if isinstance(self._data, ABCIndex):
+            return MultiIndex.from_frame(result_df)
+        return result_df
 
     @forbid_nonstring_types(["bytes"])
     def translate(self, table):
diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
@@ -160,10 +160,6 @@ def _str_translate(self, table):
     def _str_wrap(self, width: int, **kwargs):
         pass
 
-    @abc.abstractmethod
-    def _str_get_dummies(self, sep: str = "|"):
-        pass
-
     @abc.abstractmethod
     def _str_isalnum(self):
         pass
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -372,32 +372,6 @@ def _str_wrap(self, width: int, **kwargs):
         tw = textwrap.TextWrapper(**kwargs)
         return self._str_map(lambda s: "\n".join(tw.wrap(s)))
 
-    def _str_get_dummies(self, sep: str = "|"):
-        from pandas import Series
-
-        arr = Series(self).fillna("")
-        try:
-            arr = sep + arr + sep
-        except (TypeError, NotImplementedError):
-            arr = sep + arr.astype(str) + sep
-
-        tags: set[str] = set()
-        for ts in Series(arr, copy=False).str.split(sep):
-            tags.update(ts)
-        tags2 = sorted(tags - {""})
-
-        dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)
-
-        def _isin(test_elements: str, element: str) -> bool:
-            return element in test_elements
-
-        for i, t in enumerate(tags2):
-            pat = sep + t + sep
-            dummies[:, i] = lib.map_infer(
-                arr.to_numpy(), functools.partial(_isin, element=pat)
-            )
-        return dummies, tags2
-
     def _str_upper(self):
         return self._str_map(lambda x: x.upper())
 
diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
@@ -5,6 +5,7 @@
     Index,
     MultiIndex,
     Series,
+    SparseDtype,
     _testing as tm,
 )
 
@@ -51,3 +52,71 @@ def test_get_dummies_with_name_dummy_index():
         [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name")
     )
     tm.assert_index_equal(result, expected)
+
+def test_get_dummies_with_prefix(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", prefix="prefix")
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["prefix_a", "prefix_b", "prefix_c"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_with_prefix_sep(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", prefix=None, prefix_sep="__")
+    expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=["a", "b", "c"])
+    tm.assert_frame_equal(result, expected)
+
+    result = s.str.get_dummies(sep="|", prefix="col", prefix_sep="__")
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["col__a", "col__b", "col__c"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_with_dummy_na(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", dummy_na=True)
+    expected = DataFrame(
+        [[1, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 1]],
+        columns=["a", "b", "c", np.nan],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_with_sparse(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", sparse=True)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["a", "b", "c"],
+        dtype="Sparse[int]",
+    )
+    tm.assert_frame_equal(result, expected)
+    assert all(isinstance(dtype, SparseDtype) for dtype in result.dtypes)
+
+
+def test_get_dummies_with_dtype(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies(sep="|", dtype=bool)
+    expected = DataFrame(
+        [[True, True, False], [True, False, True], [False, False, False]],
+        columns=["a", "b", "c"],
+    )
+    tm.assert_frame_equal(result, expected)
+    assert (result.dtypes == bool).all()
+
+
+def test_get_dummies_with_prefix_dict(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    prefix = {"a": "alpha", "b": "beta", "c": "gamma"}
+    result = s.str.get_dummies(sep="|", prefix=prefix)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=["alpha_a", "beta_b", "gamma_c"],
+    )
+    tm.assert_frame_equal(result, expected)
+