SNOW-2396077: Support dtype parameter in get_dummies. (#3879)

sfc-gh-mvashishtha · sfc-gh-helmeleegy · web-flow · commit 7f59deb2dfab · 2025-10-14T13:36:55.000-07:00
The "dtype" parameter controls the values of the indicator variables, e.g. dtype=int means that we use 1 and 0 instead of True and False, respectively.

Signed-off-by: sfc-gh-mvashishtha &lt;mahesh.vashishtha@snowflake.com&gt;
Co-authored-by: Hazem Elmeleegy &lt;hazem.elmeleegy@snowflake.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -64,6 +64,9 @@
 
 ### Snowpark pandas API Updates
 
+#### New Features
+- Added support for the `dtypes` parameter of `pd.get_dummies`
+
 #### Improvements
 
 - Improved performance of `Series.to_snowflake` and `pd.to_snowflake(series)` for large data by uploading data via a parquet file. You can control the dataset size at which Snowpark pandas switches to parquet with the variable `modin.config.PandasToSnowflakeParquetThresholdBytes`.
diff --git a/docs/source/modin/supported/general_supported.rst b/docs/source/modin/supported/general_supported.rst
@@ -32,8 +32,8 @@ Data manipulations
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``from_dummies``            | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``get_dummies``             | P                               | ``sparse`` is ignored            | ``Y`` if params ``dummy_na``, ``drop_first``       |
-|                             |                                 |                                  | and ``dtype`` are default, otherwise ``N``         |
+| ``get_dummies``             | P                               | ``sparse`` is ignored            | ``Y`` if params ``dummy_na`` and ``drop_first``    |
+|                             |                                 |                                  | are default, otherwise ``N``                       |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``json_normalize``          | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/get_dummies_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/get_dummies_utils.py
@@ -3,6 +3,18 @@
 #
 
 from collections.abc import Hashable
+from typing import Any
+
+from pandas.api.types import (
+    is_bool_dtype,
+    is_datetime64_any_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+    is_object_dtype,
+    is_timedelta64_dtype,
+    is_string_dtype,
+)
+import pandas as native_pd
 
 from snowflake.snowpark.functions import (
     col,
@@ -37,6 +49,7 @@ def single_get_dummies_pivot(
     pivot_column_snowflake_quoted_identifier: str,
     columns_to_keep_snowflake_quoted_identifiers: list[str],
     columns_to_keep_pandas_labels: list[Hashable],
+    dummy_false: Any,
 ) -> InternalFrame:
     """
     Helper function for get dummies to perform a single pivot on the encoded column.
@@ -51,6 +64,7 @@ def single_get_dummies_pivot(
             internal_frame to keep as the data column of final result internal frame.
         columns_to_keep_pandas_labels: The pandas label in the internal_frame to keep as the
             data_column of final result internal frame.
+        dummy_false: The scalar value representing that a particular column value is not present.
 
         Note: columns_to_keep_snowflake_quoted_identifiers must be the same length as columns_to_keep_pandas_labels
     Returns:
@@ -93,7 +107,7 @@ def single_get_dummies_pivot(
         columns_snowflake_quoted_identifier
     )
     # Perform pivot on the pivot column with dummy lit true column as value column.
-    # With the above example, the result of pivot will be:
+    # With the above example, the result of pivot will be (assuming dtype is bool):
     #
     #    C    a       b
     # 0  1    True    False
@@ -102,7 +116,7 @@ def single_get_dummies_pivot(
     pivoted_ordered_dataframe = ordered_dataframe.pivot(
         col(str(pivot_column_snowflake_quoted_identifier)),
         None,
-        0,
+        pandas_lit(dummy_false),
         min_(lit_true_column_snowflake_quoted_identifier),
     )
     pivoted_ordered_dataframe = pivoted_ordered_dataframe.sort(
@@ -179,11 +193,41 @@ def single_get_dummies_pivot(
     )
 
 
+def _get_dummies_true_and_false_values(dtype: Any) -> tuple[Any, Any]:
+    """
+    Get the indicator values repsresenting whether a column is equal to a particular value.
+
+    Args:
+        dtype: The dtype of the indicator column.
+
+    Returns:
+        A tuple of the indicator values. The first value reprsents that the
+        value is present, and the second value represents that the value is not
+        present.
+    """
+    if is_object_dtype(dtype):
+        raise ValueError("dtype=object is not a valid dtype for get_dummies")
+    if is_string_dtype(dtype):
+        return ("1", "")
+    if is_bool_dtype(dtype) or dtype is None:
+        return (True, False)
+    if is_integer_dtype(dtype):
+        return (1, 0)
+    if is_float_dtype(dtype):
+        return (1.0, 0.0)
+    if is_datetime64_any_dtype(dtype):
+        return (native_pd.Timestamp(1), native_pd.Timestamp(0))
+    if is_timedelta64_dtype(dtype):
+        ErrorMessage.not_implemented_for_timedelta(method="get_dummies")
+    raise TypeError(f"data type '{dtype}' not understood")
+
+
 def get_dummies_helper(
     internal_frame: InternalFrame,
     columns: list[Hashable],
     prefixes: list[Hashable],
     prefix_sep: str,
+    dtype: Any,
     dummy_row_pos_mode: bool = False,
 ) -> InternalFrame:
     """
@@ -222,11 +266,12 @@ def get_dummies_helper(
                 f"get_dummies with duplicated columns {pandas_label}"
             )
 
-    # append a lit true column as value column for pivot
+    dummy_true, dummy_false = _get_dummies_true_and_false_values(dtype)
+
+    # the dummy column is appended as the last data column of the new_internal_frame
     new_internal_frame = internal_frame.ensure_row_position_column(
         dummy_row_pos_mode
-    ).append_column(LIT_TRUE_COLUMN_PANDAS_LABEL, pandas_lit(True))
-    # the dummy column is appended as the last data column of the new_internal_frame
+    ).append_column(LIT_TRUE_COLUMN_PANDAS_LABEL, pandas_lit(dummy_true))
     row_position_column_snowflake_quoted_identifier = (
         new_internal_frame.row_position_snowflake_quoted_identifier
     )
@@ -266,7 +311,7 @@ def get_dummies_helper(
 
     # Do the first pivot with the first column and keep all remaining columns.
     # With the example given above, the first pivot is performed on column A, and we will
-    # get the following result:
+    # get the following result (assuming dtype is int):
     #    C  A_a  A_b
     # 0  1    1    0
     # 1  2    0    1
@@ -278,6 +323,7 @@ def get_dummies_helper(
         pivot_column_snowflake_quoted_identifier=grouped_quoted_identifiers[0][0],
         columns_to_keep_snowflake_quoted_identifiers=remaining_data_column_snowflake_quoted_identifiers,
         columns_to_keep_pandas_labels=remaining_data_column_pandas_labels,
+        dummy_false=dummy_false,
     )
 
     # Perform pivot on rest columns and join on the row position column to form the final result.
@@ -294,6 +340,7 @@ def get_dummies_helper(
             pivot_column_snowflake_quoted_identifier=grouped_quoted_identifiers[i][0],
             columns_to_keep_snowflake_quoted_identifiers=[],
             columns_to_keep_pandas_labels=[],
+            dummy_false=dummy_false,
         )
         result_internal_frame = join_utils.join(
             result_internal_frame,
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -7003,10 +7003,6 @@ def groupby_pct_change(
             unsupported_conditions=[
                 ("dummy_na", True),
                 ("drop_first", True),
-                (
-                    lambda args: args.get("dtype") is not None,
-                    "get_dummies with non-default dtype parameter is not supported yet in Snowpark pandas.",
-                ),
             ]
         ),
     )
@@ -7049,9 +7045,9 @@ def get_dummies(
         """
         self._raise_not_implemented_error_for_timedelta()
 
-        if dummy_na is True or drop_first is True or dtype is not None:
+        if dummy_na is True or drop_first is True:
             ErrorMessage.not_implemented(
-                "get_dummies with non-default dummy_na, drop_first, and dtype parameters"
+                "get_dummies with non-default dummy_na or drop_first parameters"
                 + " is not supported yet in Snowpark pandas."
             )
         if columns is None:
@@ -7095,6 +7091,7 @@ def get_dummies(
             columns=columns,
             prefixes=prefix,
             prefix_sep=prefix_sep,
+            dtype=dtype,
         )
         query_compiler = SnowflakeQueryCompiler(result_internal_frame)
 
diff --git a/tests/integ/modin/frame/test_get_dummies.py b/tests/integ/modin/frame/test_get_dummies.py
@@ -9,11 +9,15 @@
 import pytest
 
 import snowflake.snowpark.modin.plugin  # noqa: F401
+from pytest import param
 from snowflake.snowpark._internal.utils import (
     TempObjectType,
     random_name_for_temp_object,
 )
-from tests.integ.modin.utils import assert_snowpark_pandas_equal_to_pandas
+from tests.integ.modin.utils import (
+    assert_snowpark_pandas_equal_to_pandas,
+    eval_snowpark_pandas_result,
+)
 from tests.integ.utils.sql_counter import sql_count_checker
 
 
@@ -246,25 +250,98 @@ def test_get_dummies_pandas_after_read_snowflake(session):
     assert_snowpark_pandas_equal_to_pandas(snow_get_dummies, pandas_get_dummies)
 
 
-@sql_count_checker(query_count=0)
-def test_get_dummies_pandas_negative():
-
-    pandas_df = native_pd.DataFrame(
-        {"A": ["a", "b", "a"], "B": ["b", "a", "c"], "C": [1, 2, 3]}
+class TestDtypeParameter:
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            np.int64,
+            int,
+            "int",
+            float,
+            np.float64,
+            "float64",
+            str,
+            "str",
+            np.str_,
+            bool,
+            "bool",
+            np.bool_,
+            "datetime64[ns]",
+            param(
+                "timedelta64[ns]",
+                marks=pytest.mark.xfail(strict=True, raises=NotImplementedError),
+            ),
+            None,
+        ],
     )
+    @sql_count_checker(query_count=1)
+    def test_valid_dtype(self, dtype):
+        pandas_df = native_pd.DataFrame({"A": ["a", "b", "a"]})
+        snow_df = pd.DataFrame(pandas_df)
+        # note that we're using the default check_dtype=True to check that we
+        # are producing the correct dtypes.
+        assert_snowpark_pandas_equal_to_pandas(
+            pd.get_dummies(snow_df, dtype=dtype),
+            native_pd.get_dummies(pandas_df, dtype=dtype),
+        )
 
-    snow_df = pd.DataFrame(pandas_df)
+    @sql_count_checker(query_count=1)
+    def test_valid_dtype_argument_int32(self):
+        """Test int32 separately because Snowpark pandas always produces int64 for integers."""
+        pandas_df = native_pd.DataFrame({"A": ["a", "b", "a"]})
+        snow_df = pd.DataFrame(pandas_df)
+        snow_result = pd.get_dummies(snow_df, dtype=np.int32)
+        pandas_result = native_pd.get_dummies(pandas_df, dtype=np.int32)
+        # note that we're using the default check_dtype=True to check that we
+        # are producing the correct dtypes.
+        assert_snowpark_pandas_equal_to_pandas(
+            snow_result, pandas_result.astype(np.int64)
+        )
 
-    with pytest.raises(NotImplementedError):
-        pd.get_dummies(
-            snow_df,
-            prefix=["col1", "col2"],
-            dummy_na=True,
-            drop_first=True,
-            dtype=np.int32,
+    @sql_count_checker(query_count=0)
+    def test_invalid_dtype_argument(self):
+        eval_snowpark_pandas_result(
+            pd,
+            native_pd,
+            lambda module: module.get_dummies(
+                module.DataFrame({"A": ["a", "b", "a"]}), dtype="invalid_dtype"
+            ),
+            expect_exception=True,
+            expect_exception_type=TypeError,
+            expect_exception_match=re.escape(
+                "data type 'invalid_dtype' not understood"
+            ),
+        )
+
+    @sql_count_checker(query_count=0)
+    @pytest.mark.parametrize("dtype", ["object", np.dtype("object")])
+    def test_invalid_dtype_argument_object(self, dtype):
+        eval_snowpark_pandas_result(
+            pd,
+            native_pd,
+            lambda module: module.get_dummies(
+                module.DataFrame({"A": ["a", "b", "a"]}), dtype=dtype
+            ),
+            expect_exception=True,
+            expect_exception_type=ValueError,
+            expect_exception_match=re.escape(
+                "dtype=object is not a valid dtype for get_dummies"
+            ),
         )
 
 
+@sql_count_checker(query_count=0)
+def test_dummy_na_negative():
+    with pytest.raises(NotImplementedError):
+        pd.get_dummies(pd.DataFrame(["a", None]), dummy_na=True)
+
+
+@sql_count_checker(query_count=0)
+def test_drop_first_negative():
+    with pytest.raises(NotImplementedError):
+        pd.get_dummies(pd.DataFrame(["a", "b"]), drop_first=True)
+
+
 @sql_count_checker(query_count=0)
 def test_get_dummies_pandas_negative_duplicated_columns():
     pandas_df = native_pd.DataFrame(
diff --git a/tests/integ/modin/hybrid/test_switch_operations.py b/tests/integ/modin/hybrid/test_switch_operations.py
@@ -884,7 +884,6 @@ def test_auto_switch_supported_post_op_switch_point_series(method, kwargs):
             "get_dummies",
             {"drop_first": True},
         ),
-        ("get_dummies", {"dtype": int}),
     ],
 )
 def test_auto_switch_unsupported_top_level_functions(method, kwargs):
@@ -1039,11 +1038,6 @@ def test_auto_switch_unsupported_series(method, kwargs):
             {"drop_first": True},
             "drop_first = True is not supported",
         ),
-        (
-            "get_dummies",
-            {"dtype": int},
-            "get_dummies with non-default dtype parameter is not supported yet in Snowpark pandas.",
-        ),
     ],
 )
 @sql_count_checker(query_count=0)
diff --git a/tests/integ/modin/series/test_get_dummies.py b/tests/integ/modin/series/test_get_dummies.py
diff --git a/tests/integ/modin/strings/test_get_dummies_series.py b/tests/integ/modin/strings/test_get_dummies_series.py