Merge branch 'refs/heads/main' into feature/aherrera/SNOW-2432059-StringAndBinary-part1

sfc-gh-aherreraaguilar · sfc-gh-aherreraaguilar · commit 2a5afac081b4 · 2025-10-20T11:35:07.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -87,6 +87,7 @@
 
 #### New Features
 - Added support for the `dtypes` parameter of `pd.get_dummies`
+- Added support for `nunique` in `df.pivot_table`, `df.agg` and other places where aggregate functions can be used.
 
 #### Improvements
 
diff --git a/docs/source/modin/supported/agg_supp.rst b/docs/source/modin/supported/agg_supp.rst
@@ -38,6 +38,9 @@ methods ``pd.pivot_table``, ``DataFrame.pivot_table``, and ``pd.crosstab``.
 | ``median``                  | ``Y`` for ``axis=0``.               | ``Y``                            | ``Y``                                      | ``Y``                                   | ``Y``                                   |
 |                             | ``N`` for  ``axis=1``.              |                                  |                                            |                                         |                                         |
 +-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+-----------------------------------------+
+| ``nunique``                 | ``Y`` for ``axis=0``.               | ``Y``                            | ``Y``                                      | ``Y``                                   | ``Y``                                   |
+|                             | ``N`` for  ``axis=1``.              |                                  |                                            |                                         |                                         |
++-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+-----------------------------------------+
 | ``size``                    | ``Y`` for ``axis=0``.               | ``Y``                            | ``Y``                                      | ``Y``                                   | ``N``                                   |
 |                             | ``N`` for  ``axis=1``.              |                                  |                                            |                                         |                                         |
 +-----------------------------+-------------------------------------+----------------------------------+--------------------------------------------+-----------------------------------------+-----------------------------------------+
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/aggregation_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/aggregation_utils.py
@@ -578,6 +578,11 @@ def _create_pandas_to_snowpark_pandas_aggregation_map(
             preserves_snowpark_pandas_types=False,
             supported_in_pivot=True,
         ),
+        "nunique": _SnowparkPandasAggregation(
+            axis_0_aggregation=count_distinct,
+            preserves_snowpark_pandas_types=False,
+            supported_in_pivot=True,
+        ),
         **_create_pandas_to_snowpark_pandas_aggregation_map(
             (len, "size"),
             _SnowparkPandasAggregation(
@@ -719,11 +724,6 @@ def _create_pandas_to_snowpark_pandas_aggregation_map(
             preserves_snowpark_pandas_types=True,
             supported_in_pivot=False,
         ),
-        "nunique": _SnowparkPandasAggregation(
-            axis_0_aggregation=count_distinct,
-            preserves_snowpark_pandas_types=False,
-            supported_in_pivot=False,
-        ),
     }
 )
 
diff --git a/src/snowflake/snowpark/modin/plugin/_internal/pivot_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/pivot_utils.py
@@ -22,6 +22,7 @@
     min as min_,
     object_construct,
     sum as sum_,
+    count_distinct,
 )
 from snowflake.snowpark.modin.plugin._internal.aggregation_utils import (
     get_pandas_aggr_func_name,
@@ -768,7 +769,7 @@ def prepare_pivot_aggregation_for_handling_missing_and_null_values(
       bar  | 0.0   | Nan   | 0.0 | Nan
       foo  | 1.0   | 1.0   | 0.0 | 1.0
 
-    To match pandas behavior, we do an upfront group-by aggregation for count and sum to get the correct
+    To match pandas behavior, we do an upfront group-by aggregation for count, nunique and sum to get the correct
     values for all null values via snowflake query:
 
     select a, b, coalesce(sum(C), 0) as sum_c, count(C) as cnt_c from df_small_data group by a, b;
@@ -792,16 +793,21 @@ def prepare_pivot_aggregation_for_handling_missing_and_null_values(
         Snowpark dataframe that has done an pre-pivot aggregation needed for matching pandas pivot behavior as
         described earlier.
     """
-    if snowpark_aggr_func in [sum_, count]:
-        agg_expr = (
-            coalesce(sum_(aggr_snowflake_quoted_identifier), pandas_lit(0)).as_(
+    if snowpark_aggr_func in [sum_, count, count_distinct]:
+        if snowpark_aggr_func == sum_:
+            agg_expr = coalesce(
+                sum_(aggr_snowflake_quoted_identifier), pandas_lit(0)
+            ).as_(aggr_snowflake_quoted_identifier)
+        elif snowpark_aggr_func == count:
+            agg_expr = count(aggr_snowflake_quoted_identifier).as_(
                 aggr_snowflake_quoted_identifier
             )
-            if snowpark_aggr_func == sum_
-            else count(aggr_snowflake_quoted_identifier).as_(
+        elif snowpark_aggr_func == count_distinct:
+            agg_expr = count_distinct(aggr_snowflake_quoted_identifier).as_(
                 aggr_snowflake_quoted_identifier
             )
-        )
+        else:
+            raise NotImplementedError("Aggregate function not supported for pivot")
         pre_pivot_ordered_dataframe = pivot_ordered_dataframe.group_by(
             grouping_snowflake_quoted_identifiers, agg_expr
         )
diff --git a/tests/integ/modin/frame/test_aggregate.py b/tests/integ/modin/frame/test_aggregate.py
@@ -65,6 +65,7 @@ def native_df_multiindex() -> native_pd.DataFrame:
             3,
         ),
         (lambda df: df.aggregate({"A": ["count", "max"], "B": [max, "min"]}), 2),
+        (lambda df: df.aggregate("nunique"), 0),
         (
             lambda df: df.aggregate(
                 x=pd.NamedAgg("A", "max"), y=("B", "min"), c=("A", "count")
diff --git a/tests/integ/modin/groupby/test_groupby_basic_agg.py b/tests/integ/modin/groupby/test_groupby_basic_agg.py
@@ -413,6 +413,7 @@ def test_groupby_agg_with_int_dtypes(int_to_decimal_float_agg_method) -> None:
         np.min,
         min,
         sum,
+        "nunique",
         np.std,
         "var",
         {"col2": "sum"},
diff --git a/tests/integ/modin/groupby/test_groupby_series.py b/tests/integ/modin/groupby/test_groupby_series.py
@@ -49,6 +49,7 @@ def test_groupby_series_count_with_nan():
         np.median,
         np.std,
         "var",
+        "nunique",
         [np.var],
         ["sum", np.std],
         ["sum", np.median, sum],
diff --git a/tests/integ/modin/pivot/test_pivot_table_dropna.py b/tests/integ/modin/pivot/test_pivot_table_dropna.py
@@ -107,7 +107,7 @@ def test_pivot_table_single_all_aggfuncs_dropna_and_null_data(
     df_data_with_nulls_2,
     values,
 ):
-    expected_join_count = 10 if len(values) > 1 else 5
+    expected_join_count = 12 if len(values) > 1 else 6
     with SqlCounter(query_count=1, join_count=expected_join_count):
         pivot_table_test_helper(
             df_data_with_nulls_2,
@@ -116,7 +116,7 @@ def test_pivot_table_single_all_aggfuncs_dropna_and_null_data(
                 "columns": ["C"],
                 "values": values,
                 "dropna": False,
-                "aggfunc": ["count", "sum", "min", "max", "mean"],
+                "aggfunc": ["count", "sum", "min", "max", "mean", "nunique"],
             },
         )
 
diff --git a/tests/integ/modin/pivot/test_pivot_table_margins.py b/tests/integ/modin/pivot/test_pivot_table_margins.py
@@ -134,7 +134,7 @@ def test_pivot_table_multiple_columns_values_with_margins(
         ),
     ],
 )
-@sql_count_checker(query_count=1, join_count=5, union_count=1)
+@sql_count_checker(query_count=1, join_count=6, union_count=1)
 def test_pivot_table_multiple_pivot_values_null_data_with_margins(
     df_data_with_nulls, index, fill_value
 ):
@@ -144,7 +144,7 @@ def test_pivot_table_multiple_pivot_values_null_data_with_margins(
             "index": index,
             "columns": "C",
             "values": "F",
-            "aggfunc": ["count", "sum", "mean"],
+            "aggfunc": ["count", "sum", "mean", "nunique"],
             "dropna": False,
             "fill_value": fill_value,
             "margins": True,
diff --git a/tests/integ/modin/pivot/test_pivot_table_multiple.py b/tests/integ/modin/pivot/test_pivot_table_multiple.py
@@ -38,14 +38,14 @@ def test_pivot_table_no_index_single_column_multiple_values(df_data):
     )
 
 
-@sql_count_checker(query_count=1, union_count=1, join_count=2)
+@sql_count_checker(query_count=1, union_count=1, join_count=4)
 def test_pivot_table_no_index_single_column_multiple_values_multiple_aggr_func(df_data):
     pivot_table_test_helper(
         df_data,
         {
             "columns": "B",
             "values": ["D", "E"],
-            "aggfunc": ["mean", "max"],
+            "aggfunc": ["mean", "max", "nunique"],
         },
     )
 
@@ -119,7 +119,7 @@ def test_pivot_table_single_index_multiple_column_single_value(
     )
 
 
-@pytest.mark.parametrize("aggfunc", ["count", "sum", "min", "max", "mean"])
+@pytest.mark.parametrize("aggfunc", ["count", "sum", "min", "max", "mean", "nunique"])
 @pytest.mark.parametrize("values", ["D", ["D"]])
 @sql_count_checker(query_count=1)
 def test_pivot_table_no_index_multiple_column_single_value(df_data, aggfunc, values):
diff --git a/tests/integ/modin/pivot/test_pivot_table_negative.py b/tests/integ/modin/pivot/test_pivot_table_negative.py
@@ -168,7 +168,6 @@ class Baz:
         (np.any, "np.any"),
         ("size", "'size'"),
         (len, "<built-in function len>"),
-        ("nunique", "'nunique'"),
         ("idxmax", "'idxmax'"),
         ("idxmin", "'idxmin'"),
     ],
diff --git a/tests/integ/modin/pivot/test_pivot_table_single.py b/tests/integ/modin/pivot/test_pivot_table_single.py
@@ -54,6 +54,7 @@ def test_pivot_table_no_index_single_column_single_value(df_data):
         np.std,
         "var",
         np.var,
+        "nunique",
     ),
 )
 @sql_count_checker(query_count=1)
@@ -77,6 +78,7 @@ def test_pivot_table_single_index_single_column_single_value(df_data, aggfunc):
         "min",
         "max",
         "mean",
+        "nunique",
     ],
 )
 @sql_count_checker(query_count=1)
@@ -95,6 +97,7 @@ def test_pivot_table_multi_index_single_column_single_value(df_data, aggfunc):
         "min",
         "max",
         "mean",
+        "nunique",
     ],
 )
 @sql_count_checker(query_count=1)
@@ -201,6 +204,7 @@ def test_pivot_table_with_duplicate_values(
     [
         "count",
         "sum",
+        "nunique",
     ],
 )
 @pytest.mark.parametrize(
@@ -228,6 +232,7 @@ def test_pivot_table_with_sum_and_count_null_and_empty_values_matching_behavior(
     [
         "count",
         "sum",
+        "nunique",
     ],
 )
 def test_pivot_table_with_sum_and_count_null_and_empty_values_matching_behavior_skipped(
diff --git a/tests/integ/modin/series/test_aggregate.py b/tests/integ/modin/series/test_aggregate.py
@@ -55,6 +55,7 @@ def validate_scalar_result(res1, res2):
             1,
         ),  # Test order of index is correct.
         (lambda df: df.aggregate(["min", np.max, "count"]), False, True, 2),
+        (lambda df: df.aggregate("nunique"), True, False, 0),
         (lambda df: df.aggregate(x="min", y=np.max, z="count"), False, True, 2),
         (lambda df: df.aggregate(min), True, False, 0),
         (lambda df: df.max(), True, False, 0),

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@ def native_df_multiindex() -> native_pd.DataFrame:`
`65`	`65`	`3,`
`66`	`66`	`),`
`67`	`67`	`(lambda df: df.aggregate({"A": ["count", "max"], "B": [max, "min"]}), 2),`
	`68`	`+ (lambda df: df.aggregate("nunique"), 0),`
`68`	`69`	`(`
`69`	`70`	`lambda df: df.aggregate(`
`70`	`71`	`x=pd.NamedAgg("A", "max"), y=("B", "min"), c=("A", "count")`
Original file line number	Diff line number	Diff line change
`@@ -38,14 +38,14 @@ def test_pivot_table_no_index_single_column_multiple_values(df_data):`
`38`	`38`	`)`
`39`	`39`
`40`	`40`
`41`		`-@sql_count_checker(query_count=1, union_count=1, join_count=2)`
	`41`	`+@sql_count_checker(query_count=1, union_count=1, join_count=4)`
`42`	`42`	`def test_pivot_table_no_index_single_column_multiple_values_multiple_aggr_func(df_data):`
`43`	`43`	`pivot_table_test_helper(`
`44`	`44`	`df_data,`
`45`	`45`	`{`
`46`	`46`	`"columns": "B",`
`47`	`47`	`"values": ["D", "E"],`
`48`		`- "aggfunc": ["mean", "max"],`
	`48`	`+ "aggfunc": ["mean", "max", "nunique"],`
`49`	`49`	`},`
`50`	`50`	`)`
`51`	`51`
`@@ -119,7 +119,7 @@ def test_pivot_table_single_index_multiple_column_single_value(`
`119`	`119`	`)`
`120`	`120`
`121`	`121`
`122`		`-@pytest.mark.parametrize("aggfunc", ["count", "sum", "min", "max", "mean"])`
	`122`	`+@pytest.mark.parametrize("aggfunc", ["count", "sum", "min", "max", "mean", "nunique"])`
`123`	`123`	`@pytest.mark.parametrize("values", ["D", ["D"]])`
`124`	`124`	`@sql_count_checker(query_count=1)`
`125`	`125`	`def test_pivot_table_no_index_multiple_column_single_value(df_data, aggfunc, values):`