SNOW-2435290: Add support for groupby.agg/min/max/count/sum/mean/median/std/var in faster pandas

sfc-gh-helmeleegy · sfc-gh-helmeleegy · commit b44816cf3a98 · 2025-10-16T15:46:28.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -122,6 +122,16 @@
   - `drop`
   - `invert`
   - `duplicated`
+  - `groupby.agg`
+  - `groupby.min`
+  - `groupby.max`
+  - `groupby.count`
+  - `groupby.sum`
+  - `groupby.mean`
+  - `groupby.median`
+  - `groupby.std`
+  - `groupby.var`
+
 - Reuse row count from the relaxed query compiler in `get_axis_len`.
 
 #### Bug Fixes
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -4501,6 +4501,50 @@ def groupby_agg(
         numeric_only: bool = False,
         is_series_groupby: bool = False,
         drop: bool = False,
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Wrapper around _groupby_agg_internal to be supported in faster pandas.
+        """
+        relaxed_query_compiler = None
+        if self._relaxed_query_compiler is not None:
+            relaxed_query_compiler = self._relaxed_query_compiler._groupby_agg_internal(
+                by=by,
+                agg_func=agg_func,
+                axis=axis,
+                groupby_kwargs=groupby_kwargs,
+                agg_args=agg_args,
+                agg_kwargs=agg_kwargs,
+                how=how,
+                numeric_only=numeric_only,
+                is_series_groupby=is_series_groupby,
+                drop=drop,
+            )
+        qc = self._groupby_agg_internal(
+            by=by,
+            agg_func=agg_func,
+            axis=axis,
+            groupby_kwargs=groupby_kwargs,
+            agg_args=agg_args,
+            agg_kwargs=agg_kwargs,
+            how=how,
+            numeric_only=numeric_only,
+            is_series_groupby=is_series_groupby,
+            drop=drop,
+        )
+        return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
+
+    def _groupby_agg_internal(
+        self,
+        by: Any,
+        agg_func: AggFuncType,
+        axis: int,
+        groupby_kwargs: dict[str, Any],
+        agg_args: Any,
+        agg_kwargs: dict[str, Any],
+        how: str = "axis_wise",
+        numeric_only: bool = False,
+        is_series_groupby: bool = False,
+        drop: bool = False,
     ) -> "SnowflakeQueryCompiler":
         """
         compute groupby with aggregation functions.
diff --git a/tests/integ/modin/test_faster_pandas.py b/tests/integ/modin/test_faster_pandas.py
@@ -252,6 +252,61 @@ def test_duplicated(session):
     assert_series_equal(snow_result, native_result)
 
 
+@pytest.mark.parametrize(
+    "func",
+    [
+        "min",
+        "max",
+        "count",
+        "sum",
+        "mean",
+        "median",
+        "std",
+        "var",
+    ],
+)
+@sql_count_checker(query_count=6)
+def test_groupby_agg(session, func):
+    # create tables
+    table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
+    session.create_dataframe(
+        native_pd.DataFrame([[2, 12], [2, 11], [3, 13]], columns=["A", "B"])
+    ).write.save_as_table(table_name, table_type="temp")
+
+    # create snow dataframes
+    df = pd.read_snowflake(table_name)
+    snow_result1 = getattr(df.groupby("A"), func)()
+    snow_result2 = df.groupby("A").agg([func])
+    snow_result3 = getattr(df.groupby("A")["B"], func)()
+    snow_result4 = df.groupby("A")["B"].agg([func])
+
+    # verify that the input dataframe has a populated relaxed query compiler
+    assert df._query_compiler._relaxed_query_compiler is not None
+    assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    # verify that the output dataframe also has a populated relaxed query compiler
+    assert snow_result1._query_compiler._relaxed_query_compiler is not None
+    assert (
+        snow_result1._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    )
+    assert snow_result2._query_compiler._relaxed_query_compiler is not None
+    assert (
+        snow_result2._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    )
+
+    # create pandas dataframes
+    native_df = df.to_pandas()
+    native_result1 = getattr(native_df.groupby("A"), func)()
+    native_result2 = native_df.groupby("A").agg([func])
+    native_result3 = getattr(native_df.groupby("A")["B"], func)()
+    native_result4 = native_df.groupby("A")["B"].agg([func])
+
+    # compare results
+    assert_frame_equal(snow_result1, native_result1, check_dtype=False)
+    assert_frame_equal(snow_result2, native_result2, check_dtype=False)
+    assert_series_equal(snow_result3, native_result3, check_dtype=False)
+    assert_frame_equal(snow_result4, native_result4, check_dtype=False)
+
+
 @sql_count_checker(query_count=3)
 def test_invert(session):
     # create tables