SNOW-2185699: Support filtering after grouping by and aggregation (#3547)

sfc-gh-jdu · web-flow · commit 6da287ed32d4 · 2025-07-22T16:10:37.000-07:00
diff --git a/src/snowflake/snowpark/_internal/analyzer/analyzer.py b/src/snowflake/snowpark/_internal/analyzer/analyzer.py
@@ -1030,6 +1030,7 @@ def do_resolve_with_resolved_children(
                 self.analyze(
                     logical_plan.condition, df_aliased_col_name_to_real_col_name
                 ),
+                logical_plan.is_having,
                 resolved_children[logical_plan.child],
                 logical_plan,
             )
@@ -1082,6 +1083,7 @@ def do_resolve_with_resolved_children(
                     self.analyze(x, df_aliased_col_name_to_real_col_name)
                     for x in logical_plan.order
                 ],
+                logical_plan.is_order_by_append,
                 resolved_children[logical_plan.child],
                 logical_plan,
             )
diff --git a/src/snowflake/snowpark/_internal/analyzer/analyzer_utils.py b/src/snowflake/snowpark/_internal/analyzer/analyzer_utils.py
@@ -206,6 +206,7 @@
 UUID_COMMENT = "-- {}"
 MODEL = "MODEL"
 EXCLAMATION_MARK = "!"
+HAVING = " HAVING "
 
 TEMPORARY_STRING_SET = frozenset(["temporary", "temp"])
 
@@ -530,14 +531,17 @@ def project_statement(
 
 
 def filter_statement(
-    condition: str, child: str, child_uuid: Optional[str] = None
+    condition: str, is_having: bool, child: str, child_uuid: Optional[str] = None
 ) -> str:
-    return (
-        project_statement([], child, child_uuid=child_uuid)
-        + NEW_LINE
-        + WHERE
-        + condition
-    )
+    if is_having:
+        return child + NEW_LINE + HAVING + condition
+    else:
+        return (
+            project_statement([], child, child_uuid=child_uuid)
+            + NEW_LINE
+            + WHERE
+            + condition
+        )
 
 
 def sample_statement(
@@ -648,10 +652,17 @@ def aggregate_statement(
 
 
 def sort_statement(
-    order: List[str], child: str, child_uuid: Optional[str] = None
+    order: List[str],
+    is_order_by_append: bool,
+    child: str,
+    child_uuid: Optional[str] = None,
 ) -> str:
     return (
-        project_statement([], child, child_uuid=child_uuid)
+        (
+            child
+            if is_order_by_append
+            else project_statement([], child, child_uuid=child_uuid)
+        )
         + NEW_LINE
         + ORDER_BY
         + NEW_LINE
@@ -736,7 +747,7 @@ def values_statement(output: List[Attribute], data: List[Row]) -> str:
 
 def empty_values_statement(output: List[Attribute]) -> str:
     data = [Row(*[None] * len(output))]
-    return filter_statement(UNSAT_FILTER, values_statement(output, data))
+    return filter_statement(UNSAT_FILTER, False, values_statement(output, data))
 
 
 def set_operator_statement(left: str, right: str, operator: str) -> str:
diff --git a/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py b/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py
@@ -1078,12 +1078,14 @@ def aggregate(
     def filter(
         self,
         condition: str,
+        is_having: bool,
         child: SnowflakePlan,
         source_plan: Optional[LogicalPlan],
     ) -> SnowflakePlan:
         return self.build(
             lambda x: filter_statement(
                 condition,
+                is_having,
                 x,
                 child_uuid=(
                     child.uuid
@@ -1135,12 +1137,14 @@ def sample_by(
     def sort(
         self,
         order: List[str],
+        is_order_by_append: bool,
         child: SnowflakePlan,
         source_plan: Optional[LogicalPlan],
     ) -> SnowflakePlan:
         return self.build(
             lambda x: sort_statement(
                 order,
+                is_order_by_append,
                 x,
                 child_uuid=(
                     child.uuid
diff --git a/src/snowflake/snowpark/_internal/analyzer/unary_plan_node.py b/src/snowflake/snowpark/_internal/analyzer/unary_plan_node.py
@@ -88,9 +88,15 @@ def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
 
 
 class Sort(UnaryNode):
-    def __init__(self, order: List[SortOrder], child: LogicalPlan) -> None:
+    def __init__(
+        self,
+        order: List[SortOrder],
+        child: LogicalPlan,
+        is_order_by_append: bool = False,
+    ) -> None:
         super().__init__(child)
         self.order = order
+        self.is_order_by_append = is_order_by_append
 
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
@@ -242,13 +248,16 @@ def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
 
 
 class Filter(UnaryNode):
-    def __init__(self, condition: Expression, child: LogicalPlan) -> None:
+    def __init__(
+        self, condition: Expression, child: LogicalPlan, is_having: bool = False
+    ) -> None:
         super().__init__(child)
         self.condition = condition
+        self.is_having = is_having
 
     @property
     def individual_node_complexity(self) -> Dict[PlanNodeCategory, int]:
-        # child WHERE condition
+        # child WHERE condition or HAVING condition
         return sum_node_complexities(
             {PlanNodeCategory.FILTER: 1},
             self.condition.cumulative_node_complexity,
diff --git a/src/snowflake/snowpark/dataframe.py b/src/snowflake/snowpark/dataframe.py
@@ -616,6 +616,7 @@ def __init__(
 
         self._statement_params = None
         self.is_cached: bool = is_cached  #: Whether the dataframe is cached.
+        self._is_grouped_by_and_aggregated = False
 
         # Whether all columns are VARIANT data type,
         # which support querying nested fields via dot notations
@@ -1964,18 +1965,41 @@ def filter(
             else:
                 stmt = _ast_stmt
 
-        if self._select_statement:
+        # In snowpark_connect_compatible mode, we need to handle
+        # the filtering for dataframe after aggregation without nesting using HAVING
+        if (
+            context._is_snowpark_connect_compatible_mode
+            and self._is_grouped_by_and_aggregated
+        ):
+            having_plan = Filter(filter_col_expr, self._plan, is_having=True)
+            if self._select_statement:
+                df = self._with_plan(
+                    self._session._analyzer.create_select_statement(
+                        from_=self._session._analyzer.create_select_snowflake_plan(
+                            having_plan, analyzer=self._session._analyzer
+                        ),
+                        analyzer=self._session._analyzer,
+                    ),
+                    _ast_stmt=stmt,
+                )
+            else:
+                df = self._with_plan(having_plan, _ast_stmt=stmt)
+            df._is_grouped_by_and_aggregated = True
+            return df
+        else:
+            if self._select_statement:
+                return self._with_plan(
+                    self._select_statement.filter(filter_col_expr),
+                    _ast_stmt=stmt,
+                )
             return self._with_plan(
-                self._select_statement.filter(filter_col_expr),
+                Filter(
+                    filter_col_expr,
+                    self._plan,
+                    is_having=False,
+                ),
                 _ast_stmt=stmt,
             )
-        return self._with_plan(
-            Filter(
-                filter_col_expr,
-                self._plan,
-            ),
-            _ast_stmt=stmt,
-        )
 
     @df_api_usage
     @publicapi
@@ -2105,16 +2129,40 @@ def sort(
                     SortOrder(exprs[idx], orders[idx] if orders else Ascending())
                 )
 
-        df = (
-            self._with_plan(self._select_statement.sort(sort_exprs))
-            if self._select_statement
-            else self._with_plan(Sort(sort_exprs, self._plan))
-        )
+        # In snowpark_connect_compatible mode, we need to handle
+        # the sorting for dataframe after aggregation without nesting
+        if (
+            context._is_snowpark_connect_compatible_mode
+            and self._is_grouped_by_and_aggregated
+        ):
+            sort_plan = Sort(sort_exprs, self._plan, is_order_by_append=True)
+            if self._select_statement:
+                df = self._with_plan(
+                    self._session._analyzer.create_select_statement(
+                        from_=self._session._analyzer.create_select_snowflake_plan(
+                            sort_plan, analyzer=self._session._analyzer
+                        ),
+                        analyzer=self._session._analyzer,
+                    ),
+                    _ast_stmt=stmt,
+                )
+            else:
+                df = self._with_plan(sort_plan, _ast_stmt=stmt)
+            df._is_grouped_by_and_aggregated = True
+            return df
+        else:
+            df = (
+                self._with_plan(self._select_statement.sort(sort_exprs))
+                if self._select_statement
+                else self._with_plan(
+                    Sort(sort_exprs, self._plan, is_order_by_append=False)
+                )
+            )
 
-        if _emit_ast:
-            df._ast_id = stmt.uid
+            if _emit_ast:
+                df._ast_id = stmt.uid
 
-        return df
+            return df
 
     @experimental(version="1.5.0")
     @publicapi
diff --git a/src/snowflake/snowpark/relational_grouped_dataframe.py b/src/snowflake/snowpark/relational_grouped_dataframe.py
@@ -324,6 +324,7 @@ def agg(
                     agg_exprs.append(_str_to_expr(e[1], _emit_ast)(col_expr))
 
         df = self._to_df(agg_exprs, _emit_ast=False)
+        df._is_grouped_by_and_aggregated = True
 
         if _emit_ast:
             df._ast_id = stmt.uid
@@ -514,6 +515,7 @@ def end_partition(
             ),
             _emit_ast=False,
         )
+        df._is_grouped_by_and_aggregated = True
 
         if _emit_ast:
             stmt = working_dataframe._session._ast_batch.bind()
@@ -692,6 +694,7 @@ def count(self, _emit_ast: bool = True) -> DataFrame:
             ],
             _emit_ast=False,
         )
+        df._is_grouped_by_and_aggregated = True
 
         # TODO: count seems similar to mean, min, .... Can we unify implementation here?
         if _emit_ast:
@@ -727,6 +730,7 @@ def _function(
             )._expression
             agg_exprs.append(expr)
         df = self._to_df(agg_exprs)
+        df._is_grouped_by_and_aggregated = True
 
         if _emit_ast:
             stmt = self._dataframe._session._ast_batch.bind()
diff --git a/tests/integ/test_df_aggregate.py b/tests/integ/test_df_aggregate.py
diff --git a/tests/unit/test_analyzer_util_suite.py b/tests/unit/test_analyzer_util_suite.py