Loosen flattening rules for sort and filter

sfc-gh-yixie · sfc-gh-yixie · commit d92ea24541aa · 2025-10-22T14:06:46.000-07:00
diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py
@@ -20,6 +20,7 @@
     Sequence,
     Set,
     Union,
+    Literal,
 )
 
 import snowflake.snowpark._internal.utils
@@ -1362,7 +1363,7 @@ def select(self, cols: List[Expression]) -> "SelectStatement":
         ):
             # TODO: Clean up, this entire if case is parameter protection
             can_be_flattened = False
-        elif (self.where or self.order_by or self.limit_) and has_data_generator_exp(
+        elif (self.where or self.order_by or self.limit_) and has_data_generator_or_window_function_exp(
             cols
         ):
             can_be_flattened = False
@@ -1453,9 +1454,9 @@ def filter(self, col: Expression) -> "SelectStatement":
         can_be_flattened = (
             (not self.flatten_disabled)
             and can_clause_dependent_columns_flatten(
-                derive_dependent_columns(col), self.column_states
+                derive_dependent_columns(col), self.column_states, "filter"
             )
-            and not has_data_generator_exp(self.projection)
+            and not has_data_generator_or_window_function_exp(self.projection)
             and not (self.order_by and self.limit_ is not None)
         )
         if can_be_flattened:
@@ -1490,7 +1491,7 @@ def sort(self, cols: List[Expression]) -> "SelectStatement":
             and (not self.limit_)
             and (not self.offset)
             and can_clause_dependent_columns_flatten(
-                derive_dependent_columns(*cols), self.column_states
+                derive_dependent_columns(*cols), self.column_states, "sort"
             )
             and not has_data_generator_exp(self.projection)
         )
@@ -1529,7 +1530,7 @@ def distinct(self) -> "SelectStatement":
             # .order_by(col1).select(col2).distinct() cannot be flattened because
             # SELECT DISTINCT B FROM TABLE ORDER BY A is not valid SQL
             and (not (self.order_by and self.has_projection))
-            and not has_data_generator_exp(self.projection)
+            and not has_data_generator_or_window_function_exp(self.projection)
         )
         if can_be_flattened:
             new = copy(self)
@@ -2020,7 +2021,10 @@ def can_projection_dependent_columns_be_flattened(
 def can_clause_dependent_columns_flatten(
     dependent_columns: Optional[AbstractSet[str]],
     subquery_column_states: ColumnStateDict,
+    clause: Literal["filter", "sort"],
 ) -> bool:
+    if clause not in ["filter", "sort"]:
+        raise ValueError(f"Invalid clause called in can_clause_dependent_columns_flatten: {clause}")
     if dependent_columns == COLUMN_DEPENDENCY_DOLLAR:
         return False
     elif (
@@ -2034,15 +2038,7 @@ def can_clause_dependent_columns_flatten(
         for dc in dependent_columns:
             dc_state = subquery_column_states.get(dc)
             if dc_state:
-                if dc_state.change_state == ColumnChangeState.CHANGED_EXP:
-                    return False
-                elif dc_state.change_state == ColumnChangeState.NEW:
-                    # Most of the time this can be flattened. But if a new column uses window function and this column
-                    # is used in a clause, the sql doesn't work in Snowflake.
-                    # For instance `select a, rank() over(order by b) as d from test_table where d = 1` doesn't work.
-                    # But `select a, b as d from test_table where d = 1` works
-                    # We can inspect whether the referenced new column uses window function. Here we are being
-                    # conservative for now to not flatten the SQL.
+                if dc_state.change_state == ColumnChangeState.CHANGED_EXP and clause == "filter":
                     return False
     return True
 
@@ -2264,8 +2260,6 @@ def has_data_generator_exp(expressions: Optional[List["Expression"]]) -> bool:
     if expressions is None:
         return False
     for exp in expressions:
-        if isinstance(exp, WindowExpression):
-            return True
         if isinstance(exp, FunctionExpression) and (
             exp.is_data_generator
             or exp.name.lower() in SEQUENCE_DEPENDENT_DATA_GENERATION
@@ -2275,3 +2269,18 @@ def has_data_generator_exp(expressions: Optional[List["Expression"]]) -> bool:
         if exp is not None and has_data_generator_exp(exp.children):
             return True
     return False
+
+
+def has_window_function_exp(expressions: Optional[List["Expression"]]) -> bool:
+    if expressions is None:
+        return False
+    for exp in expressions:
+        if isinstance(exp, WindowExpression):
+            return True
+        if exp is not None and has_window_function_exp(exp.children):
+            return True
+    return False
+
+
+def has_data_generator_or_window_function_exp(expressions: Optional[List["Expression"]]) -> bool:
+    return has_data_generator_exp(expressions) or has_window_function_exp(expressions)
diff --git a/tests/integ/test_simplifier_suite.py b/tests/integ/test_simplifier_suite.py
@@ -9,7 +9,7 @@
 
 import pytest
 
-from snowflake.snowpark import Row
+from snowflake.snowpark import Row, Window
 from snowflake.snowpark._internal.analyzer.select_statement import (
     SET_EXCEPT,
     SET_INTERSECT,
@@ -30,6 +30,7 @@
     sum as sum_,
     table_function,
     udtf,
+    rank,
 )
 from tests.utils import TestData, Utils
 
@@ -754,21 +755,34 @@ def test_order_by(setup_reduce_cast, session, simplifier_table):
         f'SELECT "A", "B" FROM {simplifier_table} ORDER BY "A" ASC NULLS FIRST, "B" ASC NULLS FIRST'
     )
 
-    # no flatten because c is a new column
+    # flatten if a new column is used in the order by clause
     df3 = df.select("a", "b", (col("a") - col("b")).as_("c")).sort("a", "b", "c")
     assert Utils.normalize_sql(df3.queries["queries"][-1]) == Utils.normalize_sql(
-        f'SELECT * FROM ( SELECT "A", "B", ("A" - "B") AS "C" FROM {simplifier_table} ) ORDER BY "A" ASC NULLS FIRST, "B" ASC NULLS FIRST, "C" ASC NULLS FIRST'
+        f'SELECT "A", "B", ("A" - "B") AS "C" FROM {simplifier_table} ORDER BY "A" ASC NULLS FIRST, "B" ASC NULLS FIRST, "C" ASC NULLS FIRST'
     )
 
-    # no flatten because a and be are changed
+    # still flatten even if a is changed because it's used in the order by clause
     df4 = df.select((col("a") + 1).as_("a"), ((col("b") + 1).as_("b"))).sort("a", "b")
     assert Utils.normalize_sql(df4.queries["queries"][-1]) == Utils.normalize_sql(
-        f'SELECT * FROM ( SELECT ("A" + 1{integer_literal_postfix}) AS "A", ("B" + 1{integer_literal_postfix}) AS "B" FROM {simplifier_table} ) ORDER BY "A" ASC NULLS FIRST, "B" ASC NULLS FIRST'
+        f'SELECT ("A" + 1{integer_literal_postfix}) AS "A", ("B" + 1{integer_literal_postfix}) AS "B" FROM {simplifier_table} ORDER BY "A" ASC NULLS FIRST, "B" ASC NULLS FIRST'
     )
 
-    # subquery has sql text so unable to figure out same-level dependency, so assuming d depends on c. No flatten.
-    df5 = df.select("a", "b", lit(3).as_("c"), sql_expr("1 + 1 as d")).sort("a", "b")
+    # still flatten if a window function is used in the projection
+    df5 = df.select("a", "b", rank().over(Window.order_by("b")).alias("c")).sort("a", "b")
     assert Utils.normalize_sql(df5.queries["queries"][-1]) == Utils.normalize_sql(
+        f'SELECT "A", "B", rank() OVER (ORDER BY "B" ASC NULLS FIRST) AS "C" FROM {simplifier_table} ORDER BY "A" ASC NULLS FIRST, "B" ASC NULLS FIRST'
+    )
+
+
+    # No flatten if a data generator is used in the projection
+    df6 = df.select("a", "b", seq1().alias("c")).sort("a", "b")
+    assert Utils.normalize_sql(df6.queries["queries"][-1]) == Utils.normalize_sql(
+        f'SELECT * FROM ( SELECT "A", "B", seq1(0) AS "C" FROM {simplifier_table}) ORDER BY "A" ASC NULLS FIRST, "B" ASC NULLS FIRST'
+    )
+
+    # subquery has sql text so unable to figure out if a data generator is used in the projection. No flatten.
+    df7 = df.select("a", "b", lit(3).as_("c"), sql_expr("1 + 1 as d")).sort("a", "b")
+    assert Utils.normalize_sql(df7.queries["queries"][-1]) == Utils.normalize_sql(
         f'SELECT * FROM ( SELECT "A", "B", 3 :: INT AS "C", 1 + 1 as d FROM ( SELECT * FROM {simplifier_table} ) ) ORDER BY "A" ASC NULLS FIRST, "B" ASC NULLS FIRST'
     )
 
@@ -790,33 +804,57 @@ def test_filter(setup_reduce_cast, session, simplifier_table):
     assert Utils.normalize_sql(df2.queries["queries"][-1]) == Utils.normalize_sql(
         f'SELECT "A", "B" FROM {simplifier_table} WHERE (("A" > 1{integer_literal_postfix}) AND ("B" > 2{integer_literal_postfix}))'
     )
-
-    # no flatten because c is a new column
+    
+    # flatten if a regular new column is in the projection
     df3 = df.select("a", "b", (col("a") - col("b")).as_("c")).filter(
-        (col("a") > 1) & (col("b") > 2) & (col("c") < 1)
+        (col("a") > 1) & (col("b") > 2)
     )
     assert Utils.normalize_sql(df3.queries["queries"][-1]) == Utils.normalize_sql(
-        f'SELECT * FROM ( SELECT "A", "B", ("A" - "B") AS "C" FROM {simplifier_table} ) WHERE ((("A" > 1{integer_literal_postfix}) AND ("B" > 2{integer_literal_postfix})) AND ("C" < 1{integer_literal_postfix}))'
+        f'SELECT "A", "B", ("A" - "B") AS "C" FROM {simplifier_table} WHERE (("A" > 1{integer_literal_postfix}) AND ("B" > 2{integer_literal_postfix}))'
+    )
+
+    # flatten if a regular new column is used in the filter clause
+    df4 = df.select("a", "b", (col("a") - col("b")).as_("c")).filter(
+        (col("a") > 1) & (col("b") > 2) & (col("c") < 1)
+    )
+    assert Utils.normalize_sql(df4.queries["queries"][-1]) == Utils.normalize_sql(
+        f'SELECT "A", "B", ("A" - "B") AS "C" FROM {simplifier_table} WHERE ((("A" > 1{integer_literal_postfix}) AND ("B" > 2{integer_literal_postfix})) AND ("C" < 1{integer_literal_postfix}))'
+    )
+
+    # no flatten if a window function is used in the projection
+    df5 = df.select("a", "b", rank().over(Window.order_by("b")).alias("c")).filter(
+        (col("a") > 1) & (col("b") > 2) & (col("c") < 1)
+    )
+    assert Utils.normalize_sql(df5.queries["queries"][-1]) == Utils.normalize_sql(
+        f'SELECT * FROM ( SELECT "A", "B", rank() OVER (ORDER BY "B" ASC NULLS FIRST) AS "C" FROM {simplifier_table} ) WHERE ((("A" > 1{integer_literal_postfix}) AND ("B" > 2{integer_literal_postfix})) AND ("C" < 1{integer_literal_postfix}))'
+    )
+
+    # no flatten if a data generator is used in the projection
+    df6 = df.select("a", "b", seq1().alias("c")).filter(
+        (col("a") > 1) & (col("b") > 2) & (col("c") < 1)
+    )
+    assert Utils.normalize_sql(df6.queries["queries"][-1]) == Utils.normalize_sql(
+        f'SELECT * FROM ( SELECT "A", "B", seq1(0) AS "C" FROM {simplifier_table} ) WHERE ((("A" > 1{integer_literal_postfix}) AND ("B" > 2{integer_literal_postfix})) AND ("C" < 1{integer_literal_postfix}))'
     )
 
     # no flatten because a and be are changed
-    df4 = df.select((col("a") + 1).as_("a"), (col("b") + 1).as_("b")).filter(
+    df7 = df.select((col("a") + 1).as_("a"), (col("b") + 1).as_("b")).filter(
         (col("a") > 1) & (col("b") > 2)
     )
-    assert Utils.normalize_sql(df4.queries["queries"][-1]) == Utils.normalize_sql(
+    assert Utils.normalize_sql(df7.queries["queries"][-1]) == Utils.normalize_sql(
         f'SELECT * FROM ( SELECT ("A" + 1{integer_literal_postfix}) AS "A", ("B" + 1{integer_literal_postfix}) AS "B" FROM {simplifier_table} ) WHERE (("A" > 1{integer_literal_postfix}) AND ("B" > 2{integer_literal_postfix}))'
     )
 
-    df5 = df4.select("a")
-    assert Utils.normalize_sql(df5.queries["queries"][-1]) == Utils.normalize_sql(
+    df8 = df7.select("a")
+    assert Utils.normalize_sql(df8.queries["queries"][-1]) == Utils.normalize_sql(
         f'SELECT "A" FROM ( SELECT ("A" + 1{integer_literal_postfix}) AS "A", ("B" + 1{integer_literal_postfix}) AS "B" FROM {simplifier_table} ) WHERE (("A" > 1{integer_literal_postfix}) AND ("B" > 2{integer_literal_postfix}))'
     )
 
     # subquery has sql text so unable to figure out same-level dependency, so assuming d depends on c. No flatten.
-    df6 = df.select("a", "b", lit(3).as_("c"), sql_expr("1 + 1 as d")).filter(
+    df9 = df.select("a", "b", lit(3).as_("c"), sql_expr("1 + 1 as d")).filter(
         col("a") > 1
     )
-    assert Utils.normalize_sql(df6.queries["queries"][-1]) == Utils.normalize_sql(
+    assert Utils.normalize_sql(df9.queries["queries"][-1]) == Utils.normalize_sql(
         f'SELECT * FROM ( SELECT "A", "B", 3 :: INT AS "C", 1 + 1 as d FROM ( SELECT * FROM {simplifier_table} ) ) WHERE ("A" > 1{integer_literal_postfix})'
     )