fix tests

sfc-gh-aling · sfc-gh-aling · commit a78670a974ee · 2025-10-31T12:12:59.000-07:00
diff --git a/src/snowflake/snowpark/_internal/analyzer/select_statement.py b/src/snowflake/snowpark/_internal/analyzer/select_statement.py
@@ -1480,6 +1480,7 @@ def select(self, cols: List[Expression]) -> "SelectStatement":
         return new
 
     def filter(self, col: Expression) -> "SelectStatement":
+        self._session._retrieve_aggregation_function_list()
         can_be_flattened = (
             (not self.flatten_disabled)
             and can_clause_dependent_columns_flatten(
@@ -1527,6 +1528,9 @@ def sort(self, cols: List[Expression]) -> "SelectStatement":
                 derive_dependent_columns(*cols), self.column_states, "sort"
             )
             and not has_data_generator_exp(self.projection)
+            # we do not check aggregation function here like filter
+            # in the case when aggregation function is in the projection
+            # order by is evaluated after aggregation, row info are not taken in the calculation
         )
         if can_be_flattened:
             new = copy(self)
diff --git a/src/snowflake/snowpark/context.py b/src/snowflake/snowpark/context.py
@@ -31,7 +31,10 @@
 
 # This is an internal-only global flag, used to determine whether the api code which will be executed is compatible with snowflake.snowpark_connect
 _is_snowpark_connect_compatible_mode = False
-_aggregation_function_set = set()
+_aggregation_function_set = (
+    set()
+)  # lower cased names of aggregation functions, used in sql simplification
+_aggregation_function_set_lock = threading.RLock()
 
 # Following are internal-only global flags, used to enable development features.
 _enable_dataframe_trace_on_error = False
diff --git a/src/snowflake/snowpark/session.py b/src/snowflake/snowpark/session.py
@@ -521,20 +521,6 @@ def create(self) -> "Session":
                 _add_session(session)
             else:
                 session = self._create_internal(self._options.get("connection"))
-                if context._is_snowpark_connect_compatible_mode:
-                    for sql in [
-                        """select function_name from information_schema.functions where is_aggregate = 'YES'""",
-                        """show functions ->> select "name" from $1 where "is_aggregate" = 'Y'""",
-                    ]:
-                        try:
-                            context._aggregation_function_set.update(
-                                {r[0] for r in session.sql(sql).collect()}
-                            )
-                        except BaseException as e:
-                            _logger.debug(
-                                "Unable to get aggregation functions from the database: %s",
-                                e,
-                            )
 
             if self._app_name:
                 if self._format_json:
@@ -4939,6 +4925,31 @@ def _execute_sproc_internal(
             # Note the collect is implicit within the stored procedure call, so should not emit_ast here.
             return df.collect(statement_params=statement_params, _emit_ast=False)[0][0]
 
+    def _retrieve_aggregation_function_list(self) -> None:
+        """Retrieve the list of aggregation functions which will later be used in sql simplifier."""
+        if (
+            not context._is_snowpark_connect_compatible_mode
+            or context._aggregation_function_set
+        ):
+            return
+
+        retrieved_set = set()
+
+        for sql in [
+            """select function_name from information_schema.functions where is_aggregate = 'YES'""",
+            """show functions ->> select "name" from $1 where "is_aggregate" = 'Y'""",
+        ]:
+            try:
+                retrieved_set.update({r[0].lower() for r in self.sql(sql).collect()})
+            except BaseException as e:
+                _logger.debug(
+                    "Unable to get aggregation functions from the database: %s",
+                    e,
+                )
+
+        with context._aggregation_function_set_lock:
+            context._aggregation_function_set.update(retrieved_set)
+
     def directory(self, stage_name: str, _emit_ast: bool = True) -> DataFrame:
         """
         Returns a DataFrame representing the results of a directory table query on the specified stage.
diff --git a/tests/integ/test_query_line_intervals.py b/tests/integ/test_query_line_intervals.py
@@ -57,8 +57,9 @@ def generate_test_data(session, sql_simplifier_enabled):
     }
 
 
+@pytest.mark.parametrize("snowpark_connect_compatible_mode", [True, False])
 @pytest.mark.parametrize(
-    "op,sql_simplifier,line_to_expected_sql",
+    "op,sql_simplifier,line_to_expected_sql,snowpark_connect_compatible_mode_sql",
     [
         (
             lambda data: data["df1"].union(data["df2"]),
@@ -68,10 +69,14 @@ def generate_test_data(session, sql_simplifier_enabled):
                 6: 'SELECT $1 AS "_1", $2 AS "_2", $3 AS "_3" FROM VALUES (1 :: INT, \'A\' :: STRING, 100 :: INT), (2 :: INT, \'B\' :: STRING, 200 :: INT)',
                 10: 'SELECT "_1" AS "ID", "_2" AS "NAME", "_3" AS "VALUE" FROM ( SELECT $1 AS "_1", $2 AS "_2", $3 AS "_3" FROM VALUES (3 :: INT, \'C\' :: STRING, 300 :: INT), (4 :: INT, \'D\' :: STRING, 400 :: INT) )',
             },
+            None,
         ),
         (
             lambda data: data["df1"].filter(data["df1"].value > 150),
             True,
+            {
+                8: 'SELECT $1 AS "_1", $2 AS "_2", $3 AS "_3" FROM VALUES (1 :: INT, \'A\' :: STRING, 100 :: INT), (2 :: INT, \'B\' :: STRING, 200 :: INT)'
+            },
             {
                 8: """SELECT "_1" AS "ID", "_2" AS "NAME", "_3" AS "VALUE" FROM (SELECT $1 AS "_1", $2 AS "_2", $3 AS "_3" FROM VALUES (1 :: INT, 'A' :: STRING, 100 :: INT), (2 :: INT, 'B' :: STRING, 200 :: INT)) WHERE ("VALUE" > 150)""",
             },
@@ -83,6 +88,7 @@ def generate_test_data(session, sql_simplifier_enabled):
                 1: 'SELECT "_1" AS "ID", "_2" AS "NAME" FROM ( SELECT $1 AS "_1", $2 AS "_2", $3 AS "_3" FROM VALUES (1 :: INT, \'A\' :: STRING, 100 :: INT), (2 :: INT, \'B\' :: STRING, 200 :: INT) )',
                 4: 'SELECT $1 AS "_1", $2 AS "_2", $3 AS "_3" FROM  VALUES (1 :: INT, \'A\' :: STRING, 100 :: INT), (2 :: INT, \'B\' :: STRING, 200 :: INT)',
             },
+            None,
         ),
         (
             lambda data: data["df1"].pivot(F.col("name")).sum(F.col("value")),
@@ -92,12 +98,26 @@ def generate_test_data(session, sql_simplifier_enabled):
                 6: 'SELECT $1 AS "_1", $2 AS "_2", $3 AS "_3" FROM VALUES (1 :: INT, \'A\' :: STRING, 100 :: INT), (2 :: INT, \'B\' :: STRING, 200 :: INT)',
                 9: 'SELECT * FROM ( SELECT "_1" AS "ID", "_2" AS "NAME", "_3" AS "VALUE" FROM ( SELECT $1 AS "_1", $2 AS "_2", $3 AS "_3" FROM VALUES (1 :: INT, \'A\' :: STRING, 100 :: INT), (2 :: INT, \'B\' :: STRING, 200 :: INT) ) ) PIVOT ( sum("VALUE") FOR "NAME" IN ( ANY ) )',
             },
+            None,
         ),
     ],
 )
 def test_get_plan_from_line_numbers_sql_content(
-    session, op, sql_simplifier, line_to_expected_sql
+    session,
+    op,
+    sql_simplifier,
+    line_to_expected_sql,
+    snowpark_connect_compatible_mode_sql,
+    snowpark_connect_compatible_mode,
+    monkeypatch,
 ):
+    if snowpark_connect_compatible_mode:
+        import snowflake.snowpark.context as context
+
+        monkeypatch.setattr(context, "_is_snowpark_connect_compatible_mode", True)
+        line_to_expected_sql = (
+            snowpark_connect_compatible_mode_sql or line_to_expected_sql
+        )
     session.sql_simplifier_enabled = sql_simplifier
     df = op(generate_test_data(session, sql_simplifier))
 
diff --git a/tests/integ/test_simplifier_suite.py b/tests/integ/test_simplifier_suite.py