add test

sfc-gh-aling · sfc-gh-aling · commit 2d8827179fd3 · 2025-12-19T16:59:49.000-08:00
diff --git a/src/snowflake/snowpark/_internal/compiler/cte_utils.py b/src/snowflake/snowpark/_internal/compiler/cte_utils.py
@@ -20,6 +20,8 @@
 if TYPE_CHECKING:
     from snowflake.snowpark._internal.compiler.utils import TreeNode  # pragma: no cover
 
+HASH_LENGTH = 10
+
 
 def find_duplicate_subtrees(
     root: "TreeNode", propagate_complexity_hist: bool = False
@@ -272,7 +274,7 @@ def stringify(d):
         string = f"{string}#{stringify(node.df_aliased_col_name_to_real_col_name)}"
 
     try:
-        return hashlib.sha256(string.encode()).hexdigest()[:10]
+        return hashlib.sha256(string.encode()).hexdigest()[:HASH_LENGTH]
     except Exception as ex:
         logging.warning(f"Encode SnowflakePlan ID failed: {ex}")
         return None
diff --git a/src/snowflake/snowpark/_internal/compiler/repeated_subquery_elimination.py b/src/snowflake/snowpark/_internal/compiler/repeated_subquery_elimination.py
@@ -10,7 +10,10 @@
     LogicalPlan,
     WithQueryBlock,
 )
-from snowflake.snowpark._internal.compiler.cte_utils import find_duplicate_subtrees
+from snowflake.snowpark._internal.compiler.cte_utils import (
+    find_duplicate_subtrees,
+    HASH_LENGTH,
+)
 from snowflake.snowpark._internal.compiler.query_generator import QueryGenerator
 from snowflake.snowpark._internal.compiler.utils import (
     TreeNode,
@@ -22,6 +25,7 @@
     TempObjectType,
     random_name_for_temp_object,
 )
+import snowflake.snowpark.context as context
 
 
 class RepeatedSubqueryEliminationResult:
@@ -165,11 +169,14 @@ def _update_parents(
                         node.encoded_node_id_with_query
                     ]
                 else:
-                    if self._query_generator.session.reduce_describe_query_enabled:
+                    if (
+                        self._query_generator.session.reduce_describe_query_enabled
+                        and context._is_snowpark_connect_compatible_mode
+                    ):
                         # create a deterministic name using the first 16 chars of encoded_node_id_with_query (SHA256 hash)
-                        # This ensures the same node always gets the same CTE name.
-                        #  it helps when DataFrame.queries is called multiple times, they will get the same CTE name.
-                        cte_name = f"{TEMP_OBJECT_NAME_PREFIX}{TempObjectType.CTE.value}_{node.encoded_node_id_with_query[:16].upper()}"
+                        # It helps when DataFrame.queries is called multiple times.
+                        # Consistent CTE names returned, reducing the number of describe queries from cached_analyze_attributes calls.
+                        cte_name = f"{TEMP_OBJECT_NAME_PREFIX}{TempObjectType.CTE.value}_{node.encoded_node_id_with_query[:HASH_LENGTH].upper()}"
                     else:
                         cte_name = random_name_for_temp_object(TempObjectType.CTE)
                     with_block = WithQueryBlock(name=cte_name, child=node)  # type: ignore
diff --git a/tests/integ/test_cte.py b/tests/integ/test_cte.py
@@ -4,6 +4,7 @@
 
 import re
 import tracemalloc
+from unittest import mock
 
 import pytest
 
@@ -32,6 +33,7 @@
     StringType,
     TimestampType,
 )
+import snowflake.snowpark.context as context
 from tests.integ.scala.test_dataframe_reader_suite import get_reader
 from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker
 from tests.utils import IS_IN_STORED_PROC_LOCALFS, TestFiles, Utils
@@ -1313,3 +1315,40 @@ def test_table_select_cte(session):
         union_count=1,
         join_count=0,
     )
+
+
+@pytest.mark.parametrize(
+    "reduce_describe_enabled,expected_describe_counts",
+    [
+        (True, [1, 0]),  # With caching: first call misses, second call hits cache
+        (False, [1, 1]),  # Without caching: both calls issue describe queries
+    ],
+)
+def test_dataframe_queries_with_cte_reuses_schema_cache(
+    session, reduce_describe_enabled, expected_describe_counts
+):
+    """Test that calling dataframe.queries (not same dataframe but same operation) multiple times with CTE optimization
+    does not issue extra DESCRIBE queries when reduce_describe_query_enabled is True.
+
+    This tests the deterministic CTE naming feature: when CTE optimization is enabled
+    and reduce_describe_query is enabled, repeated calls to df.queries should produce
+    identical SQL (with same CTE names), allowing the schema cache to hit.
+    """
+
+    def create_cte_dataframe():
+        """Create a DataFrame that triggers CTE optimization (same df used twice)."""
+        df = session.create_dataframe([[1, 2], [3, 4]], schema=["a", "b"])
+        return df.union_all(df)
+
+    def access_queries_and_schema(df):
+        """Access both queries and schema properties."""
+        _ = df.queries
+        _ = df.schema
+
+    with mock.patch.object(
+        session, "_reduce_describe_query_enabled", reduce_describe_enabled
+    ), mock.patch.object(context, "_is_snowpark_connect_compatible_mode", True):
+        for expected_describe_count in expected_describe_counts:
+            df_union = create_cte_dataframe()
+            with SqlCounter(query_count=0, describe_count=expected_describe_count):
+                access_queries_and_schema(df_union)