Skip to content

Commit 2d88271

Browse files
committed
add test
1 parent 4d0874b commit 2d88271

File tree

3 files changed

+54
-6
lines changed

3 files changed

+54
-6
lines changed

src/snowflake/snowpark/_internal/compiler/cte_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
if TYPE_CHECKING:
2121
from snowflake.snowpark._internal.compiler.utils import TreeNode # pragma: no cover
2222

23+
HASH_LENGTH = 10
24+
2325

2426
def find_duplicate_subtrees(
2527
root: "TreeNode", propagate_complexity_hist: bool = False
@@ -272,7 +274,7 @@ def stringify(d):
272274
string = f"{string}#{stringify(node.df_aliased_col_name_to_real_col_name)}"
273275

274276
try:
275-
return hashlib.sha256(string.encode()).hexdigest()[:10]
277+
return hashlib.sha256(string.encode()).hexdigest()[:HASH_LENGTH]
276278
except Exception as ex:
277279
logging.warning(f"Encode SnowflakePlan ID failed: {ex}")
278280
return None

src/snowflake/snowpark/_internal/compiler/repeated_subquery_elimination.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
LogicalPlan,
1111
WithQueryBlock,
1212
)
13-
from snowflake.snowpark._internal.compiler.cte_utils import find_duplicate_subtrees
13+
from snowflake.snowpark._internal.compiler.cte_utils import (
14+
find_duplicate_subtrees,
15+
HASH_LENGTH,
16+
)
1417
from snowflake.snowpark._internal.compiler.query_generator import QueryGenerator
1518
from snowflake.snowpark._internal.compiler.utils import (
1619
TreeNode,
@@ -22,6 +25,7 @@
2225
TempObjectType,
2326
random_name_for_temp_object,
2427
)
28+
import snowflake.snowpark.context as context
2529

2630

2731
class RepeatedSubqueryEliminationResult:
@@ -165,11 +169,14 @@ def _update_parents(
165169
node.encoded_node_id_with_query
166170
]
167171
else:
168-
if self._query_generator.session.reduce_describe_query_enabled:
172+
if (
173+
self._query_generator.session.reduce_describe_query_enabled
174+
and context._is_snowpark_connect_compatible_mode
175+
):
169176
# create a deterministic name using the first 16 chars of encoded_node_id_with_query (SHA256 hash)
170-
# This ensures the same node always gets the same CTE name.
171-
# it helps when DataFrame.queries is called multiple times, they will get the same CTE name.
172-
cte_name = f"{TEMP_OBJECT_NAME_PREFIX}{TempObjectType.CTE.value}_{node.encoded_node_id_with_query[:16].upper()}"
177+
# It helps when DataFrame.queries is called multiple times.
178+
# Consistent CTE names returned, reducing the number of describe queries from cached_analyze_attributes calls.
179+
cte_name = f"{TEMP_OBJECT_NAME_PREFIX}{TempObjectType.CTE.value}_{node.encoded_node_id_with_query[:HASH_LENGTH].upper()}"
173180
else:
174181
cte_name = random_name_for_temp_object(TempObjectType.CTE)
175182
with_block = WithQueryBlock(name=cte_name, child=node) # type: ignore

tests/integ/test_cte.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import re
66
import tracemalloc
7+
from unittest import mock
78

89
import pytest
910

@@ -32,6 +33,7 @@
3233
StringType,
3334
TimestampType,
3435
)
36+
import snowflake.snowpark.context as context
3537
from tests.integ.scala.test_dataframe_reader_suite import get_reader
3638
from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker
3739
from tests.utils import IS_IN_STORED_PROC_LOCALFS, TestFiles, Utils
@@ -1313,3 +1315,40 @@ def test_table_select_cte(session):
13131315
union_count=1,
13141316
join_count=0,
13151317
)
1318+
1319+
1320+
@pytest.mark.parametrize(
1321+
"reduce_describe_enabled,expected_describe_counts",
1322+
[
1323+
(True, [1, 0]), # With caching: first call misses, second call hits cache
1324+
(False, [1, 1]), # Without caching: both calls issue describe queries
1325+
],
1326+
)
1327+
def test_dataframe_queries_with_cte_reuses_schema_cache(
1328+
session, reduce_describe_enabled, expected_describe_counts
1329+
):
1330+
"""Test that calling dataframe.queries (not same dataframe but same operation) multiple times with CTE optimization
1331+
does not issue extra DESCRIBE queries when reduce_describe_query_enabled is True.
1332+
1333+
This tests the deterministic CTE naming feature: when CTE optimization is enabled
1334+
and reduce_describe_query is enabled, repeated calls to df.queries should produce
1335+
identical SQL (with same CTE names), allowing the schema cache to hit.
1336+
"""
1337+
1338+
def create_cte_dataframe():
1339+
"""Create a DataFrame that triggers CTE optimization (same df used twice)."""
1340+
df = session.create_dataframe([[1, 2], [3, 4]], schema=["a", "b"])
1341+
return df.union_all(df)
1342+
1343+
def access_queries_and_schema(df):
1344+
"""Access both queries and schema properties."""
1345+
_ = df.queries
1346+
_ = df.schema
1347+
1348+
with mock.patch.object(
1349+
session, "_reduce_describe_query_enabled", reduce_describe_enabled
1350+
), mock.patch.object(context, "_is_snowpark_connect_compatible_mode", True):
1351+
for expected_describe_count in expected_describe_counts:
1352+
df_union = create_cte_dataframe()
1353+
with SqlCounter(query_count=0, describe_count=expected_describe_count):
1354+
access_queries_and_schema(df_union)

0 commit comments

Comments
 (0)