-
Notifications
You must be signed in to change notification settings - Fork 144
Expand file tree
/
Copy pathquery_generator.py
More file actions
240 lines (212 loc) · 9.83 KB
/
query_generator.py
File metadata and controls
240 lines (212 loc) · 9.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
#
import copy
from typing import DefaultDict, Dict, Iterable, List, NamedTuple, Optional
from snowflake.snowpark._internal.analyzer.analyzer import Analyzer
from snowflake.snowpark._internal.analyzer.expression import Attribute
from snowflake.snowpark._internal.analyzer.select_statement import (
SelectSnowflakePlan,
Selectable,
)
from snowflake.snowpark._internal.analyzer.snowflake_plan import (
PlanQueryType,
Query,
SnowflakePlan,
SnowflakePlanBuilder,
)
from snowflake.snowpark._internal.analyzer.snowflake_plan_node import (
CopyIntoLocationNode,
LogicalPlan,
SnowflakeCreateTable,
TableCreationSource,
WithQueryBlock,
)
from snowflake.snowpark._internal.analyzer.table_merge_expression import (
TableDelete,
TableMerge,
TableUpdate,
)
from snowflake.snowpark._internal.analyzer.unary_plan_node import CreateViewCommand
from snowflake.snowpark.session import Session
class SnowflakeCreateTablePlanInfo(NamedTuple):
"""
Cached information that can be used resolve the plan for SnowflakeCreateTable.
"""
table_name: Iterable[str]
child_attributes: List[Attribute]
class QueryGenerator(Analyzer):
"""
Query Generation class that is used re-build the sql query for given logical plans
during the compilation stage.
Note that this query generator only rebuild the sql query, not the schema queries.
"""
def __init__(
self,
session: Session,
snowflake_create_table_plan_info: Optional[SnowflakeCreateTablePlanInfo] = None,
) -> None:
super().__init__(session)
# overwrite the plan_builder initiated in the super to skip the building of schema query
self.plan_builder = SnowflakePlanBuilder(self.session, skip_schema_query=True)
# cached information that can be used resolve the SnowflakeCreateTable node
self._snowflake_create_table_plan_info: Optional[
SnowflakeCreateTablePlanInfo
] = snowflake_create_table_plan_info
# Records the definition of all the with query blocks encountered during the code generation.
# This information will be used to generate the final query of a SnowflakePlan with the
# correct CTE definition.
# NOTE: the dict used here is an ordered dict, all with query block definition is recorded in the
# order of when the with query block is visited. The order is important to make sure the dependency
# between the CTE definition is satisfied.
self.resolved_with_query_block: Dict[str, Query] = {}
def to_selectable(self, plan: LogicalPlan) -> Selectable:
"""Given a LogicalPlan, convert it to a Selectable."""
if isinstance(plan, Selectable):
return plan
snowflake_plan = self.resolve(plan)
selectable = SelectSnowflakePlan(snowflake_plan, analyzer=self)
selectable._is_valid_for_replacement = snowflake_plan._is_valid_for_replacement
return selectable
def generate_queries(
self, logical_plans: List[LogicalPlan]
) -> Dict[PlanQueryType, List[Query]]:
"""
Generate final queries for the given set of logical plans.
Returns
-------
"""
from snowflake.snowpark._internal.compiler.utils import (
get_snowflake_plan_queries,
)
# generate queries for each logical plan
snowflake_plans = [self.resolve(logical_plan) for logical_plan in logical_plans]
# merge all results into final set of queries
queries = []
post_actions = []
for snowflake_plan in snowflake_plans:
plan_queries = get_snowflake_plan_queries(
snowflake_plan, self.resolved_with_query_block
)
# we deduplicate the queries and post actions generated across the logical
# plans because it is possible for large query breakdown to partition
# original plan into multiple plans that may contain the same nodes which
# generate the same queries and post actions.
for query in plan_queries[PlanQueryType.QUERIES]:
if query not in queries:
queries.append(query)
for action in plan_queries[PlanQueryType.POST_ACTIONS]:
if action not in post_actions:
post_actions.append(action)
return {
PlanQueryType.QUERIES: queries,
PlanQueryType.POST_ACTIONS: post_actions,
}
def do_resolve_with_resolved_children(
self,
logical_plan: LogicalPlan,
resolved_children: Dict[LogicalPlan, SnowflakePlan],
df_aliased_col_name_to_real_col_name: DefaultDict[str, Dict[str, str]],
) -> SnowflakePlan:
if isinstance(logical_plan, SnowflakeCreateTable):
from snowflake.snowpark._internal.compiler.utils import (
get_snowflake_plan_queries,
)
# overwrite the SnowflakeCreateTable resolving, because the child
# attribute will be pulled directly from the cache
resolved_child = resolved_children[logical_plan.children[0]]
# when creating a table during query compilation stage, if the
# table being created is the same as the one that is cached, we
# pull the child attributes directly from the cache. Otherwise, we
# use the child attributes as None. This will be for the case when
# table creation source is temp table from large query breakdown.
child_attributes = None
if (
logical_plan.creation_source
!= TableCreationSource.LARGE_QUERY_BREAKDOWN
):
assert self._snowflake_create_table_plan_info is not None
assert (
self._snowflake_create_table_plan_info.table_name
== logical_plan.table_name
)
child_attributes = (
self._snowflake_create_table_plan_info.child_attributes
)
# update the resolved child
copied_resolved_child = copy.copy(resolved_child)
final_queries = get_snowflake_plan_queries(
copied_resolved_child, self.resolved_with_query_block
)
copied_resolved_child.queries = final_queries[PlanQueryType.QUERIES]
resolved_plan = self.plan_builder.save_as_table(
table_name=logical_plan.table_name,
column_names=logical_plan.column_names,
mode=logical_plan.mode,
table_type=logical_plan.table_type,
clustering_keys=[
self.analyze(x, df_aliased_col_name_to_real_col_name)
for x in logical_plan.clustering_exprs
],
comment=logical_plan.comment,
enable_schema_evolution=logical_plan.enable_schema_evolution,
data_retention_time=logical_plan.data_retention_time,
max_data_extension_time=logical_plan.max_data_extension_time,
change_tracking=logical_plan.change_tracking,
copy_grants=logical_plan.copy_grants,
child=copied_resolved_child,
source_plan=logical_plan,
use_scoped_temp_objects=self.session._use_scoped_temp_objects,
creation_source=logical_plan.creation_source,
child_attributes=child_attributes,
iceberg_config=logical_plan.iceberg_config,
table_exists=logical_plan.table_exists,
)
elif isinstance(
logical_plan,
(
CreateViewCommand,
TableUpdate,
TableDelete,
TableMerge,
CopyIntoLocationNode,
),
):
from snowflake.snowpark._internal.compiler.utils import (
get_snowflake_plan_queries,
)
# for CreateViewCommand, TableUpdate, TableDelete, TableMerge and CopyIntoLocationNode,
# the with definition must be generated before create, update, delete, merge and copy into
# query.
resolved_child = resolved_children[logical_plan.children[0]]
copied_resolved_child = copy.copy(resolved_child)
final_queries = get_snowflake_plan_queries(
copied_resolved_child, self.resolved_with_query_block
)
copied_resolved_child.queries = final_queries[PlanQueryType.QUERIES]
resolved_children[logical_plan.children[0]] = copied_resolved_child
resolved_plan = super().do_resolve_with_resolved_children(
logical_plan, resolved_children, df_aliased_col_name_to_real_col_name
)
elif isinstance(logical_plan, Selectable):
# overwrite the Selectable resolving to make sure we are triggering
# any schema query build
resolved_plan = logical_plan.get_snowflake_plan(skip_schema_query=True)
elif isinstance(logical_plan, WithQueryBlock):
resolved_child = resolved_children[logical_plan.children[0]]
# record the CTE definition of the current block or update the query when
# the child is re-resolved during optimization stage.
self.resolved_with_query_block[logical_plan.name] = resolved_child.queries[
-1
]
resolved_plan = self.plan_builder.with_query_block(
logical_plan,
resolved_child,
logical_plan,
)
else:
resolved_plan = super().do_resolve_with_resolved_children(
logical_plan, resolved_children, df_aliased_col_name_to_real_col_name
)
resolved_plan._is_valid_for_replacement = True
return resolved_plan