Skip to content

Commit 04218cc

Browse files
SNOW-2432262: Remove an intermediate column in sample(). (#3898)
The generator expression for sample(replace=True) currently has an extra, unused column representing row position. The column requires a window function, so removing it may improve performance, in addition to making the code clearer. Signed-off-by: sfc-gh-mvashishtha <[email protected]>
1 parent bf0524d commit 04218cc

File tree

1 file changed

+11
-15
lines changed

1 file changed

+11
-15
lines changed

src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15834,11 +15834,12 @@ def sample(
1583415834
assert n is not None or frac is not None
1583515835
frame = self._modin_frame
1583615836
if replace:
15837-
snowflake_quoted_identifiers = generate_snowflake_quoted_identifiers_helper(
15838-
pandas_labels=[
15839-
ROW_POSITION_COLUMN_LABEL,
15840-
SAMPLED_ROW_POSITION_COLUMN_LABEL,
15841-
]
15837+
sampled_row_position_identifier = (
15838+
generate_snowflake_quoted_identifiers_helper(
15839+
pandas_labels=[
15840+
SAMPLED_ROW_POSITION_COLUMN_LABEL,
15841+
]
15842+
)[0]
1584215843
)
1584315844

1584415845
pre_sampling_rowcount = self.get_axis_len(axis=0)
@@ -15848,30 +15849,25 @@ def sample(
1584815849
assert frac is not None
1584915850
post_sampling_rowcount = round(frac * pre_sampling_rowcount)
1585015851

15851-
row_position_col = (
15852-
row_number()
15853-
.over(Window.order_by(pandas_lit(1)))
15854-
.as_(snowflake_quoted_identifiers[0])
15855-
)
15856-
1585715852
sampled_row_position_col = uniform(
1585815853
0, pre_sampling_rowcount - 1, random()
15859-
).as_(snowflake_quoted_identifiers[1])
15854+
).as_(sampled_row_position_identifier)
1586015855

1586115856
sampled_row_positions_snowpark_frame = pd.session.generator(
15862-
row_position_col,
1586315857
sampled_row_position_col,
1586415858
rowcount=post_sampling_rowcount,
1586515859
)
1586615860

1586715861
sampled_row_positions_odf = OrderedDataFrame(
1586815862
dataframe_ref=DataFrameReference(sampled_row_positions_snowpark_frame),
15869-
projected_column_snowflake_quoted_identifiers=snowflake_quoted_identifiers,
15863+
projected_column_snowflake_quoted_identifiers=[
15864+
sampled_row_position_identifier
15865+
],
1587015866
)
1587115867
sampled_odf = cache_result(
1587215868
sampled_row_positions_odf.join(
1587315869
right=self._modin_frame.ordered_dataframe,
15874-
left_on_cols=[snowflake_quoted_identifiers[1]],
15870+
left_on_cols=[sampled_row_position_identifier],
1587515871
right_on_cols=[
1587615872
self._modin_frame.ordered_dataframe.row_position_snowflake_quoted_identifier
1587715873
],

0 commit comments

Comments
 (0)