SNOW-2432262: Remove an intermediate column in sample(). (#3898)

sfc-gh-mvashishtha · web-flow · commit 04218cc2324a · 2025-10-15T13:54:21.000-07:00
The generator expression for sample(replace=True) currently has an extra, unused column representing row position. The column requires a window function, so removing it may improve performance, in addition to making the code clearer.

Signed-off-by: sfc-gh-mvashishtha &lt;mahesh.vashishtha@snowflake.com&gt;
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -15834,11 +15834,12 @@ def sample(
         assert n is not None or frac is not None
         frame = self._modin_frame
         if replace:
-            snowflake_quoted_identifiers = generate_snowflake_quoted_identifiers_helper(
-                pandas_labels=[
-                    ROW_POSITION_COLUMN_LABEL,
-                    SAMPLED_ROW_POSITION_COLUMN_LABEL,
-                ]
+            sampled_row_position_identifier = (
+                generate_snowflake_quoted_identifiers_helper(
+                    pandas_labels=[
+                        SAMPLED_ROW_POSITION_COLUMN_LABEL,
+                    ]
+                )[0]
             )
 
             pre_sampling_rowcount = self.get_axis_len(axis=0)
@@ -15848,30 +15849,25 @@ def sample(
                 assert frac is not None
                 post_sampling_rowcount = round(frac * pre_sampling_rowcount)
 
-            row_position_col = (
-                row_number()
-                .over(Window.order_by(pandas_lit(1)))
-                .as_(snowflake_quoted_identifiers[0])
-            )
-
             sampled_row_position_col = uniform(
                 0, pre_sampling_rowcount - 1, random()
-            ).as_(snowflake_quoted_identifiers[1])
+            ).as_(sampled_row_position_identifier)
 
             sampled_row_positions_snowpark_frame = pd.session.generator(
-                row_position_col,
                 sampled_row_position_col,
                 rowcount=post_sampling_rowcount,
             )
 
             sampled_row_positions_odf = OrderedDataFrame(
                 dataframe_ref=DataFrameReference(sampled_row_positions_snowpark_frame),
-                projected_column_snowflake_quoted_identifiers=snowflake_quoted_identifiers,
+                projected_column_snowflake_quoted_identifiers=[
+                    sampled_row_position_identifier
+                ],
             )
             sampled_odf = cache_result(
                 sampled_row_positions_odf.join(
                     right=self._modin_frame.ordered_dataframe,
-                    left_on_cols=[snowflake_quoted_identifiers[1]],
+                    left_on_cols=[sampled_row_position_identifier],
                     right_on_cols=[
                         self._modin_frame.ordered_dataframe.row_position_snowflake_quoted_identifier
                     ],