Skip to content

Commit fb6c3fe

Browse files
SNOW-2230971: Re-introduce the support of repr, joins, loc, reset_index, and binary ops in Faster pandas (Two-QCs) (#3687)
1 parent 287a5c6 commit fb6c3fe

File tree

16 files changed

+811
-143
lines changed

16 files changed

+811
-143
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@
3737
- Eliminate duplicate parameter check queries for casing status when retrieving the session.
3838
- Retrieve dataframe row counts through object metadata to avoid a COUNT(\*) query (performance)
3939
- Added support for applying Snowflake Cortex function `Complete`.
40+
- Introduce faster pandas: Improved performance by deferring row position computation.
41+
- The following operations are currently supported and can benefit from the optimization: `read_snowflake`, `repr`, `loc`, `reset_index`, `merge`, and binary operations.
42+
- If a lazy object (e.g., DataFrame or Series) depends on a mix of supported and unsupported operations, the optimization will not be used.
4043

4144
#### Dependency Updates
4245

src/snowflake/snowpark/modin/plugin/_internal/concat_utils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,9 @@ def _select_columns(
373373
)
374374

375375

376-
def add_global_ordering_columns(frame: InternalFrame, position: int) -> InternalFrame:
376+
def add_global_ordering_columns(
377+
frame: InternalFrame, position: int, dummy_row_pos_mode: bool = False
378+
) -> InternalFrame:
377379
"""
378380
To create global ordering for concat (axis=0) operation we first ensure a
379381
row position column for local ordering within the frame. Then add another
@@ -388,7 +390,7 @@ def add_global_ordering_columns(frame: InternalFrame, position: int) -> Internal
388390
A new frame with updated ordering columns.
389391
390392
"""
391-
frame = frame.ensure_row_position_column()
393+
frame = frame.ensure_row_position_column(dummy_row_pos_mode)
392394
ordered_dataframe = frame.ordered_dataframe.sort(
393395
[OrderingColumn(frame.row_position_snowflake_quoted_identifier)]
394396
)

src/snowflake/snowpark/modin/plugin/_internal/cut_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ def compute_bin_indices(
152152
cuts_frame: InternalFrame,
153153
n_cuts: int,
154154
right: bool = True,
155+
dummy_row_pos_mode: bool = False,
155156
) -> InternalFrame:
156157
"""
157158
Given a frame of cuts, i.e. borders of bins (strictly increasing) compute for the data in values_frame the index of the bin they fall into.
@@ -183,7 +184,7 @@ def compute_bin_indices(
183184
# within OrderedDataFrame yet, we use the Snowpark layer directly. This should have no negative
184185
# consequences when it comes to building lazy graphs, as both cut and qcut are materializing operations.
185186

186-
cuts_frame = cuts_frame.ensure_row_position_column()
187+
cuts_frame = cuts_frame.ensure_row_position_column(dummy_row_pos_mode)
187188
# perform asof join to find the closet to the cut frame data.
188189
asof_result = join(
189190
values_frame,

src/snowflake/snowpark/modin/plugin/_internal/frame.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -888,15 +888,19 @@ def to_pandas(
888888
###########################################################################
889889
# START: Internal Frame mutation APIs.
890890
# APIs that creates a new InternalFrame instance, should only be added below
891-
def ensure_row_position_column(self) -> "InternalFrame":
891+
def ensure_row_position_column(
892+
self, dummy_row_pos_mode: bool = False
893+
) -> "InternalFrame":
892894
"""
893895
Ensure row position column is computed for given internal frame.
894896
895897
Returns:
896898
A new InternalFrame instance with computed virtual index.
897899
"""
898900
return InternalFrame.create(
899-
ordered_dataframe=self.ordered_dataframe.ensure_row_position_column(),
901+
ordered_dataframe=self.ordered_dataframe.ensure_row_position_column(
902+
dummy_row_pos_mode
903+
),
900904
data_column_pandas_labels=self.data_column_pandas_labels,
901905
data_column_snowflake_quoted_identifiers=self.data_column_snowflake_quoted_identifiers,
902906
data_column_pandas_index_names=self.data_column_pandas_index_names,
@@ -1350,7 +1354,9 @@ def select_active_columns(self) -> "InternalFrame":
13501354
)
13511355

13521356
def strip_duplicates(
1353-
self: "InternalFrame", quoted_identifiers: list[str]
1357+
self: "InternalFrame",
1358+
quoted_identifiers: list[str],
1359+
dummy_row_pos_mode: bool = False,
13541360
) -> "InternalFrame":
13551361
"""
13561362
When assigning frames via index operations for duplicates only the last entry is used, as entries are repeatedly overwritten.
@@ -1364,7 +1370,7 @@ def strip_duplicates(
13641370
new internal frame with unique index.
13651371
"""
13661372

1367-
frame = self.ensure_row_position_column()
1373+
frame = self.ensure_row_position_column(dummy_row_pos_mode)
13681374

13691375
# To remove the duplicates, first compute via windowing over index columns the value of the last row position.
13701376
# with this join then select only the relevant rows. Note that an EXISTS subquery doesn't work here because
@@ -1400,12 +1406,15 @@ def strip_duplicates(
14001406
left_on_cols=[frame.row_position_snowflake_quoted_identifier],
14011407
right_on_cols=[relevant_last_value_row_positions_quoted_identifier],
14021408
how="inner",
1409+
dummy_row_pos_mode=dummy_row_pos_mode,
14031410
)
14041411

14051412
# Because we reuse row position to select the relevant columns, we need to
14061413
# generate a new row position column here so locational indexing after this operation
14071414
# continues to work correctly.
1408-
new_ordered_dataframe = joined_ordered_dataframe.ensure_row_position_column()
1415+
new_ordered_dataframe = joined_ordered_dataframe.ensure_row_position_column(
1416+
dummy_row_pos_mode
1417+
)
14091418
return InternalFrame.create(
14101419
ordered_dataframe=new_ordered_dataframe,
14111420
data_column_pandas_labels=frame.data_column_pandas_labels,

src/snowflake/snowpark/modin/plugin/_internal/generator_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def generate_regular_range(
7979

8080
def _create_qc_from_snowpark_dataframe(
8181
sp_df: DataFrame,
82+
dummy_row_pos_mode: bool = False,
8283
) -> "snowflake_query_compiler.SnowflakeQueryCompiler":
8384
"""
8485
Create a Snowflake query compiler from a Snowpark DataFrame, assuming the DataFrame only contains one column.
@@ -89,7 +90,9 @@ def _create_qc_from_snowpark_dataframe(
8990
Returns:
9091
A Snowflake query compiler
9192
"""
92-
odf = OrderedDataFrame(DataFrameReference(sp_df)).ensure_row_position_column()
93+
odf = OrderedDataFrame(DataFrameReference(sp_df)).ensure_row_position_column(
94+
dummy_row_pos_mode
95+
)
9396

9497
from snowflake.snowpark.modin.plugin.compiler.snowflake_query_compiler import (
9598
SnowflakeQueryCompiler,

src/snowflake/snowpark/modin/plugin/_internal/get_dummies_utils.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ def get_dummies_helper(
184184
columns: list[Hashable],
185185
prefixes: list[Hashable],
186186
prefix_sep: str,
187+
dummy_row_pos_mode: bool = False,
187188
) -> InternalFrame:
188189
"""
189190
Helper function for get dummies to perform encoding on given columns
@@ -222,9 +223,9 @@ def get_dummies_helper(
222223
)
223224

224225
# append a lit true column as value column for pivot
225-
new_internal_frame = internal_frame.ensure_row_position_column().append_column(
226-
LIT_TRUE_COLUMN_PANDAS_LABEL, pandas_lit(True)
227-
)
226+
new_internal_frame = internal_frame.ensure_row_position_column(
227+
dummy_row_pos_mode
228+
).append_column(LIT_TRUE_COLUMN_PANDAS_LABEL, pandas_lit(True))
228229
# the dummy column is appended as the last data column of the new_internal_frame
229230
row_position_column_snowflake_quoted_identifier = (
230231
new_internal_frame.row_position_snowflake_quoted_identifier

0 commit comments

Comments
 (0)