Skip to content

Commit 856ad27

Browse files
SNOW-2432713: Ensure that dummy_row_pos_mode passed to OrderedDataFrame.join() matches SnowflakeQueryCompiler._dummy_row_pos_mode (#3901)
1 parent 975da52 commit 856ad27

16 files changed

+215
-73
lines changed

src/snowflake/snowpark/modin/plugin/_internal/align_utils.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313

1414

1515
def align_axis_0_left(
16-
frame: InternalFrame, other_frame: InternalFrame, join: str
16+
frame: InternalFrame,
17+
other_frame: InternalFrame,
18+
join: str,
19+
dummy_row_pos_mode: bool,
1720
) -> InternalFrame:
1821
"""
1922
Gets the left align results.
@@ -27,7 +30,9 @@ def align_axis_0_left(
2730
New InternalFrame representing aligned left frame.
2831
"""
2932
if join == "right":
30-
left_result, left_column_mapper = align_on_index(other_frame, frame, how="left")
33+
left_result, left_column_mapper = align_on_index(
34+
other_frame, frame, dummy_row_pos_mode, how="left"
35+
)
3136
left_frame_data_ids = left_column_mapper.map_right_quoted_identifiers(
3237
frame.data_column_snowflake_quoted_identifiers
3338
)
@@ -36,7 +41,9 @@ def align_axis_0_left(
3641
left_frame_data_ids + left_index_ids
3742
)
3843
else:
39-
left_result, left_column_mapper = align_on_index(frame, other_frame, how=join)
44+
left_result, left_column_mapper = align_on_index(
45+
frame, other_frame, dummy_row_pos_mode, how=join
46+
)
4047
left_frame_data_ids = left_column_mapper.map_left_quoted_identifiers(
4148
frame.data_column_snowflake_quoted_identifiers
4249
)
@@ -58,7 +65,10 @@ def align_axis_0_left(
5865

5966

6067
def align_axis_0_right(
61-
frame: InternalFrame, other_frame: InternalFrame, join: str
68+
frame: InternalFrame,
69+
other_frame: InternalFrame,
70+
join: str,
71+
dummy_row_pos_mode: bool,
6272
) -> InternalFrame:
6373
"""
6474
Gets the right align results.
@@ -72,7 +82,9 @@ def align_axis_0_right(
7282
New InternalFrame representing aligned right frame.
7383
"""
7484
if join == "left":
75-
right_result, right_column_mapper = align_on_index(frame, other_frame, how=join)
85+
right_result, right_column_mapper = align_on_index(
86+
frame, other_frame, dummy_row_pos_mode, how=join
87+
)
7688
right_frame_data_ids = right_column_mapper.map_right_quoted_identifiers(
7789
other_frame.data_column_snowflake_quoted_identifiers
7890
)
@@ -82,7 +94,7 @@ def align_axis_0_right(
8294
)
8395
elif join == "right":
8496
right_result, right_column_mapper = align_on_index(
85-
other_frame, frame, how="left"
97+
other_frame, frame, dummy_row_pos_mode, how="left"
8698
)
8799
right_frame_data_ids = right_column_mapper.map_left_quoted_identifiers(
88100
other_frame.data_column_snowflake_quoted_identifiers
@@ -92,7 +104,9 @@ def align_axis_0_right(
92104
right_frame_data_ids + right_index_ids
93105
)
94106
else:
95-
right_result, right_column_mapper = align_on_index(other_frame, frame, how=join)
107+
right_result, right_column_mapper = align_on_index(
108+
other_frame, frame, dummy_row_pos_mode, how=join
109+
)
96110
right_frame_data_ids = right_column_mapper.map_left_quoted_identifiers(
97111
other_frame.data_column_snowflake_quoted_identifiers
98112
)

src/snowflake/snowpark/modin/plugin/_internal/cumulative_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def get_groupby_cumagg_frame_axis0(
8686
groupby_kwargs: dict[str, Any],
8787
cumagg_func: Callable,
8888
cumagg_func_name: str,
89+
dummy_row_pos_mode: bool,
8990
ascending: bool = True,
9091
) -> InternalFrame:
9192
"""
@@ -124,7 +125,7 @@ def get_groupby_cumagg_frame_axis0(
124125
)
125126

126127
qc, by_list = resample_and_extract_groupby_column_pandas_labels(
127-
query_compiler, by, level
128+
query_compiler, by, level, dummy_row_pos_mode
128129
)
129130

130131
if numeric_only:

src/snowflake/snowpark/modin/plugin/_internal/cut_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,8 @@ def compute_bin_indices(
151151
values_frame: InternalFrame,
152152
cuts_frame: InternalFrame,
153153
n_cuts: int,
154+
dummy_row_pos_mode: bool,
154155
right: bool = True,
155-
dummy_row_pos_mode: bool = False,
156156
) -> InternalFrame:
157157
"""
158158
Given a frame of cuts, i.e. borders of bins (strictly increasing) compute for the data in values_frame the index of the bin they fall into.
@@ -190,6 +190,7 @@ def compute_bin_indices(
190190
values_frame,
191191
cuts_frame,
192192
how="asof",
193+
dummy_row_pos_mode=dummy_row_pos_mode,
193194
left_match_col=values_frame.data_column_snowflake_quoted_identifiers[0],
194195
right_match_col=cuts_frame.data_column_snowflake_quoted_identifiers[0],
195196
match_comparator=MatchComparator.LESS_THAN_OR_EQUAL_TO

src/snowflake/snowpark/modin/plugin/_internal/frame.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,7 +1356,7 @@ def select_active_columns(self) -> "InternalFrame":
13561356
def strip_duplicates(
13571357
self: "InternalFrame",
13581358
quoted_identifiers: list[str],
1359-
dummy_row_pos_mode: bool = False,
1359+
dummy_row_pos_mode: bool,
13601360
) -> "InternalFrame":
13611361
"""
13621362
When assigning frames via index operations for duplicates only the last entry is used, as entries are repeatedly overwritten.
@@ -1403,10 +1403,10 @@ def strip_duplicates(
14031403

14041404
joined_ordered_dataframe = frame.ordered_dataframe.join(
14051405
right=relevant_last_value_row_positions,
1406+
dummy_row_pos_mode=dummy_row_pos_mode,
14061407
left_on_cols=[frame.row_position_snowflake_quoted_identifier],
14071408
right_on_cols=[relevant_last_value_row_positions_quoted_identifier],
14081409
how="inner",
1409-
dummy_row_pos_mode=dummy_row_pos_mode,
14101410
)
14111411

14121412
# Because we reuse row position to select the relevant columns, we need to

src/snowflake/snowpark/modin/plugin/_internal/get_dummies_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ def get_dummies_helper(
228228
prefixes: list[Hashable],
229229
prefix_sep: str,
230230
dtype: Any,
231-
dummy_row_pos_mode: bool = False,
231+
dummy_row_pos_mode: bool,
232232
) -> InternalFrame:
233233
"""
234234
Helper function for get dummies to perform encoding on given columns
@@ -348,6 +348,7 @@ def get_dummies_helper(
348348
left_on=result_internal_frame.index_column_snowflake_quoted_identifiers,
349349
right_on=pivoted_internal_frame.index_column_snowflake_quoted_identifiers,
350350
how="inner",
351+
dummy_row_pos_mode=dummy_row_pos_mode,
351352
).result_frame
352353

353354
# optimization: keep the original row position column as the result ordered frame

src/snowflake/snowpark/modin/plugin/_internal/groupby_utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,7 @@ def resample_and_extract_groupby_column_pandas_labels(
471471
query_compiler: "snowflake_query_compiler.SnowflakeQueryCompiler",
472472
by: Any,
473473
level: Optional[IndexLabel],
474+
dummy_row_pos_mode: bool,
474475
*,
475476
skip_resample: bool = False,
476477
) -> tuple[
@@ -614,6 +615,7 @@ def find_resample_columns(
614615
right=expected_resample_bins_frame,
615616
# Perform an outer join to preserve additional index columns.
616617
how="outer",
618+
dummy_row_pos_mode=dummy_row_pos_mode,
617619
# identifier might get mangled by binning operation; look it up again
618620
left_on=binned_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
619621
[original_label]
@@ -658,6 +660,7 @@ def get_frame_with_groupby_columns_as_index(
658660
by: Any,
659661
level: Optional[Union[Hashable, int]],
660662
dropna: bool,
663+
dummy_row_pos_mode: bool,
661664
) -> Optional["snowflake_query_compiler.SnowflakeQueryCompiler"]:
662665
"""
663666
Returns a new dataframe with the following properties:
@@ -753,7 +756,7 @@ def get_frame_with_groupby_columns_as_index(
753756
)
754757

755758
query_compiler, by_list = resample_and_extract_groupby_column_pandas_labels(
756-
query_compiler, by, level
759+
query_compiler, by, level, dummy_row_pos_mode
757760
)
758761

759762
if by_list is None:
@@ -918,6 +921,7 @@ def fill_missing_groupby_resample_bins_for_frame(
918921
by_list: list,
919922
orig_datetime_index_col_label: str,
920923
datetime_index_col_identifier: str,
924+
dummy_row_pos_mode: bool,
921925
) -> InternalFrame:
922926
"""
923927
Returns a new InternalFrame created using 2 rules.
@@ -1028,6 +1032,7 @@ def fill_missing_groupby_resample_bins_for_frame(
10281032
frame,
10291033
multi_expected_resample_bins_snowpark_frame,
10301034
how="right",
1035+
dummy_row_pos_mode=dummy_row_pos_mode,
10311036
left_on=frame.index_column_snowflake_quoted_identifiers,
10321037
right_on=multi_expected_resample_bins_snowpark_frame.index_column_snowflake_quoted_identifiers,
10331038
sort=False,

src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def validate_out_of_bound(key_max: Any, key_min: Any, axis_len: int) -> None:
219219
def get_frame_by_row_pos_frame(
220220
internal_frame: InternalFrame,
221221
key: InternalFrame,
222-
dummy_row_pos_mode: bool = False,
222+
dummy_row_pos_mode: bool,
223223
) -> InternalFrame:
224224
"""
225225
Select rows from this internal_frame by row positions in the key frame
@@ -269,7 +269,7 @@ def get_frame_by_row_pos_frame(
269269
def _get_frame_by_row_pos_boolean_frame(
270270
internal_frame: InternalFrame,
271271
key: InternalFrame,
272-
dummy_row_pos_mode: bool = False,
272+
dummy_row_pos_mode: bool,
273273
) -> InternalFrame:
274274
"""
275275
Select rows using the boolean frame positional key. The two frames will be inner joined on their row position column
@@ -320,7 +320,7 @@ def _get_frame_by_row_pos_boolean_frame(
320320
def _get_frame_by_row_pos_int_frame(
321321
internal_frame: InternalFrame,
322322
key: InternalFrame,
323-
dummy_row_pos_mode: bool = False,
323+
dummy_row_pos_mode: bool,
324324
) -> InternalFrame:
325325
"""
326326
Select rows using the int frame positional key. The two frames will be inner joined on the internal_frame's row
@@ -374,7 +374,7 @@ def _get_frame_by_row_pos_int_frame(
374374
def _get_adjusted_key_frame_by_row_pos_int_frame(
375375
internal_frame: InternalFrame,
376376
key: InternalFrame,
377-
dummy_row_pos_mode: bool = False,
377+
dummy_row_pos_mode: bool,
378378
) -> InternalFrame:
379379
"""
380380
Return the key frame with any negative row positions adjusted by the internal frame. For example, if the original
@@ -1100,7 +1100,7 @@ def get_index_frame_by_row_label_slice(
11001100
def get_frame_by_row_label(
11011101
internal_frame: InternalFrame,
11021102
key: Union[InternalFrame, slice, tuple],
1103-
dummy_row_pos_mode: bool = False,
1103+
dummy_row_pos_mode: bool,
11041104
) -> InternalFrame:
11051105
"""
11061106
Select rows by labels in the key.
@@ -1472,7 +1472,7 @@ def generate_bound_column(
14721472
def _get_frame_by_row_label_boolean_frame(
14731473
internal_frame: InternalFrame,
14741474
key: InternalFrame,
1475-
dummy_row_pos_mode: bool = False,
1475+
dummy_row_pos_mode: bool,
14761476
) -> InternalFrame:
14771477
"""
14781478
Select rows with boolean frame key. Here, if the frame and key's index are aligned, then the join is on their row
@@ -1490,6 +1490,7 @@ def _get_frame_by_row_label_boolean_frame(
14901490
joined_frame, result_column_mapper = align_on_index(
14911491
internal_frame,
14921492
key,
1493+
dummy_row_pos_mode,
14931494
"coalesce",
14941495
dummy_row_pos_mode,
14951496
)
@@ -1525,7 +1526,7 @@ def _get_frame_by_row_label_boolean_frame(
15251526
def _get_frame_by_row_label_non_boolean_frame(
15261527
internal_frame: InternalFrame,
15271528
key: InternalFrame,
1528-
dummy_row_pos_mode: bool = False,
1529+
dummy_row_pos_mode: bool,
15291530
) -> InternalFrame:
15301531
"""
15311532
Select rows where its index is equal to the index in the key value.
@@ -1644,7 +1645,7 @@ def _set_2d_labels_helper_for_frame_item(
16441645
matching_item_rows_by_label: bool,
16451646
col_info: LocSetColInfo,
16461647
index_is_bool_indexer: bool,
1647-
dummy_row_pos_mode: bool = False,
1648+
dummy_row_pos_mode: bool,
16481649
) -> InternalFrame:
16491650
"""
16501651
This set 2d label helper method handles df[index, columns] = item where index is a non-boolean indexer and item is a
@@ -1679,12 +1680,14 @@ def _set_2d_labels_helper_for_frame_item(
16791680
result_frame = align_on_index(
16801681
internal_frame,
16811682
index,
1683+
dummy_row_pos_mode,
16821684
"coalesce",
16831685
).result_frame
16841686

16851687
return align_on_index(
16861688
result_frame,
16871689
item,
1690+
dummy_row_pos_mode,
16881691
"coalesce",
16891692
).result_frame
16901693

@@ -1694,6 +1697,7 @@ def _set_2d_labels_helper_for_frame_item(
16941697
return align_on_index(
16951698
internal_frame,
16961699
item,
1700+
dummy_row_pos_mode,
16971701
"coalesce",
16981702
).result_frame
16991703
else:
@@ -1852,7 +1856,7 @@ def _set_2d_labels_helper_for_non_frame_item(
18521856
internal_frame: InternalFrame,
18531857
index: Union[slice, Scalar, InternalFrame],
18541858
index_is_bool_indexer: bool,
1855-
dummy_row_pos_mode: bool = False,
1859+
dummy_row_pos_mode: bool,
18561860
) -> InternalFrame:
18571861
"""
18581862
The helper method for the case where item is not an internal frame
@@ -1894,6 +1898,7 @@ def _set_2d_labels_helper_for_non_frame_item(
18941898
return align_on_index(
18951899
internal_frame,
18961900
index,
1901+
dummy_row_pos_mode,
18971902
"coalesce",
18981903
).result_frame
18991904
elif isinstance(index, InternalFrame):
@@ -1924,7 +1929,7 @@ def _set_2d_labels_helper_for_single_column_wise_item(
19241929
item_data_column_pandas_labels: list[Hashable],
19251930
index_is_bool_indexer: bool,
19261931
enforce_match_item_by_row_labels: bool,
1927-
dummy_row_pos_mode: bool = False,
1932+
dummy_row_pos_mode: bool,
19281933
) -> InternalFrame:
19291934
"""
19301935
# If it's a single column with an item list, then we set the item values column-wise, for example,
@@ -2140,7 +2145,7 @@ def set_frame_2d_labels(
21402145
index_is_bool_indexer: bool,
21412146
deduplicate_columns: bool,
21422147
frame_is_df_and_item_is_series: bool,
2143-
dummy_row_pos_mode: bool = False,
2148+
dummy_row_pos_mode: bool,
21442149
) -> InternalFrame:
21452150
"""
21462151
Helper function to handle the general loc set functionality. The general idea here is to join the key from ``index``
@@ -2355,6 +2360,7 @@ def set_frame_2d_labels(
23552360
item_data_column_pandas_labels,
23562361
index_is_bool_indexer,
23572362
enforce_match_item_by_row_labels,
2363+
dummy_row_pos_mode,
23582364
)
23592365
# we convert bool indexer to non-bool one above so set it to False now
23602366
index_is_bool_indexer = False
@@ -2608,7 +2614,7 @@ def set_frame_2d_positional(
26082614
set_as_coords: bool,
26092615
item: Union[InternalFrame, Scalar],
26102616
is_item_series: bool,
2611-
dummy_row_pos_mode: bool = False,
2617+
dummy_row_pos_mode: bool,
26122618
) -> InternalFrame:
26132619
"""
26142620
Helper function to handle the general (worst case) 2-join case where index (aka row_key) and item are both frames.
@@ -2837,7 +2843,7 @@ def set_frame_2d_positional(
28372843
def get_kv_frame_from_index_and_item_frames(
28382844
index: InternalFrame,
28392845
item: InternalFrame,
2840-
dummy_row_pos_mode: bool = False,
2846+
dummy_row_pos_mode: bool,
28412847
) -> InternalFrame:
28422848
"""
28432849
Return the key-value frame from the key (index) and item (values) frames by aligning on the row positions.

0 commit comments

Comments
 (0)