Skip to content

Commit 5068c1b

Browse files
SNOW-2432713: Ensure that dummy_row_pos_mode passed to OrderedDataFrame.join() matches SnowflakeQueryCompiler._dummy_row_pos_mode
1 parent 04218cc commit 5068c1b

File tree

15 files changed

+212
-71
lines changed

15 files changed

+212
-71
lines changed

src/snowflake/snowpark/modin/plugin/_internal/align_utils.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313

1414

1515
def align_axis_0_left(
16-
frame: InternalFrame, other_frame: InternalFrame, join: str
16+
frame: InternalFrame,
17+
other_frame: InternalFrame,
18+
join: str,
19+
dummy_row_pos_mode: bool,
1720
) -> InternalFrame:
1821
"""
1922
Gets the left align results.
@@ -27,7 +30,9 @@ def align_axis_0_left(
2730
New InternalFrame representing aligned left frame.
2831
"""
2932
if join == "right":
30-
left_result, left_column_mapper = align_on_index(other_frame, frame, how="left")
33+
left_result, left_column_mapper = align_on_index(
34+
other_frame, frame, dummy_row_pos_mode, how="left"
35+
)
3136
left_frame_data_ids = left_column_mapper.map_right_quoted_identifiers(
3237
frame.data_column_snowflake_quoted_identifiers
3338
)
@@ -36,7 +41,9 @@ def align_axis_0_left(
3641
left_frame_data_ids + left_index_ids
3742
)
3843
else:
39-
left_result, left_column_mapper = align_on_index(frame, other_frame, how=join)
44+
left_result, left_column_mapper = align_on_index(
45+
frame, other_frame, dummy_row_pos_mode, how=join
46+
)
4047
left_frame_data_ids = left_column_mapper.map_left_quoted_identifiers(
4148
frame.data_column_snowflake_quoted_identifiers
4249
)
@@ -58,7 +65,10 @@ def align_axis_0_left(
5865

5966

6067
def align_axis_0_right(
61-
frame: InternalFrame, other_frame: InternalFrame, join: str
68+
frame: InternalFrame,
69+
other_frame: InternalFrame,
70+
join: str,
71+
dummy_row_pos_mode: bool,
6272
) -> InternalFrame:
6373
"""
6474
Gets the right align results.
@@ -72,7 +82,9 @@ def align_axis_0_right(
7282
New InternalFrame representing aligned right frame.
7383
"""
7484
if join == "left":
75-
right_result, right_column_mapper = align_on_index(frame, other_frame, how=join)
85+
right_result, right_column_mapper = align_on_index(
86+
frame, other_frame, dummy_row_pos_mode, how=join
87+
)
7688
right_frame_data_ids = right_column_mapper.map_right_quoted_identifiers(
7789
other_frame.data_column_snowflake_quoted_identifiers
7890
)
@@ -82,7 +94,7 @@ def align_axis_0_right(
8294
)
8395
elif join == "right":
8496
right_result, right_column_mapper = align_on_index(
85-
other_frame, frame, how="left"
97+
other_frame, frame, dummy_row_pos_mode, how="left"
8698
)
8799
right_frame_data_ids = right_column_mapper.map_left_quoted_identifiers(
88100
other_frame.data_column_snowflake_quoted_identifiers
@@ -92,7 +104,9 @@ def align_axis_0_right(
92104
right_frame_data_ids + right_index_ids
93105
)
94106
else:
95-
right_result, right_column_mapper = align_on_index(other_frame, frame, how=join)
107+
right_result, right_column_mapper = align_on_index(
108+
other_frame, frame, dummy_row_pos_mode, how=join
109+
)
96110
right_frame_data_ids = right_column_mapper.map_left_quoted_identifiers(
97111
other_frame.data_column_snowflake_quoted_identifiers
98112
)

src/snowflake/snowpark/modin/plugin/_internal/cumulative_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def get_groupby_cumagg_frame_axis0(
8686
groupby_kwargs: dict[str, Any],
8787
cumagg_func: Callable,
8888
cumagg_func_name: str,
89+
dummy_row_pos_mode: bool,
8990
ascending: bool = True,
9091
) -> InternalFrame:
9192
"""
@@ -124,7 +125,7 @@ def get_groupby_cumagg_frame_axis0(
124125
)
125126

126127
qc, by_list = resample_and_extract_groupby_column_pandas_labels(
127-
query_compiler, by, level
128+
query_compiler, by, level, dummy_row_pos_mode
128129
)
129130

130131
if numeric_only:

src/snowflake/snowpark/modin/plugin/_internal/cut_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,8 @@ def compute_bin_indices(
151151
values_frame: InternalFrame,
152152
cuts_frame: InternalFrame,
153153
n_cuts: int,
154+
dummy_row_pos_mode: bool,
154155
right: bool = True,
155-
dummy_row_pos_mode: bool = False,
156156
) -> InternalFrame:
157157
"""
158158
Given a frame of cuts, i.e. borders of bins (strictly increasing) compute for the data in values_frame the index of the bin they fall into.
@@ -190,6 +190,7 @@ def compute_bin_indices(
190190
values_frame,
191191
cuts_frame,
192192
how="asof",
193+
dummy_row_pos_mode=dummy_row_pos_mode,
193194
left_match_col=values_frame.data_column_snowflake_quoted_identifiers[0],
194195
right_match_col=cuts_frame.data_column_snowflake_quoted_identifiers[0],
195196
match_comparator=MatchComparator.LESS_THAN_OR_EQUAL_TO

src/snowflake/snowpark/modin/plugin/_internal/frame.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,7 +1356,7 @@ def select_active_columns(self) -> "InternalFrame":
13561356
def strip_duplicates(
13571357
self: "InternalFrame",
13581358
quoted_identifiers: list[str],
1359-
dummy_row_pos_mode: bool = False,
1359+
dummy_row_pos_mode: bool,
13601360
) -> "InternalFrame":
13611361
"""
13621362
When assigning frames via index operations for duplicates only the last entry is used, as entries are repeatedly overwritten.
@@ -1403,10 +1403,10 @@ def strip_duplicates(
14031403

14041404
joined_ordered_dataframe = frame.ordered_dataframe.join(
14051405
right=relevant_last_value_row_positions,
1406+
dummy_row_pos_mode=dummy_row_pos_mode,
14061407
left_on_cols=[frame.row_position_snowflake_quoted_identifier],
14071408
right_on_cols=[relevant_last_value_row_positions_quoted_identifier],
14081409
how="inner",
1409-
dummy_row_pos_mode=dummy_row_pos_mode,
14101410
)
14111411

14121412
# Because we reuse row position to select the relevant columns, we need to

src/snowflake/snowpark/modin/plugin/_internal/get_dummies_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ def get_dummies_helper(
228228
prefixes: list[Hashable],
229229
prefix_sep: str,
230230
dtype: Any,
231-
dummy_row_pos_mode: bool = False,
231+
dummy_row_pos_mode: bool,
232232
) -> InternalFrame:
233233
"""
234234
Helper function for get dummies to perform encoding on given columns
@@ -348,6 +348,7 @@ def get_dummies_helper(
348348
left_on=result_internal_frame.index_column_snowflake_quoted_identifiers,
349349
right_on=pivoted_internal_frame.index_column_snowflake_quoted_identifiers,
350350
how="inner",
351+
dummy_row_pos_mode=dummy_row_pos_mode,
351352
).result_frame
352353

353354
# optimization: keep the original row position column as the result ordered frame

src/snowflake/snowpark/modin/plugin/_internal/groupby_utils.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,7 @@ def resample_and_extract_groupby_column_pandas_labels(
471471
query_compiler: "snowflake_query_compiler.SnowflakeQueryCompiler",
472472
by: Any,
473473
level: Optional[IndexLabel],
474+
dummy_row_pos_mode: bool,
474475
*,
475476
skip_resample: bool = False,
476477
) -> tuple[
@@ -614,6 +615,7 @@ def find_resample_columns(
614615
right=expected_resample_bins_frame,
615616
# Perform an outer join to preserve additional index columns.
616617
how="outer",
618+
dummy_row_pos_mode=dummy_row_pos_mode,
617619
# identifier might get mangled by binning operation; look it up again
618620
left_on=binned_frame.get_snowflake_quoted_identifiers_group_by_pandas_labels(
619621
[original_label]
@@ -658,6 +660,7 @@ def get_frame_with_groupby_columns_as_index(
658660
by: Any,
659661
level: Optional[Union[Hashable, int]],
660662
dropna: bool,
663+
dummy_row_pos_mode: bool,
661664
) -> Optional["snowflake_query_compiler.SnowflakeQueryCompiler"]:
662665
"""
663666
Returns a new dataframe with the following properties:
@@ -753,7 +756,7 @@ def get_frame_with_groupby_columns_as_index(
753756
)
754757

755758
query_compiler, by_list = resample_and_extract_groupby_column_pandas_labels(
756-
query_compiler, by, level
759+
query_compiler, by, level, dummy_row_pos_mode
757760
)
758761

759762
if by_list is None:
@@ -918,6 +921,7 @@ def fill_missing_groupby_resample_bins_for_frame(
918921
by_list: list,
919922
orig_datetime_index_col_label: str,
920923
datetime_index_col_identifier: str,
924+
dummy_row_pos_mode: bool,
921925
) -> InternalFrame:
922926
"""
923927
Returns a new InternalFrame created using 2 rules.
@@ -1028,6 +1032,7 @@ def fill_missing_groupby_resample_bins_for_frame(
10281032
frame,
10291033
multi_expected_resample_bins_snowpark_frame,
10301034
how="right",
1035+
dummy_row_pos_mode=dummy_row_pos_mode,
10311036
left_on=frame.index_column_snowflake_quoted_identifiers,
10321037
right_on=multi_expected_resample_bins_snowpark_frame.index_column_snowflake_quoted_identifiers,
10331038
sort=False,

src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ def get_frame_by_row_pos_frame(
269269
def _get_frame_by_row_pos_boolean_frame(
270270
internal_frame: InternalFrame,
271271
key: InternalFrame,
272-
dummy_row_pos_mode: bool = False,
272+
dummy_row_pos_mode: bool,
273273
) -> InternalFrame:
274274
"""
275275
Select rows using the boolean frame positional key. The two frames will be inner joined on their row position column
@@ -320,7 +320,7 @@ def _get_frame_by_row_pos_boolean_frame(
320320
def _get_frame_by_row_pos_int_frame(
321321
internal_frame: InternalFrame,
322322
key: InternalFrame,
323-
dummy_row_pos_mode: bool = False,
323+
dummy_row_pos_mode: bool,
324324
) -> InternalFrame:
325325
"""
326326
Select rows using the int frame positional key. The two frames will be inner joined on the internal_frame's row
@@ -374,7 +374,7 @@ def _get_frame_by_row_pos_int_frame(
374374
def _get_adjusted_key_frame_by_row_pos_int_frame(
375375
internal_frame: InternalFrame,
376376
key: InternalFrame,
377-
dummy_row_pos_mode: bool = False,
377+
dummy_row_pos_mode: bool,
378378
) -> InternalFrame:
379379
"""
380380
Return the key frame with any negative row positions adjusted by the internal frame. For example, if the original
@@ -1100,7 +1100,7 @@ def get_index_frame_by_row_label_slice(
11001100
def get_frame_by_row_label(
11011101
internal_frame: InternalFrame,
11021102
key: Union[InternalFrame, slice, tuple],
1103-
dummy_row_pos_mode: bool = False,
1103+
dummy_row_pos_mode: bool,
11041104
) -> InternalFrame:
11051105
"""
11061106
Select rows by labels in the key.
@@ -1130,7 +1130,9 @@ def get_frame_by_row_label(
11301130

11311131
# boolean indexer
11321132
if isinstance(key_datatype, BooleanType):
1133-
return _get_frame_by_row_label_boolean_frame(internal_frame, key)
1133+
return _get_frame_by_row_label_boolean_frame(
1134+
internal_frame, key, dummy_row_pos_mode
1135+
)
11341136

11351137
return _get_frame_by_row_label_non_boolean_frame(
11361138
internal_frame, key, dummy_row_pos_mode
@@ -1470,6 +1472,7 @@ def generate_bound_column(
14701472
def _get_frame_by_row_label_boolean_frame(
14711473
internal_frame: InternalFrame,
14721474
key: InternalFrame,
1475+
dummy_row_pos_mode: bool,
14731476
) -> InternalFrame:
14741477
"""
14751478
Select rows with boolean frame key. Here, if the frame and key's index are aligned, then the join is on their row
@@ -1487,6 +1490,7 @@ def _get_frame_by_row_label_boolean_frame(
14871490
joined_frame, result_column_mapper = align_on_index(
14881491
internal_frame,
14891492
key,
1493+
dummy_row_pos_mode,
14901494
"coalesce",
14911495
)
14921496

@@ -1521,7 +1525,7 @@ def _get_frame_by_row_label_boolean_frame(
15211525
def _get_frame_by_row_label_non_boolean_frame(
15221526
internal_frame: InternalFrame,
15231527
key: InternalFrame,
1524-
dummy_row_pos_mode: bool = False,
1528+
dummy_row_pos_mode: bool,
15251529
) -> InternalFrame:
15261530
"""
15271531
Select rows where its index is equal to the index in the key value.
@@ -1640,7 +1644,7 @@ def _set_2d_labels_helper_for_frame_item(
16401644
matching_item_rows_by_label: bool,
16411645
col_info: LocSetColInfo,
16421646
index_is_bool_indexer: bool,
1643-
dummy_row_pos_mode: bool = False,
1647+
dummy_row_pos_mode: bool,
16441648
) -> InternalFrame:
16451649
"""
16461650
This set 2d label helper method handles df[index, columns] = item where index is a non-boolean indexer and item is a
@@ -1675,12 +1679,14 @@ def _set_2d_labels_helper_for_frame_item(
16751679
result_frame = align_on_index(
16761680
internal_frame,
16771681
index,
1682+
dummy_row_pos_mode,
16781683
"coalesce",
16791684
).result_frame
16801685

16811686
return align_on_index(
16821687
result_frame,
16831688
item,
1689+
dummy_row_pos_mode,
16841690
"coalesce",
16851691
).result_frame
16861692

@@ -1690,6 +1696,7 @@ def _set_2d_labels_helper_for_frame_item(
16901696
return align_on_index(
16911697
internal_frame,
16921698
item,
1699+
dummy_row_pos_mode,
16931700
"coalesce",
16941701
).result_frame
16951702
else:
@@ -1848,7 +1855,7 @@ def _set_2d_labels_helper_for_non_frame_item(
18481855
internal_frame: InternalFrame,
18491856
index: Union[slice, Scalar, InternalFrame],
18501857
index_is_bool_indexer: bool,
1851-
dummy_row_pos_mode: bool = False,
1858+
dummy_row_pos_mode: bool,
18521859
) -> InternalFrame:
18531860
"""
18541861
The helper method for the case where item is not an internal frame
@@ -1890,6 +1897,7 @@ def _set_2d_labels_helper_for_non_frame_item(
18901897
return align_on_index(
18911898
internal_frame,
18921899
index,
1900+
dummy_row_pos_mode,
18931901
"coalesce",
18941902
).result_frame
18951903
elif isinstance(index, InternalFrame):
@@ -1920,7 +1928,7 @@ def _set_2d_labels_helper_for_single_column_wise_item(
19201928
item_data_column_pandas_labels: list[Hashable],
19211929
index_is_bool_indexer: bool,
19221930
enforce_match_item_by_row_labels: bool,
1923-
dummy_row_pos_mode: bool = False,
1931+
dummy_row_pos_mode: bool,
19241932
) -> InternalFrame:
19251933
"""
19261934
# If it's a single column with an item list, then we set the item values column-wise, for example,
@@ -2136,7 +2144,7 @@ def set_frame_2d_labels(
21362144
index_is_bool_indexer: bool,
21372145
deduplicate_columns: bool,
21382146
frame_is_df_and_item_is_series: bool,
2139-
dummy_row_pos_mode: bool = False,
2147+
dummy_row_pos_mode: bool,
21402148
) -> InternalFrame:
21412149
"""
21422150
Helper function to handle the general loc set functionality. The general idea here is to join the key from ``index``
@@ -2351,6 +2359,7 @@ def set_frame_2d_labels(
23512359
item_data_column_pandas_labels,
23522360
index_is_bool_indexer,
23532361
enforce_match_item_by_row_labels,
2362+
dummy_row_pos_mode,
23542363
)
23552364
# we convert bool indexer to non-bool one above so set it to False now
23562365
index_is_bool_indexer = False
@@ -2604,7 +2613,7 @@ def set_frame_2d_positional(
26042613
set_as_coords: bool,
26052614
item: Union[InternalFrame, Scalar],
26062615
is_item_series: bool,
2607-
dummy_row_pos_mode: bool = False,
2616+
dummy_row_pos_mode: bool,
26082617
) -> InternalFrame:
26092618
"""
26102619
Helper function to handle the general (worst case) 2-join case where index (aka row_key) and item are both frames.
@@ -2833,7 +2842,7 @@ def set_frame_2d_positional(
28332842
def get_kv_frame_from_index_and_item_frames(
28342843
index: InternalFrame,
28352844
item: InternalFrame,
2836-
dummy_row_pos_mode: bool = False,
2845+
dummy_row_pos_mode: bool,
28372846
) -> InternalFrame:
28382847
"""
28392848
Return the key-value frame from the key (index) and item (values) frames by aligning on the row positions.

0 commit comments

Comments
 (0)