Skip to content

Commit 5a95eb4

Browse files
SNOW-2439665: Add support for drop_duplicates
1 parent aad530a commit 5a95eb4

File tree

4 files changed

+51
-3
lines changed

4 files changed

+51
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@
143143
- `groupby.median`
144144
- `groupby.std`
145145
- `groupby.var`
146+
- `drop_duplicates`
146147
- Reuse row count from the relaxed query compiler in `get_axis_len`.
147148

148149
#### Bug Fixes

src/snowflake/snowpark/modin/plugin/_internal/indexing_utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1130,7 +1130,9 @@ def get_frame_by_row_label(
11301130

11311131
# boolean indexer
11321132
if isinstance(key_datatype, BooleanType):
1133-
return _get_frame_by_row_label_boolean_frame(internal_frame, key)
1133+
return _get_frame_by_row_label_boolean_frame(
1134+
internal_frame, key, dummy_row_pos_mode
1135+
)
11341136

11351137
return _get_frame_by_row_label_non_boolean_frame(
11361138
internal_frame, key, dummy_row_pos_mode
@@ -1470,6 +1472,7 @@ def generate_bound_column(
14701472
def _get_frame_by_row_label_boolean_frame(
14711473
internal_frame: InternalFrame,
14721474
key: InternalFrame,
1475+
dummy_row_pos_mode: bool = False,
14731476
) -> InternalFrame:
14741477
"""
14751478
Select rows with boolean frame key. Here, if the frame and key's index are aligned, then the join is on their row
@@ -1488,6 +1491,7 @@ def _get_frame_by_row_label_boolean_frame(
14881491
internal_frame,
14891492
key,
14901493
"coalesce",
1494+
dummy_row_pos_mode,
14911495
)
14921496

14931497
key_bool_val_col = col(

src/snowflake/snowpark/modin/plugin/_internal/ordered_dataframe.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1257,17 +1257,31 @@ def join(
12571257

12581258
left = left.ensure_row_position_column(dummy_row_pos_mode=True)
12591259
if len(left_on_cols) == 1 and ROW_POSITION_COLUMN_LABEL in left_on_cols[0]:
1260+
left_row_position_snowflake_quoted_identifier = (
1261+
left.row_position_snowflake_quoted_identifier
1262+
)
12601263
left.row_position_snowflake_quoted_identifier = None
1261-
left = left.ensure_row_position_column(dummy_row_pos_mode=False)
1264+
new_left = left.ensure_row_position_column(dummy_row_pos_mode=False)
1265+
left.row_position_snowflake_quoted_identifier = (
1266+
left_row_position_snowflake_quoted_identifier
1267+
)
1268+
left = new_left
12621269
assert left.row_position_snowflake_quoted_identifier is not None
12631270
left_on_cols = [left.row_position_snowflake_quoted_identifier]
12641271
right = right.ensure_row_position_column(dummy_row_pos_mode=True)
12651272
if (
12661273
len(right_on_cols) == 1
12671274
and ROW_POSITION_COLUMN_LABEL in right_on_cols[0]
12681275
):
1276+
right_row_position_snowflake_quoted_identifier = (
1277+
right.row_position_snowflake_quoted_identifier
1278+
)
12691279
right.row_position_snowflake_quoted_identifier = None
1270-
right = right.ensure_row_position_column(dummy_row_pos_mode=False)
1280+
new_right = right.ensure_row_position_column(dummy_row_pos_mode=False)
1281+
right.row_position_snowflake_quoted_identifier = (
1282+
right_row_position_snowflake_quoted_identifier
1283+
)
1284+
right = new_right
12711285
assert right.row_position_snowflake_quoted_identifier is not None
12721286
right_on_cols = [right.row_position_snowflake_quoted_identifier]
12731287

tests/integ/modin/test_faster_pandas.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,35 @@ def test_drop(session):
279279
assert_frame_equal(snow_result, native_result)
280280

281281

282+
@sql_count_checker(query_count=3, join_count=2)
283+
def test_drop_duplicates(session):
284+
# create tables
285+
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
286+
session.create_dataframe(
287+
native_pd.DataFrame([[2, 12], [2, 12], [3, 13]], columns=["A", "B"])
288+
).write.save_as_table(table_name, table_type="temp")
289+
290+
# create snow dataframes
291+
df = pd.read_snowflake(table_name)
292+
snow_result = df.drop_duplicates()
293+
294+
# verify that the input dataframe has a populated relaxed query compiler
295+
assert df._query_compiler._relaxed_query_compiler is not None
296+
assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
297+
# verify that the output dataframe also has a populated relaxed query compiler
298+
assert snow_result._query_compiler._relaxed_query_compiler is not None
299+
assert (
300+
snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
301+
)
302+
303+
# create pandas dataframes
304+
native_df = df.to_pandas()
305+
native_result = native_df.drop_duplicates()
306+
307+
# compare results
308+
assert_frame_equal(snow_result, native_result)
309+
310+
282311
@sql_count_checker(query_count=3, join_count=1)
283312
def test_duplicated(session):
284313
# create tables

0 commit comments

Comments
 (0)