Skip to content

Commit db1b20f

Browse files
SNOW-2391351: Avoid joins for drop_duplicates when keep!=False in faster pandas (#3964)
1 parent a6d1b88 commit db1b20f

File tree

4 files changed

+35
-5
lines changed

4 files changed

+35
-5
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@
124124
- `cummin`
125125
- `cummax`
126126
- Make faster pandas disabled by default (opt-in instead of opt-out).
127+
- Improve performance of `drop_duplicates` by avoiding joins when `keep!=False` in faster pandas.
127128

128129
## 1.42.0 (2025-10-28)
129130

src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18682,6 +18682,31 @@ def add_substring(
1868218682
# Returning the query compiler with updated columns and index.
1868318683
return SnowflakeQueryCompiler(result_frame)
1868418684

18685+
def drop_duplicates(self) -> "SnowflakeQueryCompiler":
18686+
"""
18687+
Wrapper around _drop_duplicates_internal to be supported in faster pandas.
18688+
"""
18689+
relaxed_query_compiler = None
18690+
if self._relaxed_query_compiler is not None:
18691+
relaxed_query_compiler = (
18692+
self._relaxed_query_compiler._drop_duplicates_internal()
18693+
)
18694+
qc = self._drop_duplicates_internal()
18695+
return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
18696+
18697+
def _drop_duplicates_internal(self) -> "SnowflakeQueryCompiler":
18698+
"""
18699+
Return a DataFrame or Series after dropping the duplicate rows.
18700+
"""
18701+
return self.groupby_agg(
18702+
by=self._modin_frame.data_column_pandas_labels,
18703+
agg_func={},
18704+
axis=0,
18705+
groupby_kwargs={"sort": False, "as_index": False, "dropna": False},
18706+
agg_args=[],
18707+
agg_kwargs={},
18708+
)
18709+
1868518710
def duplicated(
1868618711
self,
1868718712
subset: Union[Hashable, Sequence[Hashable]] = None,

src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1529,8 +1529,12 @@ def drop_duplicates(
15291529
df = self[subset]
15301530
else:
15311531
df = self
1532-
duplicated = df.duplicated(keep=keep)
1533-
result = self[~duplicated]
1532+
if pd.session.dummy_row_pos_optimization_enabled and keep in ["first", "last"]:
1533+
result_qc = df._query_compiler.drop_duplicates()
1534+
result = self.__constructor__(query_compiler=result_qc)
1535+
else:
1536+
duplicated = df.duplicated(keep=keep)
1537+
result = self[~duplicated]
15341538
if ignore_index:
15351539
result.index = pandas.RangeIndex(stop=len(result))
15361540
if inplace:

tests/integ/modin/test_faster_pandas.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -407,15 +407,15 @@ def test_drop(session):
407407
assert_frame_equal(snow_result, native_result)
408408

409409

410-
@sql_count_checker(query_count=3, join_count=2)
410+
@sql_count_checker(query_count=3)
411411
def test_drop_duplicates(session):
412412
with session_parameter_override(
413413
session, "dummy_row_pos_optimization_enabled", True
414414
):
415415
# create tables
416416
table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
417417
session.create_dataframe(
418-
native_pd.DataFrame([[2, 12], [2, 12], [3, 13]], columns=["A", "B"])
418+
native_pd.DataFrame([[2, 12], [3, 13], [2, 12]], columns=["A", "B"])
419419
).write.save_as_table(table_name, table_type="temp")
420420

421421
# create snow dataframes
@@ -437,7 +437,7 @@ def test_drop_duplicates(session):
437437
native_result = native_df.drop_duplicates()
438438

439439
# compare results
440-
assert_frame_equal(snow_result, native_result)
440+
assert_frame_equal(snow_result, native_result, check_index_type=False)
441441

442442

443443
@sql_count_checker(query_count=3, join_count=1)

0 commit comments

Comments
 (0)