SNOW-2391351: Avoid joins for drop_duplicates when keep!=False in faster pandas (#3964)

sfc-gh-helmeleegy · web-flow · commit db1b20f187f9 · 2025-11-02T16:49:23.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -124,6 +124,7 @@
   - `cummin`
   - `cummax`
 - Make faster pandas disabled by default (opt-in instead of opt-out).
+- Improve performance of `drop_duplicates` by avoiding joins when `keep!=False` in faster pandas.
 
 ## 1.42.0 (2025-10-28)
 
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -18682,6 +18682,31 @@ def add_substring(
         # Returning the query compiler with updated columns and index.
         return SnowflakeQueryCompiler(result_frame)
 
+    def drop_duplicates(self) -> "SnowflakeQueryCompiler":
+        """
+        Wrapper around _drop_duplicates_internal to be supported in faster pandas.
+        """
+        relaxed_query_compiler = None
+        if self._relaxed_query_compiler is not None:
+            relaxed_query_compiler = (
+                self._relaxed_query_compiler._drop_duplicates_internal()
+            )
+        qc = self._drop_duplicates_internal()
+        return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
+
+    def _drop_duplicates_internal(self) -> "SnowflakeQueryCompiler":
+        """
+        Return a DataFrame or Series after dropping the duplicate rows.
+        """
+        return self.groupby_agg(
+            by=self._modin_frame.data_column_pandas_labels,
+            agg_func={},
+            axis=0,
+            groupby_kwargs={"sort": False, "as_index": False, "dropna": False},
+            agg_args=[],
+            agg_kwargs={},
+        )
+
     def duplicated(
         self,
         subset: Union[Hashable, Sequence[Hashable]] = None,
diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py
@@ -1529,8 +1529,12 @@ def drop_duplicates(
         df = self[subset]
     else:
         df = self
-    duplicated = df.duplicated(keep=keep)
-    result = self[~duplicated]
+    if pd.session.dummy_row_pos_optimization_enabled and keep in ["first", "last"]:
+        result_qc = df._query_compiler.drop_duplicates()
+        result = self.__constructor__(query_compiler=result_qc)
+    else:
+        duplicated = df.duplicated(keep=keep)
+        result = self[~duplicated]
     if ignore_index:
         result.index = pandas.RangeIndex(stop=len(result))
     if inplace:
diff --git a/tests/integ/modin/test_faster_pandas.py b/tests/integ/modin/test_faster_pandas.py
@@ -407,15 +407,15 @@ def test_drop(session):
         assert_frame_equal(snow_result, native_result)
 
 
-@sql_count_checker(query_count=3, join_count=2)
+@sql_count_checker(query_count=3)
 def test_drop_duplicates(session):
     with session_parameter_override(
         session, "dummy_row_pos_optimization_enabled", True
     ):
         # create tables
         table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
         session.create_dataframe(
-            native_pd.DataFrame([[2, 12], [2, 12], [3, 13]], columns=["A", "B"])
+            native_pd.DataFrame([[2, 12], [3, 13], [2, 12]], columns=["A", "B"])
         ).write.save_as_table(table_name, table_type="temp")
 
         # create snow dataframes
@@ -437,7 +437,7 @@ def test_drop_duplicates(session):
         native_result = native_df.drop_duplicates()
 
         # compare results
-        assert_frame_equal(snow_result, native_result)
+        assert_frame_equal(snow_result, native_result, check_index_type=False)
 
 
 @sql_count_checker(query_count=3, join_count=1)