SNOW-2432963: Add support for duplicated in faster pandas

sfc-gh-helmeleegy · sfc-gh-helmeleegy · commit 6def605b2ac8 · 2025-10-15T16:33:34.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -112,6 +112,7 @@
   - `loc` (setting columns)
   - `to_datetime`
   - `drop`
+  - `duplicated`
 - Reuse row count from the relaxed query compiler in `get_axis_len`.
 
 #### Bug Fixes
diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -17024,6 +17024,26 @@ def duplicated(
         self,
         subset: Union[Hashable, Sequence[Hashable]] = None,
         keep: DropKeep = "first",
+    ) -> "SnowflakeQueryCompiler":
+        """
+        Wrapper around _duplicated_internal to be supported in faster pandas.
+        """
+        relaxed_query_compiler = None
+        if self._relaxed_query_compiler is not None:
+            relaxed_query_compiler = self._relaxed_query_compiler._duplicated_internal(
+                subset=subset,
+                keep=keep,
+            )
+        qc = self._duplicated_internal(
+            subset=subset,
+            keep=keep,
+        )
+        return self._maybe_set_relaxed_qc(qc, relaxed_query_compiler)
+
+    def _duplicated_internal(
+        self,
+        subset: Union[Hashable, Sequence[Hashable]] = None,
+        keep: DropKeep = "first",
     ) -> "SnowflakeQueryCompiler":
         """
         Return boolean Series denoting duplicate rows.
diff --git a/tests/integ/modin/test_faster_pandas.py b/tests/integ/modin/test_faster_pandas.py
@@ -223,6 +223,35 @@ def test_drop(session):
     assert_frame_equal(snow_result, native_result)
 
 
+@sql_count_checker(query_count=3, join_count=1)
+def test_duplicated(session, func):
+    # create tables
+    table_name = Utils.random_name_for_temp_object(TempObjectType.TABLE)
+    session.create_dataframe(
+        native_pd.DataFrame([[2, 12], [2, 12], [3, 13]], columns=["A", "B"])
+    ).write.save_as_table(table_name, table_type="temp")
+
+    # create snow dataframes
+    df = pd.read_snowflake(table_name)
+    snow_result = df.duplicated()
+
+    # verify that the input dataframe has a populated relaxed query compiler
+    assert df._query_compiler._relaxed_query_compiler is not None
+    assert df._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    # verify that the output dataframe also has a populated relaxed query compiler
+    assert snow_result._query_compiler._relaxed_query_compiler is not None
+    assert (
+        snow_result._query_compiler._relaxed_query_compiler._dummy_row_pos_mode is True
+    )
+
+    # create pandas dataframes
+    native_df = df.to_pandas()
+    native_result = native_df.duplicated()
+
+    # compare results
+    assert_series_equal(snow_result, native_result)
+
+
 @pytest.mark.parametrize("func", ["isna", "isnull", "notna", "notnull"])
 @sql_count_checker(query_count=3)
 def test_isna_notna(session, func):