SNOW-1893939: Add support for Index.drop_duplicates (#2923)

sfc-gh-amaheshwary · web-flow · commit 8db5109f56a0 · 2025-01-27T09:59:14.000-08:00
1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes [SNOW-1893939](https://snowflakecomputing.atlassian.net/browse/SNOW-1893939) 2. Fill out the following pre-review checklist: - [x] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. - [ ] I acknowledge that I have ensured my changes to be thread-safe. Follow the link for more information: [Thread-safe Developer Guidelines](https://github.com/snowflakedb/snowpark-python/blob/main/CONTRIBUTING.md#thread-safe-development) 3. Please describe how your code solves the related issue. Added support for Index.drop_duplicates. [SNOW-1893939]: https://snowflakecomputing.atlassian.net/browse/SNOW-1893939?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -99,6 +99,7 @@
 - Added support for `expand=True` in `Series.str.split`.
 - Added support for `DataFrame.pop` and `Series.pop`.
 - Added support for `first` and `last` in `DataFrameGroupBy.agg` and `SeriesGroupBy.agg`.
+- Added support for `Index.drop_duplicates`.
 
 #### Bug Fixes
 
diff --git a/docs/source/modin/indexing.rst b/docs/source/modin/indexing.rst
@@ -53,6 +53,7 @@ Index
     Index.argmin
     Index.argmax
     Index.copy
+    Index.drop_duplicates
     Index.equals
     Index.identical
     Index.is_boolean
diff --git a/docs/source/modin/supported/index_supported.rst b/docs/source/modin/supported/index_supported.rst
@@ -79,7 +79,7 @@ Methods
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``drop``                    | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
-| ``drop_duplicates``         | N                               |                                  |                                                    |
+| ``drop_duplicates``         | Y                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
 | ``duplicated``              | N                               |                                  |                                                    |
 +-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
diff --git a/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py b/src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py
@@ -1607,6 +1607,8 @@ def drop_duplicates(
     """
     Return `BasePandasDataset` with duplicate rows removed.
     """
+    if keep not in ("first", "last", False):
+        raise ValueError('keep must be either "first", "last" or False')
     inplace = validate_bool_kwarg(inplace, "inplace")
     ignore_index = kwargs.get("ignore_index", False)
     subset = kwargs.get("subset", None)
diff --git a/src/snowflake/snowpark/modin/plugin/extensions/index.py b/src/snowflake/snowpark/modin/plugin/extensions/index.py
@@ -533,10 +533,10 @@ def drop(
         # TODO: SNOW-1458146 implement drop
         pass  # pragma: no cover
 
-    @index_not_implemented()
-    def drop_duplicates(self) -> None:
-        # TODO: SNOW-1458147 implement drop_duplicates
-        pass  # pragma: no cover
+    def drop_duplicates(self, keep="first") -> None:
+        if keep not in ("first", "last", False):
+            raise ValueError('keep must be either "first", "last" or False')
+        return self.__constructor__(self.to_series().drop_duplicates(keep=keep))
 
     @index_not_implemented()
     def duplicated(self, keep: Literal["first", "last", False] = "first") -> np.ndarray:
diff --git a/tests/integ/modin/index/test_drop_duplicates.py b/tests/integ/modin/index/test_drop_duplicates.py
@@ -0,0 +1,76 @@
+#
+# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
+#
+
+import modin.pandas as pd
+import numpy as np
+import pandas as native_pd
+import pytest
+
+import snowflake.snowpark.modin.plugin  # noqa: F401
+from tests.integ.modin.utils import assert_index_equal
+from tests.integ.utils.sql_counter import sql_count_checker
+
+
+@pytest.mark.parametrize("keep", ["first", "last", False])
+@sql_count_checker(query_count=1, join_count=2)
+def test_drop_duplicates(keep):
+    pandas_idx = native_pd.Index(["a", "b", "b", "c", "a"], name="name")
+    snow_idx = pd.Index(pandas_idx)
+
+    assert_index_equal(
+        snow_idx.drop_duplicates(keep=keep),
+        pandas_idx.drop_duplicates(keep=keep),
+    )
+
+
+@pytest.mark.parametrize("keep", ["first", "last", False])
+@sql_count_checker(query_count=1, join_count=2)
+def test_drop_duplicates_on_empty_index(keep):
+    pandas_idx = native_pd.Index([], name="name")
+    snow_idx = pd.Index(pandas_idx)
+
+    assert_index_equal(
+        snow_idx.drop_duplicates(keep=keep),
+        pandas_idx.drop_duplicates(keep=keep),
+    )
+
+
+@pytest.mark.parametrize(
+    "keep, expected",
+    [
+        ("first", native_pd.Index([np.nan, 3])),
+        ("last", native_pd.Index([3, np.nan])),
+        (False, native_pd.Index([], dtype="float64")),
+    ],
+)
+@sql_count_checker(query_count=1, join_count=2)
+def test_drop_duplicates_nan_none(keep, expected):
+    # Note that Snowpark pandas treats np.nan and None the same
+    idx = pd.Index([np.nan, 3, 3, None, np.nan], dtype=object)
+
+    result = idx.drop_duplicates(keep=keep)
+    assert_index_equal(
+        result,
+        expected,
+    )
+
+
+@sql_count_checker(query_count=1, join_count=2)
+def test_drop_duplicates_default_keep():
+    pandas_idx = native_pd.Index([], name="name")
+    snow_idx = pd.Index(pandas_idx)
+
+    assert_index_equal(
+        snow_idx.drop_duplicates(),
+        pandas_idx.drop_duplicates(),
+    )
+
+
+@sql_count_checker(query_count=0, join_count=0)
+def test_drop_duplicates_invalid_keep():
+    snow_idx = pd.Index(["a", "b", "b", "c", "a"], name="name")
+    with pytest.raises(
+        ValueError, match='keep must be either "first", "last" or False'
+    ):
+        snow_idx.drop_duplicates(keep="invalid")
diff --git a/tests/integ/modin/series/test_drop_duplicates.py b/tests/integ/modin/series/test_drop_duplicates.py
@@ -73,3 +73,12 @@ def test_drop_duplicates_post_sort_values():
         check_dtype=False,
         check_index_type=False,
     )
+
+
+@sql_count_checker(query_count=0, join_count=0)
+def test_drop_duplicates_invalid_keep():
+    snow_ser = pd.Series(["a", "b", "b", "c", "a"], name="name")
+    with pytest.raises(
+        ValueError, match='keep must be either "first", "last" or False'
+    ):
+        snow_ser.drop_duplicates(keep="invalid")
diff --git a/tests/integ/modin/test_unimplemented.py b/tests/integ/modin/test_unimplemented.py
@@ -160,7 +160,6 @@ def test_unsupported_str_methods(func, func_name, caplog) -> None:
     lambda idx: idx.nbytes(),
     lambda idx: idx.memory_usage(),
     lambda idx: idx.delete(),
-    lambda idx: idx.drop_duplicates(),
     lambda idx: idx.factorize(),
     lambda idx: idx.insert(),
     lambda idx: idx.is_(),