Skip to content

Commit 8db5109

Browse files
SNOW-1893939: Add support for Index.drop_duplicates (#2923)
1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. Fixes [SNOW-1893939](https://snowflakecomputing.atlassian.net/browse/SNOW-1893939) 2. Fill out the following pre-review checklist: - [x] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. - [ ] I acknowledge that I have ensured my changes to be thread-safe. Follow the link for more information: [Thread-safe Developer Guidelines](https://github.com/snowflakedb/snowpark-python/blob/main/CONTRIBUTING.md#thread-safe-development) 3. Please describe how your code solves the related issue. Added support for Index.drop_duplicates. [SNOW-1893939]: https://snowflakecomputing.atlassian.net/browse/SNOW-1893939?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
1 parent e1aa089 commit 8db5109

File tree

8 files changed

+94
-6
lines changed

8 files changed

+94
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
- Added support for `expand=True` in `Series.str.split`.
100100
- Added support for `DataFrame.pop` and `Series.pop`.
101101
- Added support for `first` and `last` in `DataFrameGroupBy.agg` and `SeriesGroupBy.agg`.
102+
- Added support for `Index.drop_duplicates`.
102103

103104
#### Bug Fixes
104105

docs/source/modin/indexing.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Index
5353
Index.argmin
5454
Index.argmax
5555
Index.copy
56+
Index.drop_duplicates
5657
Index.equals
5758
Index.identical
5859
Index.is_boolean

docs/source/modin/supported/index_supported.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ Methods
7979
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
8080
| ``drop`` | N | | |
8181
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
82-
| ``drop_duplicates`` | N | | |
82+
| ``drop_duplicates`` | Y | | |
8383
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
8484
| ``duplicated`` | N | | |
8585
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+

src/snowflake/snowpark/modin/plugin/extensions/base_overrides.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1607,6 +1607,8 @@ def drop_duplicates(
16071607
"""
16081608
Return `BasePandasDataset` with duplicate rows removed.
16091609
"""
1610+
if keep not in ("first", "last", False):
1611+
raise ValueError('keep must be either "first", "last" or False')
16101612
inplace = validate_bool_kwarg(inplace, "inplace")
16111613
ignore_index = kwargs.get("ignore_index", False)
16121614
subset = kwargs.get("subset", None)

src/snowflake/snowpark/modin/plugin/extensions/index.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -533,10 +533,10 @@ def drop(
533533
# TODO: SNOW-1458146 implement drop
534534
pass # pragma: no cover
535535

536-
@index_not_implemented()
537-
def drop_duplicates(self) -> None:
538-
# TODO: SNOW-1458147 implement drop_duplicates
539-
pass # pragma: no cover
536+
def drop_duplicates(self, keep="first") -> None:
537+
if keep not in ("first", "last", False):
538+
raise ValueError('keep must be either "first", "last" or False')
539+
return self.__constructor__(self.to_series().drop_duplicates(keep=keep))
540540

541541
@index_not_implemented()
542542
def duplicated(self, keep: Literal["first", "last", False] = "first") -> np.ndarray:
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#
2+
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3+
#
4+
5+
import modin.pandas as pd
6+
import numpy as np
7+
import pandas as native_pd
8+
import pytest
9+
10+
import snowflake.snowpark.modin.plugin # noqa: F401
11+
from tests.integ.modin.utils import assert_index_equal
12+
from tests.integ.utils.sql_counter import sql_count_checker
13+
14+
15+
@pytest.mark.parametrize("keep", ["first", "last", False])
16+
@sql_count_checker(query_count=1, join_count=2)
17+
def test_drop_duplicates(keep):
18+
pandas_idx = native_pd.Index(["a", "b", "b", "c", "a"], name="name")
19+
snow_idx = pd.Index(pandas_idx)
20+
21+
assert_index_equal(
22+
snow_idx.drop_duplicates(keep=keep),
23+
pandas_idx.drop_duplicates(keep=keep),
24+
)
25+
26+
27+
@pytest.mark.parametrize("keep", ["first", "last", False])
28+
@sql_count_checker(query_count=1, join_count=2)
29+
def test_drop_duplicates_on_empty_index(keep):
30+
pandas_idx = native_pd.Index([], name="name")
31+
snow_idx = pd.Index(pandas_idx)
32+
33+
assert_index_equal(
34+
snow_idx.drop_duplicates(keep=keep),
35+
pandas_idx.drop_duplicates(keep=keep),
36+
)
37+
38+
39+
@pytest.mark.parametrize(
40+
"keep, expected",
41+
[
42+
("first", native_pd.Index([np.nan, 3])),
43+
("last", native_pd.Index([3, np.nan])),
44+
(False, native_pd.Index([], dtype="float64")),
45+
],
46+
)
47+
@sql_count_checker(query_count=1, join_count=2)
48+
def test_drop_duplicates_nan_none(keep, expected):
49+
# Note that Snowpark pandas treats np.nan and None the same
50+
idx = pd.Index([np.nan, 3, 3, None, np.nan], dtype=object)
51+
52+
result = idx.drop_duplicates(keep=keep)
53+
assert_index_equal(
54+
result,
55+
expected,
56+
)
57+
58+
59+
@sql_count_checker(query_count=1, join_count=2)
60+
def test_drop_duplicates_default_keep():
61+
pandas_idx = native_pd.Index([], name="name")
62+
snow_idx = pd.Index(pandas_idx)
63+
64+
assert_index_equal(
65+
snow_idx.drop_duplicates(),
66+
pandas_idx.drop_duplicates(),
67+
)
68+
69+
70+
@sql_count_checker(query_count=0, join_count=0)
71+
def test_drop_duplicates_invalid_keep():
72+
snow_idx = pd.Index(["a", "b", "b", "c", "a"], name="name")
73+
with pytest.raises(
74+
ValueError, match='keep must be either "first", "last" or False'
75+
):
76+
snow_idx.drop_duplicates(keep="invalid")

tests/integ/modin/series/test_drop_duplicates.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,12 @@ def test_drop_duplicates_post_sort_values():
7373
check_dtype=False,
7474
check_index_type=False,
7575
)
76+
77+
78+
@sql_count_checker(query_count=0, join_count=0)
79+
def test_drop_duplicates_invalid_keep():
80+
snow_ser = pd.Series(["a", "b", "b", "c", "a"], name="name")
81+
with pytest.raises(
82+
ValueError, match='keep must be either "first", "last" or False'
83+
):
84+
snow_ser.drop_duplicates(keep="invalid")

tests/integ/modin/test_unimplemented.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,6 @@ def test_unsupported_str_methods(func, func_name, caplog) -> None:
160160
lambda idx: idx.nbytes(),
161161
lambda idx: idx.memory_usage(),
162162
lambda idx: idx.delete(),
163-
lambda idx: idx.drop_duplicates(),
164163
lambda idx: idx.factorize(),
165164
lambda idx: idx.insert(),
166165
lambda idx: idx.is_(),

0 commit comments

Comments
 (0)