Skip to content
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
- Added support for `expand=True` in `Series.str.split`.
- Added support for `DataFrame.pop` and `Series.pop`.
- Added support for `first` and `last` in `DataFrameGroupBy.agg` and `SeriesGroupBy.agg`.
- Added support for `Index.drop_duplicates`.

#### Bug Fixes

Expand Down
1 change: 1 addition & 0 deletions docs/source/modin/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Index
Index.argmin
Index.argmax
Index.copy
Index.drop_duplicates
Index.equals
Index.identical
Index.is_boolean
Expand Down
2 changes: 1 addition & 1 deletion docs/source/modin/supported/index_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ Methods
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``drop`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``drop_duplicates`` | N | | |
| ``drop_duplicates`` | Y | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
| ``duplicated`` | N | | |
+-----------------------------+---------------------------------+----------------------------------+----------------------------------------------------+
Expand Down
8 changes: 4 additions & 4 deletions src/snowflake/snowpark/modin/plugin/extensions/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,10 +533,10 @@ def drop(
# TODO: SNOW-1458146 implement drop
pass # pragma: no cover

@index_not_implemented()
def drop_duplicates(self) -> None:
# TODO: SNOW-1458147 implement drop_duplicates
pass # pragma: no cover
def drop_duplicates(self, keep="first") -> None:
if keep not in ("first", "last", False):
raise ValueError('keep must be either "first", "last" or False')
return self.__constructor__(self.to_series().drop_duplicates(keep=keep))

@index_not_implemented()
def duplicated(self, keep: Literal["first", "last", False] = "first") -> np.ndarray:
Expand Down
75 changes: 75 additions & 0 deletions tests/integ/modin/index/test_drop_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
#

import modin.pandas as pd
import numpy as np
import pandas as native_pd
import pytest

import snowflake.snowpark.modin.plugin # noqa: F401
from tests.integ.modin.utils import assert_index_equal
from tests.integ.utils.sql_counter import sql_count_checker


@pytest.mark.parametrize("keep", ["first", "last", False])
@sql_count_checker(query_count=1, join_count=2)
def test_drop_duplicates(keep):
pandas_idx = native_pd.Index(["a", "b", "b", "c", "a"], name="name")
snow_idx = pd.Index(pandas_idx)

assert_index_equal(
snow_idx.drop_duplicates(keep=keep),
pandas_idx.drop_duplicates(keep=keep),
)


@pytest.mark.parametrize("keep", ["first", "last", False])
@sql_count_checker(query_count=1, join_count=2)
def test_drop_duplicates_on_empty_index(keep):
pandas_idx = native_pd.Index([], name="name")
snow_idx = pd.Index(pandas_idx)

assert_index_equal(
snow_idx.drop_duplicates(keep=keep),
pandas_idx.drop_duplicates(keep=keep),
)


@pytest.mark.parametrize(
"keep, expected",
[
("first", native_pd.Index([np.nan, 3])),
("last", native_pd.Index([3, np.nan])),
(False, native_pd.Index([], dtype="float64")),
],
)
@sql_count_checker(query_count=1, join_count=2)
def test_drop_duplicates_nan_none(keep, expected):
# Note that Snowpark pandas treats np.nan and None the same
idx = pd.Index([np.nan, 3, 3, None, np.nan], dtype=object)

result = idx.drop_duplicates(keep=keep)
assert_index_equal(
result,
expected,
)


@sql_count_checker(query_count=1, join_count=2)
def test_drop_duplicates_default_keep():
pandas_idx = native_pd.Index([], name="name")
snow_idx = pd.Index(pandas_idx)

assert_index_equal(
snow_idx.drop_duplicates(),
pandas_idx.drop_duplicates(),
)


@sql_count_checker(query_count=0, join_count=0)
def test_drop_duplicates_invalid_keep():
snow_idx = pd.Index(["a", "b", "b", "c", "a"], name="name")
with pytest.raises(ValueError) as ex_info:
snow_idx.drop_duplicates(keep="invalid")
assert 'keep must be either "first", "last" or False' in str(ex_info)
Loading