Skip to content

Commit 93131b0

Browse files
Correct support for drop_duplicates (#466)
* Correct support for `drop_duplicates` * Resolves #464 * No longer rely directly on index values * Corrects issue introduced in #433 * Add tests * Adding test case validating order after a sort * Resolves #465 * Lint * Address comments
1 parent ff8960e commit 93131b0

File tree

2 files changed

+95
-2
lines changed

2 files changed

+95
-2
lines changed

modin/pandas/dataframe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,8 +1277,8 @@ def drop_duplicates(self, subset=None, keep="first", inplace=False):
12771277
"""
12781278
inplace = validate_bool_kwarg(inplace, "inplace")
12791279
duplicates = self.duplicated(subset=subset, keep=keep)
1280-
indices, = duplicates.nonzero()
1281-
return self.drop(indices, inplace=inplace)
1280+
indices, = duplicates.values.nonzero()
1281+
return self.drop(index=self.index[indices], inplace=inplace)
12821282

12831283
def duplicated(self, subset=None, keep="first"):
12841284
return self._default_to_pandas(

modin/pandas/test/test_dataframe.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1486,6 +1486,99 @@ def test_drop_duplicates(data):
14861486
df_equals(modin_df, pandas.DataFrame({"A": [], "B": [], "C": []}))
14871487

14881488

1489+
def test_drop_duplicates_with_missing_index_values():
1490+
data = {
1491+
"columns": ["value", "time", "id"],
1492+
"index": [
1493+
4,
1494+
5,
1495+
6,
1496+
7,
1497+
8,
1498+
9,
1499+
10,
1500+
11,
1501+
12,
1502+
13,
1503+
14,
1504+
15,
1505+
20,
1506+
21,
1507+
22,
1508+
23,
1509+
24,
1510+
25,
1511+
26,
1512+
27,
1513+
32,
1514+
33,
1515+
34,
1516+
35,
1517+
36,
1518+
37,
1519+
38,
1520+
39,
1521+
40,
1522+
41,
1523+
],
1524+
"data": [
1525+
["3", 1279213398000.0, 88.0],
1526+
["3", 1279204682000.0, 88.0],
1527+
["0", 1245772835000.0, 448.0],
1528+
["0", 1270564258000.0, 32.0],
1529+
["0", 1267106669000.0, 118.0],
1530+
["7", 1300621123000.0, 5.0],
1531+
["0", 1251130752000.0, 957.0],
1532+
["0", 1311683506000.0, 62.0],
1533+
["9", 1283692698000.0, 89.0],
1534+
["9", 1270234253000.0, 64.0],
1535+
["0", 1285088818000.0, 50.0],
1536+
["0", 1218212725000.0, 695.0],
1537+
["2", 1383933968000.0, 348.0],
1538+
["0", 1368227625000.0, 257.0],
1539+
["1", 1454514093000.0, 446.0],
1540+
["1", 1428497427000.0, 134.0],
1541+
["1", 1459184936000.0, 568.0],
1542+
["1", 1502293302000.0, 599.0],
1543+
["1", 1491833358000.0, 829.0],
1544+
["1", 1485431534000.0, 806.0],
1545+
["8", 1351800505000.0, 101.0],
1546+
["0", 1357247721000.0, 916.0],
1547+
["0", 1335804423000.0, 370.0],
1548+
["24", 1327547726000.0, 720.0],
1549+
["0", 1332334140000.0, 415.0],
1550+
["0", 1309543100000.0, 30.0],
1551+
["18", 1309541141000.0, 30.0],
1552+
["0", 1298979435000.0, 48.0],
1553+
["14", 1276098160000.0, 59.0],
1554+
["0", 1233936302000.0, 109.0],
1555+
],
1556+
}
1557+
1558+
pandas_df = pandas.DataFrame(
1559+
data["data"], index=data["index"], columns=data["columns"]
1560+
)
1561+
modin_df = pd.DataFrame(data["data"], index=data["index"], columns=data["columns"])
1562+
modin_result = modin_df.sort_values(["id", "time"]).drop_duplicates(["id"])
1563+
pandas_result = pandas_df.sort_values(["id", "time"]).drop_duplicates(["id"])
1564+
df_equals(modin_result, pandas_result)
1565+
1566+
1567+
def test_drop_duplicates_after_sort():
1568+
data = [
1569+
{"value": 1, "time": 2},
1570+
{"value": 1, "time": 1},
1571+
{"value": 2, "time": 1},
1572+
{"value": 2, "time": 2},
1573+
]
1574+
modin_df = pd.DataFrame(data)
1575+
pandas_df = pandas.DataFrame(data)
1576+
1577+
modin_result = modin_df.sort_values(["value", "time"]).drop_duplicates(["value"])
1578+
pandas_result = pandas_df.sort_values(["value", "time"]).drop_duplicates(["value"])
1579+
df_equals(modin_result, pandas_result)
1580+
1581+
14891582
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
14901583
@pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
14911584
@pytest.mark.parametrize("how", ["any", "all"], ids=["any", "all"])

0 commit comments

Comments
 (0)