Skip to content

Commit 010902f

Browse files
Fixing issue with duplicated rows when looping through groupby object (#459)
* Fixing issue with duplicated rows when looping through groupby object * Resolves #458 * Only gets the unique values of the index * Add test to validate changes * Revert change to axis=1 * Lint * Lint * Add fix that Resolves #460
1 parent 5f72f94 commit 010902f

File tree

2 files changed

+32
-3
lines changed

2 files changed

+32
-3
lines changed

modin/pandas/groupby.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def __init__(
4141
self._columns = self._query_compiler.columns
4242
self._by = by
4343
# This tells us whether or not there are multiple columns/rows in the groupby
44-
self._is_multi_by = all(obj in self._df for obj in self._by)
44+
self._is_multi_by = all(obj in self._df for obj in self._by) and axis == 0
4545
self._level = level
4646
self._kwargs = {
4747
"sort": sort,
@@ -113,7 +113,7 @@ def _iter(self):
113113
k,
114114
DataFrame(
115115
query_compiler=self._query_compiler.getitem_row_array(
116-
self._index.get_indexer_for(self._index_grouped[k])
116+
self._index.get_indexer_for(self._index_grouped[k].unique())
117117
)
118118
),
119119
)
@@ -125,7 +125,7 @@ def _iter(self):
125125
k,
126126
DataFrame(
127127
query_compiler=self._query_compiler.getitem_column_array(
128-
self._index_grouped[k]
128+
self._index_grouped[k].unique()
129129
)
130130
),
131131
)

modin/pandas/test/test_groupby.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -764,3 +764,32 @@ def test_groups(ray_groupby, pandas_groupby):
764764
@pytest.fixture
765765
def test_shift(ray_groupby, pandas_groupby):
766766
assert ray_groupby.groups == pandas_groupby.groups
767+
768+
769+
def test_groupby_on_index_values_with_loop():
770+
length = 2 ** 6
771+
data = {
772+
"a": np.random.randint(0, 100, size=length),
773+
"b": np.random.randint(0, 100, size=length),
774+
"c": np.random.randint(0, 100, size=length),
775+
}
776+
idx = ["g1" if i % 3 != 0 else "g2" for i in range(length)]
777+
modin_df = pd.DataFrame(data, index=idx, columns=list("aba"))
778+
pandas_df = pandas.DataFrame(data, index=idx, columns=list("aba"))
779+
modin_groupby_obj = modin_df.groupby(modin_df.index)
780+
pandas_groupby_obj = pandas_df.groupby(pandas_df.index)
781+
782+
modin_dict = {k: v for k, v in modin_groupby_obj}
783+
pandas_dict = {k: v for k, v in pandas_groupby_obj}
784+
785+
for k in modin_dict:
786+
ray_df_equals_pandas(modin_dict[k], pandas_dict[k])
787+
788+
modin_groupby_obj = modin_df.groupby(modin_df.columns, axis=1)
789+
pandas_groupby_obj = pandas_df.groupby(pandas_df.columns, axis=1)
790+
791+
modin_dict = {k: v for k, v in modin_groupby_obj}
792+
pandas_dict = {k: v for k, v in pandas_groupby_obj}
793+
794+
for k in modin_dict:
795+
ray_df_equals_pandas(modin_dict[k], pandas_dict[k])

0 commit comments

Comments
 (0)