Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Bug fixes
~~~~~~~~~
- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
- Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
- Bug in :meth:`DataFrameGroupBy.value_counts` where ``subset`` had no effect (:issue:`44267`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will be changing issue number to #46383 in #48314

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @simonjayhawkins! Sounds good

- Bug in :meth:`loc.__getitem__` with a list of keys causing an internal inconsistency that could lead to a disconnect between ``frame.at[x, y]`` vs ``frame[y].loc[x]`` (:issue:`22372`)
- Bug in the :meth:`Series.dt.strftime` accessor return a float instead of object dtype Series for all-NaT input, which also causes a spurious deprecation warning (:issue:`45858`)

Expand Down
21 changes: 12 additions & 9 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1805,21 +1805,24 @@ def value_counts(
name = self._selected_obj.name
keys = [] if name in in_axis_names else [self._selected_obj]
else:
if subset is not None:
subsetted = set(subset)
clashing = subsetted & set(in_axis_names)
if clashing:
raise ValueError(
f"Keys {clashing} in subset cannot be in "
"the groupby column keys"
)
else:
subsetted = set(self._selected_obj.columns)

keys = [
# Can't use .values because the column label needs to be preserved
self._selected_obj.iloc[:, idx]
for idx, name in enumerate(self._selected_obj.columns)
if name not in in_axis_names
if name not in in_axis_names and name in subsetted
]

if subset is not None:
clashing = set(subset) & set(in_axis_names)
if clashing:
raise ValueError(
f"Keys {clashing} in subset cannot be in "
"the groupby column keys"
)

groupings = list(self.grouper.groupings)
for key in keys:
grouper, _, _ = get_grouper(
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/groupby/test_frame_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,3 +738,20 @@ def test_ambiguous_grouping():
result = gb.value_counts()
expected = Series([2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]))
tm.assert_series_equal(result, expected)


def test_subset_overlaps_gb_key_raises():
# GH 46383
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
with pytest.raises(ValueError, match="Keys {'c1'}"):
df.groupby("c1").value_counts(subset=["c1"])


def test_subset():
# GH 46383
df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
result = df.groupby(level=0).value_counts(subset=["c2"])
expected = Series(
[1, 2], index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"])
)
tm.assert_series_equal(result, expected)