feat: Add value_counts to GroupBy classes (#1974)

TrevorBergeron · web-flow · commit 82175a4d0fa4 · 2025-08-08T14:47:23.000-05:00
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -355,24 +355,28 @@ def value_counts(
     normalize: bool = False,
     sort: bool = True,
     ascending: bool = False,
-    dropna: bool = True,
+    drop_na: bool = True,
+    grouping_keys: typing.Sequence[str] = (),
 ):
-    block, dummy = block.create_constant(1)
+    if grouping_keys and drop_na:
+        # only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us
+        block = dropna(block, columns, how="any")
     block, agg_ids = block.aggregate(
-        by_column_ids=columns,
-        aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))],
-        dropna=dropna,
+        by_column_ids=(*grouping_keys, *columns),
+        aggregations=[ex.NullaryAggregation(agg_ops.size_op)],
+        dropna=drop_na and not grouping_keys,
     )
     count_id = agg_ids[0]
     if normalize:
-        unbound_window = windows.unbound()
+        unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys))
         block, total_count_id = block.apply_window_op(
             count_id, agg_ops.sum_op, unbound_window
         )
         block, count_id = block.apply_binary_op(count_id, total_count_id, ops.div_op)
 
     if sort:
-        block = block.order_by(
+        order_parts = [ordering.ascending_over(id) for id in grouping_keys]
+        order_parts.extend(
             [
                 ordering.OrderingExpression(
                     ex.deref(count_id),
@@ -382,6 +386,7 @@ def value_counts(
                 )
             ]
         )
+        block = block.order_by(order_parts)
     return block.select_column(count_id).with_column_labels(
         ["proportion" if normalize else "count"]
     )
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
@@ -16,7 +16,7 @@
 
 import datetime
 import typing
-from typing import Literal, Sequence, Tuple, Union
+from typing import Literal, Optional, Sequence, Tuple, Union
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -372,6 +372,39 @@ def diff(self, periods=1) -> series.Series:
         )
         return self._apply_window_op(agg_ops.DiffOp(periods), window=window)
 
+    def value_counts(
+        self,
+        subset: Optional[Sequence[blocks.Label]] = None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> Union[df.DataFrame, series.Series]:
+        if subset is None:
+            columns = self._selected_cols
+        else:
+            columns = [
+                column
+                for column in self._block.value_columns
+                if self._block.col_id_to_label[column] in subset
+            ]
+        block = self._block
+        if self._dropna:  # this drops null grouping columns
+            block = block_ops.dropna(block, self._by_col_ids)
+        block = block_ops.value_counts(
+            block,
+            columns,
+            normalize=normalize,
+            sort=sort,
+            ascending=ascending,
+            drop_na=dropna,  # this drops null value columns
+            grouping_keys=self._by_col_ids,
+        )
+        if self._as_index:
+            return series.Series(block)
+        else:
+            return series.Series(block).to_frame().reset_index(drop=False)
+
     @validations.requires_ordering()
     def rolling(
         self,
diff --git a/bigframes/core/groupby/series_group_by.py b/bigframes/core/groupby/series_group_by.py
@@ -244,6 +244,30 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]:
 
     aggregate = agg
 
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> Union[df.DataFrame, series.Series]:
+        columns = [self._value_column]
+        block = self._block
+        if self._dropna:  # this drops null grouping columns
+            block = block_ops.dropna(block, self._by_col_ids)
+        block = block_ops.value_counts(
+            block,
+            columns,
+            normalize=normalize,
+            sort=sort,
+            ascending=ascending,
+            drop_na=dropna,  # this drops null value columns
+            grouping_keys=self._by_col_ids,
+        )
+        # TODO: once as_index=Fales supported, return DataFrame instead by resetting index
+        # with .to_frame().reset_index(drop=False)
+        return series.Series(block)
+
     @validations.requires_ordering()
     def cumsum(self, *args, **kwargs) -> series.Series:
         return self._apply_window_op(
diff --git a/bigframes/core/indexes/base.py b/bigframes/core/indexes/base.py
@@ -489,7 +489,7 @@ def value_counts(
             self._block.index_columns,
             normalize=normalize,
             ascending=ascending,
-            dropna=dropna,
+            drop_na=dropna,
         )
         import bigframes.series as series
 
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -2475,7 +2475,7 @@ def value_counts(
             normalize=normalize,
             sort=sort,
             ascending=ascending,
-            dropna=dropna,
+            drop_na=dropna,
         )
         return bigframes.series.Series(block)
 
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -1631,7 +1631,7 @@ def value_counts(
             [self._value_column],
             normalize=normalize,
             ascending=ascending,
-            dropna=dropna,
+            drop_na=dropna,
         )
         return Series(block)
 
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
@@ -582,6 +582,101 @@ def test_dataframe_groupby_nonnumeric_with_mean():
     )
 
 
+@pytest.mark.parametrize(
+    ("subset", "normalize", "ascending", "dropna", "as_index"),
+    [
+        (None, True, True, True, True),
+        (["int64_too", "int64_col"], False, False, False, False),
+    ],
+)
+def test_dataframe_groupby_value_counts(
+    scalars_df_index,
+    scalars_pandas_df_index,
+    subset,
+    normalize,
+    ascending,
+    dropna,
+    as_index,
+):
+    if pd.__version__.startswith("1."):
+        pytest.skip("pandas 1.x produces different column labels.")
+    col_names = ["float64_col", "int64_col", "bool_col", "int64_too"]
+    bf_result = (
+        scalars_df_index[col_names]
+        .groupby("bool_col", as_index=as_index)
+        .value_counts(
+            subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
+        )
+        .to_pandas()
+    )
+    pd_result = (
+        scalars_pandas_df_index[col_names]
+        .groupby("bool_col", as_index=as_index)
+        .value_counts(
+            subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
+        )
+    )
+
+    if as_index:
+        pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+    else:
+        pd_result.index = pd_result.index.astype("Int64")
+        pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    ("numeric_only", "min_count"),
+    [
+        (False, 4),
+        (True, 0),
+    ],
+)
+def test_dataframe_groupby_first(
+    scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
+):
+    # min_count seems to not work properly on older pandas
+    pytest.importorskip("pandas", minversion="2.0.0")
+    # bytes, dates not handling min_count properly in pandas
+    bf_result = (
+        scalars_df_index.drop(columns=["bytes_col", "date_col"])
+        .groupby(scalars_df_index.int64_col % 2)
+        .first(numeric_only=numeric_only, min_count=min_count)
+    ).to_pandas()
+    pd_result = (
+        scalars_pandas_df_index.drop(columns=["bytes_col", "date_col"])
+        .groupby(scalars_pandas_df_index.int64_col % 2)
+        .first(numeric_only=numeric_only, min_count=min_count)
+    )
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
+@pytest.mark.parametrize(
+    ("numeric_only", "min_count"),
+    [
+        (True, 2),
+        (False, -1),
+    ],
+)
+def test_dataframe_groupby_last(
+    scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
+):
+    bf_result = (
+        scalars_df_index.groupby(scalars_df_index.int64_col % 2).last(
+            numeric_only=numeric_only, min_count=min_count
+        )
+    ).to_pandas()
+    pd_result = scalars_pandas_df_index.groupby(
+        scalars_pandas_df_index.int64_col % 2
+    ).last(numeric_only=numeric_only, min_count=min_count)
+    pd.testing.assert_frame_equal(
+        pd_result,
+        bf_result,
+    )
+
+
 # ==============
 # Series.groupby
 # ==============
@@ -770,6 +865,41 @@ def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
     )
 
 
+@pytest.mark.parametrize(
+    ("normalize", "ascending", "dropna"),
+    [
+        (
+            True,
+            True,
+            True,
+        ),
+        (
+            False,
+            False,
+            False,
+        ),
+    ],
+)
+def test_series_groupby_value_counts(
+    scalars_df_index,
+    scalars_pandas_df_index,
+    normalize,
+    ascending,
+    dropna,
+):
+    if pd.__version__.startswith("1."):
+        pytest.skip("pandas 1.x produces different column labels.")
+    bf_result = (
+        scalars_df_index.groupby("bool_col")["string_col"]
+        .value_counts(normalize=normalize, ascending=ascending, dropna=dropna)
+        .to_pandas()
+    )
+    pd_result = scalars_pandas_df_index.groupby("bool_col")["string_col"].value_counts(
+        normalize=normalize, ascending=ascending, dropna=dropna
+    )
+    pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     ("numeric_only", "min_count"),
     [
@@ -813,56 +943,3 @@ def test_series_groupby_last(
         numeric_only=numeric_only, min_count=min_count
     )
     pd.testing.assert_series_equal(pd_result, bf_result)
-
-
-@pytest.mark.parametrize(
-    ("numeric_only", "min_count"),
-    [
-        (False, 4),
-        (True, 0),
-    ],
-)
-def test_dataframe_groupby_first(
-    scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
-):
-    # min_count seems to not work properly on older pandas
-    pytest.importorskip("pandas", minversion="2.0.0")
-    # bytes, dates not handling min_count properly in pandas
-    bf_result = (
-        scalars_df_index.drop(columns=["bytes_col", "date_col"])
-        .groupby(scalars_df_index.int64_col % 2)
-        .first(numeric_only=numeric_only, min_count=min_count)
-    ).to_pandas()
-    pd_result = (
-        scalars_pandas_df_index.drop(columns=["bytes_col", "date_col"])
-        .groupby(scalars_pandas_df_index.int64_col % 2)
-        .first(numeric_only=numeric_only, min_count=min_count)
-    )
-    pd.testing.assert_frame_equal(
-        pd_result,
-        bf_result,
-    )
-
-
-@pytest.mark.parametrize(
-    ("numeric_only", "min_count"),
-    [
-        (True, 2),
-        (False, -1),
-    ],
-)
-def test_dataframe_groupby_last(
-    scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
-):
-    bf_result = (
-        scalars_df_index.groupby(scalars_df_index.int64_col % 2).last(
-            numeric_only=numeric_only, min_count=min_count
-        )
-    ).to_pandas()
-    pd_result = scalars_pandas_df_index.groupby(
-        scalars_pandas_df_index.int64_col % 2
-    ).last(numeric_only=numeric_only, min_count=min_count)
-    pd.testing.assert_frame_equal(
-        pd_result,
-        bf_result,
-    )
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -489,7 +489,7 @@ def value_counts(`
`489`	`489`	`self._block.index_columns,`
`490`	`490`	`normalize=normalize,`
`491`	`491`	`ascending=ascending,`
`492`		`- dropna=dropna,`
	`492`	`+ drop_na=dropna,`
`493`	`493`	`)`
`494`	`494`	`import bigframes.series as series`
`495`	`495`
Original file line number	Diff line number	Diff line change
`@@ -2475,7 +2475,7 @@ def value_counts(`
`2475`	`2475`	`normalize=normalize,`
`2476`	`2476`	`sort=sort,`
`2477`	`2477`	`ascending=ascending,`
`2478`		`- dropna=dropna,`
	`2478`	`+ drop_na=dropna,`
`2479`	`2479`	`)`
`2480`	`2480`	`return bigframes.series.Series(block)`
`2481`	`2481`
Original file line number	Diff line number	Diff line change
`@@ -1631,7 +1631,7 @@ def value_counts(`
`1631`	`1631`	`[self._value_column],`
`1632`	`1632`	`normalize=normalize,`
`1633`	`1633`	`ascending=ascending,`
`1634`		`- dropna=dropna,`
	`1634`	`+ drop_na=dropna,`
`1635`	`1635`	`)`
`1636`	`1636`	`return Series(block)`
`1637`	`1637`