Skip to content

Commit 82175a4

Browse files
feat: Add value_counts to GroupBy classes (#1974)
1 parent fedb8f2 commit 82175a4

File tree

8 files changed

+328
-64
lines changed

8 files changed

+328
-64
lines changed

bigframes/core/block_transforms.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -355,24 +355,28 @@ def value_counts(
355355
normalize: bool = False,
356356
sort: bool = True,
357357
ascending: bool = False,
358-
dropna: bool = True,
358+
drop_na: bool = True,
359+
grouping_keys: typing.Sequence[str] = (),
359360
):
360-
block, dummy = block.create_constant(1)
361+
if grouping_keys and drop_na:
362+
# only need this if grouping_keys is involved, otherwise the drop_na in the aggregation will handle it for us
363+
block = dropna(block, columns, how="any")
361364
block, agg_ids = block.aggregate(
362-
by_column_ids=columns,
363-
aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))],
364-
dropna=dropna,
365+
by_column_ids=(*grouping_keys, *columns),
366+
aggregations=[ex.NullaryAggregation(agg_ops.size_op)],
367+
dropna=drop_na and not grouping_keys,
365368
)
366369
count_id = agg_ids[0]
367370
if normalize:
368-
unbound_window = windows.unbound()
371+
unbound_window = windows.unbound(grouping_keys=tuple(grouping_keys))
369372
block, total_count_id = block.apply_window_op(
370373
count_id, agg_ops.sum_op, unbound_window
371374
)
372375
block, count_id = block.apply_binary_op(count_id, total_count_id, ops.div_op)
373376

374377
if sort:
375-
block = block.order_by(
378+
order_parts = [ordering.ascending_over(id) for id in grouping_keys]
379+
order_parts.extend(
376380
[
377381
ordering.OrderingExpression(
378382
ex.deref(count_id),
@@ -382,6 +386,7 @@ def value_counts(
382386
)
383387
]
384388
)
389+
block = block.order_by(order_parts)
385390
return block.select_column(count_id).with_column_labels(
386391
["proportion" if normalize else "count"]
387392
)

bigframes/core/groupby/dataframe_group_by.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import datetime
1818
import typing
19-
from typing import Literal, Sequence, Tuple, Union
19+
from typing import Literal, Optional, Sequence, Tuple, Union
2020

2121
import bigframes_vendored.constants as constants
2222
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -372,6 +372,39 @@ def diff(self, periods=1) -> series.Series:
372372
)
373373
return self._apply_window_op(agg_ops.DiffOp(periods), window=window)
374374

375+
def value_counts(
376+
self,
377+
subset: Optional[Sequence[blocks.Label]] = None,
378+
normalize: bool = False,
379+
sort: bool = True,
380+
ascending: bool = False,
381+
dropna: bool = True,
382+
) -> Union[df.DataFrame, series.Series]:
383+
if subset is None:
384+
columns = self._selected_cols
385+
else:
386+
columns = [
387+
column
388+
for column in self._block.value_columns
389+
if self._block.col_id_to_label[column] in subset
390+
]
391+
block = self._block
392+
if self._dropna: # this drops null grouping columns
393+
block = block_ops.dropna(block, self._by_col_ids)
394+
block = block_ops.value_counts(
395+
block,
396+
columns,
397+
normalize=normalize,
398+
sort=sort,
399+
ascending=ascending,
400+
drop_na=dropna, # this drops null value columns
401+
grouping_keys=self._by_col_ids,
402+
)
403+
if self._as_index:
404+
return series.Series(block)
405+
else:
406+
return series.Series(block).to_frame().reset_index(drop=False)
407+
375408
@validations.requires_ordering()
376409
def rolling(
377410
self,

bigframes/core/groupby/series_group_by.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,30 @@ def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]:
244244

245245
aggregate = agg
246246

247+
def value_counts(
248+
self,
249+
normalize: bool = False,
250+
sort: bool = True,
251+
ascending: bool = False,
252+
dropna: bool = True,
253+
) -> Union[df.DataFrame, series.Series]:
254+
columns = [self._value_column]
255+
block = self._block
256+
if self._dropna: # this drops null grouping columns
257+
block = block_ops.dropna(block, self._by_col_ids)
258+
block = block_ops.value_counts(
259+
block,
260+
columns,
261+
normalize=normalize,
262+
sort=sort,
263+
ascending=ascending,
264+
drop_na=dropna, # this drops null value columns
265+
grouping_keys=self._by_col_ids,
266+
)
267+
# TODO: once as_index=Fales supported, return DataFrame instead by resetting index
268+
# with .to_frame().reset_index(drop=False)
269+
return series.Series(block)
270+
247271
@validations.requires_ordering()
248272
def cumsum(self, *args, **kwargs) -> series.Series:
249273
return self._apply_window_op(

bigframes/core/indexes/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ def value_counts(
489489
self._block.index_columns,
490490
normalize=normalize,
491491
ascending=ascending,
492-
dropna=dropna,
492+
drop_na=dropna,
493493
)
494494
import bigframes.series as series
495495

bigframes/dataframe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2475,7 +2475,7 @@ def value_counts(
24752475
normalize=normalize,
24762476
sort=sort,
24772477
ascending=ascending,
2478-
dropna=dropna,
2478+
drop_na=dropna,
24792479
)
24802480
return bigframes.series.Series(block)
24812481

bigframes/series.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1631,7 +1631,7 @@ def value_counts(
16311631
[self._value_column],
16321632
normalize=normalize,
16331633
ascending=ascending,
1634-
dropna=dropna,
1634+
drop_na=dropna,
16351635
)
16361636
return Series(block)
16371637

tests/system/small/test_groupby.py

Lines changed: 130 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,101 @@ def test_dataframe_groupby_nonnumeric_with_mean():
582582
)
583583

584584

585+
@pytest.mark.parametrize(
586+
("subset", "normalize", "ascending", "dropna", "as_index"),
587+
[
588+
(None, True, True, True, True),
589+
(["int64_too", "int64_col"], False, False, False, False),
590+
],
591+
)
592+
def test_dataframe_groupby_value_counts(
593+
scalars_df_index,
594+
scalars_pandas_df_index,
595+
subset,
596+
normalize,
597+
ascending,
598+
dropna,
599+
as_index,
600+
):
601+
if pd.__version__.startswith("1."):
602+
pytest.skip("pandas 1.x produces different column labels.")
603+
col_names = ["float64_col", "int64_col", "bool_col", "int64_too"]
604+
bf_result = (
605+
scalars_df_index[col_names]
606+
.groupby("bool_col", as_index=as_index)
607+
.value_counts(
608+
subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
609+
)
610+
.to_pandas()
611+
)
612+
pd_result = (
613+
scalars_pandas_df_index[col_names]
614+
.groupby("bool_col", as_index=as_index)
615+
.value_counts(
616+
subset=subset, normalize=normalize, ascending=ascending, dropna=dropna
617+
)
618+
)
619+
620+
if as_index:
621+
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
622+
else:
623+
pd_result.index = pd_result.index.astype("Int64")
624+
pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
625+
626+
627+
@pytest.mark.parametrize(
628+
("numeric_only", "min_count"),
629+
[
630+
(False, 4),
631+
(True, 0),
632+
],
633+
)
634+
def test_dataframe_groupby_first(
635+
scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
636+
):
637+
# min_count seems to not work properly on older pandas
638+
pytest.importorskip("pandas", minversion="2.0.0")
639+
# bytes, dates not handling min_count properly in pandas
640+
bf_result = (
641+
scalars_df_index.drop(columns=["bytes_col", "date_col"])
642+
.groupby(scalars_df_index.int64_col % 2)
643+
.first(numeric_only=numeric_only, min_count=min_count)
644+
).to_pandas()
645+
pd_result = (
646+
scalars_pandas_df_index.drop(columns=["bytes_col", "date_col"])
647+
.groupby(scalars_pandas_df_index.int64_col % 2)
648+
.first(numeric_only=numeric_only, min_count=min_count)
649+
)
650+
pd.testing.assert_frame_equal(
651+
pd_result,
652+
bf_result,
653+
)
654+
655+
656+
@pytest.mark.parametrize(
657+
("numeric_only", "min_count"),
658+
[
659+
(True, 2),
660+
(False, -1),
661+
],
662+
)
663+
def test_dataframe_groupby_last(
664+
scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
665+
):
666+
bf_result = (
667+
scalars_df_index.groupby(scalars_df_index.int64_col % 2).last(
668+
numeric_only=numeric_only, min_count=min_count
669+
)
670+
).to_pandas()
671+
pd_result = scalars_pandas_df_index.groupby(
672+
scalars_pandas_df_index.int64_col % 2
673+
).last(numeric_only=numeric_only, min_count=min_count)
674+
pd.testing.assert_frame_equal(
675+
pd_result,
676+
bf_result,
677+
)
678+
679+
585680
# ==============
586681
# Series.groupby
587682
# ==============
@@ -770,6 +865,41 @@ def test_series_groupby_quantile(scalars_df_index, scalars_pandas_df_index, q):
770865
)
771866

772867

868+
@pytest.mark.parametrize(
869+
("normalize", "ascending", "dropna"),
870+
[
871+
(
872+
True,
873+
True,
874+
True,
875+
),
876+
(
877+
False,
878+
False,
879+
False,
880+
),
881+
],
882+
)
883+
def test_series_groupby_value_counts(
884+
scalars_df_index,
885+
scalars_pandas_df_index,
886+
normalize,
887+
ascending,
888+
dropna,
889+
):
890+
if pd.__version__.startswith("1."):
891+
pytest.skip("pandas 1.x produces different column labels.")
892+
bf_result = (
893+
scalars_df_index.groupby("bool_col")["string_col"]
894+
.value_counts(normalize=normalize, ascending=ascending, dropna=dropna)
895+
.to_pandas()
896+
)
897+
pd_result = scalars_pandas_df_index.groupby("bool_col")["string_col"].value_counts(
898+
normalize=normalize, ascending=ascending, dropna=dropna
899+
)
900+
pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
901+
902+
773903
@pytest.mark.parametrize(
774904
("numeric_only", "min_count"),
775905
[
@@ -813,56 +943,3 @@ def test_series_groupby_last(
813943
numeric_only=numeric_only, min_count=min_count
814944
)
815945
pd.testing.assert_series_equal(pd_result, bf_result)
816-
817-
818-
@pytest.mark.parametrize(
819-
("numeric_only", "min_count"),
820-
[
821-
(False, 4),
822-
(True, 0),
823-
],
824-
)
825-
def test_dataframe_groupby_first(
826-
scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
827-
):
828-
# min_count seems to not work properly on older pandas
829-
pytest.importorskip("pandas", minversion="2.0.0")
830-
# bytes, dates not handling min_count properly in pandas
831-
bf_result = (
832-
scalars_df_index.drop(columns=["bytes_col", "date_col"])
833-
.groupby(scalars_df_index.int64_col % 2)
834-
.first(numeric_only=numeric_only, min_count=min_count)
835-
).to_pandas()
836-
pd_result = (
837-
scalars_pandas_df_index.drop(columns=["bytes_col", "date_col"])
838-
.groupby(scalars_pandas_df_index.int64_col % 2)
839-
.first(numeric_only=numeric_only, min_count=min_count)
840-
)
841-
pd.testing.assert_frame_equal(
842-
pd_result,
843-
bf_result,
844-
)
845-
846-
847-
@pytest.mark.parametrize(
848-
("numeric_only", "min_count"),
849-
[
850-
(True, 2),
851-
(False, -1),
852-
],
853-
)
854-
def test_dataframe_groupby_last(
855-
scalars_df_index, scalars_pandas_df_index, numeric_only, min_count
856-
):
857-
bf_result = (
858-
scalars_df_index.groupby(scalars_df_index.int64_col % 2).last(
859-
numeric_only=numeric_only, min_count=min_count
860-
)
861-
).to_pandas()
862-
pd_result = scalars_pandas_df_index.groupby(
863-
scalars_pandas_df_index.int64_col % 2
864-
).last(numeric_only=numeric_only, min_count=min_count)
865-
pd.testing.assert_frame_equal(
866-
pd_result,
867-
bf_result,
868-
)

0 commit comments

Comments
 (0)