feat: add groupby head API (#791)

Genesis929 · web-flow · commit 44202bc3541d · 2024-06-17T11:57:27.000-07:00
* feat: add groupby head API

* update annotations

* update order
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -1384,6 +1384,26 @@ def _normalize_expression(
             raise ValueError("Unexpected number of value columns.")
         return expr.select_columns([*index_columns, *value_columns])
 
+    def grouped_head(
+        self,
+        by_column_ids: typing.Sequence[str],
+        value_columns: typing.Sequence[str],
+        n: int,
+    ):
+        window_spec = window_specs.cumulative_rows(grouping_keys=tuple(by_column_ids))
+
+        block, result_id = self.apply_window_op(
+            value_columns[0],
+            agg_ops.rank_op,
+            window_spec=window_spec,
+        )
+
+        cond = ops.lt_op.as_expr(result_id, ex.const(n + 1))
+        block, cond_id = block.project_expr(cond)
+        block = block.filter_by_id(cond_id)
+        if value_columns:
+            return block.select_columns(value_columns)
+
     def slice(
         self,
         start: typing.Optional[int] = None,
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
@@ -104,6 +104,18 @@ def __getitem__(
                 dropna=self._dropna,
             )
 
+    def head(self, n: int = 5) -> df.DataFrame:
+        block = self._block
+        if self._dropna:
+            block = block_ops.dropna(self._block, self._by_col_ids, how="any")
+        return df.DataFrame(
+            block.grouped_head(
+                by_column_ids=self._by_col_ids,
+                value_columns=self._block.value_columns,
+                n=n,
+            )
+        )
+
     def size(self) -> typing.Union[df.DataFrame, series.Series]:
         agg_block, _ = self._block.aggregate_size(
             by_column_ids=self._by_col_ids,
@@ -498,6 +510,16 @@ def __init__(
         self._value_name = value_name
         self._dropna = dropna  # Applies to aggregations but not windowing
 
+    def head(self, n: int = 5) -> series.Series:
+        block = self._block
+        if self._dropna:
+            block = block_ops.dropna(self._block, self._by_col_ids, how="any")
+        return series.Series(
+            block.grouped_head(
+                by_column_ids=self._by_col_ids, value_columns=[self._value_column], n=n
+            )
+        )
+
     def all(self) -> series.Series:
         return self._aggregate(agg_ops.all_op)
 
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
@@ -53,6 +53,13 @@ def test_dataframe_groupby_numeric_aggregate(
     pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
 
 
+def test_dataframe_groupby_head(scalars_df_index, scalars_pandas_df_index):
+    col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"]
+    bf_result = scalars_df_index[col_names].groupby("bool_col").head(2).to_pandas()
+    pd_result = scalars_pandas_df_index[col_names].groupby("bool_col").head(2)
+    pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)
+
+
 def test_dataframe_groupby_median(scalars_df_index, scalars_pandas_df_index):
     col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"]
     bf_result = (
@@ -442,6 +449,19 @@ def test_series_groupby_agg_list(scalars_df_index, scalars_pandas_df_index):
     )
 
 
+@pytest.mark.parametrize("dropna", [True, False])
+def test_series_groupby_head(scalars_df_index, scalars_pandas_df_index, dropna):
+    bf_result = (
+        scalars_df_index.groupby("bool_col", dropna=dropna)["int64_too"]
+        .head(1)
+        .to_pandas()
+    )
+    pd_result = scalars_pandas_df_index.groupby("bool_col", dropna=dropna)[
+        "int64_too"
+    ].head(1)
+    pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False)
+
+
 def test_series_groupby_kurt(scalars_df_index, scalars_pandas_df_index):
     bf_result = (
         scalars_df_index["int64_too"]