refactor: Generalize Block.aggregate to non-unary aggregates (#1304)

TrevorBergeron · web-flow · commit 3b53092d554e · 2025-01-21T17:01:23.000-08:00
diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py
@@ -129,12 +129,16 @@ def quantile(
                 window_spec=window,
             )
             quantile_cols.append(quantile_col)
-    block, results = block.aggregate(
+    block, _ = block.aggregate(
         grouping_column_ids,
-        tuple((col, agg_ops.AnyValueOp()) for col in quantile_cols),
+        tuple(
+            ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col))
+            for col in quantile_cols
+        ),
+        column_labels=pd.Index(labels),
         dropna=dropna,
     )
-    return block.select_columns(results).with_column_labels(labels)
+    return block
 
 
 def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block:
@@ -355,7 +359,7 @@ def value_counts(
     block, dummy = block.create_constant(1)
     block, agg_ids = block.aggregate(
         by_column_ids=columns,
-        aggregations=[(dummy, agg_ops.count_op)],
+        aggregations=[ex.UnaryAggregation(agg_ops.count_op, ex.deref(dummy))],
         dropna=dropna,
     )
     count_id = agg_ids[0]
@@ -589,9 +593,18 @@ def skew(
     # counts, moment3 for each column
     aggregations = []
     for i, col in enumerate(original_columns):
-        count_agg = (col, agg_ops.count_op)
-        moment3_agg = (delta3_ids[i], agg_ops.mean_op)
-        variance_agg = (col, agg_ops.PopVarOp())
+        count_agg = ex.UnaryAggregation(
+            agg_ops.count_op,
+            ex.deref(col),
+        )
+        moment3_agg = ex.UnaryAggregation(
+            agg_ops.mean_op,
+            ex.deref(delta3_ids[i]),
+        )
+        variance_agg = ex.UnaryAggregation(
+            agg_ops.PopVarOp(),
+            ex.deref(col),
+        )
         aggregations.extend([count_agg, moment3_agg, variance_agg])
 
     block, agg_ids = block.aggregate(
@@ -631,9 +644,9 @@ def kurt(
     # counts, moment4 for each column
     aggregations = []
     for i, col in enumerate(original_columns):
-        count_agg = (col, agg_ops.count_op)
-        moment4_agg = (delta4_ids[i], agg_ops.mean_op)
-        variance_agg = (col, agg_ops.PopVarOp())
+        count_agg = ex.UnaryAggregation(agg_ops.count_op, ex.deref(col))
+        moment4_agg = ex.UnaryAggregation(agg_ops.mean_op, ex.deref(delta4_ids[i]))
+        variance_agg = ex.UnaryAggregation(agg_ops.PopVarOp(), ex.deref(col))
         aggregations.extend([count_agg, moment4_agg, variance_agg])
 
     block, agg_ids = block.aggregate(
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -1174,38 +1174,31 @@ def remap_f(x):
     def aggregate(
         self,
         by_column_ids: typing.Sequence[str] = (),
-        aggregations: typing.Sequence[
-            typing.Tuple[
-                str, typing.Union[agg_ops.UnaryAggregateOp, agg_ops.NullaryAggregateOp]
-            ]
-        ] = (),
+        aggregations: typing.Sequence[ex.Aggregation] = (),
+        column_labels: Optional[pd.Index] = None,
         *,
         dropna: bool = True,
     ) -> typing.Tuple[Block, typing.Sequence[str]]:
         """
-        Apply aggregations to the block. Callers responsible for setting index column(s) after.
+        Apply aggregations to the block.
         Arguments:
             by_column_id: column id of the aggregation key, this is preserved through the transform and used as index.
             aggregations: input_column_id, operation tuples
-            as_index: if True, grouping keys will be index columns in result, otherwise they will be non-index columns.
             dropna: whether null keys should be dropped
         """
+        if column_labels is None:
+            column_labels = pd.Index(range(len(aggregations)))
+
         agg_specs = [
             (
-                ex.UnaryAggregation(operation, ex.deref(input_id))
-                if isinstance(operation, agg_ops.UnaryAggregateOp)
-                else ex.NullaryAggregation(operation),
+                aggregation,
                 guid.generate_guid(),
             )
-            for input_id, operation in aggregations
+            for aggregation in aggregations
         ]
         output_col_ids = [agg_spec[1] for agg_spec in agg_specs]
         result_expr = self.expr.aggregate(agg_specs, by_column_ids, dropna=dropna)
 
-        aggregate_labels = self._get_labels_for_columns(
-            [agg[0] for agg in aggregations]
-        )
-
         names: typing.List[Label] = []
         if len(by_column_ids) == 0:
             result_expr, label_id = result_expr.create_constant(0, pd.Int64Dtype())
@@ -1223,7 +1216,7 @@ def aggregate(
             Block(
                 result_expr,
                 index_columns=index_columns,
-                column_labels=aggregate_labels,
+                column_labels=column_labels,
                 index_labels=names,
             ),
             output_col_ids,
@@ -1561,7 +1554,10 @@ def pivot(
                 column_ids.append(masked_id)
 
         block = block.select_columns(column_ids)
-        aggregations = [(col_id, agg_ops.AnyValueOp()) for col_id in column_ids]
+        aggregations = [
+            ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(col_id))
+            for col_id in column_ids
+        ]
         result_block, _ = block.aggregate(
             by_column_ids=self.index_columns,
             aggregations=aggregations,
diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 import typing
-from typing import Sequence, Union
+from typing import Sequence, Tuple, Union
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
@@ -26,6 +26,7 @@
 import bigframes.core as core
 import bigframes.core.block_transforms as block_ops
 import bigframes.core.blocks as blocks
+import bigframes.core.expression
 import bigframes.core.ordering as order
 import bigframes.core.utils as utils
 import bigframes.core.validations as validations
@@ -334,24 +335,19 @@ def agg(self, func=None, **kwargs) -> typing.Union[df.DataFrame, series.Series]:
             return self._agg_named(**kwargs)
 
     def _agg_string(self, func: str) -> df.DataFrame:
-        aggregations = [
-            (col_id, agg_ops.lookup_agg_func(func))
-            for col_id in self._aggregated_columns()
-        ]
+        ids, labels = self._aggregated_columns()
+        aggregations = [agg(col_id, agg_ops.lookup_agg_func(func)) for col_id in ids]
         agg_block, _ = self._block.aggregate(
             by_column_ids=self._by_col_ids,
             aggregations=aggregations,
             dropna=self._dropna,
+            column_labels=labels,
         )
         dataframe = df.DataFrame(agg_block)
         return dataframe if self._as_index else self._convert_index(dataframe)
 
     def _agg_dict(self, func: typing.Mapping) -> df.DataFrame:
-        aggregations: typing.List[
-            typing.Tuple[
-                str, typing.Union[agg_ops.UnaryAggregateOp, agg_ops.NullaryAggregateOp]
-            ]
-        ] = []
+        aggregations: typing.List[bigframes.core.expression.Aggregation] = []
         column_labels = []
 
         want_aggfunc_level = any(utils.is_list_like(aggs) for aggs in func.values())
@@ -362,7 +358,7 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame:
                 funcs_for_id if utils.is_list_like(funcs_for_id) else [funcs_for_id]
             )
             for f in func_list:
-                aggregations.append((col_id, agg_ops.lookup_agg_func(f)))
+                aggregations.append(agg(col_id, agg_ops.lookup_agg_func(f)))
                 column_labels.append(label)
         agg_block, _ = self._block.aggregate(
             by_column_ids=self._by_col_ids,
@@ -373,7 +369,10 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame:
             agg_block = agg_block.with_column_labels(
                 utils.combine_indices(
                     pd.Index(column_labels),
-                    pd.Index(agg[1].name for agg in aggregations),
+                    pd.Index(
+                        typing.cast(agg_ops.AggregateOp, agg.op).name
+                        for agg in aggregations
+                    ),
                 )
             )
         else:
@@ -382,34 +381,21 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame:
         return dataframe if self._as_index else self._convert_index(dataframe)
 
     def _agg_list(self, func: typing.Sequence) -> df.DataFrame:
+        ids, labels = self._aggregated_columns()
         aggregations = [
-            (col_id, agg_ops.lookup_agg_func(f))
-            for col_id in self._aggregated_columns()
-            for f in func
+            agg(col_id, agg_ops.lookup_agg_func(f)) for col_id in ids for f in func
         ]
 
         if self._block.column_labels.nlevels > 1:
             # Restructure MultiIndex for proper format: (idx1, idx2, func)
             # rather than ((idx1, idx2), func).
-            aggregated_columns = pd.MultiIndex.from_tuples(
-                [
-                    self._block.col_id_to_label[col_id]
-                    for col_id in self._aggregated_columns()
-                ],
-                names=[*self._block.column_labels.names],
-            ).to_frame(index=False)
-
             column_labels = [
-                tuple(col_id) + (f,)
-                for col_id in aggregated_columns.to_numpy()
-                for f in func
-            ]
-        else:
-            column_labels = [
-                (self._block.col_id_to_label[col_id], f)
-                for col_id in self._aggregated_columns()
+                tuple(label) + (f,)
+                for label in labels.to_frame(index=False).to_numpy()
                 for f in func
             ]
+        else:  # Single-level index
+            column_labels = [(label, f) for label in labels for f in func]
 
         agg_block, _ = self._block.aggregate(
             by_column_ids=self._by_col_ids,
@@ -435,7 +421,7 @@ def _agg_named(self, **kwargs) -> df.DataFrame:
             if not isinstance(v, tuple) or (len(v) != 2):
                 raise TypeError("kwargs values must be 2-tuples of column, aggfunc")
             col_id = self._resolve_label(v[0])
-            aggregations.append((col_id, agg_ops.lookup_agg_func(v[1])))
+            aggregations.append(agg(col_id, agg_ops.lookup_agg_func(v[1])))
             column_labels.append(k)
         agg_block, _ = self._block.aggregate(
             by_column_ids=self._by_col_ids,
@@ -470,15 +456,19 @@ def _raise_on_non_numeric(self, op: str):
             )
         return self
 
-    def _aggregated_columns(self, numeric_only: bool = False) -> typing.Sequence[str]:
+    def _aggregated_columns(
+        self, numeric_only: bool = False
+    ) -> Tuple[typing.Sequence[str], pd.Index]:
         valid_agg_cols: list[str] = []
-        for col_id in self._selected_cols:
+        offsets: list[int] = []
+        for i, col_id in enumerate(self._block.value_columns):
             is_numeric = (
                 self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE
             )
-            if is_numeric or not numeric_only:
+            if (col_id in self._selected_cols) and (is_numeric or not numeric_only):
+                offsets.append(i)
                 valid_agg_cols.append(col_id)
-        return valid_agg_cols
+        return valid_agg_cols, self._block.column_labels.take(offsets)
 
     def _column_type(self, col_id: str) -> dtypes.Dtype:
         col_offset = self._block.value_columns.index(col_id)
@@ -488,11 +478,12 @@ def _column_type(self, col_id: str) -> dtypes.Dtype:
     def _aggregate_all(
         self, aggregate_op: agg_ops.UnaryAggregateOp, numeric_only: bool = False
     ) -> df.DataFrame:
-        aggregated_col_ids = self._aggregated_columns(numeric_only=numeric_only)
-        aggregations = [(col_id, aggregate_op) for col_id in aggregated_col_ids]
+        aggregated_col_ids, labels = self._aggregated_columns(numeric_only=numeric_only)
+        aggregations = [agg(col_id, aggregate_op) for col_id in aggregated_col_ids]
         result_block, _ = self._block.aggregate(
             by_column_ids=self._by_col_ids,
             aggregations=aggregations,
+            column_labels=labels,
             dropna=self._dropna,
         )
         dataframe = df.DataFrame(result_block)
@@ -508,7 +499,7 @@ def _apply_window_op(
         window_spec = window or window_specs.cumulative_rows(
             grouping_keys=tuple(self._by_col_ids)
         )
-        columns = self._aggregated_columns(numeric_only=numeric_only)
+        columns, _ = self._aggregated_columns(numeric_only=numeric_only)
         block, result_ids = self._block.multi_apply_window_op(
             columns, op, window_spec=window_spec
         )
@@ -639,11 +630,11 @@ def prod(self, *args) -> series.Series:
     def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]:
         column_names: list[str] = []
         if isinstance(func, str):
-            aggregations = [(self._value_column, agg_ops.lookup_agg_func(func))]
+            aggregations = [agg(self._value_column, agg_ops.lookup_agg_func(func))]
             column_names = [func]
         elif utils.is_list_like(func):
             aggregations = [
-                (self._value_column, agg_ops.lookup_agg_func(f)) for f in func
+                agg(self._value_column, agg_ops.lookup_agg_func(f)) for f in func
             ]
             column_names = list(func)
         else:
@@ -756,7 +747,7 @@ def expanding(self, min_periods: int = 1) -> windows.Window:
     def _aggregate(self, aggregate_op: agg_ops.UnaryAggregateOp) -> series.Series:
         result_block, _ = self._block.aggregate(
             self._by_col_ids,
-            ((self._value_column, aggregate_op),),
+            (agg(self._value_column, aggregate_op),),
             dropna=self._dropna,
         )
 
@@ -781,3 +772,13 @@ def _apply_window_op(
             window_spec=window_spec,
         )
         return series.Series(block.select_column(result_id))
+
+
+def agg(input: str, op: agg_ops.AggregateOp) -> bigframes.core.expression.Aggregation:
+    if isinstance(op, agg_ops.UnaryAggregateOp):
+        return bigframes.core.expression.UnaryAggregation(
+            op, bigframes.core.expression.deref(input)
+        )
+    else:
+        assert isinstance(op, agg_ops.NullaryAggregateOp)
+        return bigframes.core.expression.NullaryAggregation(op)
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -1062,7 +1062,9 @@ def mode(self) -> Series:
         # Approach: Count each value, return each value for which count(x) == max(counts))
         block, agg_ids = block.aggregate(
             by_column_ids=[self._value_column],
-            aggregations=((self._value_column, agg_ops.count_op),),
+            aggregations=(
+                ex.UnaryAggregation(agg_ops.count_op, ex.deref(self._value_column)),
+            ),
         )
         value_count_col_id = agg_ids[0]
         block, max_value_count_col_id = block.apply_window_op(
@@ -1675,7 +1677,8 @@ def unique(self, keep_order=True) -> Series:
             return self.drop_duplicates()
         block, result = self._block.aggregate(
             [self._value_column],
-            [(self._value_column, agg_ops.AnyValueOp())],
+            [ex.UnaryAggregation(agg_ops.AnyValueOp(), ex.deref(self._value_column))],
+            column_labels=self._block.column_labels,
             dropna=False,
         )
         return Series(block.select_columns(result).reset_index())