feat(DRAFT): Simple cases working?

dangotbanned · dangotbanned · commit 767261c44e17 · 2025-09-19T17:17:44.000Z
Borrowing some ideas from #2528, #2680
diff --git a/narwhals/_plan/arrow/dataframe.py b/narwhals/_plan/arrow/dataframe.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Literal, overload
+from typing import TYPE_CHECKING, Any, Literal, cast, overload
 
 import pyarrow as pa  # ignore-banned-import
 import pyarrow.compute as pc  # ignore-banned-import
@@ -106,6 +106,14 @@ def drop(self, columns: Sequence[str]) -> Self:
         to_drop = list(columns)
         return self._with_native(self.native.drop(to_drop))
 
+    def rename(self, mapping: Mapping[str, str]) -> Self:
+        names: dict[str, str] | list[str]
+        if fn.BACKEND_VERSION >= (17,):
+            names = cast("dict[str, str]", mapping)
+        else:  # pragma: no cover
+            names = [mapping.get(c, c) for c in self.columns]
+        return self._with_native(self.native.rename_columns(names))
+
     # NOTE: Use instead of `with_columns` for trivial cases
     def _with_columns(self, exprs: Iterable[Expr | Scalar], /) -> Self:
         native = self.native
diff --git a/narwhals/_plan/arrow/group_by.py b/narwhals/_plan/arrow/group_by.py
@@ -1,31 +1,183 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Literal
 
 import pyarrow as pa  # ignore-banned-import
+import pyarrow.compute as pc  # ignore-banned-import
 
+from narwhals._plan import expressions as ir
+from narwhals._plan.expressions import aggregation as agg
 from narwhals._plan.protocols import DataFrameGroupBy
+from narwhals._utils import Implementation, requires
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
+    from collections.abc import Iterator, Mapping
 
-    from typing_extensions import Self
+    from typing_extensions import Self, TypeAlias
 
+    from narwhals._arrow.typing import (  # type: ignore[attr-defined]
+        AggregateOptions,
+        Aggregation,
+    )
+    from narwhals._compliant.typing import NarwhalsAggregation as _NarwhalsAggregation
     from narwhals._plan.arrow.dataframe import ArrowDataFrame
     from narwhals._plan.expressions import NamedIR
     from narwhals._plan.typing import Seq
 
+    NarwhalsAggregation: TypeAlias = Literal[_NarwhalsAggregation, "first", "last"]
+    InputName: TypeAlias = str
+    NativeName: TypeAlias = str
+    OutputName: TypeAlias = str
+    NativeAggSpec: TypeAlias = tuple[InputName, Aggregation, AggregateOptions | None]
+    RenameSpec: TypeAlias = tuple[NativeName, OutputName]
 
-class ArrowGroupBy(DataFrameGroupBy["ArrowDataFrame"]):
-    """What narwhals is doing.
 
-    - Keys are handled only at compliant
-       - `ParseKeysGroupBy` does weird stuff
-       - But has a fast path for all `str` keys
-    - Aggs are handled in both levels
-      - Some compliant have more restrictions
-    """
+BACKEND_VERSION = Implementation.PYARROW._backend_version()
+
+
+# TODO @dangotbanned: Missing `nw.col("a").len()`
+SUPPORTED_AGG: Mapping[type[agg.AggExpr], Aggregation] = {
+    agg.Sum: "sum",
+    agg.Mean: "mean",
+    agg.Median: "approximate_median",
+    agg.Max: "max",
+    agg.Min: "min",
+    agg.Std: "stddev",
+    agg.Var: "variance",
+    agg.Count: "count",
+    agg.NUnique: "count_distinct",
+    agg.First: "first",
+    agg.Last: "last",
+}
+
+
+SUPPORTED_IR: Mapping[type[ir.Len], Aggregation] = {ir.Len: "count"}
+SUPPORTED_FUNCTION: Mapping[type[ir.boolean.BooleanFunction], Aggregation] = {
+    ir.boolean.All: "all",
+    ir.boolean.Any: "any",
+}
+
+REMAINING: tuple[Aggregation, ...] = (
+    "count_all",  # Count the number of rows in each group
+    "distinct",  # Keep the distinct values in each group
+    "first_last",  # Compute the first and last of values in each group
+    "list",  # List all values in each group
+    "min_max",  # Compute the minimum and maximum of values in each group
+    "one",  # Get one value from each group
+    "product",  # Compute the product of values in each group
+    "tdigest",  # Compute approximate quantiles of values in each group
+)
+"""Available [native aggs] we haven't used (excluding `first`, `last`)
+
+[native aggs]: https://arrow.apache.org/docs/python/compute.html#grouped-aggregations
+"""
+
+
+REQUIRES_PYARROW_20: tuple[
+    Literal["kurtosis"], Literal["pivot_wider"], Literal["skew"]
+] = (
+    "kurtosis",  # Compute the kurtosis of values in each group
+    "pivot_wider",  # Pivot values according to a pivot key column
+    "skew",  # Compute the skewness of values in each group
+)
+"""https://arrow.apache.org/docs/20.0/python/compute.html#grouped-aggregations"""
+
+
+def _ensure_single_thread(
+    grouped: pa.TableGroupBy, expr: ir.OrderableAggExpr, /
+) -> pa.TableGroupBy:
+    """First/last require disabling threading."""
+    if BACKEND_VERSION >= (14, 0) and grouped._use_threads:
+        # NOTE: Stubs say `_table` is a method, but at runtime it is a property
+        grouped = pa.TableGroupBy(grouped._table, grouped.keys, use_threads=False)  # type: ignore[arg-type]
+    elif BACKEND_VERSION < (14, 0):  # pragma: no cover
+        msg = (
+            f"Using `{expr!r}` in a `group_by().agg(...)` context is only available in 'pyarrow>=14.0.0', "
+            f"found version {requires._unparse_version(BACKEND_VERSION)!r}.\n\n"
+            f"See https://github.com/apache/arrow/issues/36709"
+        )
+        raise NotImplementedError(msg)
+    return grouped
+
 
+def group_by_error(
+    expr: ArrowAggExpr,
+    reason: Literal[
+        "too complex",
+        "unsupported aggregation",
+        "unsupported function",
+        "unsupported expression",
+    ],
+) -> NotImplementedError:
+    if reason == "too complex":
+        msg = "Non-trivial complex aggregation found"
+    else:
+        msg = reason.title()
+    msg = f"{msg} in 'pyarrow.Table':\n\n{expr.named_ir!r}"
+    return NotImplementedError(msg)
+
+
+class ArrowAggExpr:
+    def __init__(self, named_ir: NamedIR, /) -> None:
+        self.named_ir: NamedIR = named_ir
+
+    @property
+    def output_name(self) -> OutputName:
+        return self.named_ir.name
+
+    def _parse_agg_expr(
+        self, expr: agg.AggExpr, grouped: pa.TableGroupBy
+    ) -> tuple[InputName, Aggregation, AggregateOptions | None, pa.TableGroupBy]:
+        if agg_name := SUPPORTED_AGG.get(type(expr)):
+            option: AggregateOptions | None = None
+            if isinstance(expr, (agg.Std, agg.Var)):
+                # NOTE: Only branch which needs an instance (for `ddof`)
+                option = pc.VarianceOptions(ddof=expr.ddof)
+            elif isinstance(expr, agg.NUnique):
+                option = pc.CountOptions(mode="all")
+            elif isinstance(expr, agg.Count):
+                option = pc.CountOptions(mode="only_valid")
+            elif isinstance(expr, (agg.First, agg.Last)):
+                option = pc.ScalarAggregateOptions(skip_nulls=False)
+                # NOTE: Only branch which needs access to `pa.TableGroupBy`
+                grouped = _ensure_single_thread(grouped, expr)
+            if isinstance(expr.expr, ir.Column):
+                return expr.expr.name, agg_name, option, grouped
+            raise group_by_error(self, "too complex")
+        raise group_by_error(self, "unsupported aggregation")
+
+    def _parse_function_expr(self, expr: ir.FunctionExpr) -> NativeAggSpec:
+        if isinstance(expr.function, (ir.boolean.All, ir.boolean.Any)):
+            agg_name = SUPPORTED_FUNCTION[type(expr.function)]
+            option = pc.ScalarAggregateOptions(min_count=0)
+            if len(expr.input) == 1 and isinstance(expr.input[0], ir.Column):
+                return expr.input[0].name, agg_name, option
+            raise group_by_error(self, "too complex")
+        raise group_by_error(self, "unsupported function")
+
+    def _rename_spec(self, input_name: InputName, agg_name: Aggregation, /) -> RenameSpec:
+        # `pyarrow` auto-generates the lhs
+        # we want to overwrite that later with rhs
+        return f"{input_name}_{agg_name}", self.output_name
+
+    def to_native(
+        self, grouped: pa.TableGroupBy
+    ) -> tuple[pa.TableGroupBy, NativeAggSpec, RenameSpec]:
+        expr = self.named_ir.expr
+        if isinstance(expr, agg.AggExpr):
+            input_name, agg_name, option, grouped = self._parse_agg_expr(expr, grouped)
+        elif isinstance(expr, ir.Len):
+            msg = "Need to investigate https://github.com/narwhals-dev/narwhals/blob/0fb045536f5b56b978f354f8178b292301e9598c/narwhals/_arrow/group_by.py#L132-L141"
+            raise NotImplementedError(msg)
+        elif isinstance(expr, ir.FunctionExpr):
+            input_name, agg_name, option = self._parse_function_expr(expr)
+        else:
+            raise group_by_error(self, "unsupported expression")
+        agg_spec = input_name, agg_name, option
+        return grouped, agg_spec, self._rename_spec(input_name, agg_name)
+
+
+class ArrowGroupBy(DataFrameGroupBy["ArrowDataFrame"]):
     _df: ArrowDataFrame
     _grouped: pa.TableGroupBy
     _keys: Seq[NamedIR]
@@ -52,4 +204,11 @@ def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]:
         raise NotImplementedError
 
     def agg(self, irs: Seq[NamedIR]) -> ArrowDataFrame:
-        raise NotImplementedError
+        gb = self._grouped
+        aggs: list[NativeAggSpec] = []
+        renames: list[RenameSpec] = []
+        for e in irs:
+            gb, agg_spec, rename = ArrowAggExpr(e).to_native(gb)
+            aggs.append(agg_spec)
+            renames.append(rename)
+        return self.compliant._with_native(gb.aggregate(aggs)).rename(dict(renames))
diff --git a/narwhals/_plan/group_by.py b/narwhals/_plan/group_by.py
@@ -67,14 +67,7 @@ def agg(self, *aggs: OneOrIterable[IntoExpr], **named_aggs: IntoExpr) -> DataFra
         else:  # noqa: RET506
             # If not, we can just use the resolved key names as a fast-path
             grouped = compliant_gb.by_names(compliant, resolved.keys_names)
-        msg = fmt_group_by_error(
-            "`GroupBy.agg` needs a `CompliantGroupBy.agg` to dispatch to",
-            resolved.keys,
-            resolved.aggs,
-            resolved.result_schema,
-        )
-        raise NotImplementedError(msg)
-        return grouped.agg(resolved.aggs)
+        return self._frame._from_compliant(grouped.agg(resolved.aggs))
 
 
 class _TempGroupByStuff(NamedTuple):