refactor: Implement PandasLikeGroupBy?

dangotbanned · dangotbanned · commit 96376621b7ba · 2025-03-19T21:11:41.000Z
- Having a hard time working out what is going on here
- All I've changed is what the refs are named
diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py
@@ -45,6 +45,7 @@ def __init__(
         self,
         compliant_frame: ArrowDataFrame,
         keys: Sequence[str],
+        /,
         *,
         drop_null_keys: bool,
     ) -> None:
diff --git a/narwhals/_compliant/group_by.py b/narwhals/_compliant/group_by.py
@@ -36,6 +36,7 @@ def __init__(
         self,
         compliant_frame: CompliantFrameT_co,
         keys: Sequence[str],
+        /,
         *,
         drop_null_keys: bool,
     ) -> None: ...
diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py
@@ -550,7 +550,11 @@ def over(
         order_by: Sequence[str] | None,
     ) -> Self:
         # pandas is a required dependency of dask so it's safe to import this
-        from narwhals._pandas_like.group_by import AGGREGATIONS_TO_PANDAS_EQUIVALENT
+        from narwhals._pandas_like.group_by import PandasLikeGroupBy
+
+        AGGREGATIONS_TO_PANDAS_EQUIVALENT = (  # noqa: N806
+            PandasLikeGroupBy._NARWHALS_TO_NATIVE_AGGREGATIONS
+        )
 
         if not partition_by:
             assert order_by is not None  # help type checkers  # noqa: S101
diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py
@@ -580,11 +580,7 @@ def collect(
     def group_by(self: Self, *keys: str, drop_null_keys: bool) -> PandasLikeGroupBy:
         from narwhals._pandas_like.group_by import PandasLikeGroupBy
 
-        return PandasLikeGroupBy(
-            self,
-            list(keys),
-            drop_null_keys=drop_null_keys,
-        )
+        return PandasLikeGroupBy(self, keys, drop_null_keys=drop_null_keys)
 
     def join(
         self: Self,
diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py
@@ -11,7 +11,7 @@
 from narwhals._expression_parsing import ExprKind
 from narwhals._expression_parsing import evaluate_output_names_and_aliases
 from narwhals._expression_parsing import is_elementary_expression
-from narwhals._pandas_like.group_by import AGGREGATIONS_TO_PANDAS_EQUIVALENT
+from narwhals._pandas_like.group_by import PandasLikeGroupBy
 from narwhals._pandas_like.series import PandasLikeSeries
 from narwhals.exceptions import ColumnNotFoundError
 from narwhals.utils import generate_temporary_column_name
@@ -223,6 +223,9 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]:
             )
             raise NotImplementedError(msg)
         else:
+            AGGREGATIONS_TO_PANDAS_EQUIVALENT = (  # noqa: N806
+                PandasLikeGroupBy._NARWHALS_TO_NATIVE_AGGREGATIONS
+            )
             function_name: str = re.sub(r"(\w+->)", "", self._function_name)
             pandas_function_name = WINDOW_FUNCTIONS_TO_PANDAS_EQUIVALENT.get(
                 function_name,
diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py
@@ -5,8 +5,12 @@
 import warnings
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import ClassVar
 from typing import Iterator
+from typing import Mapping
+from typing import Sequence
 
+from narwhals._compliant import EagerGroupBy
 from narwhals._expression_parsing import evaluate_output_names_and_aliases
 from narwhals._expression_parsing import is_elementary_expression
 from narwhals._pandas_like.utils import horizontal_concat
@@ -22,39 +26,44 @@
     from narwhals._pandas_like.dataframe import PandasLikeDataFrame
     from narwhals._pandas_like.expr import PandasLikeExpr
 
-AGGREGATIONS_TO_PANDAS_EQUIVALENT = {
-    "sum": "sum",
-    "mean": "mean",
-    "median": "median",
-    "max": "max",
-    "min": "min",
-    "std": "std",
-    "var": "var",
-    "len": "size",
-    "n_unique": "nunique",
-    "count": "count",
-}
 
+class PandasLikeGroupBy(EagerGroupBy["PandasLikeDataFrame", "PandasLikeExpr"]):
+    _NARWHALS_TO_NATIVE_AGGREGATIONS: ClassVar[Mapping[str, Any]] = {
+        "sum": "sum",
+        "mean": "mean",
+        "median": "median",
+        "max": "max",
+        "min": "min",
+        "std": "std",
+        "var": "var",
+        "len": "size",
+        "n_unique": "nunique",
+        "count": "count",
+    }
 
-class PandasLikeGroupBy:
     def __init__(
-        self: Self, df: PandasLikeDataFrame, keys: list[str], *, drop_null_keys: bool
+        self: Self,
+        df: PandasLikeDataFrame,
+        keys: Sequence[str],
+        /,
+        *,
+        drop_null_keys: bool,
     ) -> None:
-        self._df = df
-        self._keys = keys
+        self._compliant_frame = df
+        self._keys: list[str] = list(keys)
         # Drop index to avoid potential collisions:
         # https://github.com/narwhals-dev/narwhals/issues/1907.
-        if set(df._native_frame.index.names).intersection(df.columns):
-            native_frame = df._native_frame.reset_index(drop=True)
+        if set(df.native.index.names).intersection(df.columns):
+            native_frame = df.native.reset_index(drop=True)
         else:
-            native_frame = df._native_frame
+            native_frame = df.native
         if (
-            self._df._implementation is Implementation.PANDAS
-            and self._df._backend_version < (1, 1)
+            self.compliant._implementation is Implementation.PANDAS
+            and self.compliant._backend_version < (1, 1)
         ):  # pragma: no cover
             if (
                 not drop_null_keys
-                and self._df.simple_select(*self._keys)._native_frame.isna().any().any()
+                and self.compliant.simple_select(*self._keys).native.isna().any().any()
             ):
                 msg = "Grouping by null values is not supported in pandas < 1.1.0"
                 raise NotImplementedError(msg)
@@ -74,19 +83,21 @@ def __init__(
             )
 
     def agg(self: Self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: PLR0915
-        implementation = self._df._implementation
-        backend_version = self._df._backend_version
+        implementation = self.compliant._implementation
+        backend_version = self.compliant._backend_version
         new_names: list[str] = self._keys.copy()
 
         all_aggs_are_simple = True
         for expr in exprs:
-            _, aliases = evaluate_output_names_and_aliases(expr, self._df, self._keys)
+            _, aliases = evaluate_output_names_and_aliases(
+                expr, self.compliant, self._keys
+            )
             new_names.extend(aliases)
 
             if not (
                 is_elementary_expression(expr)
                 and re.sub(r"(\w+->)", "", expr._function_name)
-                in AGGREGATIONS_TO_PANDAS_EQUIVALENT
+                in self._NARWHALS_TO_NATIVE_AGGREGATIONS
             ):
                 all_aggs_are_simple = False
 
@@ -111,11 +122,11 @@ def agg(self: Self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: PLR
         if all_aggs_are_simple:
             for expr in exprs:
                 output_names, aliases = evaluate_output_names_and_aliases(
-                    expr, self._df, self._keys
+                    expr, self.compliant, self._keys
                 )
                 if expr._depth == 0:
                     # e.g. agg(nw.len()) # noqa: ERA001
-                    function_name = AGGREGATIONS_TO_PANDAS_EQUIVALENT.get(
+                    function_name = self._NARWHALS_TO_NATIVE_AGGREGATIONS.get(
                         expr._function_name, expr._function_name
                     )
                     simple_aggs_functions.add(function_name)
@@ -128,7 +139,7 @@ def agg(self: Self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: PLR
 
                 # e.g. agg(nw.mean('a')) # noqa: ERA001
                 function_name = re.sub(r"(\w+->)", "", expr._function_name)
-                function_name = AGGREGATIONS_TO_PANDAS_EQUIVALENT.get(
+                function_name = self._NARWHALS_TO_NATIVE_AGGREGATIONS.get(
                     function_name, function_name
                 )
 
@@ -247,17 +258,17 @@ def agg(self: Self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: PLR
                 )
             else:
                 # No aggregation provided
-                result = self._df.__native_namespace__().DataFrame(
+                result = self.compliant.__native_namespace__().DataFrame(
                     list(self._grouped.groups.keys()), columns=self._keys
                 )
             # Keep inplace=True to avoid making a redundant copy.
             # This may need updating, depending on https://github.com/pandas-dev/pandas/pull/51466/files
             result.reset_index(inplace=True)  # noqa: PD002
-            return self._df._from_native_frame(
+            return self.compliant._from_native_frame(
                 select_columns_by_name(result, new_names, backend_version, implementation)
             )
 
-        if self._df._native_frame.empty:
+        if self.compliant.native.empty:
             # Don't even attempt this, it's way too inconsistent across pandas versions.
             msg = (
                 "No results for group-by aggregation.\n\n"
@@ -285,9 +296,9 @@ def func(df: Any) -> Any:
             out_group = []
             out_names = []
             for expr in exprs:
-                results_keys = expr(self._df._from_native_frame(df))
+                results_keys = expr(self.compliant._from_native_frame(df))
                 for result_keys in results_keys:
-                    out_group.append(result_keys._native_series.iloc[0])
+                    out_group.append(result_keys.native.iloc[0])
                     out_names.append(result_keys.name)
             return native_series_from_iterable(
                 out_group,
@@ -305,7 +316,7 @@ def func(df: Any) -> Any:
         # This may need updating, depending on https://github.com/pandas-dev/pandas/pull/51466/files
         result_complex.reset_index(inplace=True)  # noqa: PD002
 
-        return self._df._from_native_frame(
+        return self.compliant._from_native_frame(
             select_columns_by_name(
                 result_complex, new_names, backend_version, implementation
             )
@@ -319,4 +330,4 @@ def __iter__(self: Self) -> Iterator[tuple[Any, PandasLikeDataFrame]]:
                 category=FutureWarning,
             )
             for key, group in self._grouped:
-                yield (key, self._df._from_native_frame(group))
+                yield (key, self.compliant._from_native_frame(group))