1486 add summarise method to groupby (#1488)

samukweku · samuel.oranyeli · web-flow · commit 9e7cb25d5f72 · 2025-07-27T20:29:30.000+10:00
* add support for groupby.summarise

* add tests

* remove irrelevant functions

---------

Co-authored-by: samuel.oranyeli &lt;samuel.oranyeli@grow.inc&gt;
diff --git a/janitor/functions/summarise.py b/janitor/functions/summarise.py
@@ -14,9 +14,10 @@
 from janitor.functions.select import get_index_labels
 
 
+@pf.register_groupby_method
 @pf.register_dataframe_method
 def summarise(
-    df: pd.DataFrame,
+    df: pd.DataFrame | DataFrameGroupBy,
     *args: tuple[dict | tuple],
     by: Any = None,
 ) -> pd.DataFrame:
@@ -107,6 +108,8 @@ def summarise(
     Arguments supported in `pd.DataFrame.groupby`
     can also be passed to `by` via a dictionary.
 
+    If `df` is a `DataFrameGroupBy` object, `by` is ignored.
+
     Examples:
         >>> import pandas as pd
         >>> import janitor
@@ -160,7 +163,7 @@ def summarise(
         103202            4.0
 
     Args:
-        df: A pandas DataFrame.
+        df: A pandas DataFrame or DataFrameGroupBy object.
         args: Either a dictionary or a tuple.
         by: Column(s) to group by.
 
@@ -171,8 +174,10 @@ def summarise(
         A pandas DataFrame with aggregated columns.
 
     """  # noqa: E501
-
-    if by is not None:
+    if isinstance(df, DataFrameGroupBy):
+        by = df
+        df = df.obj
+    elif by is not None:
         # it is assumed that by is created from df
         # onus is on user to ensure that
         if isinstance(by, DataFrameGroupBy):
@@ -233,7 +238,7 @@ def _aggfunc(arg, df, by):
         val = df
     else:
         val = by
-    outcome = _process_maybe_callable(func=arg, obj=val)
+    outcome = apply_if_callable(maybe_callable=arg, obj=val)
     if isinstance(outcome, pd.Series):
         if not outcome.name:
             raise ValueError("Ensure the pandas Series object has a name")
@@ -270,10 +275,11 @@ def _(arg, df, by):
             if len(aggfunc) != 2:
                 raise ValueError("the tuple has to be a length of 2")
             column, func = aggfunc
-            column_ = _handle_tuple_groupby_selection(by=by, column=column)
-            column = _apply_func_to_obj(aggfunc=func, obj=val[column_])
-            if isinstance(column, pd.DataFrame) and column.shape[-1] == 1:
+            column = val.agg({column: func})
+            try:
                 column = column.squeeze()
+            except AttributeError:
+                pass
             column = _convert_obj_to_named_series(
                 obj=column,
                 column_name=column_name,
@@ -285,54 +291,20 @@ def _(arg, df, by):
                     f"instead got {type(column)}"
                 )
         else:
-            column_ = _handle_tuple_groupby_selection(
-                by=by, column=column_name
-            )
-            column = _apply_func_to_obj(aggfunc=aggfunc, obj=val[column_])
+            column = val.agg({column_name: aggfunc})
+            try:
+                column = column.squeeze()
+            except AttributeError:
+                pass
             column = _convert_obj_to_named_series(
                 obj=column,
                 column_name=column_name,
                 function=aggfunc,
             )
-        column = _rename_column_in_by(
-            column=column, column_name=column_name, by=by
-        )
         contents.append(column)
     return contents
 
 
-def _process_maybe_callable(func: callable, obj):
-    """Function to handle callables"""
-    try:
-        column = obj.agg(func)
-    except:  # noqa: E722
-        column = apply_if_callable(maybe_callable=func, obj=obj)
-    return column
-
-
-def _process_maybe_string(func: str, obj):
-    """Function to handle pandas string functions"""
-    # treat as a pandas approved string function
-    # https://pandas.pydata.org/docs/user_guide/groupby.html#built-in-aggregation-methods
-    return obj.agg(func)
-
-
-def _apply_func_to_obj(aggfunc, obj):
-    """Handle str/callables within a dictionary"""
-    if isinstance(aggfunc, str):
-        return _process_maybe_string(func=aggfunc, obj=obj)
-    return _process_maybe_callable(func=aggfunc, obj=obj)
-
-
-def _handle_tuple_groupby_selection(by: Any, column: Any):
-    """
-    Properly handle a tuple column selection in the presence of a groupby
-    """
-    if (by is not None) and isinstance(column, tuple):
-        return [column]
-    return column
-
-
 def _convert_obj_to_named_series(obj, function: Any, column_name: Any):
     if isinstance(obj, pd.Series):
         obj.name = column_name
@@ -344,12 +316,3 @@ def _convert_obj_to_named_series(obj, function: Any, column_name: Any):
     else:
         function_name = function.__name__
     return pd.Series(data=obj, index=[function_name], name=column_name)
-
-
-def _rename_column_in_by(column, column_name, by):
-    if by is None:
-        return column
-    elif isinstance(column, pd.DataFrame) and is_scalar(column_name):
-        columns = pd.MultiIndex.from_product([[column_name], column.columns])
-        column.columns = columns
-    return column
diff --git a/tests/functions/test_summarise.py b/tests/functions/test_summarise.py
@@ -77,6 +77,14 @@ def test_summarise_by_callable_grp(df_summarise):
     assert_frame_equal(actual, expected)
 
 
+def test_summarise_by_callable_grp_grouped(df_summarise):
+    """Test output for a callable"""
+    grp = df_summarise.groupby("combine_id")
+    actual = grp.summarise(lambda df: df.sum())
+    expected = grp.sum()
+    assert_frame_equal(actual, expected)
+
+
 def test_summarise_dict_df_str(df_summarise):
     """Test output for a dictionary"""
     actual = df_summarise.summarise({"avg_run": "mean"})
@@ -164,6 +172,13 @@ def test_summarise_by_tuple(df_summarise):
     assert_frame_equal(actual, expected)
 
 
+def test_summarise_by_tuple_grouped(df_summarise):
+    """Test output for a tuple"""
+    actual = df_summarise.groupby("combine_id").summarise(("avg_run", "mean"))
+    expected = df_summarise.groupby("combine_id").agg({"avg_run": "mean"})
+    assert_frame_equal(actual, expected)
+
+
 def test_summarise_tuple_df_callable(df_summarise):
     """Test output for a tuple"""
     actual = df_summarise.summarise(("avg_run", lambda df: df.sum()))
@@ -180,24 +195,12 @@ def test_summarise_tuple_by_callable(df_summarise):
     assert_frame_equal(actual, expected)
 
 
-def test_summarise_tuple_by_callable_dataframe(df_summarise):
-    """Test output for a tuple"""
-    actual = df_summarise.summarise(
-        ("avg_run", lambda df: df.agg(["sum", "mean"])), by="combine_id"
-    )
-    expected = df_summarise.groupby("combine_id").agg(
-        {"avg_run": ["sum", "mean"]}
-    )
-    assert_frame_equal(actual, expected)
-
-
-def test_summarise_tuple_grouped_object(df_summarise):
+def test_summarise_tuple_by_callable_grouped(df_summarise):
     """Test output for a tuple"""
-    grp = df_summarise.groupby("combine_id")
-    actual = df_summarise.summarise(
-        ("avg_run", lambda df: df.agg(["sum", "mean"])), by=grp
+    actual = df_summarise.groupby("combine_id").summarise(
+        ("avg_run", lambda df: df.sum())
     )
-    expected = grp.agg({"avg_run": ["sum", "mean"]})
+    expected = df_summarise.groupby("combine_id").agg({"avg_run": "sum"})
     assert_frame_equal(actual, expected)
 
 
@@ -273,6 +276,26 @@ def test_summarise_MI_different_levels_tuple(dfmi):
     assert_frame_equal(actual, expected)
 
 
+def test_summarise_MI_different_levels_tuple_grouped(dfmi):
+    """Test summarise on a MultiIndex"""
+    actual = dfmi.groupby(level="A").summarise(
+        {("a", "bar"): "sum", ("rar",): (("a", "foo"), "mean")},
+        ("b", "min"),
+    )
+    actual.columns = ["A", "B", "C", "D"]
+    grp = dfmi.groupby(level="A")
+    expected = grp.agg(
+        {
+            ("a", "bar"): "sum",
+            ("a", "foo"): "mean",
+            ("b", "bah"): "min",
+            ("b", "foo"): "min",
+        }
+    )
+    expected.columns = ["A", "B", "C", "D"]
+    assert_frame_equal(actual, expected)
+
+
 def test_summarise_MI_different_levels_dataframe(dfmi):
     """raise if dictionary value is a tuple and the returned aggregate is a DataFrame"""
     with pytest.raises(TypeError, match="Expected a pandas Series object;.+"):