[ENH] faster groupby top k (#1101)

samukweku · ericmjl · web-flow · commit 6cd5ef2ac508 · 2022-05-30T07:57:53.000-04:00
* skeleton code

* fix tests, docstrings, examples

* changelog

* minor edits

* docstrings for tests

* allow by to be a list

* updates based on feedback

Co-authored-by: Eric Ma &lt;ericmjl@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 -   [ENH] Allow column selection/renaming within conditional_join. #1102 @samukweku.
 -   [ENH] New decorator `deprecated_kwargs` for breaking API. #1103 @Zeroto521
 -   [ENH] Extend select_columns to support non-string columns. #1105 @samukweku
+-   [ENH] Performance improvement for groupby_topk. #1093 @samukweku
 
 ## [v0.23.1] - 2022-05-03
 
diff --git a/janitor/functions/groupby_topk.py b/janitor/functions/groupby_topk.py
@@ -1,99 +1,124 @@
 """Implementation of the `groupby_topk` function"""
-from typing import Dict, Hashable
+from typing import Hashable, Union
 import pandas_flavor as pf
 import pandas as pd
 
 from janitor.utils import check_column
+from janitor.utils import check, deprecated_alias
 
 
 @pf.register_dataframe_method
+@deprecated_alias(groupby_column_name="by", sort_column_name="column")
 def groupby_topk(
     df: pd.DataFrame,
-    groupby_column_name: Hashable,
-    sort_column_name: Hashable,
+    by: Union[list, Hashable],
+    column: Hashable,
     k: int,
-    sort_values_kwargs: Dict = None,
+    dropna: bool = True,
+    ascending: bool = True,
+    ignore_index: bool = True,
 ) -> pd.DataFrame:
     """
     Return top `k` rows from a groupby of a set of columns.
 
-    Returns a DataFrame that has the top `k` values grouped by `groupby_column_name`
-    and sorted by `sort_column_name`.
-    Additional parameters to the sorting (such as `ascending=True`)
-    can be passed using `sort_values_kwargs`.
+    Returns a DataFrame that has the top `k` values per `column`,
+    grouped by `by`. Under the hood it uses `nlargest/nsmallest`,
+    for numeric columns, which avoids sorting the entire dataframe,
+    and is usually more performant. For non-numeric columns, `pd.sort_values`
+    is used.
+    No sorting is done to the `by` column(s); the order is maintained
+    in the final output.
 
-    List of all sort_values() parameters can be found
-    [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html).
 
     Example:
 
         >>> import pandas as pd
         >>> import janitor
-        >>> df = pd.DataFrame({
-        ...     "age": [20, 23, 22, 43, 21],
-        ...     "id": [1, 4, 6, 2, 5],
-        ...     "result": ["pass", "pass", "fail", "pass", "fail"]
-        ... })
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "age": [20, 23, 22, 43, 21],
+        ...         "id": [1, 4, 6, 2, 5],
+        ...         "result": ["pass", "pass", "fail", "pass", "fail"],
+        ...     }
+        ... )
         >>> df
            age  id result
         0   20   1   pass
         1   23   4   pass
         2   22   6   fail
         3   43   2   pass
         4   21   5   fail
-        >>> df.groupby_topk('result', 'age', 3) # Ascending top 3
-        ... # doctest: +NORMALIZE_WHITESPACE
-                  age  id result
-        result
-        fail   4   21   5   fail
-               2   22   6   fail
-        pass   0   20   1   pass
-               1   23   4   pass
-               3   43   2   pass
-        >>> df.groupby_topk('result', 'age', 2, {'ascending':False}) # Descending top 2
-        ... # doctest: +NORMALIZE_WHITESPACE
-                  age  id result
-        result
-        fail   2   22   6   fail
-               4   21   5   fail
-        pass   3   43   2   pass
-               1   23   4   pass
+
+    Ascending top 3:
+
+        >>> df.groupby_topk(by="result", column="age", k=3)
+           age  id result
+        0   20   1   pass
+        1   23   4   pass
+        2   43   2   pass
+        3   21   5   fail
+        4   22   6   fail
+
+    Descending top 2:
+
+        >>> df.groupby_topk(
+        ...     by="result", column="age", k=2, ascending=False, ignore_index=False
+        ... )
+           age  id result
+        3   43   2   pass
+        1   23   4   pass
+        2   22   6   fail
+        4   21   5   fail
 
 
     :param df: A pandas DataFrame.
-    :param groupby_column_name: Column name to group input DataFrame `df` by.
-    :param sort_column_name: Name of the column to sort along the
-        input DataFrame `df`.
-    :param k: Number of top rows to return from each group after sorting.
-    :param sort_values_kwargs: Arguments to be passed to sort_values function.
-    :returns: A pandas DataFrame with top `k` rows that are grouped by
-        `groupby_column_name` column with each group sorted along the
-        column `sort_column_name`.
+    :param by: Column name(s) to group input DataFrame `df` by.
+    :param column: Name of the column that determines `k` rows
+        to return.
+    :param k: Number of top rows to return for each group.
+    :param dropna: If `True`, and `NA` values exist in `by`, the `NA`
+        values are not used in the groupby computation to get the relevant
+        `k` rows. If `False`, and `NA` values exist in `by`, then the `NA`
+        values are used in the groupby computation to get the relevant
+        `k` rows. The default is `True`.
+    :param ascending: Default is `True`. If `True`, the smallest top `k` rows,
+        determined by `column` are returned; if `False, the largest top `k` rows,
+        determined by `column` are returned.
+    :param ignore_index: Default `True`. If `True`,
+        the original index is ignored. If `False`, the original index
+        for the top `k` rows is retained.
+    :returns: A pandas DataFrame with top `k` rows per `column`, grouped by `by`.
     :raises ValueError: if `k` is less than 1.
-    :raises ValueError: if `groupby_column_name` not in DataFrame `df`.
-    :raises ValueError: if `sort_column_name` not in DataFrame `df`.
-    :raises KeyError: if `inplace:True` is present in `sort_values_kwargs`.
     """  # noqa: E501
 
-    # Convert the default sort_values_kwargs from None to empty Dict
-    sort_values_kwargs = sort_values_kwargs or {}
+    if isinstance(by, Hashable):
+        by = [by]
 
-    # Check if groupby_column_name and sort_column_name exists in the DataFrame
-    check_column(df, [groupby_column_name, sort_column_name])
+    check("by", by, [Hashable, list])
+
+    check_column(df, [column])
+    check_column(df, by)
 
-    # Check if k is greater than 0.
     if k < 1:
         raise ValueError(
-            "Numbers of rows per group to be returned must be greater than 0."
+            "Numbers of rows per group "
+            "to be returned must be greater than 0."
         )
 
-    # Check if inplace:True in sort values kwargs because it returns None
-    if (
-        "inplace" in sort_values_kwargs.keys()
-        and sort_values_kwargs["inplace"]
-    ):
-        raise KeyError("Cannot use `inplace=True` in `sort_values_kwargs`.")
+    indices = df.groupby(by=by, dropna=dropna, sort=False, observed=True)
+    indices = indices[column]
+
+    try:
+        if ascending:
+            indices = indices.nsmallest(n=k)
+        else:
+            indices = indices.nlargest(n=k)
+    except TypeError:
+        indices = indices.apply(
+            lambda d: d.sort_values(ascending=ascending).head(k)
+        )
 
-    return df.groupby(groupby_column_name).apply(
-        lambda d: d.sort_values(sort_column_name, **sort_values_kwargs).head(k)
-    )
+    indices = indices.index.get_level_values(-1)
+    if ignore_index:
+        return df.loc[indices].reset_index(drop=True)
+    return df.loc[indices]
diff --git a/tests/functions/test_groupby_topk.py b/tests/functions/test_groupby_topk.py
@@ -5,6 +5,7 @@
 
 @pytest.fixture
 def df():
+    """fixture for groupby_topk"""
     return pd.DataFrame(
         [
             {"age": 22, "major": "science", "ID": 145, "result": "pass"},
@@ -17,44 +18,75 @@ def df():
     )
 
 
+def test_dtype_by(df):
+    """Check dtype for by."""
+    with pytest.raises(TypeError):
+        df.groupby_topk(by={"result"}, column="age", k=2)
+
+
 def test_ascending_groupby_k_2(df):
     """Test ascending group by, k=2"""
-    expected = df.groupby("result").apply(
-        lambda d: d.sort_values("age").head(2)
+    expected = (
+        df.groupby("result", sort=False)
+        .apply(lambda d: d.sort_values("age").head(2))
+        .droplevel(0)
+    )
+    assert_frame_equal(
+        df.groupby_topk("result", "age", 2, ignore_index=False), expected
+    )
+
+
+def test_ascending_groupby_non_numeric(df):
+    """Test output for non-numeric column"""
+    expected = (
+        df.groupby("result", sort=False)
+        .apply(lambda d: d.sort_values("major").head(2))
+        .droplevel(0)
+    )
+    assert_frame_equal(
+        df.groupby_topk("result", "major", 2, ignore_index=False), expected
     )
-    assert_frame_equal(df.groupby_topk("result", "age", 2), expected)
 
 
 def test_descending_groupby_k_3(df):
     """Test descending group by, k=3"""
-    expected = df.groupby("result").apply(
-        lambda d: d.sort_values("age", ascending=False).head(3)
+    expected = (
+        df.groupby("result", sort=False)
+        .apply(lambda d: d.sort_values("age", ascending=False).head(3))
+        .droplevel(0)
+        .reset_index(drop=True)
     )
     assert_frame_equal(
-        df.groupby_topk("result", "age", 3, {"ascending": False}), expected
+        df.groupby_topk("result", "age", 3, ascending=False), expected
     )
 
 
 def test_wrong_groupby_column_name(df):
     """Raise Value Error if wrong groupby column name is provided."""
-    with pytest.raises(ValueError):
+    with pytest.raises(
+        ValueError, match="RESULT not present in dataframe columns!"
+    ):
         df.groupby_topk("RESULT", "age", 3)
 
 
 def test_wrong_sort_column_name(df):
     """Raise Value Error if wrong sort column name is provided."""
-    with pytest.raises(ValueError):
+    with pytest.raises(
+        ValueError, match="Age not present in dataframe columns!"
+    ):
         df.groupby_topk("result", "Age", 3)
 
 
 def test_negative_k(df):
     """Raises Value Error if k is less than 1 (negative or 0)."""
-    with pytest.raises(ValueError):
+    with pytest.raises(
+        ValueError,
+        match="Numbers of rows per group.+",
+    ):
         df.groupby_topk("result", "age", -2)
-    with pytest.raises(ValueError):
-        df.groupby_topk("result", "age", 0)
 
 
+@pytest.mark.xfail(reason="sort_value_kwargs parameter deprecated.")
 def test_inplace(df):
     """Raise Key Error if inplace is True in sort_values_kwargs"""
     with pytest.raises(KeyError):