Skip to content

Commit 258baca

Browse files
committed
use only tuples
1 parent 23f1bda commit 258baca

File tree

3 files changed

+96
-122
lines changed

3 files changed

+96
-122
lines changed

janitor/functions/summarize.py

Lines changed: 56 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
from janitor.utils import check
77
from pandas.api.types import is_scalar
88

9-
from janitor.functions.utils import _select_index, SD
10-
from collections import Counter
9+
from janitor.functions.utils import SD, _process_SD
10+
from itertools import product
1111

1212

1313
@pf.register_dataframe_method
@@ -17,7 +17,7 @@ def summarize(
1717
by: Any = None,
1818
) -> pd.DataFrame:
1919
"""
20-
Reduction operation on columns via a dictionary or a tuple.
20+
Reduction operation on columns via a tuple.
2121
2222
It is a wrapper around `pd.DataFrame.agg`,
2323
with added flexibility for multiple columns.
@@ -28,8 +28,7 @@ def summarize(
2828
for the entire dataframe,
2929
or a row per group, if `by` is present.
3030
31-
If the variable argument is a tuple,
32-
it has to be of the form `(columns, func, names_glue)`;
31+
The argument should be of the form `(columns, func, names_glue)`;
3332
the `names_glue` argument is optional.
3433
`columns` can be selected with the
3534
[`select_columns`][janitor.functions.select.select_columns]
@@ -48,7 +47,7 @@ def summarize(
4847
of passing tuples to the `summarize` function.
4948
5049
51-
Example - Summarize with a dictionary:
50+
Example:
5251
5352
>>> import pandas as pd
5453
>>> import numpy as np
@@ -62,9 +61,7 @@ def summarize(
6261
... 'combine_id': [100200, 100200, 101200, 101200, 102201, 103202],
6362
... 'category': ['heats', 'heats', 'finals', 'finals', 'heats', 'finals']}
6463
>>> df = pd.DataFrame(data)
65-
>>> (df
66-
... .summarize({"avg_run":"mean"}, by=['combine_id', 'category'])
67-
... )
64+
>>> df.summarize(("avg_run", "mean"), by=['combine_id', 'category'])
6865
avg_run
6966
combine_id category
7067
100200 heats 3.5
@@ -74,18 +71,18 @@ def summarize(
7471
7572
Summarize with a new column name:
7673
77-
>>> df.summarize({"avg_run_2":df.avg_run.mean()})
74+
>>> df.summarize(("avg_run", "mean", "avg_run_2"))
7875
avg_run_2
7976
0 2.833333
80-
>>> df.summarize({"avg_run_2":lambda f: f.avg_run.mean()}, by=['combine_id', 'category'])
77+
>>> df.summarize(("avg_run", "mean", "avg_run_2"), by=['combine_id', 'category'])
8178
avg_run_2
8279
combine_id category
8380
100200 heats 3.5
8481
101200 finals 2.0
8582
102201 heats 2.0
8683
103202 finals 4.0
8784
88-
Summarize with a tuple:
85+
Summarize with the placeholders in `names_glue`:
8986
9087
>>> cols = jn.SD(columns="avg*", func="mean", names_glue="{_col}_{_fn}")
9188
>>> df.summarize(cols)
@@ -100,14 +97,15 @@ def summarize(
10097
103202 finals 4.0 4.0 4.0
10198
10299
:param df: A pandas DataFrame.
103-
:param args: Either a dictionary or a tuple.
100+
:param args: A tuple.
104101
:param by: Column(s) to group by.
105-
:raises ValueError: If a tuple is passed and the length is not 3.
102+
:raises ValueError: If the tuple size is less than 2.
106103
:returns: A pandas DataFrame with summarized columns.
107104
""" # noqa: E501
108105

106+
args_to_process = []
109107
for num, arg in enumerate(args):
110-
check(f"Argument {num} in the summarize function", arg, [dict, tuple])
108+
check(f"Argument {num} in the summarize function", arg, [tuple])
111109
if isinstance(arg, tuple):
112110
if len(arg) < 2:
113111
raise ValueError(
@@ -119,28 +117,31 @@ def summarize(
119117
f"Argument {num} should have a maximum length of 3, "
120118
f"instead got {len(arg)}"
121119
)
122-
_, func, names = SD(*arg)
123-
check(
124-
f"The function (position 1 in the tuple) for argument {num} ",
125-
func,
126-
[str, callable, list, tuple],
127-
)
128-
if isinstance(func, (list, tuple)):
129-
for number, funcn in enumerate(func):
130-
check(
131-
f"Entry {number} in the function sequence "
132-
f"for argument {num}",
133-
funcn,
134-
[str, callable],
135-
)
136-
137-
if names:
120+
entry = SD(*arg)
121+
func = entry.func
122+
names = entry.names_glue
123+
check(
124+
f"The function (position 1 in the tuple) for argument {num} ",
125+
func,
126+
[str, callable, list, tuple],
127+
)
128+
if isinstance(func, (list, tuple)):
129+
for number, funcn in enumerate(func):
138130
check(
139-
f"The names (position 2 in the tuple) for argument {num} ",
140-
names,
141-
[str],
131+
f"Entry {number} in the function sequence "
132+
f"for argument {num}",
133+
funcn,
134+
[str, callable],
142135
)
143136

137+
if names:
138+
check(
139+
f"The names (position 2 in the tuple) for argument {num} ",
140+
names,
141+
[str],
142+
)
143+
args_to_process.append(entry)
144+
144145
by_is_true = by is not None
145146
grp = None
146147
if by_is_true and isinstance(by, dict):
@@ -150,70 +151,25 @@ def summarize(
150151

151152
aggs = {}
152153

153-
for arg in args:
154-
if isinstance(arg, dict):
155-
for col, func in arg.items():
156-
val = grp if by_is_true else df
157-
if isinstance(func, str):
158-
outcome = val[col].agg(func)
159-
elif is_scalar(func):
160-
outcome = func
161-
else:
162-
try:
163-
outcome = val.agg(func)
164-
except (ValueError, AttributeError):
165-
outcome = func(val)
166-
aggs[col] = outcome
167-
else:
168-
columns, func, names = SD(*arg)
169-
columns = _select_index([columns], df, axis="columns")
170-
columns = df.columns[columns]
171-
if not isinstance(func, (list, tuple)):
172-
func = [func]
173-
func_names = [
174-
funcn.__name__ if callable(funcn) else funcn for funcn in func
175-
]
176-
counts = None
177-
dupes = set()
178-
if len(func) > 1:
179-
counts = Counter(func_names)
180-
counts = {key: 0 for key, value in counts.items() if value > 1}
181-
# deal with duplicate function names
182-
if counts:
183-
func_list = []
184-
for funcn in func_names:
185-
if funcn in counts:
186-
if names:
187-
name = f"{funcn}{counts[funcn]}"
188-
else:
189-
name = f"{counts[funcn]}"
190-
dupes.add(name)
191-
func_list.append(name)
192-
counts[funcn] += 1
193-
else:
194-
func_list.append(funcn)
195-
func_names = func_list
196-
counts = None
197-
func_names = tuple(zip(func_names, func))
198-
for col in columns:
199-
val = grp[col] if by_is_true else df[col]
200-
for name, funcn in func_names:
201-
if names:
202-
name = names.format(_col=col, _fn=name)
203-
elif name in dupes:
204-
name = f"{col}{name}"
205-
else:
206-
name = col
207-
if isinstance(funcn, str):
208-
outcome = val.agg(funcn)
209-
else:
210-
try:
211-
outcome = val.agg(funcn)
212-
except (ValueError, AttributeError):
213-
outcome = funcn(val)
214-
aggs[name] = outcome
215-
aggs = {
216-
col: [outcome] if is_scalar(outcome) else outcome
217-
for col, outcome in aggs.items()
218-
}
154+
for arg in args_to_process:
155+
columns, names, func_names_and_func, dupes = _process_SD(df, arg)
156+
for col, (name, funcn) in product(columns, func_names_and_func):
157+
val = grp[col] if by_is_true else df[col]
158+
if names:
159+
name = names.format(_col=col, _fn=name)
160+
elif name in dupes:
161+
name = f"{col}{name}"
162+
else:
163+
name = col
164+
if isinstance(funcn, str):
165+
outcome = val.agg(funcn)
166+
else:
167+
try:
168+
outcome = val.agg(funcn)
169+
except (ValueError, AttributeError):
170+
outcome = funcn(val)
171+
if is_scalar(outcome):
172+
outcome = [outcome]
173+
aggs[name] = outcome
174+
219175
return pd.DataFrame(aggs, copy=False)

janitor/functions/utils.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray
2020
from pandas.core.common import is_bool_indexer
2121
from dataclasses import dataclass
22+
from collections import Counter
2223

2324
import pandas as pd
2425
from janitor.utils import check, _expand_grid
@@ -625,3 +626,41 @@ class SD(NamedTuple):
625626
columns: Any
626627
func: Optional[Union[str, Callable, list, tuple]]
627628
names_glue: Optional[str] = None
629+
630+
631+
def _process_SD(df, arg):
632+
"""
633+
process SD for use in `mutate` or `summarize`
634+
"""
635+
columns = arg.columns
636+
func = arg.func
637+
names = arg.names_glue
638+
columns = _select_index([columns], df, axis="columns")
639+
columns = df.columns[columns]
640+
if not isinstance(func, (list, tuple)):
641+
func = [func]
642+
func_names = [
643+
funcn.__name__ if callable(funcn) else funcn for funcn in func
644+
]
645+
counts = None
646+
dupes = set()
647+
if len(func) > 1:
648+
counts = Counter(func_names)
649+
counts = {key: 0 for key, value in counts.items() if value > 1}
650+
# deal with duplicate function names
651+
if counts:
652+
func_list = []
653+
for funcn in func_names:
654+
if funcn in counts:
655+
if names:
656+
name = f"{funcn}{counts[funcn]}"
657+
else:
658+
name = f"{counts[funcn]}"
659+
dupes.add(name)
660+
func_list.append(name)
661+
counts[funcn] += 1
662+
else:
663+
func_list.append(funcn)
664+
func_names = func_list
665+
counts = None
666+
return columns, names, zip(func_names, func), dupes

tests/functions/test_summarize.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,9 @@
66
from pandas.api.types import is_numeric_dtype
77

88

9-
@pytest.mark.functions
10-
def test_dict_args_error(dataframe):
11-
"""Raise if arg is not a dict/tuple"""
12-
with pytest.raises(
13-
TypeError, match="Argument 0 in the summarize function.+"
14-
):
15-
dataframe.summarize(1)
16-
17-
189
func = lambda grp: grp.Revenue.sum() / grp.Quantity.sum() # noqa: E731
1910

2011

21-
@pytest.mark.functions
22-
def test_dict_agg_error(dataframe):
23-
"""
24-
Raise if func triggers an attributeerror/valueerror
25-
"""
26-
with pytest.raises(AttributeError):
27-
dataframe.summarize({"a": func})
28-
29-
3012
@pytest.mark.functions
3113
def test_tuple_agg_error(dataframe):
3214
"""
@@ -84,12 +66,9 @@ def test_tuple_func_seq_error(dataframe):
8466

8567

8668
args = [
87-
{"a": "sum"},
88-
{"a": pd.Series([1, 2, 3] * 3).sum()},
89-
{"a": lambda df: df.a.sum()},
69+
("a", lambda df: df.sum()),
9070
("a", "sum"),
9171
("a", np.sum),
92-
{"a": lambda f: np.sum(f.a)},
9372
]
9473

9574

0 commit comments

Comments
 (0)