Skip to content

Commit 9f7e1b9

Browse files
committed
setup SD like an expression
1 parent c478a6e commit 9f7e1b9

File tree

3 files changed

+253
-237
lines changed

3 files changed

+253
-237
lines changed

janitor/functions/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@
6868
from .shuffle import shuffle
6969
from .sort_column_value_order import sort_column_value_order
7070
from .sort_naturally import sort_naturally
71-
from .summarize import summarize
71+
72+
# from .summarize import summarize
7273
from .take_first import take_first
7374
from .then import then
7475
from .to_datetime import to_datetime

janitor/functions/summarize.py

Lines changed: 186 additions & 184 deletions
Original file line numberDiff line numberDiff line change
@@ -1,184 +1,186 @@
1-
"""Alternative function to pd.agg for summarizing data."""
2-
from typing import Any
3-
import pandas as pd
4-
import pandas_flavor as pf
5-
6-
from janitor.utils import check
7-
from pandas.api.types import is_scalar
8-
9-
from janitor.functions.utils import SD, _process_SD
10-
from itertools import product
11-
12-
13-
@pf.register_dataframe_method
14-
def summarize(
15-
df: pd.DataFrame,
16-
*args,
17-
by: Any = None,
18-
) -> pd.DataFrame:
19-
"""
20-
21-
!!! info "New in version 0.25.0"
22-
23-
!!!note
24-
25-
Before reaching for `summarize`, try `pd.DataFrame.agg`.
26-
27-
Reduction operation on columns via a tuple.
28-
29-
It is a wrapper around `pd.DataFrame.agg`,
30-
with added flexibility for multiple columns.
31-
32-
The argument should be of the form `(columns, func, names_glue)`;
33-
the `names_glue` argument is optional.
34-
`columns` can be selected with the
35-
[`select_columns`][janitor.functions.select.select_columns]
36-
syntax for flexibility.
37-
The function `func` should be a string
38-
(which is dispatched to `pd.Series.agg`),
39-
or a callable, or a list/tuple of strings/callables.
40-
The function is called on each column in `columns`.
41-
42-
The `names_glue` argument allows for renaming, especially for
43-
multiple columns or multiple functions.
44-
The placeholders for `names_glue` are `_col`, which represents
45-
the column name, and `_fn` which represents the function name.
46-
Under the hood, it uses python's `str.format` method.
47-
48-
`janitor.SD` offers a more explicit form
49-
of passing tuples to the `summarize` function.
50-
51-
`by` accepts a label, labels, mapping or function.
52-
Arguments supported in `pd.DataFrame.groupby`
53-
can also be passed to `by` via a dictionary.
54-
55-
56-
Example:
57-
58-
>>> import pandas as pd
59-
>>> import numpy as np
60-
>>> import janitor as jn
61-
>>> data = {'avg_jump': [3, 4, 1, 2, 3, 4],
62-
... 'avg_run': [3, 4, 1, 3, 2, 4],
63-
... 'avg_swim': [2, 1, 2, 2, 3, 4],
64-
... 'combine_id': [100200, 100200,
65-
... 101200, 101200,
66-
... 102201, 103202],
67-
... 'category': ['heats', 'heats',
68-
... 'finals', 'finals',
69-
... 'heats', 'finals']}
70-
>>> df = pd.DataFrame(data)
71-
>>> df.summarize(("avg_run", "mean"), by=['combine_id', 'category'])
72-
avg_run
73-
combine_id category
74-
100200 heats 3.5
75-
101200 finals 2.0
76-
102201 heats 2.0
77-
103202 finals 4.0
78-
79-
Summarize with a new column name:
80-
81-
>>> df.summarize(("avg_run", "mean", "avg_run_2"))
82-
avg_run_2
83-
0 2.833333
84-
>>> df.summarize(("avg_run", "mean", "avg_run_2"), by=['combine_id', 'category'])
85-
avg_run_2
86-
combine_id category
87-
100200 heats 3.5
88-
101200 finals 2.0
89-
102201 heats 2.0
90-
103202 finals 4.0
91-
92-
Summarize with the placeholders in `names_glue`:
93-
94-
>>> cols = jn.SD(columns="avg*", func="mean", names_glue="{_col}_{_fn}")
95-
>>> df.summarize(cols)
96-
avg_jump_mean avg_run_mean avg_swim_mean
97-
0 2.833333 2.833333 2.333333
98-
>>> df.summarize(cols, by=['combine_id', 'category'])
99-
avg_jump_mean avg_run_mean avg_swim_mean
100-
combine_id category
101-
100200 heats 3.5 3.5 1.5
102-
101200 finals 1.5 2.0 2.0
103-
102201 heats 3.0 2.0 3.0
104-
103202 finals 4.0 4.0 4.0
105-
106-
:param df: A pandas DataFrame.
107-
:param args: A tuple.
108-
:param by: Column(s) to group by.
109-
:raises ValueError: If the tuple size is less than 2.
110-
:returns: A pandas DataFrame with summarized columns.
111-
""" # noqa: E501
112-
113-
args_to_process = []
114-
for num, arg in enumerate(args):
115-
check(f"Argument {num} in the summarize function", arg, [tuple])
116-
if len(arg) < 2:
117-
raise ValueError(
118-
f"Argument {num} should have a minimum length of 2, "
119-
f"instead got {len(arg)}"
120-
)
121-
if len(arg) > 3:
122-
raise ValueError(
123-
f"Argument {num} should have a maximum length of 3, "
124-
f"instead got {len(arg)}"
125-
)
126-
entry = SD(*arg)
127-
func = entry.func
128-
names = entry.names_glue
129-
check(
130-
f"The function (position 1 in the tuple) for argument {num} ",
131-
func,
132-
[str, callable, list, tuple],
133-
)
134-
if isinstance(func, (list, tuple)):
135-
for number, funcn in enumerate(func):
136-
check(
137-
f"Entry {number} in the function sequence "
138-
f"for argument {num}",
139-
funcn,
140-
[str, callable],
141-
)
142-
143-
if names:
144-
check(
145-
f"The names (position 2 in the tuple) for argument {num} ",
146-
names,
147-
[str],
148-
)
149-
args_to_process.append(entry)
150-
151-
by_is_true = by is not None
152-
grp = None
153-
if by_is_true and isinstance(by, dict):
154-
grp = df.groupby(**by)
155-
elif by_is_true:
156-
grp = df.groupby(by)
157-
158-
aggs = {}
159-
160-
for arg in args_to_process:
161-
columns, names, func_names_and_func, dupes = _process_SD(df, arg)
162-
for col, (name, funcn) in product(columns, func_names_and_func):
163-
val = grp[col] if by_is_true else df[col]
164-
if names:
165-
name = names.format(_col=col, _fn=name)
166-
elif name in dupes:
167-
name = f"{col}{name}"
168-
else:
169-
name = col
170-
if isinstance(funcn, str):
171-
outcome = val.agg(funcn)
172-
else:
173-
try:
174-
outcome = val.agg(funcn)
175-
except (ValueError, AttributeError):
176-
outcome = funcn(val)
177-
if isinstance(outcome, pd.DataFrame):
178-
outcome.columns = f"{name}_" + outcome.columns
179-
aggs.update(outcome)
180-
else:
181-
if is_scalar(outcome):
182-
outcome = [outcome]
183-
aggs[name] = outcome
184-
return pd.DataFrame(aggs, copy=False)
1+
# """Alternative function to pd.agg for summarizing data."""
2+
# from typing import Any
3+
# import pandas as pd
4+
# import pandas_flavor as pf
5+
6+
# from janitor.utils import check
7+
# from pandas.api.types import is_scalar
8+
9+
# from janitor.functions.utils import SD, _process_SD
10+
# from itertools import product
11+
12+
13+
# @pf.register_dataframe_method
14+
# def summarize(
15+
# df: pd.DataFrame,
16+
# *args,
17+
# by: Any = None,
18+
# ) -> pd.DataFrame:
19+
# """
20+
21+
# !!! info "New in version 0.25.0"
22+
23+
# !!!note
24+
25+
# Before reaching for `summarize`, try `pd.DataFrame.agg`.
26+
27+
# Reduction operation on columns via a tuple.
28+
29+
# It is a wrapper around `pd.DataFrame.agg`,
30+
# with added flexibility for multiple columns.
31+
32+
# The argument should be of the form `(columns, func, names_glue)`;
33+
# the `names_glue` argument is optional.
34+
# `columns` can be selected with the
35+
# [`select_columns`][janitor.functions.select.select_columns]
36+
# syntax for flexibility.
37+
# The function `func` should be a string
38+
# (which is dispatched to `pd.Series.agg`),
39+
# or a callable, or a list/tuple of strings/callables.
40+
# The function is called on each column in `columns`.
41+
42+
# The `names_glue` argument allows for renaming, especially for
43+
# multiple columns or multiple functions.
44+
# The placeholders for `names_glue` are `_col`, which represents
45+
# the column name, and `_fn` which represents the function name.
46+
# Under the hood, it uses python's `str.format` method.
47+
48+
# `janitor.SD` offers a more explicit form
49+
# of passing tuples to the `summarize` function.
50+
51+
# `by` accepts a label, labels, mapping or function.
52+
# Arguments supported in `pd.DataFrame.groupby`
53+
# can also be passed to `by` via a dictionary.
54+
55+
56+
# Example:
57+
58+
# >>> import pandas as pd
59+
# >>> import numpy as np
60+
# >>> import janitor as jn
61+
# >>> data = {'avg_jump': [3, 4, 1, 2, 3, 4],
62+
# ... 'avg_run': [3, 4, 1, 3, 2, 4],
63+
# ... 'avg_swim': [2, 1, 2, 2, 3, 4],
64+
# ... 'combine_id': [100200, 100200,
65+
# ... 101200, 101200,
66+
# ... 102201, 103202],
67+
# ... 'category': ['heats', 'heats',
68+
# ... 'finals', 'finals',
69+
# ... 'heats', 'finals']}
70+
# >>> df = pd.DataFrame(data)
71+
# >>> df.summarize(("avg_run", "mean"), by=['combine_id', 'category'])
72+
# avg_run
73+
# combine_id category
74+
# 100200 heats 3.5
75+
# 101200 finals 2.0
76+
# 102201 heats 2.0
77+
# 103202 finals 4.0
78+
79+
# Summarize with a new column name:
80+
81+
# >>> df.summarize(("avg_run", "mean", "avg_run_2"))
82+
# avg_run_2
83+
# 0 2.833333
84+
# >>> df.summarize(("avg_run", "mean", "avg_run_2"),
85+
# by=['combine_id', 'category'])
86+
# avg_run_2
87+
# combine_id category
88+
# 100200 heats 3.5
89+
# 101200 finals 2.0
90+
# 102201 heats 2.0
91+
# 103202 finals 4.0
92+
93+
# Summarize with the placeholders in `names_glue`:
94+
95+
# >>> cols = jn.SD(columns="avg*", func="mean",
96+
# names_glue="{_col}_{_fn}")
97+
# >>> df.summarize(cols)
98+
# avg_jump_mean avg_run_mean avg_swim_mean
99+
# 0 2.833333 2.833333 2.333333
100+
# >>> df.summarize(cols, by=['combine_id', 'category'])
101+
# avg_jump_mean avg_run_mean avg_swim_mean
102+
# combine_id category
103+
# 100200 heats 3.5 3.5 1.5
104+
# 101200 finals 1.5 2.0 2.0
105+
# 102201 heats 3.0 2.0 3.0
106+
# 103202 finals 4.0 4.0 4.0
107+
108+
# :param df: A pandas DataFrame.
109+
# :param args: A tuple.
110+
# :param by: Column(s) to group by.
111+
# :raises ValueError: If the tuple size is less than 2.
112+
# :returns: A pandas DataFrame with summarized columns.
113+
# """ # noqa: E501
114+
115+
# args_to_process = []
116+
# for num, arg in enumerate(args):
117+
# check(f"Argument {num} in the summarize function", arg, [tuple])
118+
# if len(arg) < 2:
119+
# raise ValueError(
120+
# f"Argument {num} should have a minimum length of 2, "
121+
# f"instead got {len(arg)}"
122+
# )
123+
# if len(arg) > 3:
124+
# raise ValueError(
125+
# f"Argument {num} should have a maximum length of 3, "
126+
# f"instead got {len(arg)}"
127+
# )
128+
# entry = SD(*arg)
129+
# func = entry.func
130+
# names = entry.names_glue
131+
# check(
132+
# f"The function (position 1 in the tuple) for argument {num} ",
133+
# func,
134+
# [str, callable, list, tuple],
135+
# )
136+
# if isinstance(func, (list, tuple)):
137+
# for number, funcn in enumerate(func):
138+
# check(
139+
# f"Entry {number} in the function sequence "
140+
# f"for argument {num}",
141+
# funcn,
142+
# [str, callable],
143+
# )
144+
145+
# if names:
146+
# check(
147+
# f"The names (position 2 in the tuple) for argument {num} ",
148+
# names,
149+
# [str],
150+
# )
151+
# args_to_process.append(entry)
152+
153+
# by_is_true = by is not None
154+
# grp = None
155+
# if by_is_true and isinstance(by, dict):
156+
# grp = df.groupby(**by)
157+
# elif by_is_true:
158+
# grp = df.groupby(by)
159+
160+
# aggs = {}
161+
162+
# for arg in args_to_process:
163+
# columns, names, func_names_and_func, dupes = _process_SD(df, arg)
164+
# for col, (name, funcn) in product(columns, func_names_and_func):
165+
# val = grp[col] if by_is_true else df[col]
166+
# if names:
167+
# name = names.format(_col=col, _fn=name)
168+
# elif name in dupes:
169+
# name = f"{col}{name}"
170+
# else:
171+
# name = col
172+
# if isinstance(funcn, str):
173+
# outcome = val.agg(funcn)
174+
# else:
175+
# try:
176+
# outcome = val.agg(funcn)
177+
# except (ValueError, AttributeError):
178+
# outcome = funcn(val)
179+
# if isinstance(outcome, pd.DataFrame):
180+
# outcome.columns = f"{name}_" + outcome.columns
181+
# aggs.update(outcome)
182+
# else:
183+
# if is_scalar(outcome):
184+
# outcome = [outcome]
185+
# aggs[name] = outcome
186+
# return pd.DataFrame(aggs, copy=False)

0 commit comments

Comments
 (0)