Skip to content

Commit c209f7c

Browse files
committed
changes to code - use normal class for SD; strictly SD - no tuples/dict
1 parent 9f7e1b9 commit c209f7c

File tree

4 files changed

+263
-290
lines changed

4 files changed

+263
-290
lines changed

janitor/functions/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@
6868
from .shuffle import shuffle
6969
from .sort_column_value_order import sort_column_value_order
7070
from .sort_naturally import sort_naturally
71-
72-
# from .summarize import summarize
71+
from .summarize import summarize
7372
from .take_first import take_first
7473
from .then import then
7574
from .to_datetime import to_datetime

janitor/functions/summarize.py

Lines changed: 157 additions & 186 deletions
Original file line numberDiff line numberDiff line change
@@ -1,186 +1,157 @@
1-
# """Alternative function to pd.agg for summarizing data."""
2-
# from typing import Any
3-
# import pandas as pd
4-
# import pandas_flavor as pf
5-
6-
# from janitor.utils import check
7-
# from pandas.api.types import is_scalar
8-
9-
# from janitor.functions.utils import SD, _process_SD
10-
# from itertools import product
11-
12-
13-
# @pf.register_dataframe_method
14-
# def summarize(
15-
# df: pd.DataFrame,
16-
# *args,
17-
# by: Any = None,
18-
# ) -> pd.DataFrame:
19-
# """
20-
21-
# !!! info "New in version 0.25.0"
22-
23-
# !!!note
24-
25-
# Before reaching for `summarize`, try `pd.DataFrame.agg`.
26-
27-
# Reduction operation on columns via a tuple.
28-
29-
# It is a wrapper around `pd.DataFrame.agg`,
30-
# with added flexibility for multiple columns.
31-
32-
# The argument should be of the form `(columns, func, names_glue)`;
33-
# the `names_glue` argument is optional.
34-
# `columns` can be selected with the
35-
# [`select_columns`][janitor.functions.select.select_columns]
36-
# syntax for flexibility.
37-
# The function `func` should be a string
38-
# (which is dispatched to `pd.Series.agg`),
39-
# or a callable, or a list/tuple of strings/callables.
40-
# The function is called on each column in `columns`.
41-
42-
# The `names_glue` argument allows for renaming, especially for
43-
# multiple columns or multiple functions.
44-
# The placeholders for `names_glue` are `_col`, which represents
45-
# the column name, and `_fn` which represents the function name.
46-
# Under the hood, it uses python's `str.format` method.
47-
48-
# `janitor.SD` offers a more explicit form
49-
# of passing tuples to the `summarize` function.
50-
51-
# `by` accepts a label, labels, mapping or function.
52-
# Arguments supported in `pd.DataFrame.groupby`
53-
# can also be passed to `by` via a dictionary.
54-
55-
56-
# Example:
57-
58-
# >>> import pandas as pd
59-
# >>> import numpy as np
60-
# >>> import janitor as jn
61-
# >>> data = {'avg_jump': [3, 4, 1, 2, 3, 4],
62-
# ... 'avg_run': [3, 4, 1, 3, 2, 4],
63-
# ... 'avg_swim': [2, 1, 2, 2, 3, 4],
64-
# ... 'combine_id': [100200, 100200,
65-
# ... 101200, 101200,
66-
# ... 102201, 103202],
67-
# ... 'category': ['heats', 'heats',
68-
# ... 'finals', 'finals',
69-
# ... 'heats', 'finals']}
70-
# >>> df = pd.DataFrame(data)
71-
# >>> df.summarize(("avg_run", "mean"), by=['combine_id', 'category'])
72-
# avg_run
73-
# combine_id category
74-
# 100200 heats 3.5
75-
# 101200 finals 2.0
76-
# 102201 heats 2.0
77-
# 103202 finals 4.0
78-
79-
# Summarize with a new column name:
80-
81-
# >>> df.summarize(("avg_run", "mean", "avg_run_2"))
82-
# avg_run_2
83-
# 0 2.833333
84-
# >>> df.summarize(("avg_run", "mean", "avg_run_2"),
85-
# by=['combine_id', 'category'])
86-
# avg_run_2
87-
# combine_id category
88-
# 100200 heats 3.5
89-
# 101200 finals 2.0
90-
# 102201 heats 2.0
91-
# 103202 finals 4.0
92-
93-
# Summarize with the placeholders in `names_glue`:
94-
95-
# >>> cols = jn.SD(columns="avg*", func="mean",
96-
# names_glue="{_col}_{_fn}")
97-
# >>> df.summarize(cols)
98-
# avg_jump_mean avg_run_mean avg_swim_mean
99-
# 0 2.833333 2.833333 2.333333
100-
# >>> df.summarize(cols, by=['combine_id', 'category'])
101-
# avg_jump_mean avg_run_mean avg_swim_mean
102-
# combine_id category
103-
# 100200 heats 3.5 3.5 1.5
104-
# 101200 finals 1.5 2.0 2.0
105-
# 102201 heats 3.0 2.0 3.0
106-
# 103202 finals 4.0 4.0 4.0
107-
108-
# :param df: A pandas DataFrame.
109-
# :param args: A tuple.
110-
# :param by: Column(s) to group by.
111-
# :raises ValueError: If the tuple size is less than 2.
112-
# :returns: A pandas DataFrame with summarized columns.
113-
# """ # noqa: E501
114-
115-
# args_to_process = []
116-
# for num, arg in enumerate(args):
117-
# check(f"Argument {num} in the summarize function", arg, [tuple])
118-
# if len(arg) < 2:
119-
# raise ValueError(
120-
# f"Argument {num} should have a minimum length of 2, "
121-
# f"instead got {len(arg)}"
122-
# )
123-
# if len(arg) > 3:
124-
# raise ValueError(
125-
# f"Argument {num} should have a maximum length of 3, "
126-
# f"instead got {len(arg)}"
127-
# )
128-
# entry = SD(*arg)
129-
# func = entry.func
130-
# names = entry.names_glue
131-
# check(
132-
# f"The function (position 1 in the tuple) for argument {num} ",
133-
# func,
134-
# [str, callable, list, tuple],
135-
# )
136-
# if isinstance(func, (list, tuple)):
137-
# for number, funcn in enumerate(func):
138-
# check(
139-
# f"Entry {number} in the function sequence "
140-
# f"for argument {num}",
141-
# funcn,
142-
# [str, callable],
143-
# )
144-
145-
# if names:
146-
# check(
147-
# f"The names (position 2 in the tuple) for argument {num} ",
148-
# names,
149-
# [str],
150-
# )
151-
# args_to_process.append(entry)
152-
153-
# by_is_true = by is not None
154-
# grp = None
155-
# if by_is_true and isinstance(by, dict):
156-
# grp = df.groupby(**by)
157-
# elif by_is_true:
158-
# grp = df.groupby(by)
159-
160-
# aggs = {}
161-
162-
# for arg in args_to_process:
163-
# columns, names, func_names_and_func, dupes = _process_SD(df, arg)
164-
# for col, (name, funcn) in product(columns, func_names_and_func):
165-
# val = grp[col] if by_is_true else df[col]
166-
# if names:
167-
# name = names.format(_col=col, _fn=name)
168-
# elif name in dupes:
169-
# name = f"{col}{name}"
170-
# else:
171-
# name = col
172-
# if isinstance(funcn, str):
173-
# outcome = val.agg(funcn)
174-
# else:
175-
# try:
176-
# outcome = val.agg(funcn)
177-
# except (ValueError, AttributeError):
178-
# outcome = funcn(val)
179-
# if isinstance(outcome, pd.DataFrame):
180-
# outcome.columns = f"{name}_" + outcome.columns
181-
# aggs.update(outcome)
182-
# else:
183-
# if is_scalar(outcome):
184-
# outcome = [outcome]
185-
# aggs[name] = outcome
186-
# return pd.DataFrame(aggs, copy=False)
1+
"""Alternative function to pd.agg for summarizing data."""
2+
from typing import Any
3+
import pandas as pd
4+
import pandas_flavor as pf
5+
6+
from janitor.utils import check
7+
from pandas.api.types import is_scalar
8+
9+
from janitor.functions.utils import SD, _process_SD
10+
from itertools import product
11+
12+
13+
@pf.register_dataframe_method
14+
def summarize(
15+
df: pd.DataFrame,
16+
*args,
17+
by: Any = None,
18+
) -> pd.DataFrame:
19+
"""
20+
21+
!!! info "New in version 0.25.0"
22+
23+
!!!note
24+
25+
Before reaching for `summarize`, try `pd.DataFrame.agg`.
26+
27+
Reduction operation on columns via the `janitor.SD` class.
28+
29+
It is a wrapper around `pd.DataFrame.agg`,
30+
with added flexibility for multiple columns.
31+
32+
The argument should be of the form `(columns, func, names_glue)`;
33+
the `names_glue` argument is optional.
34+
`janitor.SD` allows for flexible column selection with the
35+
[`select_columns`][janitor.functions.select.select_columns]
36+
syntax.
37+
The function `func`, added via `janitor.SD.add_fns` method
38+
should be a string (which is dispatched to `pd.Series.agg`),
39+
or a callable, or a list/tuple of strings/callables.
40+
The function is called on each column in `columns`.
41+
Additional parameters can be passed as keyword arguments in the
42+
`add_fns` method for `janitor.SD`.
43+
44+
The optional `janitor.SD` `names_glue` argument
45+
(passed via the `janitor.SD.rename` method) allows for renaming.
46+
For single columns, simply pass the new column name.
47+
For multiple columns, use the `names_glue` specification -
48+
the placeholders for `names_glue` are `_col`, which represents
49+
the column name, and `_fn` which represents the function name.
50+
Under the hood, it uses python's `str.format` method.
51+
52+
`by` accepts a label, labels, mapping or function.
53+
Arguments supported in `pd.DataFrame.groupby`
54+
can also be passed to `by` via a dictionary.
55+
56+
57+
Example:
58+
59+
>>> import pandas as pd
60+
>>> import numpy as np
61+
>>> import janitor as jn
62+
>>> from janitor import SD
63+
>>> data = {'avg_jump': [3, 4, 1, 2, 3, 4],
64+
... 'avg_run': [3, 4, 1, 3, 2, 4],
65+
... 'avg_swim': [2, 1, 2, 2, 3, 4],
66+
... 'combine_id': [100200, 100200,
67+
... 101200, 101200,
68+
... 102201, 103202],
69+
... 'category': ['heats', 'heats',
70+
... 'finals', 'finals',
71+
... 'heats', 'finals']}
72+
>>> df = pd.DataFrame(data)
73+
>>> arg = SD("avg_run").add_fns("mean")
74+
>>> df.summarize(arg, by=['combine_id', 'category'])
75+
avg_run
76+
combine_id category
77+
100200 heats 3.5
78+
101200 finals 2.0
79+
102201 heats 2.0
80+
103202 finals 4.0
81+
82+
Summarize with a new column name:
83+
84+
>>> arg = SD("avg_run").add_fns("mean").rename("avg_run_2")
85+
>>> df.summarize(arg)
86+
avg_run_2
87+
0 2.833333
88+
>>> df.summarize(arg, by=['combine_id', 'category'])
89+
avg_run_2
90+
combine_id category
91+
100200 heats 3.5
92+
101200 finals 2.0
93+
102201 heats 2.0
94+
103202 finals 4.0
95+
96+
Summarize with the placeholders in `names_glue`:
97+
98+
>>> cols = SD("avg*").add_fns("mean").rename("{_col}_{_fn}")
99+
>>> df.summarize(cols)
100+
avg_jump_mean avg_run_mean avg_swim_mean
101+
0 2.833333 2.833333 2.333333
102+
>>> df.summarize(cols, by=['combine_id', 'category'])
103+
avg_jump_mean avg_run_mean avg_swim_mean
104+
combine_id category
105+
100200 heats 3.5 3.5 1.5
106+
101200 finals 1.5 2.0 2.0
107+
102201 heats 3.0 2.0 3.0
108+
103202 finals 4.0 4.0 4.0
109+
110+
:param df: A pandas DataFrame.
111+
:param args: instance(s) of the `janitor.SD` class.
112+
:param by: Column(s) to group by.
113+
:raises ValueError: If a function is not provided for any of the arguments.
114+
:returns: A pandas DataFrame with summarized columns.
115+
""" # noqa: E501
116+
117+
for num, arg in enumerate(args):
118+
check(f"Argument {num} in the summarize function", arg, [SD])
119+
if arg.func is None:
120+
raise ValueError(f"Kindly provide a function for Argument {num}")
121+
122+
by_is_true = by is not None
123+
grp = None
124+
if by_is_true and isinstance(by, dict):
125+
grp = df.groupby(**by)
126+
elif by_is_true:
127+
grp = df.groupby(by)
128+
129+
aggs = {}
130+
131+
for arg in args:
132+
columns, names, func_names_and_func, dupes = _process_SD(df, arg)
133+
for col, (name, (funcn, kwargs)) in product(
134+
columns, func_names_and_func
135+
):
136+
val = grp[col] if by_is_true else df[col]
137+
if names:
138+
name = names.format(_col=col, _fn=name)
139+
elif name in dupes:
140+
name = f"{col}{name}"
141+
else:
142+
name = col
143+
if isinstance(funcn, str):
144+
outcome = val.agg(funcn, **kwargs)
145+
else:
146+
try:
147+
outcome = val.agg(funcn, **kwargs)
148+
except (ValueError, AttributeError):
149+
outcome = funcn(val, **kwargs)
150+
if isinstance(outcome, pd.DataFrame):
151+
outcome.columns = f"{name}_" + outcome.columns
152+
aggs.update(outcome)
153+
else:
154+
if is_scalar(outcome):
155+
outcome = [outcome]
156+
aggs[name] = outcome
157+
return pd.DataFrame(aggs, copy=False)

0 commit comments

Comments
 (0)