|
1 |
| -# """Alternative function to pd.agg for summarizing data.""" |
2 |
| -# from typing import Any |
3 |
| -# import pandas as pd |
4 |
| -# import pandas_flavor as pf |
5 |
| - |
6 |
| -# from janitor.utils import check |
7 |
| -# from pandas.api.types import is_scalar |
8 |
| - |
9 |
| -# from janitor.functions.utils import SD, _process_SD |
10 |
| -# from itertools import product |
11 |
| - |
12 |
| - |
13 |
| -# @pf.register_dataframe_method |
14 |
| -# def summarize( |
15 |
| -# df: pd.DataFrame, |
16 |
| -# *args, |
17 |
| -# by: Any = None, |
18 |
| -# ) -> pd.DataFrame: |
19 |
| -# """ |
20 |
| - |
21 |
| -# !!! info "New in version 0.25.0" |
22 |
| - |
23 |
| -# !!!note |
24 |
| - |
25 |
| -# Before reaching for `summarize`, try `pd.DataFrame.agg`. |
26 |
| - |
27 |
| -# Reduction operation on columns via a tuple. |
28 |
| - |
29 |
| -# It is a wrapper around `pd.DataFrame.agg`, |
30 |
| -# with added flexibility for multiple columns. |
31 |
| - |
32 |
| -# The argument should be of the form `(columns, func, names_glue)`; |
33 |
| -# the `names_glue` argument is optional. |
34 |
| -# `columns` can be selected with the |
35 |
| -# [`select_columns`][janitor.functions.select.select_columns] |
36 |
| -# syntax for flexibility. |
37 |
| -# The function `func` should be a string |
38 |
| -# (which is dispatched to `pd.Series.agg`), |
39 |
| -# or a callable, or a list/tuple of strings/callables. |
40 |
| -# The function is called on each column in `columns`. |
41 |
| - |
42 |
| -# The `names_glue` argument allows for renaming, especially for |
43 |
| -# multiple columns or multiple functions. |
44 |
| -# The placeholders for `names_glue` are `_col`, which represents |
45 |
| -# the column name, and `_fn` which represents the function name. |
46 |
| -# Under the hood, it uses python's `str.format` method. |
47 |
| - |
48 |
| -# `janitor.SD` offers a more explicit form |
49 |
| -# of passing tuples to the `summarize` function. |
50 |
| - |
51 |
| -# `by` accepts a label, labels, mapping or function. |
52 |
| -# Arguments supported in `pd.DataFrame.groupby` |
53 |
| -# can also be passed to `by` via a dictionary. |
54 |
| - |
55 |
| - |
56 |
| -# Example: |
57 |
| - |
58 |
| -# >>> import pandas as pd |
59 |
| -# >>> import numpy as np |
60 |
| -# >>> import janitor as jn |
61 |
| -# >>> data = {'avg_jump': [3, 4, 1, 2, 3, 4], |
62 |
| -# ... 'avg_run': [3, 4, 1, 3, 2, 4], |
63 |
| -# ... 'avg_swim': [2, 1, 2, 2, 3, 4], |
64 |
| -# ... 'combine_id': [100200, 100200, |
65 |
| -# ... 101200, 101200, |
66 |
| -# ... 102201, 103202], |
67 |
| -# ... 'category': ['heats', 'heats', |
68 |
| -# ... 'finals', 'finals', |
69 |
| -# ... 'heats', 'finals']} |
70 |
| -# >>> df = pd.DataFrame(data) |
71 |
| -# >>> df.summarize(("avg_run", "mean"), by=['combine_id', 'category']) |
72 |
| -# avg_run |
73 |
| -# combine_id category |
74 |
| -# 100200 heats 3.5 |
75 |
| -# 101200 finals 2.0 |
76 |
| -# 102201 heats 2.0 |
77 |
| -# 103202 finals 4.0 |
78 |
| - |
79 |
| -# Summarize with a new column name: |
80 |
| - |
81 |
| -# >>> df.summarize(("avg_run", "mean", "avg_run_2")) |
82 |
| -# avg_run_2 |
83 |
| -# 0 2.833333 |
84 |
| -# >>> df.summarize(("avg_run", "mean", "avg_run_2"), |
85 |
| -# by=['combine_id', 'category']) |
86 |
| -# avg_run_2 |
87 |
| -# combine_id category |
88 |
| -# 100200 heats 3.5 |
89 |
| -# 101200 finals 2.0 |
90 |
| -# 102201 heats 2.0 |
91 |
| -# 103202 finals 4.0 |
92 |
| - |
93 |
| -# Summarize with the placeholders in `names_glue`: |
94 |
| - |
95 |
| -# >>> cols = jn.SD(columns="avg*", func="mean", |
96 |
| -# names_glue="{_col}_{_fn}") |
97 |
| -# >>> df.summarize(cols) |
98 |
| -# avg_jump_mean avg_run_mean avg_swim_mean |
99 |
| -# 0 2.833333 2.833333 2.333333 |
100 |
| -# >>> df.summarize(cols, by=['combine_id', 'category']) |
101 |
| -# avg_jump_mean avg_run_mean avg_swim_mean |
102 |
| -# combine_id category |
103 |
| -# 100200 heats 3.5 3.5 1.5 |
104 |
| -# 101200 finals 1.5 2.0 2.0 |
105 |
| -# 102201 heats 3.0 2.0 3.0 |
106 |
| -# 103202 finals 4.0 4.0 4.0 |
107 |
| - |
108 |
| -# :param df: A pandas DataFrame. |
109 |
| -# :param args: A tuple. |
110 |
| -# :param by: Column(s) to group by. |
111 |
| -# :raises ValueError: If the tuple size is less than 2. |
112 |
| -# :returns: A pandas DataFrame with summarized columns. |
113 |
| -# """ # noqa: E501 |
114 |
| - |
115 |
| -# args_to_process = [] |
116 |
| -# for num, arg in enumerate(args): |
117 |
| -# check(f"Argument {num} in the summarize function", arg, [tuple]) |
118 |
| -# if len(arg) < 2: |
119 |
| -# raise ValueError( |
120 |
| -# f"Argument {num} should have a minimum length of 2, " |
121 |
| -# f"instead got {len(arg)}" |
122 |
| -# ) |
123 |
| -# if len(arg) > 3: |
124 |
| -# raise ValueError( |
125 |
| -# f"Argument {num} should have a maximum length of 3, " |
126 |
| -# f"instead got {len(arg)}" |
127 |
| -# ) |
128 |
| -# entry = SD(*arg) |
129 |
| -# func = entry.func |
130 |
| -# names = entry.names_glue |
131 |
| -# check( |
132 |
| -# f"The function (position 1 in the tuple) for argument {num} ", |
133 |
| -# func, |
134 |
| -# [str, callable, list, tuple], |
135 |
| -# ) |
136 |
| -# if isinstance(func, (list, tuple)): |
137 |
| -# for number, funcn in enumerate(func): |
138 |
| -# check( |
139 |
| -# f"Entry {number} in the function sequence " |
140 |
| -# f"for argument {num}", |
141 |
| -# funcn, |
142 |
| -# [str, callable], |
143 |
| -# ) |
144 |
| - |
145 |
| -# if names: |
146 |
| -# check( |
147 |
| -# f"The names (position 2 in the tuple) for argument {num} ", |
148 |
| -# names, |
149 |
| -# [str], |
150 |
| -# ) |
151 |
| -# args_to_process.append(entry) |
152 |
| - |
153 |
| -# by_is_true = by is not None |
154 |
| -# grp = None |
155 |
| -# if by_is_true and isinstance(by, dict): |
156 |
| -# grp = df.groupby(**by) |
157 |
| -# elif by_is_true: |
158 |
| -# grp = df.groupby(by) |
159 |
| - |
160 |
| -# aggs = {} |
161 |
| - |
162 |
| -# for arg in args_to_process: |
163 |
| -# columns, names, func_names_and_func, dupes = _process_SD(df, arg) |
164 |
| -# for col, (name, funcn) in product(columns, func_names_and_func): |
165 |
| -# val = grp[col] if by_is_true else df[col] |
166 |
| -# if names: |
167 |
| -# name = names.format(_col=col, _fn=name) |
168 |
| -# elif name in dupes: |
169 |
| -# name = f"{col}{name}" |
170 |
| -# else: |
171 |
| -# name = col |
172 |
| -# if isinstance(funcn, str): |
173 |
| -# outcome = val.agg(funcn) |
174 |
| -# else: |
175 |
| -# try: |
176 |
| -# outcome = val.agg(funcn) |
177 |
| -# except (ValueError, AttributeError): |
178 |
| -# outcome = funcn(val) |
179 |
| -# if isinstance(outcome, pd.DataFrame): |
180 |
| -# outcome.columns = f"{name}_" + outcome.columns |
181 |
| -# aggs.update(outcome) |
182 |
| -# else: |
183 |
| -# if is_scalar(outcome): |
184 |
| -# outcome = [outcome] |
185 |
| -# aggs[name] = outcome |
186 |
| -# return pd.DataFrame(aggs, copy=False) |
| 1 | +"""Alternative function to pd.agg for summarizing data.""" |
| 2 | +from typing import Any |
| 3 | +import pandas as pd |
| 4 | +import pandas_flavor as pf |
| 5 | + |
| 6 | +from janitor.utils import check |
| 7 | +from pandas.api.types import is_scalar |
| 8 | + |
| 9 | +from janitor.functions.utils import SD, _process_SD |
| 10 | +from itertools import product |
| 11 | + |
| 12 | + |
| 13 | +@pf.register_dataframe_method |
| 14 | +def summarize( |
| 15 | + df: pd.DataFrame, |
| 16 | + *args, |
| 17 | + by: Any = None, |
| 18 | +) -> pd.DataFrame: |
| 19 | + """ |
| 20 | +
|
| 21 | + !!! info "New in version 0.25.0" |
| 22 | +
|
| 23 | + !!!note |
| 24 | +
|
| 25 | + Before reaching for `summarize`, try `pd.DataFrame.agg`. |
| 26 | +
|
| 27 | + Reduction operation on columns via the `janitor.SD` class. |
| 28 | +
|
| 29 | + It is a wrapper around `pd.DataFrame.agg`, |
| 30 | + with added flexibility for multiple columns. |
| 31 | +
|
| 32 | + The argument should be of the form `(columns, func, names_glue)`; |
| 33 | + the `names_glue` argument is optional. |
| 34 | + `janitor.SD` allows for flexible column selection with the |
| 35 | + [`select_columns`][janitor.functions.select.select_columns] |
| 36 | + syntax. |
| 37 | + The function `func`, added via `janitor.SD.add_fns` method |
| 38 | + should be a string (which is dispatched to `pd.Series.agg`), |
| 39 | + or a callable, or a list/tuple of strings/callables. |
| 40 | + The function is called on each column in `columns`. |
| 41 | + Additional parameters can be passed as keyword arguments in the |
| 42 | + `add_fns` method for `janitor.SD`. |
| 43 | +
|
| 44 | + The optional `janitor.SD` `names_glue` argument |
| 45 | + (passed via the `janitor.SD.rename` method) allows for renaming. |
| 46 | + For single columns, simply pass the new column name. |
| 47 | + For multiple columns, use the `names_glue` specification - |
| 48 | + the placeholders for `names_glue` are `_col`, which represents |
| 49 | + the column name, and `_fn` which represents the function name. |
| 50 | + Under the hood, it uses python's `str.format` method. |
| 51 | +
|
| 52 | + `by` accepts a label, labels, mapping or function. |
| 53 | + Arguments supported in `pd.DataFrame.groupby` |
| 54 | + can also be passed to `by` via a dictionary. |
| 55 | +
|
| 56 | +
|
| 57 | + Example: |
| 58 | +
|
| 59 | + >>> import pandas as pd |
| 60 | + >>> import numpy as np |
| 61 | + >>> import janitor as jn |
| 62 | + >>> from janitor import SD |
| 63 | + >>> data = {'avg_jump': [3, 4, 1, 2, 3, 4], |
| 64 | + ... 'avg_run': [3, 4, 1, 3, 2, 4], |
| 65 | + ... 'avg_swim': [2, 1, 2, 2, 3, 4], |
| 66 | + ... 'combine_id': [100200, 100200, |
| 67 | + ... 101200, 101200, |
| 68 | + ... 102201, 103202], |
| 69 | + ... 'category': ['heats', 'heats', |
| 70 | + ... 'finals', 'finals', |
| 71 | + ... 'heats', 'finals']} |
| 72 | + >>> df = pd.DataFrame(data) |
| 73 | + >>> arg = SD("avg_run").add_fns("mean") |
| 74 | + >>> df.summarize(arg, by=['combine_id', 'category']) |
| 75 | + avg_run |
| 76 | + combine_id category |
| 77 | + 100200 heats 3.5 |
| 78 | + 101200 finals 2.0 |
| 79 | + 102201 heats 2.0 |
| 80 | + 103202 finals 4.0 |
| 81 | +
|
| 82 | + Summarize with a new column name: |
| 83 | +
|
| 84 | + >>> arg = SD("avg_run").add_fns("mean").rename("avg_run_2") |
| 85 | + >>> df.summarize(arg) |
| 86 | + avg_run_2 |
| 87 | + 0 2.833333 |
| 88 | + >>> df.summarize(arg, by=['combine_id', 'category']) |
| 89 | + avg_run_2 |
| 90 | + combine_id category |
| 91 | + 100200 heats 3.5 |
| 92 | + 101200 finals 2.0 |
| 93 | + 102201 heats 2.0 |
| 94 | + 103202 finals 4.0 |
| 95 | +
|
| 96 | + Summarize with the placeholders in `names_glue`: |
| 97 | +
|
| 98 | + >>> cols = SD("avg*").add_fns("mean").rename("{_col}_{_fn}") |
| 99 | + >>> df.summarize(cols) |
| 100 | + avg_jump_mean avg_run_mean avg_swim_mean |
| 101 | + 0 2.833333 2.833333 2.333333 |
| 102 | + >>> df.summarize(cols, by=['combine_id', 'category']) |
| 103 | + avg_jump_mean avg_run_mean avg_swim_mean |
| 104 | + combine_id category |
| 105 | + 100200 heats 3.5 3.5 1.5 |
| 106 | + 101200 finals 1.5 2.0 2.0 |
| 107 | + 102201 heats 3.0 2.0 3.0 |
| 108 | + 103202 finals 4.0 4.0 4.0 |
| 109 | +
|
| 110 | + :param df: A pandas DataFrame. |
| 111 | + :param args: instance(s) of the `janitor.SD` class. |
| 112 | + :param by: Column(s) to group by. |
| 113 | + :raises ValueError: If a function is not provided for any of the arguments. |
| 114 | + :returns: A pandas DataFrame with summarized columns. |
| 115 | + """ # noqa: E501 |
| 116 | + |
| 117 | + for num, arg in enumerate(args): |
| 118 | + check(f"Argument {num} in the summarize function", arg, [SD]) |
| 119 | + if arg.func is None: |
| 120 | + raise ValueError(f"Kindly provide a function for Argument {num}") |
| 121 | + |
| 122 | + by_is_true = by is not None |
| 123 | + grp = None |
| 124 | + if by_is_true and isinstance(by, dict): |
| 125 | + grp = df.groupby(**by) |
| 126 | + elif by_is_true: |
| 127 | + grp = df.groupby(by) |
| 128 | + |
| 129 | + aggs = {} |
| 130 | + |
| 131 | + for arg in args: |
| 132 | + columns, names, func_names_and_func, dupes = _process_SD(df, arg) |
| 133 | + for col, (name, (funcn, kwargs)) in product( |
| 134 | + columns, func_names_and_func |
| 135 | + ): |
| 136 | + val = grp[col] if by_is_true else df[col] |
| 137 | + if names: |
| 138 | + name = names.format(_col=col, _fn=name) |
| 139 | + elif name in dupes: |
| 140 | + name = f"{col}{name}" |
| 141 | + else: |
| 142 | + name = col |
| 143 | + if isinstance(funcn, str): |
| 144 | + outcome = val.agg(funcn, **kwargs) |
| 145 | + else: |
| 146 | + try: |
| 147 | + outcome = val.agg(funcn, **kwargs) |
| 148 | + except (ValueError, AttributeError): |
| 149 | + outcome = funcn(val, **kwargs) |
| 150 | + if isinstance(outcome, pd.DataFrame): |
| 151 | + outcome.columns = f"{name}_" + outcome.columns |
| 152 | + aggs.update(outcome) |
| 153 | + else: |
| 154 | + if is_scalar(outcome): |
| 155 | + outcome = [outcome] |
| 156 | + aggs[name] = outcome |
| 157 | + return pd.DataFrame(aggs, copy=False) |
0 commit comments