|
1 |
| -"""Alternative function to pd.agg for summarizing data.""" |
2 |
| -from typing import Any |
3 |
| -import pandas as pd |
4 |
| -import pandas_flavor as pf |
5 |
| - |
6 |
| -from janitor.utils import check |
7 |
| -from pandas.api.types import is_scalar |
8 |
| - |
9 |
| -from janitor.functions.utils import SD, _process_SD |
10 |
| -from itertools import product |
11 |
| - |
12 |
| - |
13 |
| -@pf.register_dataframe_method |
14 |
| -def summarize( |
15 |
| - df: pd.DataFrame, |
16 |
| - *args, |
17 |
| - by: Any = None, |
18 |
| -) -> pd.DataFrame: |
19 |
| - """ |
20 |
| -
|
21 |
| - !!! info "New in version 0.25.0" |
22 |
| -
|
23 |
| - !!!note |
24 |
| -
|
25 |
| - Before reaching for `summarize`, try `pd.DataFrame.agg`. |
26 |
| -
|
27 |
| - Reduction operation on columns via a tuple. |
28 |
| -
|
29 |
| - It is a wrapper around `pd.DataFrame.agg`, |
30 |
| - with added flexibility for multiple columns. |
31 |
| -
|
32 |
| - The argument should be of the form `(columns, func, names_glue)`; |
33 |
| - the `names_glue` argument is optional. |
34 |
| - `columns` can be selected with the |
35 |
| - [`select_columns`][janitor.functions.select.select_columns] |
36 |
| - syntax for flexibility. |
37 |
| - The function `func` should be a string |
38 |
| - (which is dispatched to `pd.Series.agg`), |
39 |
| - or a callable, or a list/tuple of strings/callables. |
40 |
| - The function is called on each column in `columns`. |
41 |
| -
|
42 |
| - The `names_glue` argument allows for renaming, especially for |
43 |
| - multiple columns or multiple functions. |
44 |
| - The placeholders for `names_glue` are `_col`, which represents |
45 |
| - the column name, and `_fn` which represents the function name. |
46 |
| - Under the hood, it uses python's `str.format` method. |
47 |
| -
|
48 |
| - `janitor.SD` offers a more explicit form |
49 |
| - of passing tuples to the `summarize` function. |
50 |
| -
|
51 |
| - `by` accepts a label, labels, mapping or function. |
52 |
| - Arguments supported in `pd.DataFrame.groupby` |
53 |
| - can also be passed to `by` via a dictionary. |
54 |
| -
|
55 |
| -
|
56 |
| - Example: |
57 |
| -
|
58 |
| - >>> import pandas as pd |
59 |
| - >>> import numpy as np |
60 |
| - >>> import janitor as jn |
61 |
| - >>> data = {'avg_jump': [3, 4, 1, 2, 3, 4], |
62 |
| - ... 'avg_run': [3, 4, 1, 3, 2, 4], |
63 |
| - ... 'avg_swim': [2, 1, 2, 2, 3, 4], |
64 |
| - ... 'combine_id': [100200, 100200, |
65 |
| - ... 101200, 101200, |
66 |
| - ... 102201, 103202], |
67 |
| - ... 'category': ['heats', 'heats', |
68 |
| - ... 'finals', 'finals', |
69 |
| - ... 'heats', 'finals']} |
70 |
| - >>> df = pd.DataFrame(data) |
71 |
| - >>> df.summarize(("avg_run", "mean"), by=['combine_id', 'category']) |
72 |
| - avg_run |
73 |
| - combine_id category |
74 |
| - 100200 heats 3.5 |
75 |
| - 101200 finals 2.0 |
76 |
| - 102201 heats 2.0 |
77 |
| - 103202 finals 4.0 |
78 |
| -
|
79 |
| - Summarize with a new column name: |
80 |
| -
|
81 |
| - >>> df.summarize(("avg_run", "mean", "avg_run_2")) |
82 |
| - avg_run_2 |
83 |
| - 0 2.833333 |
84 |
| - >>> df.summarize(("avg_run", "mean", "avg_run_2"), by=['combine_id', 'category']) |
85 |
| - avg_run_2 |
86 |
| - combine_id category |
87 |
| - 100200 heats 3.5 |
88 |
| - 101200 finals 2.0 |
89 |
| - 102201 heats 2.0 |
90 |
| - 103202 finals 4.0 |
91 |
| -
|
92 |
| - Summarize with the placeholders in `names_glue`: |
93 |
| -
|
94 |
| - >>> cols = jn.SD(columns="avg*", func="mean", names_glue="{_col}_{_fn}") |
95 |
| - >>> df.summarize(cols) |
96 |
| - avg_jump_mean avg_run_mean avg_swim_mean |
97 |
| - 0 2.833333 2.833333 2.333333 |
98 |
| - >>> df.summarize(cols, by=['combine_id', 'category']) |
99 |
| - avg_jump_mean avg_run_mean avg_swim_mean |
100 |
| - combine_id category |
101 |
| - 100200 heats 3.5 3.5 1.5 |
102 |
| - 101200 finals 1.5 2.0 2.0 |
103 |
| - 102201 heats 3.0 2.0 3.0 |
104 |
| - 103202 finals 4.0 4.0 4.0 |
105 |
| -
|
106 |
| - :param df: A pandas DataFrame. |
107 |
| - :param args: A tuple. |
108 |
| - :param by: Column(s) to group by. |
109 |
| - :raises ValueError: If the tuple size is less than 2. |
110 |
| - :returns: A pandas DataFrame with summarized columns. |
111 |
| - """ # noqa: E501 |
112 |
| - |
113 |
| - args_to_process = [] |
114 |
| - for num, arg in enumerate(args): |
115 |
| - check(f"Argument {num} in the summarize function", arg, [tuple]) |
116 |
| - if len(arg) < 2: |
117 |
| - raise ValueError( |
118 |
| - f"Argument {num} should have a minimum length of 2, " |
119 |
| - f"instead got {len(arg)}" |
120 |
| - ) |
121 |
| - if len(arg) > 3: |
122 |
| - raise ValueError( |
123 |
| - f"Argument {num} should have a maximum length of 3, " |
124 |
| - f"instead got {len(arg)}" |
125 |
| - ) |
126 |
| - entry = SD(*arg) |
127 |
| - func = entry.func |
128 |
| - names = entry.names_glue |
129 |
| - check( |
130 |
| - f"The function (position 1 in the tuple) for argument {num} ", |
131 |
| - func, |
132 |
| - [str, callable, list, tuple], |
133 |
| - ) |
134 |
| - if isinstance(func, (list, tuple)): |
135 |
| - for number, funcn in enumerate(func): |
136 |
| - check( |
137 |
| - f"Entry {number} in the function sequence " |
138 |
| - f"for argument {num}", |
139 |
| - funcn, |
140 |
| - [str, callable], |
141 |
| - ) |
142 |
| - |
143 |
| - if names: |
144 |
| - check( |
145 |
| - f"The names (position 2 in the tuple) for argument {num} ", |
146 |
| - names, |
147 |
| - [str], |
148 |
| - ) |
149 |
| - args_to_process.append(entry) |
150 |
| - |
151 |
| - by_is_true = by is not None |
152 |
| - grp = None |
153 |
| - if by_is_true and isinstance(by, dict): |
154 |
| - grp = df.groupby(**by) |
155 |
| - elif by_is_true: |
156 |
| - grp = df.groupby(by) |
157 |
| - |
158 |
| - aggs = {} |
159 |
| - |
160 |
| - for arg in args_to_process: |
161 |
| - columns, names, func_names_and_func, dupes = _process_SD(df, arg) |
162 |
| - for col, (name, funcn) in product(columns, func_names_and_func): |
163 |
| - val = grp[col] if by_is_true else df[col] |
164 |
| - if names: |
165 |
| - name = names.format(_col=col, _fn=name) |
166 |
| - elif name in dupes: |
167 |
| - name = f"{col}{name}" |
168 |
| - else: |
169 |
| - name = col |
170 |
| - if isinstance(funcn, str): |
171 |
| - outcome = val.agg(funcn) |
172 |
| - else: |
173 |
| - try: |
174 |
| - outcome = val.agg(funcn) |
175 |
| - except (ValueError, AttributeError): |
176 |
| - outcome = funcn(val) |
177 |
| - if isinstance(outcome, pd.DataFrame): |
178 |
| - outcome.columns = f"{name}_" + outcome.columns |
179 |
| - aggs.update(outcome) |
180 |
| - else: |
181 |
| - if is_scalar(outcome): |
182 |
| - outcome = [outcome] |
183 |
| - aggs[name] = outcome |
184 |
| - return pd.DataFrame(aggs, copy=False) |
| 1 | +# """Alternative function to pd.agg for summarizing data.""" |
| 2 | +# from typing import Any |
| 3 | +# import pandas as pd |
| 4 | +# import pandas_flavor as pf |
| 5 | + |
| 6 | +# from janitor.utils import check |
| 7 | +# from pandas.api.types import is_scalar |
| 8 | + |
| 9 | +# from janitor.functions.utils import SD, _process_SD |
| 10 | +# from itertools import product |
| 11 | + |
| 12 | + |
| 13 | +# @pf.register_dataframe_method |
| 14 | +# def summarize( |
| 15 | +# df: pd.DataFrame, |
| 16 | +# *args, |
| 17 | +# by: Any = None, |
| 18 | +# ) -> pd.DataFrame: |
| 19 | +# """ |
| 20 | + |
| 21 | +# !!! info "New in version 0.25.0" |
| 22 | + |
| 23 | +# !!!note |
| 24 | + |
| 25 | +# Before reaching for `summarize`, try `pd.DataFrame.agg`. |
| 26 | + |
| 27 | +# Reduction operation on columns via a tuple. |
| 28 | + |
| 29 | +# It is a wrapper around `pd.DataFrame.agg`, |
| 30 | +# with added flexibility for multiple columns. |
| 31 | + |
| 32 | +# The argument should be of the form `(columns, func, names_glue)`; |
| 33 | +# the `names_glue` argument is optional. |
| 34 | +# `columns` can be selected with the |
| 35 | +# [`select_columns`][janitor.functions.select.select_columns] |
| 36 | +# syntax for flexibility. |
| 37 | +# The function `func` should be a string |
| 38 | +# (which is dispatched to `pd.Series.agg`), |
| 39 | +# or a callable, or a list/tuple of strings/callables. |
| 40 | +# The function is called on each column in `columns`. |
| 41 | + |
| 42 | +# The `names_glue` argument allows for renaming, especially for |
| 43 | +# multiple columns or multiple functions. |
| 44 | +# The placeholders for `names_glue` are `_col`, which represents |
| 45 | +# the column name, and `_fn` which represents the function name. |
| 46 | +# Under the hood, it uses python's `str.format` method. |
| 47 | + |
| 48 | +# `janitor.SD` offers a more explicit form |
| 49 | +# of passing tuples to the `summarize` function. |
| 50 | + |
| 51 | +# `by` accepts a label, labels, mapping or function. |
| 52 | +# Arguments supported in `pd.DataFrame.groupby` |
| 53 | +# can also be passed to `by` via a dictionary. |
| 54 | + |
| 55 | + |
| 56 | +# Example: |
| 57 | + |
| 58 | +# >>> import pandas as pd |
| 59 | +# >>> import numpy as np |
| 60 | +# >>> import janitor as jn |
| 61 | +# >>> data = {'avg_jump': [3, 4, 1, 2, 3, 4], |
| 62 | +# ... 'avg_run': [3, 4, 1, 3, 2, 4], |
| 63 | +# ... 'avg_swim': [2, 1, 2, 2, 3, 4], |
| 64 | +# ... 'combine_id': [100200, 100200, |
| 65 | +# ... 101200, 101200, |
| 66 | +# ... 102201, 103202], |
| 67 | +# ... 'category': ['heats', 'heats', |
| 68 | +# ... 'finals', 'finals', |
| 69 | +# ... 'heats', 'finals']} |
| 70 | +# >>> df = pd.DataFrame(data) |
| 71 | +# >>> df.summarize(("avg_run", "mean"), by=['combine_id', 'category']) |
| 72 | +# avg_run |
| 73 | +# combine_id category |
| 74 | +# 100200 heats 3.5 |
| 75 | +# 101200 finals 2.0 |
| 76 | +# 102201 heats 2.0 |
| 77 | +# 103202 finals 4.0 |
| 78 | + |
| 79 | +# Summarize with a new column name: |
| 80 | + |
| 81 | +# >>> df.summarize(("avg_run", "mean", "avg_run_2")) |
| 82 | +# avg_run_2 |
| 83 | +# 0 2.833333 |
| 84 | +# >>> df.summarize(("avg_run", "mean", "avg_run_2"), |
| 85 | +# by=['combine_id', 'category']) |
| 86 | +# avg_run_2 |
| 87 | +# combine_id category |
| 88 | +# 100200 heats 3.5 |
| 89 | +# 101200 finals 2.0 |
| 90 | +# 102201 heats 2.0 |
| 91 | +# 103202 finals 4.0 |
| 92 | + |
| 93 | +# Summarize with the placeholders in `names_glue`: |
| 94 | + |
| 95 | +# >>> cols = jn.SD(columns="avg*", func="mean", |
| 96 | +# names_glue="{_col}_{_fn}") |
| 97 | +# >>> df.summarize(cols) |
| 98 | +# avg_jump_mean avg_run_mean avg_swim_mean |
| 99 | +# 0 2.833333 2.833333 2.333333 |
| 100 | +# >>> df.summarize(cols, by=['combine_id', 'category']) |
| 101 | +# avg_jump_mean avg_run_mean avg_swim_mean |
| 102 | +# combine_id category |
| 103 | +# 100200 heats 3.5 3.5 1.5 |
| 104 | +# 101200 finals 1.5 2.0 2.0 |
| 105 | +# 102201 heats 3.0 2.0 3.0 |
| 106 | +# 103202 finals 4.0 4.0 4.0 |
| 107 | + |
| 108 | +# :param df: A pandas DataFrame. |
| 109 | +# :param args: A tuple. |
| 110 | +# :param by: Column(s) to group by. |
| 111 | +# :raises ValueError: If the tuple size is less than 2. |
| 112 | +# :returns: A pandas DataFrame with summarized columns. |
| 113 | +# """ # noqa: E501 |
| 114 | + |
| 115 | +# args_to_process = [] |
| 116 | +# for num, arg in enumerate(args): |
| 117 | +# check(f"Argument {num} in the summarize function", arg, [tuple]) |
| 118 | +# if len(arg) < 2: |
| 119 | +# raise ValueError( |
| 120 | +# f"Argument {num} should have a minimum length of 2, " |
| 121 | +# f"instead got {len(arg)}" |
| 122 | +# ) |
| 123 | +# if len(arg) > 3: |
| 124 | +# raise ValueError( |
| 125 | +# f"Argument {num} should have a maximum length of 3, " |
| 126 | +# f"instead got {len(arg)}" |
| 127 | +# ) |
| 128 | +# entry = SD(*arg) |
| 129 | +# func = entry.func |
| 130 | +# names = entry.names_glue |
| 131 | +# check( |
| 132 | +# f"The function (position 1 in the tuple) for argument {num} ", |
| 133 | +# func, |
| 134 | +# [str, callable, list, tuple], |
| 135 | +# ) |
| 136 | +# if isinstance(func, (list, tuple)): |
| 137 | +# for number, funcn in enumerate(func): |
| 138 | +# check( |
| 139 | +# f"Entry {number} in the function sequence " |
| 140 | +# f"for argument {num}", |
| 141 | +# funcn, |
| 142 | +# [str, callable], |
| 143 | +# ) |
| 144 | + |
| 145 | +# if names: |
| 146 | +# check( |
| 147 | +# f"The names (position 2 in the tuple) for argument {num} ", |
| 148 | +# names, |
| 149 | +# [str], |
| 150 | +# ) |
| 151 | +# args_to_process.append(entry) |
| 152 | + |
| 153 | +# by_is_true = by is not None |
| 154 | +# grp = None |
| 155 | +# if by_is_true and isinstance(by, dict): |
| 156 | +# grp = df.groupby(**by) |
| 157 | +# elif by_is_true: |
| 158 | +# grp = df.groupby(by) |
| 159 | + |
| 160 | +# aggs = {} |
| 161 | + |
| 162 | +# for arg in args_to_process: |
| 163 | +# columns, names, func_names_and_func, dupes = _process_SD(df, arg) |
| 164 | +# for col, (name, funcn) in product(columns, func_names_and_func): |
| 165 | +# val = grp[col] if by_is_true else df[col] |
| 166 | +# if names: |
| 167 | +# name = names.format(_col=col, _fn=name) |
| 168 | +# elif name in dupes: |
| 169 | +# name = f"{col}{name}" |
| 170 | +# else: |
| 171 | +# name = col |
| 172 | +# if isinstance(funcn, str): |
| 173 | +# outcome = val.agg(funcn) |
| 174 | +# else: |
| 175 | +# try: |
| 176 | +# outcome = val.agg(funcn) |
| 177 | +# except (ValueError, AttributeError): |
| 178 | +# outcome = funcn(val) |
| 179 | +# if isinstance(outcome, pd.DataFrame): |
| 180 | +# outcome.columns = f"{name}_" + outcome.columns |
| 181 | +# aggs.update(outcome) |
| 182 | +# else: |
| 183 | +# if is_scalar(outcome): |
| 184 | +# outcome = [outcome] |
| 185 | +# aggs[name] = outcome |
| 186 | +# return pd.DataFrame(aggs, copy=False) |
0 commit comments