6
6
from janitor .utils import check
7
7
from pandas .api .types import is_scalar
8
8
9
- from janitor .functions .utils import _select_index , SD
10
- from collections import Counter
9
+ from janitor .functions .utils import SD , _process_SD
10
+ from itertools import product
11
11
12
12
13
13
@pf .register_dataframe_method
@@ -17,7 +17,7 @@ def summarize(
17
17
by : Any = None ,
18
18
) -> pd .DataFrame :
19
19
"""
20
- Reduction operation on columns via a dictionary or a tuple.
20
+ Reduction operation on columns via a tuple.
21
21
22
22
It is a wrapper around `pd.DataFrame.agg`,
23
23
with added flexibility for multiple columns.
@@ -28,8 +28,7 @@ def summarize(
28
28
for the entire dataframe,
29
29
or a row per group, if `by` is present.
30
30
31
- If the variable argument is a tuple,
32
- it has to be of the form `(columns, func, names_glue)`;
31
+ The argument should be of the form `(columns, func, names_glue)`;
33
32
the `names_glue` argument is optional.
34
33
`columns` can be selected with the
35
34
[`select_columns`][janitor.functions.select.select_columns]
@@ -48,7 +47,7 @@ def summarize(
48
47
of passing tuples to the `summarize` function.
49
48
50
49
51
- Example - Summarize with a dictionary :
50
+ Example:
52
51
53
52
>>> import pandas as pd
54
53
>>> import numpy as np
@@ -62,9 +61,7 @@ def summarize(
62
61
... 'combine_id': [100200, 100200, 101200, 101200, 102201, 103202],
63
62
... 'category': ['heats', 'heats', 'finals', 'finals', 'heats', 'finals']}
64
63
>>> df = pd.DataFrame(data)
65
- >>> (df
66
- ... .summarize({"avg_run":"mean"}, by=['combine_id', 'category'])
67
- ... )
64
+ >>> df.summarize(("avg_run", "mean"), by=['combine_id', 'category'])
68
65
avg_run
69
66
combine_id category
70
67
100200 heats 3.5
@@ -74,18 +71,18 @@ def summarize(
74
71
75
72
Summarize with a new column name:
76
73
77
- >>> df.summarize({"avg_run_2":df. avg_run. mean()} )
74
+ >>> df.summarize((" avg_run", " mean", "avg_run_2") )
78
75
avg_run_2
79
76
0 2.833333
80
- >>> df.summarize({"avg_run_2":lambda f: f.avg_run. mean()} , by=['combine_id', 'category'])
77
+ >>> df.summarize(("avg_run", " mean", "avg_run_2") , by=['combine_id', 'category'])
81
78
avg_run_2
82
79
combine_id category
83
80
100200 heats 3.5
84
81
101200 finals 2.0
85
82
102201 heats 2.0
86
83
103202 finals 4.0
87
84
88
- Summarize with a tuple :
85
+ Summarize with the placeholders in `names_glue` :
89
86
90
87
>>> cols = jn.SD(columns="avg*", func="mean", names_glue="{_col}_{_fn}")
91
88
>>> df.summarize(cols)
@@ -100,14 +97,15 @@ def summarize(
100
97
103202 finals 4.0 4.0 4.0
101
98
102
99
:param df: A pandas DataFrame.
103
- :param args: Either a dictionary or a tuple.
100
+ :param args: A tuple.
104
101
:param by: Column(s) to group by.
105
- :raises ValueError: If a tuple is passed and the length is not 3 .
102
+ :raises ValueError: If the tuple size is less than 2 .
106
103
:returns: A pandas DataFrame with summarized columns.
107
104
""" # noqa: E501
108
105
106
+ args_to_process = []
109
107
for num , arg in enumerate (args ):
110
- check (f"Argument { num } in the summarize function" , arg , [dict , tuple ])
108
+ check (f"Argument { num } in the summarize function" , arg , [tuple ])
111
109
if isinstance (arg , tuple ):
112
110
if len (arg ) < 2 :
113
111
raise ValueError (
@@ -119,28 +117,31 @@ def summarize(
119
117
f"Argument { num } should have a maximum length of 3, "
120
118
f"instead got { len (arg )} "
121
119
)
122
- _ , func , names = SD (* arg )
123
- check (
124
- f"The function (position 1 in the tuple) for argument { num } " ,
125
- func ,
126
- [str , callable , list , tuple ],
127
- )
128
- if isinstance (func , (list , tuple )):
129
- for number , funcn in enumerate (func ):
130
- check (
131
- f"Entry { number } in the function sequence "
132
- f"for argument { num } " ,
133
- funcn ,
134
- [str , callable ],
135
- )
136
-
137
- if names :
120
+ entry = SD (* arg )
121
+ func = entry .func
122
+ names = entry .names_glue
123
+ check (
124
+ f"The function (position 1 in the tuple) for argument { num } " ,
125
+ func ,
126
+ [str , callable , list , tuple ],
127
+ )
128
+ if isinstance (func , (list , tuple )):
129
+ for number , funcn in enumerate (func ):
138
130
check (
139
- f"The names (position 2 in the tuple) for argument { num } " ,
140
- names ,
141
- [str ],
131
+ f"Entry { number } in the function sequence "
132
+ f"for argument { num } " ,
133
+ funcn ,
134
+ [str , callable ],
142
135
)
143
136
137
+ if names :
138
+ check (
139
+ f"The names (position 2 in the tuple) for argument { num } " ,
140
+ names ,
141
+ [str ],
142
+ )
143
+ args_to_process .append (entry )
144
+
144
145
by_is_true = by is not None
145
146
grp = None
146
147
if by_is_true and isinstance (by , dict ):
@@ -150,70 +151,25 @@ def summarize(
150
151
151
152
aggs = {}
152
153
153
- for arg in args :
154
- if isinstance (arg , dict ):
155
- for col , func in arg .items ():
156
- val = grp if by_is_true else df
157
- if isinstance (func , str ):
158
- outcome = val [col ].agg (func )
159
- elif is_scalar (func ):
160
- outcome = func
161
- else :
162
- try :
163
- outcome = val .agg (func )
164
- except (ValueError , AttributeError ):
165
- outcome = func (val )
166
- aggs [col ] = outcome
167
- else :
168
- columns , func , names = SD (* arg )
169
- columns = _select_index ([columns ], df , axis = "columns" )
170
- columns = df .columns [columns ]
171
- if not isinstance (func , (list , tuple )):
172
- func = [func ]
173
- func_names = [
174
- funcn .__name__ if callable (funcn ) else funcn for funcn in func
175
- ]
176
- counts = None
177
- dupes = set ()
178
- if len (func ) > 1 :
179
- counts = Counter (func_names )
180
- counts = {key : 0 for key , value in counts .items () if value > 1 }
181
- # deal with duplicate function names
182
- if counts :
183
- func_list = []
184
- for funcn in func_names :
185
- if funcn in counts :
186
- if names :
187
- name = f"{ funcn } { counts [funcn ]} "
188
- else :
189
- name = f"{ counts [funcn ]} "
190
- dupes .add (name )
191
- func_list .append (name )
192
- counts [funcn ] += 1
193
- else :
194
- func_list .append (funcn )
195
- func_names = func_list
196
- counts = None
197
- func_names = tuple (zip (func_names , func ))
198
- for col in columns :
199
- val = grp [col ] if by_is_true else df [col ]
200
- for name , funcn in func_names :
201
- if names :
202
- name = names .format (_col = col , _fn = name )
203
- elif name in dupes :
204
- name = f"{ col } { name } "
205
- else :
206
- name = col
207
- if isinstance (funcn , str ):
208
- outcome = val .agg (funcn )
209
- else :
210
- try :
211
- outcome = val .agg (funcn )
212
- except (ValueError , AttributeError ):
213
- outcome = funcn (val )
214
- aggs [name ] = outcome
215
- aggs = {
216
- col : [outcome ] if is_scalar (outcome ) else outcome
217
- for col , outcome in aggs .items ()
218
- }
154
+ for arg in args_to_process :
155
+ columns , names , func_names_and_func , dupes = _process_SD (df , arg )
156
+ for col , (name , funcn ) in product (columns , func_names_and_func ):
157
+ val = grp [col ] if by_is_true else df [col ]
158
+ if names :
159
+ name = names .format (_col = col , _fn = name )
160
+ elif name in dupes :
161
+ name = f"{ col } { name } "
162
+ else :
163
+ name = col
164
+ if isinstance (funcn , str ):
165
+ outcome = val .agg (funcn )
166
+ else :
167
+ try :
168
+ outcome = val .agg (funcn )
169
+ except (ValueError , AttributeError ):
170
+ outcome = funcn (val )
171
+ if is_scalar (outcome ):
172
+ outcome = [outcome ]
173
+ aggs [name ] = outcome
174
+
219
175
return pd .DataFrame (aggs , copy = False )
0 commit comments