Skip to content

Commit 8db6af6

Browse files
authored
[stdlib-candidate] add aggregate (#991)
# Description Add `aggregate`, a command that operates on the output of `group-by --to-table` to help aggregate to do quick inspections. # Related - nushell/nushell#14316 (comment) - nushell/nushell#2607 - nushell/nushell#14337 # Examples ```nushell open ~/Downloads/movies.csv | group-by Lead_Studio Genre --to-table | aggregate Worldwide_Gross # | first 4 # | to md ``` |Lead_Studio|Genre|count|Worldwide_Gross_min|Worldwide_Gross_avg|Worldwide_Gross_max|Worldwide_Gross_sum| |-|-|-|-|-|-|-| |The Weinstein Company|Comedy|1|19.62|19.62|19.62|19.62| |The Weinstein Company|Drama|1|8.26|8.26|8.26|8.26| |Independent|Comedy|7|14.31|57.01|205.3|399.07| |Independent|Romance|7|0.03|149.82142857142858|702.17|1048.75| --- ```nushell open ~/Downloads/movies.csv | group-by Lead_Studio Genre --to-table | aggregate Worldwide_Gross --ops {avg: {math avg}, std: {math stddev}} # | first 4 # | to md ``` |Lead_Studio|Genre|count|Worldwide_Gross_avg|Worldwide_Gross_std| |-|-|-|-|-| |The Weinstein Company|Comedy|1|19.62|0| |The Weinstein Company|Drama|1|8.26|0| |Independent|Comedy|7|57.01|66.1709932134704| |Independent|Romance|7|149.82142857142858|229.79475832816996| --- ```nushell open ~/Downloads/movies.csv | group-by Lead_Studio Genre --to-table | aggregate Worldwide_Gross Audience_score_% --ops {avg: {math avg}} # | first 4 # | to md ``` |Lead_Studio|Genre|count|Worldwide_Gross_avg|Audience_score_%_avg| |-|-|-|-|-| |The Weinstein Company|Comedy|1|19.62|52| |The Weinstein Company|Drama|1|8.26|84| |Independent|Comedy|7|57.01|60.142857142857146| |Independent|Romance|7|149.82142857142858|59.857142857142854|
1 parent a83a40d commit 8db6af6

File tree

2 files changed

+321
-0
lines changed

2 files changed

+321
-0
lines changed
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
def aggregate-default-ops [] {
2+
{
3+
min: {math min},
4+
avg: {math avg},
5+
max: {math max},
6+
sum: {math sum},
7+
}
8+
}
9+
10+
def aggregate-col-name [col: cell-path, op_name: string]: [nothing -> string] {
11+
$col | split cell-path | get value | str join "." | $"($in)_($op_name)"
12+
}
13+
14+
def get-item-with-error [
15+
col: cell-path,
16+
opts: record<span: record<start: int, end: int>, items: bool>
17+
]: [table -> any] {
18+
try {
19+
get $col
20+
} catch {
21+
let full_cellpath = if $opts.items {
22+
$col
23+
| split cell-path
24+
| prepend {value: items, optional: false}
25+
| into cell-path
26+
} else {
27+
$col
28+
}
29+
error make {
30+
msg: $"Cannot find column '($full_cellpath)'",
31+
label: {
32+
text: "value originates here",
33+
span: $opts.span
34+
},
35+
}
36+
}
37+
}
38+
39+
def "error not-a-table" [span: record<start: int, end:int>] {
40+
error make {
41+
msg: "input must be a table",
42+
label: {
43+
text: "from here",
44+
span: $span
45+
},
46+
help: "Are you using `group-by`? Make sure to use its `--to-table` flag."
47+
}
48+
}
49+
50+
# Run aggregate operations on output of `group-by --to-table`.
51+
#
52+
# # Example
53+
#
54+
# - group files by type and extension, and get stats about their sizes
55+
# ```nushell
56+
# >_ ls | group-by type { get name | path parse | get extension } --to-table | aggregate size
57+
# ```
58+
#
59+
# - group data by multiple columns, and run custom aggregate operations
60+
# ```nushell
61+
# >_ open movies.csv
62+
# | group-by Lead_Studio Genre --to-table
63+
# | aggregate Worldwide_Gross Profitability --ops {avg: {math avg}, std: {math stddev}}
64+
# ```
65+
#
66+
# - run aggregate operations without grouping the input
67+
# ```nushell
68+
# >_ open movies.csv | aggregate Year
69+
# ```
70+
export def main [
71+
--ops: record, # default = {min: {math min}, avg: {math avg}, max: {math max}, sum: {math sum}}
72+
...columns: cell-path, # columns to perform aggregations on
73+
]: [
74+
table -> table<count: int>
75+
] {
76+
let IN = $in
77+
let md = metadata $in
78+
79+
let first = try { $IN | first } catch { error not-a-table $md.span }
80+
if not (($first | describe) starts-with record) {
81+
error not-a-table $md.span
82+
}
83+
84+
let grouped = "items" in $first
85+
86+
let IN = if $grouped {
87+
$IN
88+
} else {
89+
[{items: $IN}]
90+
}
91+
92+
let agg_ops = $ops | default (aggregate-default-ops)
93+
94+
let results = $IN
95+
| update items {|group|
96+
let column_results = $columns
97+
| each {|col| # col: cell-path
98+
let column = $group.items | get-item-with-error $col {span: $md.span, items: $grouped}
99+
$agg_ops | items {|op_name, op| # op_name: string, op: closure
100+
$column | do $op | wrap (aggregate-col-name $col $op_name)
101+
}
102+
| reduce {|it| merge $it}
103+
}
104+
105+
# Manually propagate errors
106+
for r in $column_results {
107+
if ($r | describe) == error {
108+
return $r
109+
}
110+
}
111+
112+
$column_results
113+
| reduce --fold {} {|it| merge $it}
114+
| insert count ($group.items | length)
115+
| roll right # put count as the first column
116+
}
117+
118+
# Manually propagate errors
119+
for r in $results {
120+
if ($r.items | describe) == error {
121+
return $r.items
122+
}
123+
}
124+
125+
$results | flatten items
126+
}
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
use std/assert
2+
use ../std-rfc/aggregate
3+
4+
const movies = [
5+
[ Film, Genre, Lead_Studio, Audience_score_%, Profitability, Rotten_Tomatoes_%, Worldwide_Gross, Year ];
6+
[ "Youth in Revolt", Comedy, "The Weinstein Company", 52, 1.09, 68, 19.62, 2010 ],
7+
[ "You Will Meet a Tall Dark Stranger", Comedy, Independent, 35, 1.211818182, 43, 26.66, 2010 ],
8+
[ "When in Rome", Comedy, Disney, 44, 0, 15, 43.04, 2010 ],
9+
[ "What Happens in Vegas", Comedy, Fox, 72, 6.267647029, 28, 219.37, 2008 ],
10+
[ "Water For Elephants", Drama, "20th Century Fox", 72, 3.081421053, 60, 117.09, 2011 ],
11+
[ WALL-E, Animation, Disney, 89, 2.896019067, 96, 521.28, 2008 ],
12+
[ Waitress, Romance, Independent, 67, 11.0897415, 89, 22.18, 2007 ],
13+
[ "Waiting For Forever", Romance, Independent, 53, 0.005, 6, 0.03, 2011 ],
14+
[ "Valentine's Day", Comedy, "Warner Bros.", 54, 4.184038462, 17, 217.57, 2010 ],
15+
[ "Tyler Perry's Why Did I get Married", Romance, Independent, 47, 3.7241924, 46, 55.86, 2007 ],
16+
[ "Twilight: Breaking Dawn", Romance, Independent, 68, 6.383363636, 26, 702.17, 2011 ],
17+
[ Twilight, Romance, Summit, 82, 10.18002703, 49, 376.66, 2008 ],
18+
[ "The Ugly Truth", Comedy, Independent, 68, 5.402631579, 14, 205.3, 2009 ],
19+
[ "The Twilight Saga: New Moon", Drama, Summit, 78, 14.1964, 27, 709.82, 2009 ],
20+
[ "The Time Traveler's Wife", Drama, Paramount, 65, 2.598205128, 38, 101.33, 2009 ],
21+
[ "The Proposal", Comedy, Disney, 74, 7.8675, 43, 314.7, 2009 ],
22+
[ "The Invention of Lying", Comedy, "Warner Bros.", 47, 1.751351351, 56, 32.4, 2009 ],
23+
[ "The Heartbreak Kid", Comedy, Paramount, 41, 2.129444167, 30, 127.77, 2007 ],
24+
[ "The Duchess", Drama, Paramount, 68, 3.207850222, 60, 43.31, 2008 ],
25+
[ "The Curious Case of Benjamin Button", Fantasy, "Warner Bros.", 81, 1.78394375, 73, 285.43, 2008 ],
26+
[ "The Back-up Plan", Comedy, CBS, 47, 2.202571429, 20, 77.09, 2010 ],
27+
[ Tangled, Animation, Disney, 88, 1.365692308, 89, 355.01, 2010 ],
28+
[ "Something Borrowed", Romance, Independent, 48, 1.719514286, 15, 60.18, 2011 ],
29+
[ "She's Out of My League", Comedy, Paramount, 60, 2.4405, 57, 48.81, 2010 ],
30+
[ "Sex and the City Two", Comedy, "Warner Bros.", 49, 2.8835, 15, 288.35, 2010 ],
31+
[ "Sex and the City 2", Comedy, "Warner Bros.", 49, 2.8835, 15, 288.35, 2010 ],
32+
[ "Sex and the City", Comedy, "Warner Bros.", 81, 7.221795791, 49, 415.25, 2008 ],
33+
[ "Remember Me", Drama, Summit, 70, 3.49125, 28, 55.86, 2010 ],
34+
[ "Rachel Getting Married", Drama, Independent, 61, 1.384166667, 85, 16.61, 2008 ],
35+
[ Penelope, Comedy, Summit, 74, 1.382799733, 52, 20.74, 2008 ],
36+
[ "P.S. I Love You", Romance, Independent, 82, 5.103116833, 21, 153.09, 2007 ],
37+
[ "Over Her Dead Body", Comedy, "New Line", 47, 2.071, 15, 20.71, 2008 ],
38+
[ "Our Family Wedding", Comedy, Independent, 49, 0, 14, 21.37, 2010 ],
39+
[ "One Day", Romance, Independent, 54, 3.682733333, 37, 55.24, 2011 ],
40+
[ "Not Easily Broken", Drama, Independent, 66, 2.14, 34, 10.7, 2009 ],
41+
[ "No Reservations", Comedy, "Warner Bros.", 64, 3.307180357, 39, 92.6, 2007 ],
42+
[ "Nick and Norah's Infinite Playlist", Comedy, Sony, 67, 3.3527293, 73, 33.53, 2008 ],
43+
[ "New Year's Eve", Romance, "Warner Bros.", 48, 2.536428571, 8, 142.04, 2011 ],
44+
[ "My Week with Marilyn", Drama, "The Weinstein Company", 84, 0.8258, 83, 8.26, 2011 ],
45+
[ "Music and Lyrics", Romance, "Warner Bros.", 70, 3.64741055, 63, 145.9, 2007 ],
46+
[ "Monte Carlo", Romance, "20th Century Fox", 50, 1.9832, 38, 39.66, 2011 ],
47+
[ "Miss Pettigrew Lives for a Day", Comedy, Independent, 70, 0.2528949, 78, 15.17, 2008 ],
48+
[ "Midnight in Paris", Romence, Sony, 84, 8.744705882, 93, 148.66, 2011 ],
49+
[ "Marley and Me", Comedy, Fox, 77, 3.746781818, 63, 206.07, 2008 ],
50+
[ "Mamma Mia!", Comedy, Universal, 76, 9.234453864, 53, 609.47, 2008 ],
51+
[ "Mamma Mia!", Comedy, Universal, 76, 9.234453864, 53, 609.47, 2008 ],
52+
[ "Made of Honor", Comdy, Sony, 61, 2.64906835, 13, 105.96, 2008 ],
53+
[ "Love Happens", Drama, Universal, 40, 2.004444444, 18, 36.08, 2009 ],
54+
[ "Love & Other Drugs", Comedy, Fox, 55, 1.817666667, 48, 54.53, 2010 ],
55+
[ "Life as We Know It", Comedy, Independent, 62, 2.530526316, 28, 96.16, 2010 ],
56+
[ "License to Wed", Comedy, "Warner Bros.", 55, 1.9802064, 8, 69.31, 2007 ],
57+
[ "Letters to Juliet", Comedy, Summit, 62, 2.639333333, 40, 79.18, 2010 ],
58+
[ "Leap Year", Comedy, Universal, 49, 1.715263158, 21, 32.59, 2010 ],
59+
[ "Knocked Up", Comedy, Universal, 83, 6.636401848, 91, 219, 2007 ],
60+
[ Killers, Action, Lionsgate, 45, 1.245333333, 11, 93.4, 2010 ],
61+
[ "Just Wright", Comedy, Fox, 58, 1.797416667, 45, 21.57, 2010 ],
62+
[ "Jane Eyre", Romance, Universal, 77, 0, 85, 30.15, 2011 ],
63+
[ "It's Complicated", Comedy, Universal, 63, 2.642352941, 56, 224.6, 2009 ],
64+
[ "I Love You Phillip Morris", Comedy, Independent, 57, 1.34, 71, 20.1, 2010 ],
65+
[ "High School Musical 3: Senior Year", Comedy, Disney, 76, 22.91313646, 65, 252.04, 2008 ],
66+
[ "He's Just Not That Into You", Comedy, "Warner Bros.", 60, 7.1536, 42, 178.84, 2009 ],
67+
[ "Good Luck Chuck", Comedy, Lionsgate, 61, 2.36768512, 3, 59.19, 2007 ],
68+
[ "Going the Distance", Comedy, "Warner Bros.", 56, 1.3140625, 53, 42.05, 2010 ],
69+
[ "Gnomeo and Juliet", Animation, Disney, 52, 5.387972222, 56, 193.97, 2011 ],
70+
[ "Gnomeo and Juliet", Animation, Disney, 52, 5.387972222, 56, 193.97, 2011 ],
71+
[ "Ghosts of Girlfriends Past", Comedy, "Warner Bros.", 47, 2.0444, 27, 102.22, 2009 ],
72+
[ "Four Christmases", Comedy, "Warner Bros.", 52, 2.022925, 26, 161.83, 2008 ],
73+
[ Fireproof, Drama, Independent, 51, 66.934, 40, 33.47, 2008 ],
74+
[ Enchanted, Comedy, Disney, 80, 4.005737082, 93, 340.49, 2007 ],
75+
[ "Dear John", Drama, Sony, 66, 4.5988, 29, 114.97, 2010 ],
76+
[ Beginners, Comedy, Independent, 80, 4.471875, 84, 14.31, 2011 ],
77+
[ "Across the Universe", romance, Independent, 84, 0.652603178, 54, 29.37, 2007 ],
78+
[ "A Serious Man", Drama, Universal, 64, 4.382857143, 89, 30.68, 2009 ],
79+
[ "A Dangerous Method", Drama, Independent, 89, 0.44864475, 79, 8.97, 2011 ],
80+
[ "27 Dresses", Comedy, Fox, 71, 5.3436218, 40, 160.31, 2008 ],
81+
[ "(500) Days of Summer", comedy, Fox, 81, 8.096, 87, 60.72, 2009 ]
82+
]
83+
84+
#[test]
85+
def count_movies_by_Lead_Studio [] {
86+
let grouped = $movies | group-by Lead_Studio --to-table
87+
let out = $grouped | aggregate
88+
# let expected = $grouped | insert count {get items | length} | select Lead_Studio count
89+
let expected = [
90+
[ Lead_Studio, count ];
91+
[ "The Weinstein Company", 2 ],
92+
[ Independent, 19 ],
93+
[ Disney, 8 ],
94+
[ Fox, 6 ],
95+
[ "20th Century Fox", 2 ],
96+
[ "Warner Bros.", 14 ],
97+
[ Summit, 5 ],
98+
[ Paramount, 4 ],
99+
[ CBS, 1 ],
100+
[ "New Line", 1 ],
101+
[ Sony, 4 ],
102+
[ Universal, 8 ],
103+
[ Lionsgate, 2 ]
104+
]
105+
106+
assert equal $out $expected
107+
}
108+
109+
#[test]
110+
def average_gross_by_Genre [] {
111+
let grouped = $movies | group-by Genre --to-table
112+
let out = $grouped | aggregate --ops {avg: {math avg}} Worldwide_Gross | select Genre Worldwide_Gross_avg
113+
# let expected = $grouped | insert Worldwide_Gross_avg {get items.Worldwide_Gross | math avg} | select Genre Worldwide_Gross_avg
114+
115+
# Round to 2 digits of precision to keep floating point operations consistent between platforms.
116+
let out = $out | update Worldwide_Gross_avg {math round --precision 2}
117+
let expected = [
118+
[ Genre, Worldwide_Gross_avg ];
119+
[ Comedy, 148.33 ],
120+
[ Drama, 99.01 ],
121+
[ Animation, 316.06 ],
122+
[ Romance, 148.60 ],
123+
[ Fantasy, 285.43 ],
124+
[ Romence, 148.66 ],
125+
[ Comdy, 105.96 ],
126+
[ Action, 93.40 ],
127+
[ romance, 29.37 ],
128+
[ comedy, 60.72 ]
129+
]
130+
131+
assert equal $out $expected
132+
}
133+
134+
#[test]
135+
def aggregate_default_ops [] {
136+
let grouped = $movies | group-by Genre --to-table
137+
let out = $grouped | aggregate Worldwide_Gross
138+
139+
# Round to 2 digits of precision to keep floating point operations consistent between platforms.
140+
let out = $out | update cells -c [Worldwide_Gross_min, Worldwide_Gross_avg, Worldwide_Gross_max, Worldwide_Gross_sum] { math round --precision 2 }
141+
142+
let expected = [
143+
[Genre , count, Worldwide_Gross_min, Worldwide_Gross_avg, Worldwide_Gross_max, Worldwide_Gross_sum];
144+
[Comedy , 41, 14.31, 148.33, 609.47, 6081.73],
145+
[Drama , 13, 8.26, 99.01, 709.82, 1287.15],
146+
[Animation, 4, 193.97, 316.06, 521.28, 1264.23],
147+
[Romance , 12, 0.03, 148.60, 702.17, 1783.16],
148+
[Fantasy , 1, 285.43, 285.43, 285.43, 285.43],
149+
[Romence , 1, 148.66, 148.66, 148.66, 148.66],
150+
[Comdy , 1, 105.96, 105.96, 105.96, 105.96],
151+
[Action , 1, 93.40, 93.40, 93.40, 93.40],
152+
[romance , 1, 29.37, 29.37, 29.37, 29.37],
153+
[comedy , 1, 60.72, 60.72, 60.72, 60.72],
154+
]
155+
156+
assert equal $out $expected
157+
}
158+
159+
#[test]
160+
def throw_error_on_non-table_input [] {
161+
# without --to-table
162+
let out = try {
163+
$movies | group-by Genre | aggregate Worldwide_Gross
164+
} catch {|e|
165+
$e.msg
166+
}
167+
168+
assert equal $out "input must be a table"
169+
}
170+
171+
#[test]
172+
def throw_error_on_non-existing_column [] {
173+
let grouped = $movies | group-by Genre --to-table
174+
let error = try {
175+
$grouped | aggregate --ops {avg: {math avg}} NotInTheDataSet
176+
} catch {|e|
177+
$e.json | from json
178+
}
179+
180+
assert equal $error.inner.0.msg "Cannot find column '$.items.NotInTheDataSet'"
181+
}
182+
183+
#[test]
184+
def aggregate_stats_without_grouping [] {
185+
let out = $movies | aggregate Year | update cells -c [Year_min Year_avg Year_max Year_sum] {math round -p 2}
186+
let expected = [{
187+
count: 76,
188+
Year_min: 2007,
189+
Year_avg: 2009.09,
190+
Year_max: 2011,
191+
Year_sum: 152691
192+
}]
193+
194+
assert equal $out $expected
195+
}

0 commit comments

Comments
 (0)