@@ -12,28 +12,49 @@ use crate::CliResult;
1212static USAGE : & str = "
1313Discretize selection of columns containing continuous data into bins.
1414
15- The bins table is formatted as CSV data :
15+ The resulting bins table will be formatted thusly :
1616
17- field,value,lower_bound,upper_bound,count
17+ field - Name of the column
18+ value - Bin's label (depends on what was given to -l/--label)
19+ lower_bound - Lower bound of the bin
20+ upper_bound - Upper bound of the bin
21+ count - Number of rows falling into this bin
22+
23+ The number of bins can be chosen with the -b/--bins flag. Note that,
24+ by default, this number is an approximate goal since the command
25+ attempts to find readble boundaries for the bins and this make it
26+ hard to respect a precise number of bins. Use the -e/--exact flag
27+ if you want to force the command to respect -b/--bins exactly.
28+
29+ Combined with `xan hist`, this command can be very useful to visualize
30+ distributions of continous columns:
31+
32+ $ xan bins -s count data.csv | xan hist
33+
34+ Using a log scale:
35+
36+ $ xan bins -s count data.csv | xan hist --scale log
1837
1938Usage:
2039 xan bins [options] [<input>]
2140 xan bins --help
2241
2342bins options:
24- -s, --select <arg> Select a subset of columns to compute bins
25- for. See 'xan select --help' for the format
26- details.
27- -b, --bins <number> Number of bins. Will default to using various heuristics
28- to find an optimal default number if not provided.
43+ -s, --select <arg> Select a subset of columns to compute bins for. See
44+ 'xan select --help' for more detail.
45+ -b, --bins <number> Number of bins to generate. Note that without -e/--exact,
46+ this number should be considered as an approximate goal.
47+ The command by default attempts to find nice & readable boundaries
48+ for the bins and this means a precise number of bins is not
49+ always achievable.
2950 [default: 10]
3051 -H, --heuristic <name> Heuristic to use to automatically find an adequate number
3152 of bins. Must be one of `freedman-diaconis`, `sqrt` or `sturges`.
3253 --max-bins <number> Maximum number of bins to generate. Only useful when using
3354 the -H/--heuristic flag.
34- -E , --nice Whether to choose nice boundaries for the bins.
35- Might return a number of bins slightly different to
36- what was passed to -b/-- bins, as a consequence .
55+ -e , --exact Whether to make sure to return the exact number of bins
56+ provided to -b/--bins, which means the readability of the
57+ bins boundaries might suffer .
3758 -l, --label <mode> Label to choose for the bins (that will be placed in the
3859 `value` column). Mostly useful to tweak representation when
3960 piping to `xan hist`. Can be one of \" full\" , \" lower\" or \" upper\" .
@@ -64,8 +85,8 @@ struct Args {
6485 flag_bins : usize ,
6586 flag_max_bins : Option < usize > ,
6687 flag_heuristic : Option < Heuristic > ,
67- flag_label : String ,
68- flag_nice : bool ,
88+ flag_label : LabelOption ,
89+ flag_exact : bool ,
6990 flag_min : Option < f64 > ,
7091 flag_max : Option < f64 > ,
7192}
@@ -86,13 +107,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
86107 . no_headers ( args. flag_no_headers )
87108 . select ( args. flag_select . clone ( ) ) ;
88109
89- if ![ "full" , "upper" , "lower" ] . contains ( & args. flag_label . as_str ( ) ) {
90- Err ( format ! (
91- "unknown --label {:?}, must be one of \" full\" , \" upper\" or \" lower\" ." ,
92- args. flag_label
93- ) ) ?;
94- }
95-
96110 let mut rdr = conf. simd_reader ( ) ?;
97111 let mut wtr = Config :: new ( & args. flag_output ) . simd_writer ( ) ?;
98112
@@ -117,7 +131,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
117131 args. flag_max_bins ,
118132 args. flag_min ,
119133 args. flag_max ,
120- args. flag_nice ,
134+ args. flag_exact ,
121135 ) {
122136 None => continue ,
123137 Some ( bins) => {
@@ -146,8 +160,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
146160 let label_format = if bin. is_constant ( ) {
147161 lower_bound
148162 } else {
149- match args. flag_label . as_str ( ) {
150- "full" => match bins_iter. peek ( ) {
163+ match args. flag_label {
164+ LabelOption :: Full => match bins_iter. peek ( ) {
151165 None => format ! (
152166 ">= {:lower_width$} <= {:upper_width$}" ,
153167 lower_bound,
@@ -163,13 +177,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
163177 upper_width = max_upper_bound_width
164178 ) ,
165179 } ,
166- "upper" => upper_bound,
167- "lower" => lower_bound,
168- _ => unreachable ! ( ) ,
180+ LabelOption :: Upper => upper_bound,
181+ LabelOption :: Lower => lower_bound,
169182 }
170183 } ;
171184
172- wtr. write_record ( vec ! [
185+ wtr. write_record ( [
173186 & headers[ series. column ] ,
174187 label_format. as_bytes ( ) ,
175188 bin. lower_bound . to_string ( ) . as_bytes ( ) ,
@@ -236,6 +249,13 @@ fn compute_rectified_iqr(numbers: &[f64], stats: &SeriesStats) -> Option<f64> {
236249 }
237250}
238251
252+ #[ derive( Deserialize , Clone , Copy ) ]
253+ enum LabelOption {
254+ Full ,
255+ Lower ,
256+ Upper ,
257+ }
258+
239259#[ derive( Deserialize , Clone , Copy , Debug ) ]
240260enum Heuristic {
241261 #[ serde( rename = "freedman-diaconis" ) ]
@@ -418,7 +438,7 @@ impl Series {
418438 max_bins : Option < usize > ,
419439 min : Option < f64 > ,
420440 max : Option < f64 > ,
421- nice : bool ,
441+ exact : bool ,
422442 ) -> Option < Vec < Bin > > {
423443 if self . len ( ) < 1 {
424444 return None ;
@@ -441,7 +461,7 @@ impl Series {
441461
442462 let count = self . solve_bins_count ( count, max_bins, width, & stats) ;
443463
444- let bins = if nice {
464+ let bins = if !exact {
445465 let scale = LinearScale :: nice ( ( min, max) , ( 0.0 , 1.0 ) , count) ;
446466 let mut ticks = scale. ticks ( count) ;
447467
0 commit comments