Better xan bins

Yomguithereal · Yomguithereal · commit f5ff5edc5b47 · 2025-11-17T14:09:12.000+01:00
Fix #489
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@
 * Dropping `xan shuffle -m/--in-memory` flag. Loading the file into memory is now the default. The `xan shuffle -e/--external` flag has been added if
 you want the old default behavior.
 * `xan bins` now outputs `<empty>` values instead of `<nulls>`.
+* Overhauling `xan bins`. The default is now to find nice boundaries for the bins. Use `-e/--exact` to revert to the old behavior. The default number of bins is now `10`, and won't use Freedman-Diaconis rule by default. A `-H/--heuristic` flag has been added if you want to automatically select a suitable number of bins.
 
 *Features*
 
diff --git a/docs/cmd/bins.md b/docs/cmd/bins.md
@@ -4,30 +4,58 @@
 ```txt
 Discretize selection of columns containing continuous data into bins.
 
-The bins table is formatted as CSV data:
+The resulting bins table will be formatted thusly:
 
-    field,value,lower_bound,upper_bound,count
+field       - Name of the column
+value       - Bin's label (depends on what was given to -l/--label)
+lower_bound - Lower bound of the bin
+upper_bound - Upper bound of the bin
+count       - Number of rows falling into this bin
+
+The number of bins can be chosen with the -b/--bins flag. Note that,
+by default, this number is an approximate goal since the command
+attempts to find readble boundaries for the bins and this make it
+hard to respect a precise number of bins. Use the -e/--exact flag
+if you want to force the command to respect -b/--bins exactly.
+
+Combined with `xan hist`, this command can be very useful to visualize
+distributions of continous columns:
+
+    $ xan bins -s count data.csv | xan hist
+
+Using a log scale:
+
+    $ xan bins -s count data.csv | xan hist --scale log
 
 Usage:
     xan bins [options] [<input>]
     xan bins --help
 
 bins options:
-    -s, --select <arg>     Select a subset of columns to compute bins
-                           for. See 'xan select --help' for the format
-                           details.
-    -b, --bins <number>    Number of bins. Will default to using various heuristics
-                           to find an optimal default number if not provided.
-    -E, --nice             Whether to choose nice boundaries for the bins.
-                           Might return a number of bins slightly different to
-                           what was passed to -b/--bins, as a consequence.
-    -l, --label <mode>     Label to choose for the bins (that will be placed in the
-                           `value` column). Mostly useful to tweak representation when
-                           piping to `xan hist`. Can be one of "full", "lower" or "upper".
-                           [default: full]
-    -m, --min <min>        Override min value.
-    -M, --max <max>        Override max value.
-    -N, --no-extra         Don't include, nulls, nans and out-of-bounds counts.
+    -s, --select <arg>      Select a subset of columns to compute bins for. See
+                            'xan select --help' for more detail.
+    -b, --bins <number>     Number of bins to generate. Note that without -e/--exact,
+                            this number should be considered as an approximate goal.
+                            The command by default attempts to find nice & readable boundaries
+                            for the bins and this means a precise number of bins is not
+                            always achievable.
+                            [default: 10]
+    -H, --heuristic <name>  Heuristic to use to automatically find an adequate number
+                            of bins. Must be one of `freedman-diaconis`, `sqrt` or `sturges`.
+    --max-bins <number>     Maximum number of bins to generate. Only useful when using
+                            the -H/--heuristic flag.
+    -e, --exact             Whether to make sure to return the exact number of bins
+                            provided to -b/--bins, which means the readability of the
+                            bins boundaries might suffer.
+    -l, --label <mode>      Label to choose for the bins (that will be placed in the
+                            `value` column). Mostly useful to tweak representation when
+                            piping to `xan hist`. Can be one of "full", "lower" or "upper".
+                            [default: full]
+    -m, --min <min>         Override min value. Values lower that this min will be counted
+                            as out of bounds.
+    -M, --max <max>         Override max value. Values greater that this max will be counted
+                            as out of bounds.
+    -N, --no-extra          Don't include, empty cells, nans and out of bounds counts.
 
 Common options:
     -h, --help             Display this message
diff --git a/docs/cmd/frequency.md b/docs/cmd/frequency.md
@@ -10,6 +10,10 @@ field - Name of the column
 value - Some distinct value of the column
 count - Number of rows containing this value
 
+Pipe into `xan hist` to easily visualize the result:
+
+    $ xan freq -s category data.csv | xan hist
+
 By default, there is a row for the N most frequent values for each field in the
 data. The number of returned values can be tweaked with -l/--limit or you can
 disable the limit altogether using the -A/--all flag.
diff --git a/src/cmd/bins.rs b/src/cmd/bins.rs
@@ -12,28 +12,49 @@ use crate::CliResult;
 static USAGE: &str = "
 Discretize selection of columns containing continuous data into bins.
 
-The bins table is formatted as CSV data:
+The resulting bins table will be formatted thusly:
 
-    field,value,lower_bound,upper_bound,count
+field       - Name of the column
+value       - Bin's label (depends on what was given to -l/--label)
+lower_bound - Lower bound of the bin
+upper_bound - Upper bound of the bin
+count       - Number of rows falling into this bin
+
+The number of bins can be chosen with the -b/--bins flag. Note that,
+by default, this number is an approximate goal since the command
+attempts to find readble boundaries for the bins and this make it
+hard to respect a precise number of bins. Use the -e/--exact flag
+if you want to force the command to respect -b/--bins exactly.
+
+Combined with `xan hist`, this command can be very useful to visualize
+distributions of continous columns:
+
+    $ xan bins -s count data.csv | xan hist
+
+Using a log scale:
+
+    $ xan bins -s count data.csv | xan hist --scale log
 
 Usage:
     xan bins [options] [<input>]
     xan bins --help
 
 bins options:
-    -s, --select <arg>      Select a subset of columns to compute bins
-                            for. See 'xan select --help' for the format
-                            details.
-    -b, --bins <number>     Number of bins. Will default to using various heuristics
-                            to find an optimal default number if not provided.
+    -s, --select <arg>      Select a subset of columns to compute bins for. See
+                            'xan select --help' for more detail.
+    -b, --bins <number>     Number of bins to generate. Note that without -e/--exact,
+                            this number should be considered as an approximate goal.
+                            The command by default attempts to find nice & readable boundaries
+                            for the bins and this means a precise number of bins is not
+                            always achievable.
                             [default: 10]
     -H, --heuristic <name>  Heuristic to use to automatically find an adequate number
                             of bins. Must be one of `freedman-diaconis`, `sqrt` or `sturges`.
     --max-bins <number>     Maximum number of bins to generate. Only useful when using
                             the -H/--heuristic flag.
-    -E, --nice              Whether to choose nice boundaries for the bins.
-                            Might return a number of bins slightly different to
-                            what was passed to -b/--bins, as a consequence.
+    -e, --exact             Whether to make sure to return the exact number of bins
+                            provided to -b/--bins, which means the readability of the
+                            bins boundaries might suffer.
     -l, --label <mode>      Label to choose for the bins (that will be placed in the
                             `value` column). Mostly useful to tweak representation when
                             piping to `xan hist`. Can be one of \"full\", \"lower\" or \"upper\".
@@ -64,8 +85,8 @@ struct Args {
     flag_bins: usize,
     flag_max_bins: Option<usize>,
     flag_heuristic: Option<Heuristic>,
-    flag_label: String,
-    flag_nice: bool,
+    flag_label: LabelOption,
+    flag_exact: bool,
     flag_min: Option<f64>,
     flag_max: Option<f64>,
 }
@@ -86,13 +107,6 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
         .no_headers(args.flag_no_headers)
         .select(args.flag_select.clone());
 
-    if !["full", "upper", "lower"].contains(&args.flag_label.as_str()) {
-        Err(format!(
-            "unknown --label {:?}, must be one of \"full\", \"upper\" or \"lower\".",
-            args.flag_label
-        ))?;
-    }
-
     let mut rdr = conf.simd_reader()?;
     let mut wtr = Config::new(&args.flag_output).simd_writer()?;
 
@@ -117,7 +131,7 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
             args.flag_max_bins,
             args.flag_min,
             args.flag_max,
-            args.flag_nice,
+            args.flag_exact,
         ) {
             None => continue,
             Some(bins) => {
@@ -146,8 +160,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
                     let label_format = if bin.is_constant() {
                         lower_bound
                     } else {
-                        match args.flag_label.as_str() {
-                            "full" => match bins_iter.peek() {
+                        match args.flag_label {
+                            LabelOption::Full => match bins_iter.peek() {
                                 None => format!(
                                     ">= {:lower_width$} <= {:upper_width$}",
                                     lower_bound,
@@ -163,13 +177,12 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
                                     upper_width = max_upper_bound_width
                                 ),
                             },
-                            "upper" => upper_bound,
-                            "lower" => lower_bound,
-                            _ => unreachable!(),
+                            LabelOption::Upper => upper_bound,
+                            LabelOption::Lower => lower_bound,
                         }
                     };
 
-                    wtr.write_record(vec![
+                    wtr.write_record([
                         &headers[series.column],
                         label_format.as_bytes(),
                         bin.lower_bound.to_string().as_bytes(),
@@ -236,6 +249,13 @@ fn compute_rectified_iqr(numbers: &[f64], stats: &SeriesStats) -> Option<f64> {
     }
 }
 
+#[derive(Deserialize, Clone, Copy)]
+enum LabelOption {
+    Full,
+    Lower,
+    Upper,
+}
+
 #[derive(Deserialize, Clone, Copy, Debug)]
 enum Heuristic {
     #[serde(rename = "freedman-diaconis")]
@@ -418,7 +438,7 @@ impl Series {
         max_bins: Option<usize>,
         min: Option<f64>,
         max: Option<f64>,
-        nice: bool,
+        exact: bool,
     ) -> Option<Vec<Bin>> {
         if self.len() < 1 {
             return None;
@@ -441,7 +461,7 @@ impl Series {
 
         let count = self.solve_bins_count(count, max_bins, width, &stats);
 
-        let bins = if nice {
+        let bins = if !exact {
             let scale = LinearScale::nice((min, max), (0.0, 1.0), count);
             let mut ticks = scale.ticks(count);
 
diff --git a/src/cmd/frequency.rs b/src/cmd/frequency.rs
@@ -22,6 +22,10 @@ field - Name of the column
 value - Some distinct value of the column
 count - Number of rows containing this value
 
+Pipe into `xan hist` to easily visualize the result:
+
+    $ xan freq -s category data.csv | xan hist
+
 By default, there is a row for the N most frequent values for each field in the
 data. The number of returned values can be tweaked with -l/--limit or you can
 disable the limit altogether using the -A/--all flag.