facebookresearch
diff --git a/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎balance/balancedf_class.py‎
Lines changed: 8 additions & 0 deletions b/‎balance/balancedf_class.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎balance/stats_and_plots/ascii_plots.py‎
Lines changed: 152 additions & 42 deletions b/‎balance/stats_and_plots/ascii_plots.py‎
Lines changed: 152 additions & 42 deletions
@@ -17,6 +17,16 @@
     and `]` for deficit relative to the baseline. Uses visually distinct fill
     characters (`█`, `▒`, `▐`, `░`) for better readability when comparing
     multiple datasets.
+- **ASCII plot display order: population → adjusted → sample**
+  - Comparative ASCII plots now order datasets as population, adjusted, sample
+    (instead of sample, adjusted, population) so the target distribution appears
+    first.
+- **`comparative` parameter for ASCII numeric plots**
+  - `ascii_plot_dist` (and `BalanceDF.plot(library="balance")`) accepts a new
+    `comparative` keyword (default `True`). When `True`, numeric variables use
+    the columnar comparative histogram (`ascii_comparative_hist`). Set
+    `comparative=False` to use grouped-bar histograms (`ascii_plot_hist`) for
+    numeric variables instead, matching the style used for categorical variables.
 
 ## Documentation
 
 
@@ -626,6 +626,11 @@ def plot(
         can be used for ASCII text output suitable for LLM consumption (only dist_type="hist_ascii" is supported
         with library="balance").
 
+        When using ``library="balance"``, numeric variables are rendered as
+        comparative histograms by default (showing excess/deficit vs. a
+        baseline).  Pass ``comparative=False`` to use grouped-bar histograms
+        instead (same style as categorical variables).
+
         This function is inherited as is when invoking BalanceDFCovars.plot, but some modifications are made when
         preparing the data for BalanceDFOutcomes.plot and BalanceDFWeights.plot.
 
@@ -686,6 +691,9 @@ def plot(
 
                 # ASCII text output (suitable for LLM consumption):
                 s3_null.covars().plot(library = "balance", dist_type = "hist_ascii")
+
+                # ASCII with grouped-bar histograms instead of comparative:
+                s3_null.covars().plot(library = "balance", comparative = False)
         """
         if on_linked_samples:
             dfs_to_add = self._BalanceDF_child_from_linked_samples()
 
@@ -8,7 +8,7 @@
 from __future__ import annotations
 
 import logging
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import numpy.typing as npt
@@ -27,6 +27,34 @@
 # Each dataset gets a unique character from this list.
 BAR_CHARS: List[str] = ["█", "▒", "▐", "░", "▄", "▀"]
 
+# Preferred ordering for comparative plots: population first, then adjusted,
+# then sample.  Known internal names are placed in this order; any unknown
+# names are appended at the end in their original order.
+_PREFERRED_NAME_ORDER: List[str] = ["target", "self", "adjusted", "unadjusted"]
+
+
+def _reorder_dfs_and_names(
+    dfs: List[DataFrameWithWeight],
+    names: List[str],
+) -> Tuple[List[DataFrameWithWeight], List[str]]:
+    """Reorder *dfs* and *names* to the preferred display order.
+
+    The canonical display order is: population (``target``), adjusted
+    (``self`` when ``unadjusted`` is also present), sample
+    (``unadjusted``).  Names not in the preferred list keep their
+    original relative order and are appended after the known names.
+    """
+    order_map = {name: i for i, name in enumerate(_PREFERRED_NAME_ORDER)}
+    indexed = list(enumerate(names))
+    # Stable sort: known names by preferred position, unknown names stay in
+    # their original order at the end.
+    indexed.sort(key=lambda x: (order_map.get(x[1], len(_PREFERRED_NAME_ORDER)), x[0]))
+    reordered_indices = [i for i, _ in indexed]
+    return (
+        [dfs[i] for i in reordered_indices],
+        [names[i] for i in reordered_indices],
+    )
+
 
 def _auto_n_bins(n_samples: int, n_unique: int) -> int:
     """Pick a number of bins using Sturges' rule, capped at unique values."""
@@ -39,8 +67,12 @@ def _auto_n_bins(n_samples: int, n_unique: int) -> int:
     return max(2, min(sturges, n_unique, 50))
 
 
-def _auto_bar_width(label_width: int, n_datasets: int) -> int:
-    """Pick bar_width to fit within terminal width."""
+def _auto_bar_width(label_width: int) -> int:
+    """Pick bar_width to fit within terminal width.
+
+    Used by grouped barplots and histograms where each dataset gets its own
+    line within a row (single bar per line).
+    """
     import shutil
 
     term_width = shutil.get_terminal_size((80, 24)).columns
@@ -49,6 +81,24 @@ def _auto_bar_width(label_width: int, n_datasets: int) -> int:
     return max(10, available)
 
 
+def _auto_bar_width_columnar(range_width: int, n_columns: int) -> int:
+    """Pick per-column bar_width for a columnar (side-by-side) layout.
+
+    Used by :func:`ascii_comparative_hist` where all datasets are rendered as
+    columns on the same line.  Each column needs space for the bar, a
+    percentage string (~6 chars), and inter-column separators (`` | ``, 3
+    chars each).
+    """
+    import shutil
+
+    term_width = shutil.get_terminal_size((80, 24)).columns
+    # "Range  | col1 | col2 | ..."
+    # range_width + " | " (3+1 for padding) consumed by the label column
+    available = term_width - range_width - 4
+    per_col = max(10, (available - (n_columns - 1) * 3) // n_columns - 6)
+    return per_col
+
+
 def _weighted_histogram(
     values: pd.Series,
     weights: Optional[pd.Series],
@@ -174,7 +224,8 @@ def ascii_plot_bar(
         names: Names for each DataFrame (e.g., ["self", "target"]).
         column: The categorical column name to plot.
         weighted: Whether to use weights. Defaults to True.
-        bar_width: Maximum character width for bars. Defaults to 40.
+        bar_width: Maximum character width for bars. Defaults to None,
+            which auto-detects based on terminal width.
         dist_type: Accepted for compatibility but only "hist_ascii" is supported.
             A warning is logged if any other value is passed.
         separate_categories: If True, insert a blank line between categories
@@ -243,7 +294,7 @@ def ascii_plot_bar(
     label_width = max(label_width, 8)  # minimum width for "Category"
 
     if bar_width is None:
-        bar_width = _auto_bar_width(label_width, len(legend_names))
+        bar_width = _auto_bar_width(label_width)
 
     # Build output
     lines: List[str] = []
@@ -304,8 +355,10 @@ def ascii_plot_hist(
         names: Names for each DataFrame (e.g., ["self", "target"]).
         column: The numeric column name to plot.
         weighted: Whether to use weights. Defaults to True.
-        n_bins: Number of histogram bins. Defaults to 10.
-        bar_width: Maximum character width for bars. Defaults to 40.
+        n_bins: Number of histogram bins. Defaults to None, which
+            auto-detects using Sturges' rule.
+        bar_width: Maximum character width for bars. Defaults to None,
+            which auto-detects based on terminal width.
         dist_type: Accepted for compatibility but only "hist_ascii" is supported.
             A warning is logged if any other value is passed.
 
@@ -395,7 +448,7 @@ def ascii_plot_hist(
     label_width = max(label_width, 3)  # minimum width for "Bin"
 
     if bar_width is None:
-        bar_width = _auto_bar_width(label_width, len(legend_names))
+        bar_width = _auto_bar_width(label_width)
 
     # Build output
     lines: List[str] = []
@@ -459,8 +512,10 @@ def ascii_comparative_hist(
         names: Names for each DataFrame (e.g., ["Target", "Sample"]).
         column: The numeric column name to plot.
         weighted: Whether to use weights. Defaults to True.
-        n_bins: Number of histogram bins. Defaults to 10.
-        bar_width: Maximum character width for bars. Defaults to 20.
+        n_bins: Number of histogram bins. Defaults to None, which
+            auto-detects using Sturges' rule.
+        bar_width: Maximum character width for bars. Defaults to None,
+            which auto-detects based on terminal width.
 
     Returns:
         ASCII comparative histogram text.
@@ -470,6 +525,8 @@ def ascii_comparative_hist(
 
             >>> print(ascii_comparative_hist(dfs, names=["Target", "Sample"],
             ...       column="income", n_bins=2, bar_width=20))
+            === income (numeric, comparative) ===
+            <BLANKLINE>
             Range          | Target (%)         | Sample (%)
             ---------------------------------------------------------------
             [10.00, 25.00) | █████████████ 50.0 | █████████████▒▒▒▒▒▒▒ 75.0
@@ -538,14 +595,7 @@ def ascii_comparative_hist(
     range_width = max(len(range_header), max(len(lbl) for lbl in bin_labels))
 
     if bar_width is None:
-        import shutil
-
-        term_width = shutil.get_terminal_size((80, 24)).columns
-        n_cols = len(legend_names)
-        # Each column needs: bar_width + pct string (~6) + spacing (3)
-        available = term_width - range_width - 4  # "  | " separator
-        per_col = max(10, (available - (n_cols - 1) * 3) // n_cols - 6)
-        bar_width = per_col
+        bar_width = _auto_bar_width_columnar(range_width, len(legend_names))
 
     # Baseline percentages (first dataset)
     baseline_pcts = hist_pcts[0]
@@ -597,6 +647,8 @@ def ascii_comparative_hist(
 
     # Build output
     lines: List[str] = []
+    lines.append(f"=== {column} (numeric, comparative) ===")
+    lines.append("")
 
     # Header row
     header_parts = [range_header.ljust(range_width)]
@@ -645,12 +697,27 @@ def ascii_plot_dist(
     bar_width: Optional[int] = None,
     dist_type: Optional[str] = None,
     separate_categories: bool = True,
+    comparative: bool = True,
 ) -> str:
     """Produces ASCII text comparing weighted distributions across datasets.
 
     Iterates over variables, classifying each as categorical or numeric
     (using the same logic as :func:`seaborn_plot_dist`), then delegates to
-    :func:`ascii_plot_bar` or :func:`ascii_plot_hist` respectively.
+    the appropriate plotting function.
+
+    Two display modes are available for numeric variables:
+
+    - **comparative** (``comparative=True``, the default): numeric variables
+      are rendered with :func:`ascii_comparative_hist`, a columnar layout
+      where the first dataset is the baseline and subsequent datasets show
+      excess / deficit relative to it.
+    - **grouped** (``comparative=False``): numeric variables are rendered
+      with :func:`ascii_plot_hist`, a grouped-bar layout where each dataset
+      gets its own bar per bin (the same style used for categorical
+      variables).
+
+    Categorical variables always use :func:`ascii_plot_bar` regardless of
+    this setting.
 
     The output is both printed to stdout and returned as a string.
 
@@ -662,12 +729,19 @@ def ascii_plot_dist(
         numeric_n_values_threshold: Columns with fewer unique values than this
             are treated as categorical. Defaults to 15.
         weighted: Whether to use weights. Defaults to True.
-        n_bins: Number of bins for numeric histograms. Defaults to 10.
-        bar_width: Maximum character width for the longest bar. Defaults to 40.
+        n_bins: Number of bins for numeric histograms. Defaults to None,
+            which auto-detects using Sturges' rule.
+        bar_width: Maximum character width for the longest bar. Defaults to
+            None, which auto-detects based on terminal width.
         dist_type: Accepted for compatibility but only "hist_ascii" is supported.
             A warning is logged if any other value is passed.
         separate_categories: If True, insert a blank line between categories
             in barplots for readability. Defaults to True.
+        comparative: If True (default), numeric variables use a columnar
+            comparative histogram (:func:`ascii_comparative_hist`) that
+            highlights differences relative to a baseline dataset.  If
+            False, numeric variables use a grouped-bar histogram
+            (:func:`ascii_plot_hist`) instead.
 
     Returns:
         The full ASCII output text.
@@ -693,31 +767,52 @@ def ascii_plot_dist(
             ...       numeric_n_values_threshold=0, n_bins=2, bar_width=20))
             === color (categorical) ===
             <BLANKLINE>
-            Category | sample  population
+            Category | population  sample
                      |
-            blue     | ████████████████████ (50.0%)
-                     | ▒▒▒▒▒▒▒▒▒▒ (25.0%)
+            blue     | ██████████ (25.0%)
+                     | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
             <BLANKLINE>
             green    | ██████████ (25.0%)
                      | ▒▒▒▒▒▒▒▒▒▒ (25.0%)
             <BLANKLINE>
-            red      | ██████████ (25.0%)
-                     | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
+            red      | ████████████████████ (50.0%)
+                     | ▒▒▒▒▒▒▒▒▒▒ (25.0%)
             <BLANKLINE>
-            Legend: █ sample  ▒ population
+            Legend: █ population  ▒ sample
             Bar lengths are proportional to weighted frequency within each dataset.
             <BLANKLINE>
+            === age (numeric, comparative) ===
+            <BLANKLINE>
+            Range          | population (%)     | sample (%)
+            ---------------------------------------------------------------
+            [10.00, 25.00) | █████████████ 50.0 | █████████████▒▒▒▒▒▒▒ 75.0
+            [25.00, 40.00] | █████████████ 50.0 | ███████     ] 25.0
+            ---------------------------------------------------------------
+            Total          | 100.0              | 100.0
+            <BLANKLINE>
+            Key: █ = shared with population, ▒ = excess,    ] = deficit
+
+        To use grouped-bar histograms (same style as categorical) instead
+        of comparative histograms for numeric variables, pass
+        ``comparative=False``::
+
+            >>> print(ascii_plot_dist(dfs, names=["self", "target"],
+            ...       numeric_n_values_threshold=0, n_bins=2, bar_width=20,
+            ...       comparative=False))
+            === color (categorical) ===
+            ...
             === age (numeric) ===
             <BLANKLINE>
-            Bin            | sample  population
+            Bin            | population  sample
                            |
-            [10.00, 25.00) | █████████████ (50.0%)
-                           | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (75.0%)
-            [25.00, 40.00] | █████████████ (50.0%)
-                           | ▒▒▒▒▒▒▒ (25.0%)
+            [10.00, 25.00) | ████████████████████ (75.0%)
+                           | ▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
+            [25.00, 40.00] | ███████ (25.0%)
+                           | ▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
             <BLANKLINE>
-            Legend: █ sample  ▒ population
+            Legend: █ population  ▒ sample
             Bar lengths are proportional to weighted frequency within each dataset.
+            <BLANKLINE>
     """
     if dist_type is not None and dist_type != "hist_ascii":
         logger.warning(
@@ -727,6 +822,9 @@ def ascii_plot_dist(
     if names is None:
         names = [f"df_{i}" for i in range(len(dfs))]
 
+    # Reorder so comparative plots show: population, adjusted, sample
+    dfs, names = _reorder_dfs_and_names(dfs, names)
+
     variables = choose_variables(*(d["df"] for d in dfs), variables=variables)
     logger.debug(f"ASCII plotting variables {variables}")
 
@@ -757,16 +855,28 @@ def ascii_plot_dist(
                 )
             )
         else:
-            output_parts.append(
-                ascii_plot_hist(
-                    dfs,
-                    names,
-                    o,
-                    weighted=weighted,
-                    n_bins=n_bins,
-                    bar_width=bar_width,
+            if comparative:
+                output_parts.append(
+                    ascii_comparative_hist(
+                        dfs,
+                        names,
+                        o,
+                        weighted=weighted,
+                        n_bins=n_bins,
+                        bar_width=bar_width,
+                    )
+                )
+            else:
+                output_parts.append(
+                    ascii_plot_hist(
+                        dfs,
+                        names,
+                        o,
+                        weighted=weighted,
+                        n_bins=n_bins,
+                        bar_width=bar_width,
+                    )
                 )
-            )
 
     result = "\n".join(output_parts)
     print(result)