Skip to content

Commit bb86743

Browse files
sahil350meta-codesync[bot]
authored andcommitted
Improve ASCII plots with auto-sizing, category spacing, and dedicated tutorial (#346)
Summary: Pull Request resolved: #346 The changes in this diff: ascii_plots.py — Core feature changes: - Added _auto_n_bins(): auto-selects histogram bin count using Sturges' rule, capped at unique values and 50 max - Added _auto_bar_width(): computes bar width from terminal size minus label/percentage columns - Changed n_bins and bar_width parameters from required ints to Optional[int] with auto-detection defaults - Added separate_categories parameter to ascii_plot_bar() and ascii_plot_dist() — inserts blank lines between categories for readability (default True) - Non-zero proportions too small to render a full bar character now show a dot (.) instead of nothing test_ascii_plots.py — New tests: - TestAutoNBins: tests Sturges' rule, capping at unique values, capping at 50, and edge cases - TestAutoBarWidth: tests terminal width computation - TestAutoDetectionIntegration: integration tests for auto-detection when calling without explicit parameters balance_ascii_plots.ipynb — New dedicated tutorial: - Detailed tutorial covering ASCII plotting features: barplots, histograms, comparative histograms, ascii_plot_dist, separate_categories, n_bins, bar_width, and auto-detection balance_quickstart.ipynb — Simplified: - Moved detailed ASCII plot examples out to the dedicated tutorial, keeping quickstart focused Reviewed By: talgalili Differential Revision: D94231216 fbshipit-source-id: b1b46512bc9e9cfe41ad5158dacc9768f5a71e67
1 parent e9828da commit bb86743

File tree

4 files changed

+455
-140
lines changed

4 files changed

+455
-140
lines changed

balance/stats_and_plots/ascii_plots.py

Lines changed: 85 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,27 @@
2828
BAR_CHARS: List[str] = ["█", "▒", "▐", "░", "▄", "▀"]
2929

3030

31+
def _auto_n_bins(n_samples: int, n_unique: int) -> int:
32+
"""Pick a number of bins using Sturges' rule, capped at unique values."""
33+
import math
34+
35+
if n_samples <= 1:
36+
return 1
37+
sturges = math.ceil(math.log2(n_samples) + 1)
38+
# Don't exceed the number of unique values, and clamp to [2, 50]
39+
return max(2, min(sturges, n_unique, 50))
40+
41+
42+
def _auto_bar_width(label_width: int, n_datasets: int) -> int:
43+
"""Pick bar_width to fit within terminal width."""
44+
import shutil
45+
46+
term_width = shutil.get_terminal_size((80, 24)).columns
47+
# Each line: label_width + " | " (3) + bar + " (XX.X%)" (9)
48+
available = term_width - label_width - 3 - 9
49+
return max(10, available)
50+
51+
3152
def _weighted_histogram(
3253
values: pd.Series,
3354
weights: Optional[pd.Series],
@@ -65,7 +86,9 @@ def _render_horizontal_bars(
6586
"""Renders a group of horizontal bars for one category or bin.
6687
6788
Each dataset gets its own line with a distinct character and a percentage
68-
label at the end.
89+
label at the end. When a proportion is non-zero but too small to render
90+
even one bar character, a single dot (``.``) is shown so that the reader
91+
can distinguish "present but tiny" from "truly zero".
6992
7093
Args:
7194
label: The category label or bin range string.
@@ -86,7 +109,13 @@ def _render_horizontal_bars(
86109
bar_len = int(round((prop / max_value) * bar_width))
87110
else:
88111
bar_len = 0
89-
bar = char * bar_len
112+
if bar_len > 0:
113+
bar = char * bar_len
114+
elif prop > 0:
115+
# Non-zero proportion too small to render — show a dot
116+
bar = "."
117+
else:
118+
bar = ""
90119
if i == 0:
91120
prefix = label.ljust(label_width)
92121
else:
@@ -121,8 +150,9 @@ def ascii_plot_bar(
121150
names: List[str],
122151
column: str,
123152
weighted: bool = True,
124-
bar_width: int = 40,
153+
bar_width: Optional[int] = None,
125154
dist_type: Optional[str] = None,
155+
separate_categories: bool = True,
126156
) -> str:
127157
"""Produces an ASCII grouped barplot for a single categorical variable.
128158
@@ -147,6 +177,8 @@ def ascii_plot_bar(
147177
bar_width: Maximum character width for bars. Defaults to 40.
148178
dist_type: Accepted for compatibility but only "hist_ascii" is supported.
149179
A warning is logged if any other value is passed.
180+
separate_categories: If True, insert a blank line between categories
181+
for readability. Defaults to True.
150182
151183
Returns:
152184
ASCII barplot text for this variable.
@@ -168,8 +200,10 @@ def ascii_plot_bar(
168200
|
169201
blue | ████████████████████ (50.0%)
170202
| ▒▒▒▒▒▒▒▒▒▒ (25.0%)
203+
<BLANKLINE>
171204
green | ██████████ (25.0%)
172205
| ▒▒▒▒▒▒▒▒▒▒ (25.0%)
206+
<BLANKLINE>
173207
red | ██████████ (25.0%)
174208
| ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
175209
<BLANKLINE>
@@ -208,6 +242,9 @@ def ascii_plot_bar(
208242
label_width = max(len(str(c)) for c in categories) if categories else 8
209243
label_width = max(label_width, 8) # minimum width for "Category"
210244

245+
if bar_width is None:
246+
bar_width = _auto_bar_width(label_width, len(legend_names))
247+
211248
# Build output
212249
lines: List[str] = []
213250
lines.append(f"=== {column} (categorical) ===")
@@ -218,7 +255,9 @@ def ascii_plot_bar(
218255
lines.append(f"{header_label} | {' '.join(legend_names)}")
219256
lines.append(f"{' ' * label_width} |")
220257

221-
for cat in categories:
258+
for ci, cat in enumerate(categories):
259+
if separate_categories and ci > 0:
260+
lines.append("")
222261
cat_data = combined[combined[column] == cat]
223262
proportions: Dict[str, float] = {}
224263
for _, row in cat_data.iterrows():
@@ -241,8 +280,8 @@ def ascii_plot_hist(
241280
names: List[str],
242281
column: str,
243282
weighted: bool = True,
244-
n_bins: int = 10,
245-
bar_width: int = 40,
283+
n_bins: Optional[int] = None,
284+
bar_width: Optional[int] = None,
246285
dist_type: Optional[str] = None,
247286
) -> str:
248287
"""Produces an ASCII histogram for a single numeric variable.
@@ -322,6 +361,9 @@ def ascii_plot_hist(
322361
if len(combined_values) == 0:
323362
return f"=== {column} (numeric) ===\n\nNo data available.\n"
324363

364+
if n_bins is None:
365+
n_bins = _auto_n_bins(len(combined_values), combined_values.nunique())
366+
325367
global_min = float(combined_values.min())
326368
global_max = float(combined_values.max())
327369

@@ -352,6 +394,9 @@ def ascii_plot_hist(
352394
label_width = max(len(lbl) for lbl in bin_labels) if bin_labels else 8
353395
label_width = max(label_width, 3) # minimum width for "Bin"
354396

397+
if bar_width is None:
398+
bar_width = _auto_bar_width(label_width, len(legend_names))
399+
355400
# Build output
356401
lines: List[str] = []
357402
lines.append(f"=== {column} (numeric) ===")
@@ -384,8 +429,8 @@ def ascii_comparative_hist(
384429
names: List[str],
385430
column: str,
386431
weighted: bool = True,
387-
n_bins: int = 10,
388-
bar_width: int = 20,
432+
n_bins: Optional[int] = None,
433+
bar_width: Optional[int] = None,
389434
) -> str:
390435
"""Produces a columnar, baseline-relative ASCII histogram.
391436
@@ -456,6 +501,9 @@ def ascii_comparative_hist(
456501
if len(combined_values) == 0:
457502
return "No data available."
458503

504+
if n_bins is None:
505+
n_bins = _auto_n_bins(len(combined_values), combined_values.nunique())
506+
459507
global_min = float(combined_values.min())
460508
global_max = float(combined_values.max())
461509

@@ -485,6 +533,20 @@ def ascii_comparative_hist(
485533
bracket_right = "]" if i == n_bins - 1 else ")"
486534
bin_labels.append(f"[{left:,.2f}, {right:,.2f}{bracket_right}")
487535

536+
# Range column width (computed early so bar_width auto-detection can use it)
537+
range_header = "Range"
538+
range_width = max(len(range_header), max(len(lbl) for lbl in bin_labels))
539+
540+
if bar_width is None:
541+
import shutil
542+
543+
term_width = shutil.get_terminal_size((80, 24)).columns
544+
n_cols = len(legend_names)
545+
# Each column needs: bar_width + pct string (~6) + spacing (3)
546+
available = term_width - range_width - 4 # " | " separator
547+
per_col = max(10, (available - (n_cols - 1) * 3) // n_cols - 6)
548+
bar_width = per_col
549+
488550
# Baseline percentages (first dataset)
489551
baseline_pcts = hist_pcts[0]
490552

@@ -533,10 +595,6 @@ def ascii_comparative_hist(
533595
max_cell_w = max(len(cell_strings[di][bi]) for bi in range(n_bins))
534596
col_widths.append(max(header_w, max_cell_w))
535597

536-
# Range column width
537-
range_header = "Range"
538-
range_width = max(len(range_header), max(len(lbl) for lbl in bin_labels))
539-
540598
# Build output
541599
lines: List[str] = []
542600

@@ -583,9 +641,10 @@ def ascii_plot_dist(
583641
variables: Optional[List[str]] = None,
584642
numeric_n_values_threshold: int = 15,
585643
weighted: bool = True,
586-
n_bins: int = 10,
587-
bar_width: int = 40,
644+
n_bins: Optional[int] = None,
645+
bar_width: Optional[int] = None,
588646
dist_type: Optional[str] = None,
647+
separate_categories: bool = True,
589648
) -> str:
590649
"""Produces ASCII text comparing weighted distributions across datasets.
591650
@@ -607,6 +666,8 @@ def ascii_plot_dist(
607666
bar_width: Maximum character width for the longest bar. Defaults to 40.
608667
dist_type: Accepted for compatibility but only "hist_ascii" is supported.
609668
A warning is logged if any other value is passed.
669+
separate_categories: If True, insert a blank line between categories
670+
in barplots for readability. Defaults to True.
610671
611672
Returns:
612673
The full ASCII output text.
@@ -636,8 +697,10 @@ def ascii_plot_dist(
636697
|
637698
blue | ████████████████████ (50.0%)
638699
| ▒▒▒▒▒▒▒▒▒▒ (25.0%)
700+
<BLANKLINE>
639701
green | ██████████ (25.0%)
640702
| ▒▒▒▒▒▒▒▒▒▒ (25.0%)
703+
<BLANKLINE>
641704
red | ██████████ (25.0%)
642705
| ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
643706
<BLANKLINE>
@@ -684,7 +747,14 @@ def ascii_plot_dist(
684747

685748
if categorical:
686749
output_parts.append(
687-
ascii_plot_bar(dfs, names, o, weighted=weighted, bar_width=bar_width)
750+
ascii_plot_bar(
751+
dfs,
752+
names,
753+
o,
754+
weighted=weighted,
755+
bar_width=bar_width,
756+
separate_categories=separate_categories,
757+
)
688758
)
689759
else:
690760
output_parts.append(

0 commit comments

Comments
 (0)