Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 85 additions & 15 deletions balance/stats_and_plots/ascii_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,27 @@
BAR_CHARS: List[str] = ["█", "▒", "▐", "░", "▄", "▀"]


def _auto_n_bins(n_samples: int, n_unique: int) -> int:
"""Pick a number of bins using Sturges' rule, capped at unique values."""
import math

if n_samples <= 1:
return 1
sturges = math.ceil(math.log2(n_samples) + 1)
# Don't exceed the number of unique values, and clamp to [2, 50]
return max(2, min(sturges, n_unique, 50))


def _auto_bar_width(label_width: int, n_datasets: int) -> int:
"""Pick bar_width to fit within terminal width."""
import shutil

term_width = shutil.get_terminal_size((80, 24)).columns
# Each line: label_width + " | " (3) + bar + " (XX.X%)" (9)
available = term_width - label_width - 3 - 9
return max(10, available)


def _weighted_histogram(
values: pd.Series,
weights: Optional[pd.Series],
Expand Down Expand Up @@ -65,7 +86,9 @@ def _render_horizontal_bars(
"""Renders a group of horizontal bars for one category or bin.

Each dataset gets its own line with a distinct character and a percentage
label at the end.
label at the end. When a proportion is non-zero but too small to render
even one bar character, a single dot (``.``) is shown so that the reader
can distinguish "present but tiny" from "truly zero".

Args:
label: The category label or bin range string.
Expand All @@ -86,7 +109,13 @@ def _render_horizontal_bars(
bar_len = int(round((prop / max_value) * bar_width))
else:
bar_len = 0
bar = char * bar_len
if bar_len > 0:
bar = char * bar_len
elif prop > 0:
# Non-zero proportion too small to render — show a dot
bar = "."
else:
bar = ""
if i == 0:
prefix = label.ljust(label_width)
else:
Expand Down Expand Up @@ -121,8 +150,9 @@ def ascii_plot_bar(
names: List[str],
column: str,
weighted: bool = True,
bar_width: int = 40,
bar_width: Optional[int] = None,
dist_type: Optional[str] = None,
separate_categories: bool = True,
) -> str:
"""Produces an ASCII grouped barplot for a single categorical variable.

Expand All @@ -147,6 +177,8 @@ def ascii_plot_bar(
bar_width: Maximum character width for bars. Defaults to 40.
Copy link

Copilot AI Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring incorrectly states that bar_width defaults to 40. The actual default is now None, which triggers automatic detection based on terminal width. Update the docstring to reflect this change, e.g., "Defaults to None, which auto-detects based on terminal width."

Suggested change
bar_width: Maximum character width for bars. Defaults to 40.
bar_width: Maximum character width for bars. Defaults to None, which
auto-detects an appropriate width based on the terminal size.

Copilot uses AI. Check for mistakes.
dist_type: Accepted for compatibility but only "hist_ascii" is supported.
A warning is logged if any other value is passed.
separate_categories: If True, insert a blank line between categories
for readability. Defaults to True.

Returns:
ASCII barplot text for this variable.
Expand All @@ -168,8 +200,10 @@ def ascii_plot_bar(
|
blue | ████████████████████ (50.0%)
| ▒▒▒▒▒▒▒▒▒▒ (25.0%)
<BLANKLINE>
green | ██████████ (25.0%)
| ▒▒▒▒▒▒▒▒▒▒ (25.0%)
<BLANKLINE>
red | ██████████ (25.0%)
| ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
<BLANKLINE>
Expand Down Expand Up @@ -208,6 +242,9 @@ def ascii_plot_bar(
label_width = max(len(str(c)) for c in categories) if categories else 8
label_width = max(label_width, 8) # minimum width for "Category"

if bar_width is None:
bar_width = _auto_bar_width(label_width, len(legend_names))

# Build output
lines: List[str] = []
lines.append(f"=== {column} (categorical) ===")
Expand All @@ -218,7 +255,9 @@ def ascii_plot_bar(
lines.append(f"{header_label} | {' '.join(legend_names)}")
lines.append(f"{' ' * label_width} |")

for cat in categories:
for ci, cat in enumerate(categories):
if separate_categories and ci > 0:
lines.append("")
cat_data = combined[combined[column] == cat]
proportions: Dict[str, float] = {}
for _, row in cat_data.iterrows():
Expand All @@ -241,8 +280,8 @@ def ascii_plot_hist(
names: List[str],
column: str,
weighted: bool = True,
n_bins: int = 10,
bar_width: int = 40,
n_bins: Optional[int] = None,
bar_width: Optional[int] = None,
dist_type: Optional[str] = None,
) -> str:
"""Produces an ASCII histogram for a single numeric variable.
Expand Down Expand Up @@ -322,6 +361,9 @@ def ascii_plot_hist(
if len(combined_values) == 0:
return f"=== {column} (numeric) ===\n\nNo data available.\n"

if n_bins is None:
n_bins = _auto_n_bins(len(combined_values), combined_values.nunique())

global_min = float(combined_values.min())
global_max = float(combined_values.max())

Expand Down Expand Up @@ -352,6 +394,9 @@ def ascii_plot_hist(
label_width = max(len(lbl) for lbl in bin_labels) if bin_labels else 8
label_width = max(label_width, 3) # minimum width for "Bin"

if bar_width is None:
bar_width = _auto_bar_width(label_width, len(legend_names))

# Build output
lines: List[str] = []
lines.append(f"=== {column} (numeric) ===")
Expand Down Expand Up @@ -384,8 +429,8 @@ def ascii_comparative_hist(
names: List[str],
column: str,
weighted: bool = True,
n_bins: int = 10,
bar_width: int = 20,
n_bins: Optional[int] = None,
bar_width: Optional[int] = None,
) -> str:
"""Produces a columnar, baseline-relative ASCII histogram.

Expand Down Expand Up @@ -456,6 +501,9 @@ def ascii_comparative_hist(
if len(combined_values) == 0:
return "No data available."

if n_bins is None:
n_bins = _auto_n_bins(len(combined_values), combined_values.nunique())

global_min = float(combined_values.min())
global_max = float(combined_values.max())

Expand Down Expand Up @@ -485,6 +533,20 @@ def ascii_comparative_hist(
bracket_right = "]" if i == n_bins - 1 else ")"
bin_labels.append(f"[{left:,.2f}, {right:,.2f}{bracket_right}")

# Range column width (computed early so bar_width auto-detection can use it)
range_header = "Range"
range_width = max(len(range_header), max(len(lbl) for lbl in bin_labels))

if bar_width is None:
import shutil

term_width = shutil.get_terminal_size((80, 24)).columns
n_cols = len(legend_names)
# Each column needs: bar_width + pct string (~6) + spacing (3)
available = term_width - range_width - 4 # " | " separator
per_col = max(10, (available - (n_cols - 1) * 3) // n_cols - 6)
bar_width = per_col
Comment on lines +540 to +548
Copy link

Copilot AI Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The auto-detection logic for bar_width in ascii_comparative_hist is duplicated inline instead of using the _auto_bar_width helper. This creates maintenance burden and inconsistency. Consider refactoring to use a shared helper, or if the comparative histogram needs different logic due to its columnar layout, document why in a comment.

Copilot uses AI. Check for mistakes.

# Baseline percentages (first dataset)
baseline_pcts = hist_pcts[0]

Expand Down Expand Up @@ -533,10 +595,6 @@ def ascii_comparative_hist(
max_cell_w = max(len(cell_strings[di][bi]) for bi in range(n_bins))
col_widths.append(max(header_w, max_cell_w))

# Range column width
range_header = "Range"
range_width = max(len(range_header), max(len(lbl) for lbl in bin_labels))

# Build output
lines: List[str] = []

Expand Down Expand Up @@ -583,9 +641,10 @@ def ascii_plot_dist(
variables: Optional[List[str]] = None,
numeric_n_values_threshold: int = 15,
weighted: bool = True,
n_bins: int = 10,
bar_width: int = 40,
n_bins: Optional[int] = None,
bar_width: Optional[int] = None,
dist_type: Optional[str] = None,
separate_categories: bool = True,
) -> str:
"""Produces ASCII text comparing weighted distributions across datasets.

Expand All @@ -607,6 +666,8 @@ def ascii_plot_dist(
bar_width: Maximum character width for the longest bar. Defaults to 40.
dist_type: Accepted for compatibility but only "hist_ascii" is supported.
A warning is logged if any other value is passed.
separate_categories: If True, insert a blank line between categories
in barplots for readability. Defaults to True.

Returns:
The full ASCII output text.
Expand Down Expand Up @@ -636,8 +697,10 @@ def ascii_plot_dist(
|
blue | ████████████████████ (50.0%)
| ▒▒▒▒▒▒▒▒▒▒ (25.0%)
<BLANKLINE>
green | ██████████ (25.0%)
| ▒▒▒▒▒▒▒▒▒▒ (25.0%)
<BLANKLINE>
red | ██████████ (25.0%)
| ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%)
<BLANKLINE>
Expand Down Expand Up @@ -684,7 +747,14 @@ def ascii_plot_dist(

if categorical:
output_parts.append(
ascii_plot_bar(dfs, names, o, weighted=weighted, bar_width=bar_width)
ascii_plot_bar(
dfs,
names,
o,
weighted=weighted,
bar_width=bar_width,
separate_categories=separate_categories,
)
)
else:
output_parts.append(
Expand Down
Loading