-
Notifications
You must be signed in to change notification settings - Fork 51
Add auto-detection for n_bins and bar_width in ASCII plots #346
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,27 @@ | |
| BAR_CHARS: List[str] = ["█", "▒", "▐", "░", "▄", "▀"] | ||
|
|
||
|
|
||
| def _auto_n_bins(n_samples: int, n_unique: int) -> int: | ||
| """Pick a number of bins using Sturges' rule, capped at unique values.""" | ||
| import math | ||
|
|
||
| if n_samples <= 1: | ||
| return 1 | ||
| sturges = math.ceil(math.log2(n_samples) + 1) | ||
| # Don't exceed the number of unique values, and clamp to [2, 50] | ||
| return max(2, min(sturges, n_unique, 50)) | ||
|
|
||
|
|
||
| def _auto_bar_width(label_width: int, n_datasets: int) -> int: | ||
| """Pick bar_width to fit within terminal width.""" | ||
| import shutil | ||
|
|
||
| term_width = shutil.get_terminal_size((80, 24)).columns | ||
| # Each line: label_width + " | " (3) + bar + " (XX.X%)" (9) | ||
| available = term_width - label_width - 3 - 9 | ||
| return max(10, available) | ||
|
|
||
|
|
||
| def _weighted_histogram( | ||
| values: pd.Series, | ||
| weights: Optional[pd.Series], | ||
|
|
@@ -65,7 +86,9 @@ def _render_horizontal_bars( | |
| """Renders a group of horizontal bars for one category or bin. | ||
|
|
||
| Each dataset gets its own line with a distinct character and a percentage | ||
| label at the end. | ||
| label at the end. When a proportion is non-zero but too small to render | ||
| even one bar character, a single dot (``.``) is shown so that the reader | ||
| can distinguish "present but tiny" from "truly zero". | ||
|
|
||
| Args: | ||
| label: The category label or bin range string. | ||
|
|
@@ -86,7 +109,13 @@ def _render_horizontal_bars( | |
| bar_len = int(round((prop / max_value) * bar_width)) | ||
| else: | ||
| bar_len = 0 | ||
| bar = char * bar_len | ||
| if bar_len > 0: | ||
| bar = char * bar_len | ||
| elif prop > 0: | ||
| # Non-zero proportion too small to render — show a dot | ||
| bar = "." | ||
| else: | ||
| bar = "" | ||
| if i == 0: | ||
| prefix = label.ljust(label_width) | ||
| else: | ||
|
|
@@ -121,8 +150,9 @@ def ascii_plot_bar( | |
| names: List[str], | ||
| column: str, | ||
| weighted: bool = True, | ||
| bar_width: int = 40, | ||
| bar_width: Optional[int] = None, | ||
| dist_type: Optional[str] = None, | ||
| separate_categories: bool = True, | ||
| ) -> str: | ||
| """Produces an ASCII grouped barplot for a single categorical variable. | ||
|
|
||
|
|
@@ -147,6 +177,8 @@ def ascii_plot_bar( | |
| bar_width: Maximum character width for bars. Defaults to 40. | ||
| dist_type: Accepted for compatibility but only "hist_ascii" is supported. | ||
| A warning is logged if any other value is passed. | ||
| separate_categories: If True, insert a blank line between categories | ||
| for readability. Defaults to True. | ||
|
|
||
| Returns: | ||
| ASCII barplot text for this variable. | ||
|
|
@@ -168,8 +200,10 @@ def ascii_plot_bar( | |
| | | ||
| blue | ████████████████████ (50.0%) | ||
| | ▒▒▒▒▒▒▒▒▒▒ (25.0%) | ||
| <BLANKLINE> | ||
| green | ██████████ (25.0%) | ||
| | ▒▒▒▒▒▒▒▒▒▒ (25.0%) | ||
| <BLANKLINE> | ||
| red | ██████████ (25.0%) | ||
| | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%) | ||
| <BLANKLINE> | ||
|
|
@@ -208,6 +242,9 @@ def ascii_plot_bar( | |
| label_width = max(len(str(c)) for c in categories) if categories else 8 | ||
| label_width = max(label_width, 8) # minimum width for "Category" | ||
|
|
||
| if bar_width is None: | ||
| bar_width = _auto_bar_width(label_width, len(legend_names)) | ||
|
|
||
| # Build output | ||
| lines: List[str] = [] | ||
| lines.append(f"=== {column} (categorical) ===") | ||
|
|
@@ -218,7 +255,9 @@ def ascii_plot_bar( | |
| lines.append(f"{header_label} | {' '.join(legend_names)}") | ||
| lines.append(f"{' ' * label_width} |") | ||
|
|
||
| for cat in categories: | ||
| for ci, cat in enumerate(categories): | ||
| if separate_categories and ci > 0: | ||
| lines.append("") | ||
| cat_data = combined[combined[column] == cat] | ||
| proportions: Dict[str, float] = {} | ||
| for _, row in cat_data.iterrows(): | ||
|
|
@@ -241,8 +280,8 @@ def ascii_plot_hist( | |
| names: List[str], | ||
| column: str, | ||
| weighted: bool = True, | ||
| n_bins: int = 10, | ||
| bar_width: int = 40, | ||
| n_bins: Optional[int] = None, | ||
| bar_width: Optional[int] = None, | ||
| dist_type: Optional[str] = None, | ||
| ) -> str: | ||
| """Produces an ASCII histogram for a single numeric variable. | ||
|
|
@@ -322,6 +361,9 @@ def ascii_plot_hist( | |
| if len(combined_values) == 0: | ||
| return f"=== {column} (numeric) ===\n\nNo data available.\n" | ||
|
|
||
| if n_bins is None: | ||
| n_bins = _auto_n_bins(len(combined_values), combined_values.nunique()) | ||
|
|
||
| global_min = float(combined_values.min()) | ||
| global_max = float(combined_values.max()) | ||
|
|
||
|
|
@@ -352,6 +394,9 @@ def ascii_plot_hist( | |
| label_width = max(len(lbl) for lbl in bin_labels) if bin_labels else 8 | ||
| label_width = max(label_width, 3) # minimum width for "Bin" | ||
|
|
||
| if bar_width is None: | ||
| bar_width = _auto_bar_width(label_width, len(legend_names)) | ||
|
|
||
| # Build output | ||
| lines: List[str] = [] | ||
| lines.append(f"=== {column} (numeric) ===") | ||
|
|
@@ -384,8 +429,8 @@ def ascii_comparative_hist( | |
| names: List[str], | ||
| column: str, | ||
| weighted: bool = True, | ||
| n_bins: int = 10, | ||
| bar_width: int = 20, | ||
| n_bins: Optional[int] = None, | ||
| bar_width: Optional[int] = None, | ||
| ) -> str: | ||
| """Produces a columnar, baseline-relative ASCII histogram. | ||
|
|
||
|
|
@@ -456,6 +501,9 @@ def ascii_comparative_hist( | |
| if len(combined_values) == 0: | ||
| return "No data available." | ||
|
|
||
| if n_bins is None: | ||
| n_bins = _auto_n_bins(len(combined_values), combined_values.nunique()) | ||
|
|
||
| global_min = float(combined_values.min()) | ||
| global_max = float(combined_values.max()) | ||
|
|
||
|
|
@@ -485,6 +533,20 @@ def ascii_comparative_hist( | |
| bracket_right = "]" if i == n_bins - 1 else ")" | ||
| bin_labels.append(f"[{left:,.2f}, {right:,.2f}{bracket_right}") | ||
|
|
||
| # Range column width (computed early so bar_width auto-detection can use it) | ||
| range_header = "Range" | ||
| range_width = max(len(range_header), max(len(lbl) for lbl in bin_labels)) | ||
|
|
||
| if bar_width is None: | ||
| import shutil | ||
|
|
||
| term_width = shutil.get_terminal_size((80, 24)).columns | ||
| n_cols = len(legend_names) | ||
| # Each column needs: bar_width + pct string (~6) + spacing (3) | ||
| available = term_width - range_width - 4 # " | " separator | ||
| per_col = max(10, (available - (n_cols - 1) * 3) // n_cols - 6) | ||
| bar_width = per_col | ||
|
Comment on lines
+540
to
+548
|
||
|
|
||
| # Baseline percentages (first dataset) | ||
| baseline_pcts = hist_pcts[0] | ||
|
|
||
|
|
@@ -533,10 +595,6 @@ def ascii_comparative_hist( | |
| max_cell_w = max(len(cell_strings[di][bi]) for bi in range(n_bins)) | ||
| col_widths.append(max(header_w, max_cell_w)) | ||
|
|
||
| # Range column width | ||
| range_header = "Range" | ||
| range_width = max(len(range_header), max(len(lbl) for lbl in bin_labels)) | ||
|
|
||
| # Build output | ||
| lines: List[str] = [] | ||
|
|
||
|
|
@@ -583,9 +641,10 @@ def ascii_plot_dist( | |
| variables: Optional[List[str]] = None, | ||
| numeric_n_values_threshold: int = 15, | ||
| weighted: bool = True, | ||
| n_bins: int = 10, | ||
| bar_width: int = 40, | ||
| n_bins: Optional[int] = None, | ||
| bar_width: Optional[int] = None, | ||
| dist_type: Optional[str] = None, | ||
| separate_categories: bool = True, | ||
| ) -> str: | ||
| """Produces ASCII text comparing weighted distributions across datasets. | ||
|
|
||
|
|
@@ -607,6 +666,8 @@ def ascii_plot_dist( | |
| bar_width: Maximum character width for the longest bar. Defaults to 40. | ||
| dist_type: Accepted for compatibility but only "hist_ascii" is supported. | ||
| A warning is logged if any other value is passed. | ||
| separate_categories: If True, insert a blank line between categories | ||
| in barplots for readability. Defaults to True. | ||
|
|
||
| Returns: | ||
| The full ASCII output text. | ||
|
|
@@ -636,8 +697,10 @@ def ascii_plot_dist( | |
| | | ||
| blue | ████████████████████ (50.0%) | ||
| | ▒▒▒▒▒▒▒▒▒▒ (25.0%) | ||
| <BLANKLINE> | ||
| green | ██████████ (25.0%) | ||
| | ▒▒▒▒▒▒▒▒▒▒ (25.0%) | ||
| <BLANKLINE> | ||
| red | ██████████ (25.0%) | ||
| | ▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒ (50.0%) | ||
| <BLANKLINE> | ||
|
|
@@ -684,7 +747,14 @@ def ascii_plot_dist( | |
|
|
||
| if categorical: | ||
| output_parts.append( | ||
| ascii_plot_bar(dfs, names, o, weighted=weighted, bar_width=bar_width) | ||
| ascii_plot_bar( | ||
| dfs, | ||
| names, | ||
| o, | ||
| weighted=weighted, | ||
| bar_width=bar_width, | ||
| separate_categories=separate_categories, | ||
| ) | ||
| ) | ||
| else: | ||
| output_parts.append( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The docstring incorrectly states that
bar_widthdefaults to 40. The actual default is nowNone, which triggers automatic detection based on terminal width. Update the docstring to reflect this change, e.g., "Defaults to None, which auto-detects based on terminal width."