diff --git a/AUTHORS.md b/AUTHORS.md index b6c426445..a526df9a6 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -114,3 +114,4 @@ Contributors - [@joranbeasley](https://github.com/joranbeasley) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%joranbeasley) -[@kianmeng](https://github.com/kianmeng) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1290#issue-1906020324) - [@lbeltrame](https://github.com/lbeltrame) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1401) +- [@Sarra99](https://github.com/Sarra99) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%3ASarra99) diff --git a/janitor/functions/adorn.py b/janitor/functions/adorn.py new file mode 100644 index 000000000..cc85df64f --- /dev/null +++ b/janitor/functions/adorn.py @@ -0,0 +1,251 @@ +from typing import Optional + +import pandas as pd + + +def tabyl( + df: pd.DataFrame, + col1: str, + col2: Optional[str] = None, + col3: Optional[str] = None, + show_counts: bool = True, + show_percentages: bool = False, + percentage_axis: Optional[str] = None, # 'row', 'col', or 'all' +) -> pd.DataFrame: + """ + Create a summary table similar to R's `tabyl`. + + Args: + df: Input DataFrame. + col1: Name of the first column for grouping (required). + col2: Name of the second column for grouping (optional). + col3: Name of the third column for grouping (optional). + show_counts: Whether to show raw counts in the table. + show_percentages: Whether to show percentages in the table. + percentage_axis: Axis for percentages ('row', 'col', or 'all'). + Only applies if `show_percentages` is True. + + Returns: + A DataFrame representing the summary table. + + Example : + >>> data = { + ... "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + ... "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + ... "Region": ["North", "South", "East", "West", "North", + ... "South", "East", "West", "North", "East"], + ... "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + ... } + >>> df = pd.DataFrame(data) + + >>> result = tabyl(df, "Category", "Subcategory", show_percentages=True, + ... percentage_axis="row") + >>> print(result) + Subcategory Category X Y + 0 A 3.0 (75.00%) 1.0 (25.00%) + 1 B 1.0 (33.33%) 2.0 (66.67%) + 2 C 2.0 (66.67%) 1.0 (33.33%) + + >>> result = tabyl(df, "Category", "Subcategory", + ... show_percentages=True, percentage_axis="col") + >>> print(result) + Subcategory Category X Y + 0 A 3.0 (50.00%) 1.0 (25.00%) + 1 B 1.0 (16.67%) 2.0 (50.00%) + 2 C 2.0 (33.33%) 1.0 (25.00%) + + """ + + if col1 not in df.columns: + raise ValueError(f"Column '{col1}' is not in the DataFrame.") + if col2 and col2 not in df.columns: + raise ValueError(f"Column '{col2}' is not in the DataFrame.") + if col3 and col3 not in df.columns: + raise ValueError(f"Column '{col3}' is not in the DataFrame.") + + # Step 1: Group and count + group_cols = [col1] + if col2: + group_cols.append(col2) + if col3: + group_cols.append(col3) + + grouped = df.groupby(group_cols).size().reset_index(name="count") + + # Step 2: Pivot for 3D (col1, col2, col3) + if col2 and col3: + pivot = grouped.pivot_table( + index=col1, + columns=[col2, col3], # Creating 2-level columns for col2 and col3 + values="count", + aggfunc="sum", + fill_value=0, + ) + elif col2: + pivot = grouped.pivot_table( + index=col1, + columns=col2, + values="count", + aggfunc="sum", + fill_value=0, + ) + else: + pivot = grouped.set_index(col1)["count"].to_frame() + + if show_percentages: + pivot = pivot.astype( + float + ) # Convert to float before calculating percentages + + if percentage_axis == "row": + percentages = pivot.div(pivot.sum(axis=1), axis=0) + elif percentage_axis == "col": + percentages = pivot.div(pivot.sum(axis=0), axis=1) + elif percentage_axis == "all": + total = pivot.values.sum() + percentages = pivot / total + else: + raise ValueError( + "`percentage_axis` must be one of 'row', 'col', or 'all'." + ) + + percentages = percentages.applymap(lambda x: f"{x:.2%}") + + if show_counts: + pivot = pivot.astype(str) + " (" + percentages + ")" + else: + pivot = percentages + + return pivot.reset_index() + + +def adorn_totals(df, col1, col2, axis=0): + """ + Adds a 'Total' row or column to a crosstab generated by tabyl. + + :param df: DataFrame used to generate the crosstab + :param col1: First column to create the crosstab + :param col2: Second column to create the crosstab + :param axis: 0 to add a 'Total' row, 1 to add a 'Total' column + :return: DataFrame with a 'Total' row/column added + + Example: + >>> data = { + ... "Category": ["A", "B", "A", "B", "A", "B", "A", "B"], + ... "Subcategory": ["X", "X", "Y", "Y", "X", "X", "Y", "Y"], + ... "Value": [1, 2, 3, 4, 5, 6, 7, 8], + ... } + >>> df = pd.DataFrame(data) + + >>> result = adorn_totals(df, "Category", "Subcategory", axis=0) + >>> print(result) + Subcategory Category X Y + 0 A 2 2 + 1 B 2 2 + Total NaN 4 4 + + >>> result = adorn_totals(df, "Category", "Subcategory", axis=1) + >>> print(result) + Subcategory Category X Y Total + 0 A 2 2 4 + 1 B 2 2 4 + + """ + + # Generate the crosstab using tabyl with the two specified columns + pivot = tabyl(df, col1, col2) + + if pivot.empty: # If the crosstab is empty, return it as-is + return pivot + + if axis == 0: # Add a 'Total' row + # Select only numeric columns and compute their sum across rows + total_row = pivot.select_dtypes(include="number").sum(axis=0) + total_row.name = "Total" # Set the name of the total row + # Concatenate the total row to the crosstab + pivot = pd.concat([pivot, total_row.to_frame().T]) + elif axis == 1: # Add a 'Total' column + # Select only numeric columns and compute their sum across columns + total_col = pivot.select_dtypes(include="number").sum(axis=1) + pivot["Total"] = total_col # Add the total column to the crosstab + else: + raise ValueError( + "The 'axis' argument must be 0 (to add a row) or 1 (to add a column)" + ) + + return pivot + + +def adorn_percentages( + df, col1, col2, axis="row", fmt=True, include_ns=False, decimal_places=1 +): + """ + Adds percentages to a crosstab generated by tabyl, with options to format + and include raw counts, and also control the behavior + of adorn_pct_formatting and adorn_ns. + + :param df: DataFrame used to generate the crosstab + :param col1: First column to create the crosstab + :param col2: Second column to create the crosstab + :param axis: 'row' to add percentages by row, 'col' for column percentages, + 'all' for global percentages + :param fmt: If True, formats percentages as strings + (e.g., "12.5%"), else returns numeric values. + :param include_ns: If True, includes raw counts alongside percentages. + :param decimal_places: Number of decimal places for the percentages + :param thousand_separator: Whether to add a thousand separator to the counts + :param percent_format: Whether to format as percentages + :return: DataFrame with percentages and optional formatting and raw counts + + """ + # Generate the crosstab using tabyl with the two specified columns + pivot = pd.pivot_table( + df, + values="Value", + index=col1, + columns=col2, + aggfunc="sum", + fill_value=0, + ) + + if pivot.empty: # If the crosstab is empty, return it as-is + return pivot + + # Separate numeric columns from the rest of the data + numeric_cols = pivot.select_dtypes(include="number") + + # Calculate the percentages based on the axis + if axis == "row": + percentages = numeric_cols.div(numeric_cols.sum(axis=1), axis=0) + elif axis == "col": + percentages = numeric_cols.div(numeric_cols.sum(axis=0), axis=1) + elif axis == "all": + total_sum = numeric_cols.sum().sum() + percentages = numeric_cols / total_sum + else: + raise ValueError("The 'axis' argument must be 'row', 'col', or 'all'.") + + # Format the percentages if requested + if fmt: + percentages = percentages.applymap( + lambda x: f"{x * 100:.{decimal_places}f}%" if pd.notnull(x) else x + ) + else: + percentages = percentages.applymap( + lambda x: f"{x:.{decimal_places}f}" if pd.notnull(x) else x + ) + + # Combine percentages with raw counts if requested (adorn_ns functionality) + if include_ns: + raw_counts = numeric_cols + percentages_with_ns = ( + percentages.astype(str) + " (" + raw_counts.astype(str) + ")" + if fmt + else percentages.astype(str) + " (" + raw_counts.astype(str) + ")" + ) + percentages = percentages_with_ns + + # Reattach the categories and the percentages to form the final DataFrame + result = pd.concat([pivot.iloc[:, :1], percentages], axis=1) + + return result diff --git a/tests/functions/test_adorn_percentages.py b/tests/functions/test_adorn_percentages.py new file mode 100644 index 000000000..afe2651ea --- /dev/null +++ b/tests/functions/test_adorn_percentages.py @@ -0,0 +1,185 @@ +import pandas as pd +import pytest + +from janitor.functions.adorn import adorn_percentages + + +@pytest.mark.functions +def test_adorn_percentages_row(): + """ + Test that adorn_percentages correctly calculates row percentages. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="row", fmt=True, include_ns=False + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages) + assert "%" in result.iloc[0, 1] + # Check that the result contains percentages + + +@pytest.mark.functions +def test_adorn_percentages_col(): + """ + Test that adorn_percentages correctly calculates column percentages. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="col", fmt=True, include_ns=False + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages) + assert "%" in result.iloc[0, 1] + # Check that the result contains percentages + + +@pytest.mark.functions +def test_adorn_percentages_all(): + """ + Test that adorn_percentages correctly calculates total (global) percentages. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="all", fmt=True, include_ns=False + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages) + assert "%" in result.iloc[0, 1] + # Check that the result contains percentages + + +@pytest.mark.functions +def test_adorn_percentages_with_ns_row(): + """ + Test that adorn_percentages correctly calculates row percentages + with raw counts. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="row", fmt=True, include_ns=True + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages and raw counts) + assert "(" in result.iloc[0, 1] + # Check that raw counts are included + + +@pytest.mark.functions +def test_adorn_percentages_with_ns_col(): + """ + Test that adorn_percentages correctly calculates column percentages + with raw counts. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="col", fmt=True, include_ns=True + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages and raw counts) + assert "(" in result.iloc[0, 1] + # Check that raw counts are included + + +@pytest.mark.functions +def test_adorn_percentages_with_ns_all(): + """ + Test that adorn_percentages correctly calculates total (global) percentages + with raw counts. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="all", fmt=True, include_ns=True + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages and raw counts) + assert "(" in result.iloc[0, 1] + # Check that raw counts are included + + +@pytest.mark.functions +def test_adorn_percentages_empty_pivot(): + """ + Test that adorn_percentages returns an empty DataFrame if the pivot is empty. + """ + # DataFrame sans colonnes valides pour le pivot + data = {"NonExistentColumn": [], "AnotherColumn": [], "Value": []} + df = pd.DataFrame(data) + + # Appel de la fonction avec des colonnes inexistantes + result = adorn_percentages(df, "NonExistentColumn", "AnotherColumn") + + # Vérifie que le résultat est un DataFrame vide + assert result.empty, "Expected an empty DataFrame when pivot is empty." + + +@pytest.mark.functions +def test_adorn_percentages_invalid_axis(): + """ + Test that adorn_percentages raises a ValueError for an invalid axis argument. + """ + data = { + "Category": ["A", "B"], + "Subcategory": ["X", "Y"], + "Value": [10, 20], + } + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, match="The 'axis' argument must be 'row', 'col', or 'all'." + ): + adorn_percentages(df, "Category", "Subcategory", axis="invalid") diff --git a/tests/functions/test_adorn_totals.py b/tests/functions/test_adorn_totals.py new file mode 100644 index 000000000..a017d1e96 --- /dev/null +++ b/tests/functions/test_adorn_totals.py @@ -0,0 +1,97 @@ +import pandas as pd +import pytest + +from janitor.functions.adorn import adorn_totals + +# Sample data +data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], +} + +df = pd.DataFrame(data) + + +@pytest.mark.functions +def test_adorn_totals_row(): + """ + Test that adorn_totals correctly adds a 'Total' row to the crosstab. + """ + result = adorn_totals(df, "Category", "Subcategory", axis=0) + + assert ( + "Total" in result.index + ), "The 'Total' row must be present in the crosstab." + assert ( + result.loc["Total"].sum() == df["Value"].count() + ), "The sum of the 'Total' row must match the total count of the values." + + +@pytest.mark.functions +def test_adorn_totals_column(): + """ + Test that adorn_totals correctly adds a 'Total' column to the crosstab. + """ + result = adorn_totals(df, "Category", "Subcategory", axis=1) + + assert ( + "Total" in result.columns + ), "The 'Total' column must be present in the crosstab." + assert ( + result["Total"].sum() == df["Value"].count() + ), "The sum of the 'Total' column must match the total count of the values." + + +@pytest.mark.functions +def test_adorn_totals_empty_df(): + """ + Test that adorn_totals works correctly with an empty DataFrame. + """ + empty_df = pd.DataFrame(columns=["Category", "Subcategory", "Value"]) + result_row = adorn_totals(empty_df, "Category", "Subcategory", axis=0) + result_col = adorn_totals(empty_df, "Category", "Subcategory", axis=1) + + assert ( + result_row.empty + ), "The crosstab must be empty when an empty DataFrame is used." + assert ( + result_col.empty + ), "The crosstab must be empty when an empty DataFrame is used." + + +@pytest.mark.functions +def test_adorn_totals_invalid_axis(): + """ + Test that adorn_totals raises an error when an invalid axis is provided. + """ + data = { + "Category": ["A", "B", "C"], + "Subcategory": ["X", "Y", "Z"], + "Value": [1, 2, 3], + } + df = pd.DataFrame(data) + + with pytest.raises(ValueError, match="The 'axis' argument must be 0 .* 1"): + adorn_totals(df, "Category", "Subcategory", axis=2) # Invalid axis + + +@pytest.mark.functions +def test_adorn_totals_large_data(): + """ + Test that adorn_totals works correctly with a larger DataFrame. + """ + large_data = { + "Category": ["A"] * 1000 + ["B"] * 1000, + "Subcategory": ["X"] * 500 + ["Y"] * 500 + ["X"] * 500 + ["Y"] * 500, + "Value": list(range(2000)), + } + large_df = pd.DataFrame(large_data) + result = adorn_totals(large_df, "Category", "Subcategory", axis=0) + + assert ( + "Total" in result.index + ), "The 'Total' row must be present in the crosstab for a large DataFrame." + assert result.loc["Total"].sum() == len( + large_data["Value"] + ), "The sum of the 'Total' row must match the total count of the values." diff --git a/tests/functions/test_tabyl.py b/tests/functions/test_tabyl.py new file mode 100644 index 000000000..d53cc99a1 --- /dev/null +++ b/tests/functions/test_tabyl.py @@ -0,0 +1,259 @@ +import pandas as pd +import pytest + +from janitor.functions.adorn import tabyl + + +@pytest.mark.functions +def test_tabyl_basic_counts(): + """ + Test that tabyl correctly generates a crosstab with raw counts. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Region": [ + "North", + "South", + "East", + "West", + "North", + "South", + "East", + "West", + "North", + "East", + ], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = tabyl( + df, "Category", "Subcategory", "Region", show_percentages=False + ) + + assert ( + result.shape[1] >= 5 + ), f"Expected at least 5 columns, got {result.shape[1]}" + assert ( + result.iloc[:, 1:].sum().sum() == 10 + ) # The sum of the counts should be equal to 10 + + +@pytest.mark.functions +def test_tabyl_with_percentages_row(): + """ + Test that tabyl correctly calculates percentages by row. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Region": [ + "North", + "South", + "East", + "West", + "North", + "South", + "East", + "West", + "North", + "East", + ], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = tabyl( + df, + "Category", + "Subcategory", + "Region", + show_counts=False, + show_percentages=True, + percentage_axis="row", + ) + + result_numeric = result.applymap( + lambda x: ( + float(x.strip("%")) / 100 if isinstance(x, str) and "%" in x else x + ) + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).min().min() >= 0 + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).max().max() <= 1 + ) + + +@pytest.mark.functions +def test_tabyl_with_percentages_col(): + """ + Test that tabyl correctly calculates percentages by column. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Region": [ + "North", + "South", + "East", + "West", + "North", + "South", + "East", + "West", + "North", + "East", + ], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = tabyl( + df, + "Category", + "Subcategory", + "Region", + show_counts=False, + show_percentages=True, + percentage_axis="col", + ) + + result_numeric = result.applymap( + lambda x: ( + float(x.strip("%")) / 100 if isinstance(x, str) and "%" in x else x + ) + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).min().min() >= 0 + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).max().max() <= 1 + ) + + +@pytest.mark.functions +def test_tabyl_with_percentages_all(): + """ + Test that tabyl correctly calculates total percentages. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Region": [ + "North", + "South", + "East", + "West", + "North", + "South", + "East", + "West", + "North", + "East", + ], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = tabyl( + df, + "Category", + "Subcategory", + "Region", + show_counts=False, + show_percentages=True, + percentage_axis="all", + ) + + result_numeric = result.applymap( + lambda x: ( + float(x.strip("%")) / 100 if isinstance(x, str) and "%" in x else x + ) + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).min().min() >= 0 + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).max().max() <= 1 + ) + + +@pytest.mark.functions +def test_tabyl_missing_col1(): + """ + Test that tabyl raises an error if col1 is missing from the DataFrame. + """ + data = {"Category": ["A", "B"], "Subcategory": ["X", "Y"]} + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, match="Column 'Region' is not in the DataFrame." + ): + tabyl(df, "Region") + + +@pytest.mark.functions +def test_tabyl_missing_col2(): + """ + Test that tabyl raises an error if col2 is missing from the DataFrame. + """ + data = {"Category": ["A", "B"], "Subcategory": ["X", "Y"]} + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, match="Column 'Value' is not in the DataFrame." + ): + tabyl(df, "Category", "Value") + + +@pytest.mark.functions +def test_tabyl_missing_col3(): + """ + Test that tabyl raises an error if col3 is missing from the DataFrame. + """ + data = {"Category": ["A", "B"], "Subcategory": ["X", "Y"]} + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, match="Column 'Region' is not in the DataFrame." + ): + tabyl(df, "Category", "Subcategory", "Region") + + +@pytest.mark.functions +def test_tabyl_single_column(): + """ + Test that tabyl works correctly with only col1 specified. + """ + data = {"Category": ["A", "B", "A", "C", "B", "A", "C"]} + df = pd.DataFrame(data) + + result = tabyl(df, "Category") + assert result.shape[0] == 3 # Three unique values in 'Category' + assert ( + result["count"].sum() == 7 + ) # Total count should match the number of rows + + +@pytest.mark.functions +def test_tabyl_invalid_percentage_axis(): + """ + Test that tabyl raises an error for invalid percentage_axis values. + """ + data = {"Category": ["A", "B"], "Subcategory": ["X", "Y"]} + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, + match="`percentage_axis` must be one of 'row', 'col', or 'all'.", + ): + tabyl( + df, + "Category", + "Subcategory", + show_percentages=True, + percentage_axis="invalid", + )