From adb4ba6f9ca75513c85a36d81a8437b3aca37551 Mon Sep 17 00:00:00 2001 From: Sarra Nebli Date: Fri, 24 Jan 2025 17:11:11 +0100 Subject: [PATCH 1/8] Implementation and tests of tabyl function in Python --- janitor/functions/adorn.py | 91 +++++++++++++++++ tests/functions/test_tabyl.py | 181 ++++++++++++++++++++++++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 janitor/functions/adorn.py create mode 100644 tests/functions/test_tabyl.py diff --git a/janitor/functions/adorn.py b/janitor/functions/adorn.py new file mode 100644 index 000000000..a2f00e1c5 --- /dev/null +++ b/janitor/functions/adorn.py @@ -0,0 +1,91 @@ +from typing import Optional + +import pandas as pd + + +def tabyl( + df: pd.DataFrame, + col1: str, + col2: Optional[str] = None, + col3: Optional[str] = None, + show_counts: bool = True, + show_percentages: bool = False, + percentage_axis: Optional[str] = None, # 'row', 'col', or 'all' +) -> pd.DataFrame: + """ + Create a summary table similar to R's `tabyl`. + + Args: + df: Input DataFrame. + col1: Name of the first column for grouping (required). + col2: Name of the second column for grouping (optional). + col3: Name of the third column for grouping (optional). + show_counts: Whether to show raw counts in the table. + show_percentages: Whether to show percentages in the table. + percentage_axis: Axis for percentages ('row', 'col', or 'all'). + Only applies if `show_percentages` is True. + + Returns: + A DataFrame representing the summary table. + """ + if col1 not in df.columns: + raise ValueError(f"Column '{col1}' is not in the DataFrame.") + if col2 and col2 not in df.columns: + raise ValueError(f"Column '{col2}' is not in the DataFrame.") + if col3 and col3 not in df.columns: + raise ValueError(f"Column '{col3}' is not in the DataFrame.") + + # Step 1: Group and count + group_cols = [col1] + if col2: + group_cols.append(col2) + if col3: + group_cols.append(col3) + + grouped = df.groupby(group_cols).size().reset_index(name="count") + + # Step 2: Pivot for 3D (col1, col2, col3) + if col2 and col3: + pivot = grouped.pivot_table( + index=col1, + columns=[col2, col3], # Creating 2-level columns for col2 and col3 + values="count", + aggfunc="sum", + fill_value=0, + ) + elif col2: + pivot = grouped.pivot_table( + index=col1, + columns=col2, + values="count", + aggfunc="sum", + fill_value=0, + ) + else: + pivot = grouped.set_index(col1)["count"].to_frame() + + if show_percentages: + pivot = pivot.astype( + float + ) # Convert to float before calculating percentages + + if percentage_axis == "row": + percentages = pivot.div(pivot.sum(axis=1), axis=0) + elif percentage_axis == "col": + percentages = pivot.div(pivot.sum(axis=0), axis=1) + elif percentage_axis == "all": + total = pivot.values.sum() + percentages = pivot / total + else: + raise ValueError( + "`percentage_axis` must be one of 'row', 'col', or 'all'." + ) + + percentages = percentages.applymap(lambda x: f"{x:.2%}") + + if show_counts: + pivot = pivot.astype(str) + " (" + percentages + ")" + else: + pivot = percentages + + return pivot.reset_index() diff --git a/tests/functions/test_tabyl.py b/tests/functions/test_tabyl.py new file mode 100644 index 000000000..07326305a --- /dev/null +++ b/tests/functions/test_tabyl.py @@ -0,0 +1,181 @@ +import pandas as pd +import pytest + +from janitor.functions.adorn import tabyl + + +@pytest.mark.functions +def test_tabyl_basic_counts(): + """ + Test que tabyl génère correctement un tableau croisé avec des comptes bruts. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Region": [ + "North", + "South", + "East", + "West", + "North", + "South", + "East", + "West", + "North", + "East", + ], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = tabyl( + df, "Category", "Subcategory", "Region", show_percentages=False + ) + + assert ( + result.shape[1] >= 5 + ), f"Expected at least 5 columns, got {result.shape[1]}" + assert ( + result.iloc[:, 1:].sum().sum() == 10 + ) # La somme des comptes doit être égale à 10 + + +@pytest.mark.functions +def test_tabyl_with_percentages_row(): + """ + Test que tabyl calcule correctement les pourcentages par ligne. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Region": [ + "North", + "South", + "East", + "West", + "North", + "South", + "East", + "West", + "North", + "East", + ], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = tabyl( + df, + "Category", + "Subcategory", + "Region", + show_counts=False, + show_percentages=True, + percentage_axis="row", + ) + + result_numeric = result.applymap( + lambda x: ( + float(x.strip("%")) / 100 if isinstance(x, str) and "%" in x else x + ) + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).min().min() >= 0 + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).max().max() <= 1 + ) + + +@pytest.mark.functions +def test_tabyl_with_percentages_col(): + """ + Test que tabyl calcule correctement les pourcentages par colonne. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Region": [ + "North", + "South", + "East", + "West", + "North", + "South", + "East", + "West", + "North", + "East", + ], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = tabyl( + df, + "Category", + "Subcategory", + "Region", + show_counts=False, + show_percentages=True, + percentage_axis="col", + ) + + result_numeric = result.applymap( + lambda x: ( + float(x.strip("%")) / 100 if isinstance(x, str) and "%" in x else x + ) + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).min().min() >= 0 + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).max().max() <= 1 + ) + + +@pytest.mark.functions +def test_tabyl_with_percentages_all(): + """ + Test que tabyl calcule correctement les pourcentages totaux. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Region": [ + "North", + "South", + "East", + "West", + "North", + "South", + "East", + "West", + "North", + "East", + ], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = tabyl( + df, + "Category", + "Subcategory", + "Region", + show_counts=False, + show_percentages=True, + percentage_axis="all", + ) + + result_numeric = result.applymap( + lambda x: ( + float(x.strip("%")) / 100 if isinstance(x, str) and "%" in x else x + ) + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).min().min() >= 0 + ) + assert ( + result_numeric.select_dtypes(include=["float", "int"]).max().max() <= 1 + ) From b296674cfb9d2d7bf89e42f4328c6b8a60ec78fa Mon Sep 17 00:00:00 2001 From: Sarra Nebli Date: Fri, 24 Jan 2025 21:22:56 +0100 Subject: [PATCH 2/8] Implementation of adorn_totals and adorn_percentages with tests --- janitor/functions/adorn.py | 91 +++++++++++++ tests/functions/test_adorn_percentages.py | 151 ++++++++++++++++++++++ tests/functions/test_adorn_totals.py | 97 ++++++++++++++ 3 files changed, 339 insertions(+) create mode 100644 tests/functions/test_adorn_percentages.py create mode 100644 tests/functions/test_adorn_totals.py diff --git a/janitor/functions/adorn.py b/janitor/functions/adorn.py index a2f00e1c5..df85d59d7 100644 --- a/janitor/functions/adorn.py +++ b/janitor/functions/adorn.py @@ -89,3 +89,94 @@ def tabyl( pivot = percentages return pivot.reset_index() + + +def adorn_totals(df, col1, col2, axis=0): + """ + Adds a 'Total' row or column to a crosstab generated by tabyl. + + :param df: DataFrame used to generate the crosstab + :param col1: First column to create the crosstab + :param col2: Second column to create the crosstab + :param axis: 0 to add a 'Total' row, 1 to add a 'Total' column + :return: DataFrame with a 'Total' row/column added + """ + # Generate the crosstab using tabyl with the two specified columns + pivot = tabyl(df, col1, col2) + + if pivot.empty: # If the crosstab is empty, return it as-is + return pivot + + if axis == 0: # Add a 'Total' row + # Select only numeric columns and compute their sum across rows + total_row = pivot.select_dtypes(include="number").sum(axis=0) + total_row.name = "Total" # Set the name of the total row + # Concatenate the total row to the crosstab + pivot = pd.concat([pivot, total_row.to_frame().T]) + elif axis == 1: # Add a 'Total' column + # Select only numeric columns and compute their sum across columns + total_col = pivot.select_dtypes(include="number").sum(axis=1) + pivot["Total"] = total_col # Add the total column to the crosstab + else: + raise ValueError( + "The 'axis' argument must be 0 (to add a row) or 1 (to add a column)" + ) + + return pivot + + +def adorn_percentages(df, col1, col2, axis="row", fmt=True, include_ns=False): + """ + Adds percentages to a crosstab generated by tabyl, with options to format + and include raw counts. + + :param df: DataFrame used to generate the crosstab + :param col1: First column to create the crosstab + :param col2: Second column to create the crosstab + :param axis: 'row' to add percentages by row, 'col' + for column percentages, 'all' for global percentages + :param fmt: If True, formats percentages as strings + (e.g., "12.5%"), else returns numeric values. + :param include_ns: If True, includes raw counts alongside percentages. + :return: DataFrame with percentages added + """ + # Generate the crosstab using tabyl with the two specified columns + pivot = tabyl(df, col1, col2) + + if pivot.empty: # If the crosstab is empty, return it as-is + return pivot + + # Separate numeric columns from the rest of the data + numeric_cols = pivot.select_dtypes(include="number") + + # Calculate the percentages based on the axis + if axis == "row": + percentages = numeric_cols.div(numeric_cols.sum(axis=1), axis=0) + elif axis == "col": + percentages = numeric_cols.div(numeric_cols.sum(axis=0), axis=1) + elif axis == "all": + total_sum = numeric_cols.sum().sum() + percentages = numeric_cols / total_sum + else: + raise ValueError("The 'axis' argument must be 'row', 'col', or 'all'.") + + # Format the percentages if requested + if fmt: + percentages = percentages.applymap( + lambda x: f"{x * 100:.1f}%" if pd.notnull(x) else x + ) + + # Combine percentages with raw counts if requested + if include_ns: + raw_counts = numeric_cols + percentages_with_ns = ( + percentages.astype(str) + " (" + raw_counts.astype(str) + ")" + if fmt + else (percentages, raw_counts) + ) + percentages = percentages_with_ns + + # Reattach the categories and the percentages to form the final DataFrame + result = pd.concat([pivot.iloc[:, :1], percentages], axis=1) + + return result diff --git a/tests/functions/test_adorn_percentages.py b/tests/functions/test_adorn_percentages.py new file mode 100644 index 000000000..2841b8704 --- /dev/null +++ b/tests/functions/test_adorn_percentages.py @@ -0,0 +1,151 @@ +import pandas as pd +import pytest + +from janitor.functions.adorn import adorn_percentages + + +@pytest.mark.functions +def test_adorn_percentages_row(): + """ + Test that adorn_percentages correctly calculates row percentages. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="row", fmt=True + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages) + assert "%" in result.iloc[0, 1] + # Check that the result contains percentages + + +@pytest.mark.functions +def test_adorn_percentages_col(): + """ + Test that adorn_percentages correctly calculates column percentages. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="col", fmt=True + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages) + assert "%" in result.iloc[0, 1] + # Check that the result contains percentages + + +@pytest.mark.functions +def test_adorn_percentages_all(): + """ + Test that adorn_percentages correctly calculates total (global) percentages. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="all", fmt=True + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages) + assert "%" in result.iloc[0, 1] + # Check that the result contains percentages + + +@pytest.mark.functions +def test_adorn_percentages_with_ns_row(): + """ + Test that adorn_percentages correctly calculates row percentages + with raw counts. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="row", fmt=True, include_ns=True + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages and raw counts) + assert "(" in result.iloc[0, 1] + # Check that the raw counts are included + + +@pytest.mark.functions +def test_adorn_percentages_with_ns_col(): + """ + Test that adorn_percentages correctly calculates + column percentages with raw counts. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="col", fmt=True, include_ns=True + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages and raw counts) + assert "(" in result.iloc[0, 1] + # Check that the raw counts are included + + +@pytest.mark.functions +def test_adorn_percentages_with_ns_all(): + """ + Test that adorn_percentages correctly + calculates total (global) percentages with raw counts. + """ + data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + } + df = pd.DataFrame(data) + + result = adorn_percentages( + df, "Category", "Subcategory", axis="all", fmt=True, include_ns=True + ) + + assert result.shape[0] == 3 + # 3 unique categories + assert result.shape[1] > 1 + # Should have more than one column (including percentages and raw counts) + assert "(" in result.iloc[0, 1] + # Check that the raw counts are included diff --git a/tests/functions/test_adorn_totals.py b/tests/functions/test_adorn_totals.py new file mode 100644 index 000000000..ce8374797 --- /dev/null +++ b/tests/functions/test_adorn_totals.py @@ -0,0 +1,97 @@ +import pandas as pd +import pytest + +from janitor.functions.adorn import adorn_totals + +# Données d'exemple +data = { + "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], +} + +df = pd.DataFrame(data) + + +@pytest.mark.functions +def test_adorn_totals_row(): + """ + Test que adorn_totals ajoute correctement une ligne 'Total' au tableau croisé. + """ + result = adorn_totals(df, "Category", "Subcategory", axis=0) + + assert ( + "Total" in result.index + ), "La ligne 'Total' doit être présente dans le tableau." + assert ( + result.loc["Total"].sum() == df["Value"].count() + ), "La somme de la ligne 'Total' doit correspondre au total des comptes." + + +@pytest.mark.functions +def test_adorn_totals_column(): + """ + Test que adorn_totals ajoute correctement une colonne 'Total' au tableau croisé. + """ + result = adorn_totals(df, "Category", "Subcategory", axis=1) + + assert ( + "Total" in result.columns + ), "La colonne 'Total' doit être présente dans le tableau." + assert ( + result["Total"].sum() == df["Value"].count() + ), "La somme de la colonne 'Total' doit correspondre au total des comptes." + + +@pytest.mark.functions +def test_adorn_totals_empty_df(): + """ + Test que adorn_totals fonctionne correctement avec un DataFrame vide. + """ + empty_df = pd.DataFrame(columns=["Category", "Subcategory", "Value"]) + result_row = adorn_totals(empty_df, "Category", "Subcategory", axis=0) + result_col = adorn_totals(empty_df, "Category", "Subcategory", axis=1) + + assert ( + result_row.empty + ), "Le tableau croisé doit être vide lorsqu'un DataFrame vide est utilisé." + assert ( + result_col.empty + ), "Le tableau croisé doit être vide lorsqu'un DataFrame vide est utilisé." + + +@pytest.mark.functions +def test_adorn_totals_invalid_axis(): + """ + Test that adorn_totals raises an error when an invalid axis is provided. + """ + data = { + "Category": ["A", "B", "C"], + "Subcategory": ["X", "Y", "Z"], + "Value": [1, 2, 3], + } + df = pd.DataFrame(data) + + with pytest.raises(ValueError, match="The 'axis' argument must be 0 .* 1"): + adorn_totals(df, "Category", "Subcategory", axis=2) # Invalid axis + + +@pytest.mark.functions +def test_adorn_totals_large_data(): + """ + Test que adorn_totals fonctionne correctement avec un DataFrame plus grand. + """ + large_data = { + "Category": ["A"] * 1000 + ["B"] * 1000, + "Subcategory": ["X"] * 500 + ["Y"] * 500 + ["X"] * 500 + ["Y"] * 500, + "Value": list(range(2000)), + } + large_df = pd.DataFrame(large_data) + result = adorn_totals(large_df, "Category", "Subcategory", axis=0) + + assert ( + "Total" in result.index + ), "La ligne 'Total' doit être présente dans le tableau pour un grand DataFrame." + assert result.loc["Total"].sum() == len( + large_data["Value"] + ), "La somme de la ligne 'Total' doit correspondre au total des comptes." From d7a475f95aac73c4238528d03b7481ea313791bd Mon Sep 17 00:00:00 2001 From: Sarra Nebli Date: Fri, 24 Jan 2025 22:18:47 +0100 Subject: [PATCH 3/8] Adding adorn_pct_formatting and adorn_ns in adorn_percentages with tests --- janitor/functions/adorn.py | 110 ++++++++++++++++++++-- tests/functions/test_adorn_percentages.py | 20 ++-- 2 files changed, 110 insertions(+), 20 deletions(-) diff --git a/janitor/functions/adorn.py b/janitor/functions/adorn.py index df85d59d7..7f68d102e 100644 --- a/janitor/functions/adorn.py +++ b/janitor/functions/adorn.py @@ -27,7 +27,21 @@ def tabyl( Returns: A DataFrame representing the summary table. + + Example Output: + | Category | X | Y | + |----------|---------|---------| + | A | 50.0% (2) | 50.0% (2) | + | B | 66.7% (2) | 33.3% (1) | + + In this example: + - The table shows counts and row-wise percentages for the `Category` and + `Subcategory` columns. + - The first row for Category 'A' shows 50.0% for X and 50.0% for Y, + with counts in parentheses. + - The second row for Category 'B' shows 66.7% for X and 33.3% for Y. """ + if col1 not in df.columns: raise ValueError(f"Column '{col1}' is not in the DataFrame.") if col2 and col2 not in df.columns: @@ -100,6 +114,35 @@ def adorn_totals(df, col1, col2, axis=0): :param col2: Second column to create the crosstab :param axis: 0 to add a 'Total' row, 1 to add a 'Total' column :return: DataFrame with a 'Total' row/column added + + Example: + # Sample DataFrame + data = { + "Category": ["A", "B", "A", "B", "A", "B", "A", "B"], + "Subcategory": ["X", "X", "Y", "Y", "X", "X", "Y", "Y"], + "Value": [1, 2, 3, 4, 5, 6, 7, 8], + } + df = pd.DataFrame(data) + + # Generate crosstab and add a 'Total' row + result = adorn_totals(df, col1="Category", col2="Subcategory", axis=0) + print(result) + + # Output will look like this: + # Category X Y Total + # A 6 12 18 + # B 6 12 18 + # Total 12 24 36 + + # Generate crosstab and add a 'Total' column + result = adorn_totals(df, col1="Category", col2="Subcategory", axis=1) + print(result) + + # Output will look like this: + # Category X Y Total + # A 3 3 6 + # B 3 3 6 + # Total 6 6 12 """ # Generate the crosstab using tabyl with the two specified columns pivot = tabyl(df, col1, col2) @@ -125,23 +168,66 @@ def adorn_totals(df, col1, col2, axis=0): return pivot -def adorn_percentages(df, col1, col2, axis="row", fmt=True, include_ns=False): +def adorn_percentages( + df, col1, col2, axis="row", fmt=True, include_ns=False, decimal_places=1 +): """ Adds percentages to a crosstab generated by tabyl, with options to format - and include raw counts. + and include raw counts, and also control the behavior + of adorn_pct_formatting and adorn_ns. :param df: DataFrame used to generate the crosstab :param col1: First column to create the crosstab :param col2: Second column to create the crosstab - :param axis: 'row' to add percentages by row, 'col' - for column percentages, 'all' for global percentages + :param axis: 'row' to add percentages by row, 'col' for column percentages, + 'all' for global percentages :param fmt: If True, formats percentages as strings - (e.g., "12.5%"), else returns numeric values. + (e.g., "12.5%"), else returns numeric values. :param include_ns: If True, includes raw counts alongside percentages. - :return: DataFrame with percentages added + :param decimal_places: Number of decimal places for the percentages + :param thousand_separator: Whether to add a thousand separator to the counts + :param percent_format: Whether to format as percentages + :return: DataFrame with percentages and optional formatting and raw counts + + Example: + # Sample DataFrame + data = { + "Category": ["A", "B", "A", "B", "A", "B"], + "Subcategory": ["X", "X", "Y", "Y", "X", "X"], + "Value": [1, 2, 3, 4, 5, 6], + } + df = pd.DataFrame(data) + + # Add percentages by row + result = adorn_percentages(df, col1="Category", + col2="Subcategory", axis="row", fmt=True, include_ns=True) + print(result) + + # Output might look like this: + # Category X Y + # A 20% (1) 80% (4) + # B 33.33% (2) 66.67% (4) + + # Add percentages by column + result = adorn_percentages(df, col1="Category", + col2="Subcategory", axis="col", fmt=True) + print(result) + + # Output might look like this: + # Category X Y + # A 33.33% (1) 50% (3) + # B 66.67% (2) 50% (3) + """ # Generate the crosstab using tabyl with the two specified columns - pivot = tabyl(df, col1, col2) + pivot = pd.pivot_table( + df, + values="Value", + index=col1, + columns=col2, + aggfunc="sum", + fill_value=0, + ) if pivot.empty: # If the crosstab is empty, return it as-is return pivot @@ -163,16 +249,20 @@ def adorn_percentages(df, col1, col2, axis="row", fmt=True, include_ns=False): # Format the percentages if requested if fmt: percentages = percentages.applymap( - lambda x: f"{x * 100:.1f}%" if pd.notnull(x) else x + lambda x: f"{x * 100:.{decimal_places}f}%" if pd.notnull(x) else x + ) + else: + percentages = percentages.applymap( + lambda x: f"{x:.{decimal_places}f}" if pd.notnull(x) else x ) - # Combine percentages with raw counts if requested + # Combine percentages with raw counts if requested (adorn_ns functionality) if include_ns: raw_counts = numeric_cols percentages_with_ns = ( percentages.astype(str) + " (" + raw_counts.astype(str) + ")" if fmt - else (percentages, raw_counts) + else percentages.astype(str) + " (" + raw_counts.astype(str) + ")" ) percentages = percentages_with_ns diff --git a/tests/functions/test_adorn_percentages.py b/tests/functions/test_adorn_percentages.py index 2841b8704..08c348668 100644 --- a/tests/functions/test_adorn_percentages.py +++ b/tests/functions/test_adorn_percentages.py @@ -17,7 +17,7 @@ def test_adorn_percentages_row(): df = pd.DataFrame(data) result = adorn_percentages( - df, "Category", "Subcategory", axis="row", fmt=True + df, "Category", "Subcategory", axis="row", fmt=True, include_ns=False ) assert result.shape[0] == 3 @@ -41,7 +41,7 @@ def test_adorn_percentages_col(): df = pd.DataFrame(data) result = adorn_percentages( - df, "Category", "Subcategory", axis="col", fmt=True + df, "Category", "Subcategory", axis="col", fmt=True, include_ns=False ) assert result.shape[0] == 3 @@ -65,7 +65,7 @@ def test_adorn_percentages_all(): df = pd.DataFrame(data) result = adorn_percentages( - df, "Category", "Subcategory", axis="all", fmt=True + df, "Category", "Subcategory", axis="all", fmt=True, include_ns=False ) assert result.shape[0] == 3 @@ -98,14 +98,14 @@ def test_adorn_percentages_with_ns_row(): assert result.shape[1] > 1 # Should have more than one column (including percentages and raw counts) assert "(" in result.iloc[0, 1] - # Check that the raw counts are included + # Check that raw counts are included @pytest.mark.functions def test_adorn_percentages_with_ns_col(): """ - Test that adorn_percentages correctly calculates - column percentages with raw counts. + Test that adorn_percentages correctly calculates column percentages + with raw counts. """ data = { "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], @@ -123,14 +123,14 @@ def test_adorn_percentages_with_ns_col(): assert result.shape[1] > 1 # Should have more than one column (including percentages and raw counts) assert "(" in result.iloc[0, 1] - # Check that the raw counts are included + # Check that raw counts are included @pytest.mark.functions def test_adorn_percentages_with_ns_all(): """ - Test that adorn_percentages correctly - calculates total (global) percentages with raw counts. + Test that adorn_percentages correctly calculates total (global) percentages + with raw counts. """ data = { "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], @@ -148,4 +148,4 @@ def test_adorn_percentages_with_ns_all(): assert result.shape[1] > 1 # Should have more than one column (including percentages and raw counts) assert "(" in result.iloc[0, 1] - # Check that the raw counts are included + # Check that raw counts are included From 9ae1878fabd6690ff56de6edee9f4df1f5563f4c Mon Sep 17 00:00:00 2001 From: Sarra Nebli Date: Fri, 24 Jan 2025 22:31:00 +0100 Subject: [PATCH 4/8] Examples of adorn_functions --- janitor/functions/adorn.py | 152 +++++++++++++++++++------------------ 1 file changed, 79 insertions(+), 73 deletions(-) diff --git a/janitor/functions/adorn.py b/janitor/functions/adorn.py index 7f68d102e..9ec97bb23 100644 --- a/janitor/functions/adorn.py +++ b/janitor/functions/adorn.py @@ -28,12 +28,29 @@ def tabyl( Returns: A DataFrame representing the summary table. - Example Output: + >>> data = { + ... "Category": ["A", "B", "A", "B", "A", "B"], + ... "Subcategory": ["X", "X", "Y", "Y", "X", "X"], + ... "Value": [1, 2, 3, 4, 5, 6], + ... } + >>> df = pd.DataFrame(data) + + >>> result = adorn_percentages(df, col1="Category", + col2="Subcategory", axis="row", fmt=True, include_ns=True) + >>> print(result) | Category | X | Y | |----------|---------|---------| | A | 50.0% (2) | 50.0% (2) | | B | 66.7% (2) | 33.3% (1) | + >>> result = adorn_percentages(df, col1="Category", + col2="Subcategory", axis="col", fmt=True) + >>> print(result) + | Category | X | Y | + |----------|---------|---------| + | A | 33.3% (1) | 50.0% (3) | + | B | 66.7% (2) | 50.0% (3) | + In this example: - The table shows counts and row-wise percentages for the `Category` and `Subcategory` columns. @@ -116,33 +133,26 @@ def adorn_totals(df, col1, col2, axis=0): :return: DataFrame with a 'Total' row/column added Example: - # Sample DataFrame - data = { - "Category": ["A", "B", "A", "B", "A", "B", "A", "B"], - "Subcategory": ["X", "X", "Y", "Y", "X", "X", "Y", "Y"], - "Value": [1, 2, 3, 4, 5, 6, 7, 8], - } - df = pd.DataFrame(data) - - # Generate crosstab and add a 'Total' row - result = adorn_totals(df, col1="Category", col2="Subcategory", axis=0) - print(result) - - # Output will look like this: - # Category X Y Total - # A 6 12 18 - # B 6 12 18 - # Total 12 24 36 - - # Generate crosstab and add a 'Total' column - result = adorn_totals(df, col1="Category", col2="Subcategory", axis=1) - print(result) - - # Output will look like this: - # Category X Y Total - # A 3 3 6 - # B 3 3 6 - # Total 6 6 12 + >>> data = { + ... "Category": ["A", "B", "A", "B", "A", "B", "A", "B"], + ... "Subcategory": ["X", "X", "Y", "Y", "X", "X", "Y", "Y"], + ... "Value": [1, 2, 3, 4, 5, 6, 7, 8], + ... } + >>> df = pd.DataFrame(data) + + >>> result = adorn_totals(df, col1="Category", col2="Subcategory", axis=0) + >>> print(result) + Category X Y Total + A 6 12 18 + B 6 12 18 + Total 12 24 36 + + >>> result = adorn_totals(df, col1="Category", col2="Subcategory", axis=1) + >>> print(result) + Category X Y Total + A 3 3 6 + B 3 3 6 + Total 6 6 12 """ # Generate the crosstab using tabyl with the two specified columns pivot = tabyl(df, col1, col2) @@ -172,51 +182,47 @@ def adorn_percentages( df, col1, col2, axis="row", fmt=True, include_ns=False, decimal_places=1 ): """ - Adds percentages to a crosstab generated by tabyl, with options to format - and include raw counts, and also control the behavior - of adorn_pct_formatting and adorn_ns. - - :param df: DataFrame used to generate the crosstab - :param col1: First column to create the crosstab - :param col2: Second column to create the crosstab - :param axis: 'row' to add percentages by row, 'col' for column percentages, - 'all' for global percentages - :param fmt: If True, formats percentages as strings - (e.g., "12.5%"), else returns numeric values. - :param include_ns: If True, includes raw counts alongside percentages. - :param decimal_places: Number of decimal places for the percentages - :param thousand_separator: Whether to add a thousand separator to the counts - :param percent_format: Whether to format as percentages - :return: DataFrame with percentages and optional formatting and raw counts - - Example: - # Sample DataFrame - data = { - "Category": ["A", "B", "A", "B", "A", "B"], - "Subcategory": ["X", "X", "Y", "Y", "X", "X"], - "Value": [1, 2, 3, 4, 5, 6], - } - df = pd.DataFrame(data) - - # Add percentages by row - result = adorn_percentages(df, col1="Category", - col2="Subcategory", axis="row", fmt=True, include_ns=True) - print(result) - - # Output might look like this: - # Category X Y - # A 20% (1) 80% (4) - # B 33.33% (2) 66.67% (4) - - # Add percentages by column - result = adorn_percentages(df, col1="Category", - col2="Subcategory", axis="col", fmt=True) - print(result) - - # Output might look like this: - # Category X Y - # A 33.33% (1) 50% (3) - # B 66.67% (2) 50% (3) + Adds percentages to a crosstab generated by tabyl, with options to format + and include raw counts, and also control the behavior + of adorn_pct_formatting and adorn_ns. + + :param df: DataFrame used to generate the crosstab + :param col1: First column to create the crosstab + :param col2: Second column to create the crosstab + :param axis: 'row' to add percentages by row, 'col' for column percentages, + 'all' for global percentages + :param fmt: If True, formats percentages as strings + (e.g., "12.5%"), else returns numeric values. + :param include_ns: If True, includes raw counts alongside percentages. + :param decimal_places: Number of decimal places for the percentages + :param thousand_separator: Whether to add a thousand separator to the counts + :param percent_format: Whether to format as percentages + :return: DataFrame with percentages and optional formatting and raw counts + + Example: + Example: + >>> data = { + ... "Category": ["A", "B", "A", "B", "A", "B"], + ... "Subcategory": ["X", "X", "Y", "Y", "X", "X"], + ... "Value": [1, 2, 3, 4, 5, 6], + ... } + >>> df = pd.DataFrame(data) + + >>> result = adorn_percentages(df, col1="Category", + col2="Subcategory", axis="row", fmt=True, include_ns=True) + + >>> print(result) + Category X Y + A 20% (1) 80% (4) + B 33.33% (2) 66.67% (4) + + >>> result = adorn_percentages(df, col1="Category", + col2="Subcategory", axis="col", fmt=True) + + >>> print(result) + Category X Y + A 33.33% (1) 50% (3) + B 66.67% (2) 50% (3) """ # Generate the crosstab using tabyl with the two specified columns From 2f30d408fd4db1406bbf63eb4416c327e243cf59 Mon Sep 17 00:00:00 2001 From: Sarra Nebli Date: Fri, 24 Jan 2025 22:40:06 +0100 Subject: [PATCH 5/8] Fixing documentation --- tests/functions/test_adorn_totals.py | 26 +++++++++++++------------- tests/functions/test_tabyl.py | 10 +++++----- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tests/functions/test_adorn_totals.py b/tests/functions/test_adorn_totals.py index ce8374797..a017d1e96 100644 --- a/tests/functions/test_adorn_totals.py +++ b/tests/functions/test_adorn_totals.py @@ -3,7 +3,7 @@ from janitor.functions.adorn import adorn_totals -# Données d'exemple +# Sample data data = { "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], @@ -16,37 +16,37 @@ @pytest.mark.functions def test_adorn_totals_row(): """ - Test que adorn_totals ajoute correctement une ligne 'Total' au tableau croisé. + Test that adorn_totals correctly adds a 'Total' row to the crosstab. """ result = adorn_totals(df, "Category", "Subcategory", axis=0) assert ( "Total" in result.index - ), "La ligne 'Total' doit être présente dans le tableau." + ), "The 'Total' row must be present in the crosstab." assert ( result.loc["Total"].sum() == df["Value"].count() - ), "La somme de la ligne 'Total' doit correspondre au total des comptes." + ), "The sum of the 'Total' row must match the total count of the values." @pytest.mark.functions def test_adorn_totals_column(): """ - Test que adorn_totals ajoute correctement une colonne 'Total' au tableau croisé. + Test that adorn_totals correctly adds a 'Total' column to the crosstab. """ result = adorn_totals(df, "Category", "Subcategory", axis=1) assert ( "Total" in result.columns - ), "La colonne 'Total' doit être présente dans le tableau." + ), "The 'Total' column must be present in the crosstab." assert ( result["Total"].sum() == df["Value"].count() - ), "La somme de la colonne 'Total' doit correspondre au total des comptes." + ), "The sum of the 'Total' column must match the total count of the values." @pytest.mark.functions def test_adorn_totals_empty_df(): """ - Test que adorn_totals fonctionne correctement avec un DataFrame vide. + Test that adorn_totals works correctly with an empty DataFrame. """ empty_df = pd.DataFrame(columns=["Category", "Subcategory", "Value"]) result_row = adorn_totals(empty_df, "Category", "Subcategory", axis=0) @@ -54,10 +54,10 @@ def test_adorn_totals_empty_df(): assert ( result_row.empty - ), "Le tableau croisé doit être vide lorsqu'un DataFrame vide est utilisé." + ), "The crosstab must be empty when an empty DataFrame is used." assert ( result_col.empty - ), "Le tableau croisé doit être vide lorsqu'un DataFrame vide est utilisé." + ), "The crosstab must be empty when an empty DataFrame is used." @pytest.mark.functions @@ -79,7 +79,7 @@ def test_adorn_totals_invalid_axis(): @pytest.mark.functions def test_adorn_totals_large_data(): """ - Test que adorn_totals fonctionne correctement avec un DataFrame plus grand. + Test that adorn_totals works correctly with a larger DataFrame. """ large_data = { "Category": ["A"] * 1000 + ["B"] * 1000, @@ -91,7 +91,7 @@ def test_adorn_totals_large_data(): assert ( "Total" in result.index - ), "La ligne 'Total' doit être présente dans le tableau pour un grand DataFrame." + ), "The 'Total' row must be present in the crosstab for a large DataFrame." assert result.loc["Total"].sum() == len( large_data["Value"] - ), "La somme de la ligne 'Total' doit correspondre au total des comptes." + ), "The sum of the 'Total' row must match the total count of the values." diff --git a/tests/functions/test_tabyl.py b/tests/functions/test_tabyl.py index 07326305a..3b6dde80f 100644 --- a/tests/functions/test_tabyl.py +++ b/tests/functions/test_tabyl.py @@ -7,7 +7,7 @@ @pytest.mark.functions def test_tabyl_basic_counts(): """ - Test que tabyl génère correctement un tableau croisé avec des comptes bruts. + Test that tabyl correctly generates a crosstab with raw counts. """ data = { "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], @@ -37,13 +37,13 @@ def test_tabyl_basic_counts(): ), f"Expected at least 5 columns, got {result.shape[1]}" assert ( result.iloc[:, 1:].sum().sum() == 10 - ) # La somme des comptes doit être égale à 10 + ) # The sum of the counts should be equal to 10 @pytest.mark.functions def test_tabyl_with_percentages_row(): """ - Test que tabyl calcule correctement les pourcentages par ligne. + Test that tabyl correctly calculates percentages by row. """ data = { "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], @@ -90,7 +90,7 @@ def test_tabyl_with_percentages_row(): @pytest.mark.functions def test_tabyl_with_percentages_col(): """ - Test que tabyl calcule correctement les pourcentages par colonne. + Test that tabyl correctly calculates percentages by column. """ data = { "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], @@ -137,7 +137,7 @@ def test_tabyl_with_percentages_col(): @pytest.mark.functions def test_tabyl_with_percentages_all(): """ - Test que tabyl calcule correctement les pourcentages totaux. + Test that tabyl correctly calculates total percentages. """ data = { "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], From 7ba6ac6be1a5ab259d0b2374e6d742cffd9ed1d8 Mon Sep 17 00:00:00 2001 From: Sarra Nebli Date: Fri, 24 Jan 2025 22:46:58 +0100 Subject: [PATCH 6/8] update AUTHORS.md file --- AUTHORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS.md b/AUTHORS.md index b6c426445..a526df9a6 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -114,3 +114,4 @@ Contributors - [@joranbeasley](https://github.com/joranbeasley) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%joranbeasley) -[@kianmeng](https://github.com/kianmeng) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1290#issue-1906020324) - [@lbeltrame](https://github.com/lbeltrame) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1401) +- [@Sarra99](https://github.com/Sarra99) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%3ASarra99) From 35d9300a2a501b0622d170b42ed6c87fa69e9ce7 Mon Sep 17 00:00:00 2001 From: Sarra Nebli Date: Fri, 24 Jan 2025 23:09:49 +0100 Subject: [PATCH 7/8] Fixing CI errors --- janitor/functions/adorn.py | 139 +++++++++++++++---------------------- 1 file changed, 56 insertions(+), 83 deletions(-) diff --git a/janitor/functions/adorn.py b/janitor/functions/adorn.py index 9ec97bb23..cc85df64f 100644 --- a/janitor/functions/adorn.py +++ b/janitor/functions/adorn.py @@ -28,35 +28,32 @@ def tabyl( Returns: A DataFrame representing the summary table. + Example : >>> data = { - ... "Category": ["A", "B", "A", "B", "A", "B"], - ... "Subcategory": ["X", "X", "Y", "Y", "X", "X"], - ... "Value": [1, 2, 3, 4, 5, 6], + ... "Category": ["A", "A", "B", "B", "C", "C", "A", "B", "C", "A"], + ... "Subcategory": ["X", "Y", "X", "Y", "X", "Y", "X", "Y", "X", "X"], + ... "Region": ["North", "South", "East", "West", "North", + ... "South", "East", "West", "North", "East"], + ... "Value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ... } >>> df = pd.DataFrame(data) - >>> result = adorn_percentages(df, col1="Category", - col2="Subcategory", axis="row", fmt=True, include_ns=True) + >>> result = tabyl(df, "Category", "Subcategory", show_percentages=True, + ... percentage_axis="row") >>> print(result) - | Category | X | Y | - |----------|---------|---------| - | A | 50.0% (2) | 50.0% (2) | - | B | 66.7% (2) | 33.3% (1) | + Subcategory Category X Y + 0 A 3.0 (75.00%) 1.0 (25.00%) + 1 B 1.0 (33.33%) 2.0 (66.67%) + 2 C 2.0 (66.67%) 1.0 (33.33%) - >>> result = adorn_percentages(df, col1="Category", - col2="Subcategory", axis="col", fmt=True) + >>> result = tabyl(df, "Category", "Subcategory", + ... show_percentages=True, percentage_axis="col") >>> print(result) - | Category | X | Y | - |----------|---------|---------| - | A | 33.3% (1) | 50.0% (3) | - | B | 66.7% (2) | 50.0% (3) | - - In this example: - - The table shows counts and row-wise percentages for the `Category` and - `Subcategory` columns. - - The first row for Category 'A' shows 50.0% for X and 50.0% for Y, - with counts in parentheses. - - The second row for Category 'B' shows 66.7% for X and 33.3% for Y. + Subcategory Category X Y + 0 A 3.0 (50.00%) 1.0 (25.00%) + 1 B 1.0 (16.67%) 2.0 (50.00%) + 2 C 2.0 (33.33%) 1.0 (25.00%) + """ if col1 not in df.columns: @@ -132,28 +129,29 @@ def adorn_totals(df, col1, col2, axis=0): :param axis: 0 to add a 'Total' row, 1 to add a 'Total' column :return: DataFrame with a 'Total' row/column added - Example: - >>> data = { - ... "Category": ["A", "B", "A", "B", "A", "B", "A", "B"], - ... "Subcategory": ["X", "X", "Y", "Y", "X", "X", "Y", "Y"], - ... "Value": [1, 2, 3, 4, 5, 6, 7, 8], - ... } - >>> df = pd.DataFrame(data) - - >>> result = adorn_totals(df, col1="Category", col2="Subcategory", axis=0) - >>> print(result) - Category X Y Total - A 6 12 18 - B 6 12 18 - Total 12 24 36 + Example: + >>> data = { + ... "Category": ["A", "B", "A", "B", "A", "B", "A", "B"], + ... "Subcategory": ["X", "X", "Y", "Y", "X", "X", "Y", "Y"], + ... "Value": [1, 2, 3, 4, 5, 6, 7, 8], + ... } + >>> df = pd.DataFrame(data) + + >>> result = adorn_totals(df, "Category", "Subcategory", axis=0) + >>> print(result) + Subcategory Category X Y + 0 A 2 2 + 1 B 2 2 + Total NaN 4 4 + + >>> result = adorn_totals(df, "Category", "Subcategory", axis=1) + >>> print(result) + Subcategory Category X Y Total + 0 A 2 2 4 + 1 B 2 2 4 - >>> result = adorn_totals(df, col1="Category", col2="Subcategory", axis=1) - >>> print(result) - Category X Y Total - A 3 3 6 - B 3 3 6 - Total 6 6 12 """ + # Generate the crosstab using tabyl with the two specified columns pivot = tabyl(df, col1, col2) @@ -182,47 +180,22 @@ def adorn_percentages( df, col1, col2, axis="row", fmt=True, include_ns=False, decimal_places=1 ): """ - Adds percentages to a crosstab generated by tabyl, with options to format - and include raw counts, and also control the behavior - of adorn_pct_formatting and adorn_ns. - - :param df: DataFrame used to generate the crosstab - :param col1: First column to create the crosstab - :param col2: Second column to create the crosstab - :param axis: 'row' to add percentages by row, 'col' for column percentages, - 'all' for global percentages - :param fmt: If True, formats percentages as strings - (e.g., "12.5%"), else returns numeric values. - :param include_ns: If True, includes raw counts alongside percentages. - :param decimal_places: Number of decimal places for the percentages - :param thousand_separator: Whether to add a thousand separator to the counts - :param percent_format: Whether to format as percentages - :return: DataFrame with percentages and optional formatting and raw counts - - Example: - Example: - >>> data = { - ... "Category": ["A", "B", "A", "B", "A", "B"], - ... "Subcategory": ["X", "X", "Y", "Y", "X", "X"], - ... "Value": [1, 2, 3, 4, 5, 6], - ... } - >>> df = pd.DataFrame(data) - - >>> result = adorn_percentages(df, col1="Category", - col2="Subcategory", axis="row", fmt=True, include_ns=True) - - >>> print(result) - Category X Y - A 20% (1) 80% (4) - B 33.33% (2) 66.67% (4) - - >>> result = adorn_percentages(df, col1="Category", - col2="Subcategory", axis="col", fmt=True) - - >>> print(result) - Category X Y - A 33.33% (1) 50% (3) - B 66.67% (2) 50% (3) + Adds percentages to a crosstab generated by tabyl, with options to format + and include raw counts, and also control the behavior + of adorn_pct_formatting and adorn_ns. + + :param df: DataFrame used to generate the crosstab + :param col1: First column to create the crosstab + :param col2: Second column to create the crosstab + :param axis: 'row' to add percentages by row, 'col' for column percentages, + 'all' for global percentages + :param fmt: If True, formats percentages as strings + (e.g., "12.5%"), else returns numeric values. + :param include_ns: If True, includes raw counts alongside percentages. + :param decimal_places: Number of decimal places for the percentages + :param thousand_separator: Whether to add a thousand separator to the counts + :param percent_format: Whether to format as percentages + :return: DataFrame with percentages and optional formatting and raw counts """ # Generate the crosstab using tabyl with the two specified columns From add19e9bea969fcdbb9fdc12ccc87e96149b6660 Mon Sep 17 00:00:00 2001 From: Sarra Nebli Date: Sun, 26 Jan 2025 21:18:34 +0100 Subject: [PATCH 8/8] Fixing code coverage --- tests/functions/test_adorn_percentages.py | 34 ++++++++++ tests/functions/test_tabyl.py | 78 +++++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/tests/functions/test_adorn_percentages.py b/tests/functions/test_adorn_percentages.py index 08c348668..afe2651ea 100644 --- a/tests/functions/test_adorn_percentages.py +++ b/tests/functions/test_adorn_percentages.py @@ -149,3 +149,37 @@ def test_adorn_percentages_with_ns_all(): # Should have more than one column (including percentages and raw counts) assert "(" in result.iloc[0, 1] # Check that raw counts are included + + +@pytest.mark.functions +def test_adorn_percentages_empty_pivot(): + """ + Test that adorn_percentages returns an empty DataFrame if the pivot is empty. + """ + # DataFrame sans colonnes valides pour le pivot + data = {"NonExistentColumn": [], "AnotherColumn": [], "Value": []} + df = pd.DataFrame(data) + + # Appel de la fonction avec des colonnes inexistantes + result = adorn_percentages(df, "NonExistentColumn", "AnotherColumn") + + # Vérifie que le résultat est un DataFrame vide + assert result.empty, "Expected an empty DataFrame when pivot is empty." + + +@pytest.mark.functions +def test_adorn_percentages_invalid_axis(): + """ + Test that adorn_percentages raises a ValueError for an invalid axis argument. + """ + data = { + "Category": ["A", "B"], + "Subcategory": ["X", "Y"], + "Value": [10, 20], + } + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, match="The 'axis' argument must be 'row', 'col', or 'all'." + ): + adorn_percentages(df, "Category", "Subcategory", axis="invalid") diff --git a/tests/functions/test_tabyl.py b/tests/functions/test_tabyl.py index 3b6dde80f..d53cc99a1 100644 --- a/tests/functions/test_tabyl.py +++ b/tests/functions/test_tabyl.py @@ -179,3 +179,81 @@ def test_tabyl_with_percentages_all(): assert ( result_numeric.select_dtypes(include=["float", "int"]).max().max() <= 1 ) + + +@pytest.mark.functions +def test_tabyl_missing_col1(): + """ + Test that tabyl raises an error if col1 is missing from the DataFrame. + """ + data = {"Category": ["A", "B"], "Subcategory": ["X", "Y"]} + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, match="Column 'Region' is not in the DataFrame." + ): + tabyl(df, "Region") + + +@pytest.mark.functions +def test_tabyl_missing_col2(): + """ + Test that tabyl raises an error if col2 is missing from the DataFrame. + """ + data = {"Category": ["A", "B"], "Subcategory": ["X", "Y"]} + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, match="Column 'Value' is not in the DataFrame." + ): + tabyl(df, "Category", "Value") + + +@pytest.mark.functions +def test_tabyl_missing_col3(): + """ + Test that tabyl raises an error if col3 is missing from the DataFrame. + """ + data = {"Category": ["A", "B"], "Subcategory": ["X", "Y"]} + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, match="Column 'Region' is not in the DataFrame." + ): + tabyl(df, "Category", "Subcategory", "Region") + + +@pytest.mark.functions +def test_tabyl_single_column(): + """ + Test that tabyl works correctly with only col1 specified. + """ + data = {"Category": ["A", "B", "A", "C", "B", "A", "C"]} + df = pd.DataFrame(data) + + result = tabyl(df, "Category") + assert result.shape[0] == 3 # Three unique values in 'Category' + assert ( + result["count"].sum() == 7 + ) # Total count should match the number of rows + + +@pytest.mark.functions +def test_tabyl_invalid_percentage_axis(): + """ + Test that tabyl raises an error for invalid percentage_axis values. + """ + data = {"Category": ["A", "B"], "Subcategory": ["X", "Y"]} + df = pd.DataFrame(data) + + with pytest.raises( + ValueError, + match="`percentage_axis` must be one of 'row', 'col', or 'all'.", + ): + tabyl( + df, + "Category", + "Subcategory", + show_percentages=True, + percentage_axis="invalid", + )