diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py index 7da32317e..f82be0542 100644 --- a/janitor/functions/pivot.py +++ b/janitor/functions/pivot.py @@ -11,7 +11,7 @@ import numpy as np import pandas as pd import pandas_flavor as pf -from pandas.api.types import is_extension_array_dtype, is_scalar +from pandas.api.types import is_extension_array_dtype from pandas.core.dtypes.concat import concat_compat from janitor.functions.select import ( @@ -2110,11 +2110,13 @@ def _computations_pivot_wider( names_expand, index_expand, ) - - out = df.pivot( # noqa: PD010 - index=index, columns=names_from, values=values_from - ) - + if index is None: + # avoids the KeyError issue that arises from passing index=None + out = df.pivot(columns=names_from, values=values_from) # noqa: PD010 + else: + out = df.pivot( # noqa: PD010 + index=index, columns=names_from, values=values_from + ) indexer = out.index if index_expand and index: any_categoricals = (indexer.get_level_values(name) for name in index) @@ -2227,43 +2229,51 @@ def _data_checks_pivot_wider( "pivot_wider() is missing 1 required argument: 'names_from'" ) names_from = get_index_labels([names_from], df, axis="columns") - - if (values_from is None) and (index is not None): - index_ = get_index_labels([index], df, axis="columns") - values_from_ = df.columns.difference(names_from).difference(index_) - index_ = None - elif values_from is None: - values_from_ = df.columns.difference(names_from) + try: + # hack to align with pd.pivot + # this means the user passed a scalar + # that exists in the columns, + # and does not require dynamic evaluation (via the janitor.select function) + # if, over time, the below check becomes unwieldy + # or we have to support too many edge cases, + # then we can abort this step altogether, + # and inform users to + # set flatten_levels to False + # and manage the output afterwards + _ = df.columns.get_loc(values_from) + except (pd.errors.InvalidIndexError, KeyError, TypeError): + values_from_ = None else: - values_from_ = get_index_labels([values_from], df, axis="columns") - - if index is None: + values_from_ = values_from + if (values_from is None) and (index is not None): + index = get_index_labels([index], df, axis="columns") + values_from = df.columns.difference(names_from).difference(index) + elif (values_from is not None) and (index is None): + values_from = get_index_labels([values_from], df, axis="columns") index = df.columns.difference(names_from).difference(values_from) - if index.empty: - index = None - else: - index = list(index) - else: + elif (values_from is None) and (index is None): + values_from = df.columns.difference(names_from) + index = pd.Index([]) + elif (values_from is not None) and (index is not None): + values_from = get_index_labels([values_from], df, axis="columns") index = get_index_labels([index], df, axis="columns") + if index.empty: + index = None + else: index = list(index) - names_from = list(names_from) - if is_scalar(values_from) and (values_from is not None): - if values_from == values_from_[0]: - pass + if values_from_: + values_from = values_from_ else: - values_from = list(values_from_) + values_from = list(values_from) + names_from = list(names_from) check("flatten_levels", flatten_levels, [bool]) - if names_sep is not None: check("names_sep", names_sep, [str]) - if names_glue is not None: check("names_glue", names_glue, [str]) - check("reset_index", reset_index, [bool]) check("names_expand", names_expand, [bool]) check("index_expand", index_expand, [bool]) - return ( df, index, diff --git a/tests/functions/test_pivot_wider.py b/tests/functions/test_pivot_wider.py index 2b81ca956..33141fd4a 100644 --- a/tests/functions/test_pivot_wider.py +++ b/tests/functions/test_pivot_wider.py @@ -92,9 +92,9 @@ def test_names_glue_wrong_label1(df_checks_output): KeyError, match="'variabl' is not a column label in names_from." ): df_checks_output.pivot_wider( - ["geoid", "name"], - "variable", - "estimate", + index=["geoid", "name"], + names_from="variable", + values_from="estimate", names_glue="{variabl}_estimate", ) @@ -309,6 +309,32 @@ def test_no_index_names_from_order(): assert_frame_equal(result, expected_output) +def test_no_index_names_from_order2(): + """Test output if no `index` is supplied and column order is maintained.""" + df_in = pd.DataFrame( + { + "gender": ["Male", "Female", "Female", "Male", "Male"], + "contVar": [22379, 24523, 23421, 23831, 29234], + }, + index=[0, 0, 1, 1, 2], + ) + + expected_output = pd.DataFrame( + { + "Male": [22379.0, 23831.0, 29234.0], + "Female": [24523.0, 23421.0, np.nan], + } + ) + + result = ( + df_in.pivot_wider(names_from="gender", values_from="contVar") + .loc[:, ["Male", "Female"]] + .rename_axis(index=None) + ) + + assert_frame_equal(result, expected_output) + + def test_index_names(): """Test output if index is supplied.""" df = pd.DataFrame( @@ -673,3 +699,35 @@ def test_values_from_is_None_index_is_not_None(): ) actual = df.pivot(index=["subject", "date"], columns="strength") assert_frame_equal(expected, actual) + + +def test_values_from_is_None_index_is_None(): + """ + Test output if + only names_from is provided, + """ + # https://github.com/pyjanitor-devs/pyjanitor/issues/1509 + df = pd.DataFrame( + { + "subject": [1, 1, 1, 2, 2, 2, 2], + "pills": [4, 4, 2, 1, 1, 1, 3], + "date": [ + "10/10/2012", + "10/11/2012", + "10/12/2012", + "1/6/2014", + "1/7/2014", + "1/7/2014", + "1/8/2014", + ], + "strength": [250, 250, 500, 1000, 250, 500, 250], + } + ) + + expected = df.pivot_wider( + names_from="strength", flatten_levels=False + ).sort_index(axis="columns") + expected["subject"] = expected["subject"].apply(pd.to_numeric) + expected["pills"] = expected["pills"].apply(pd.to_numeric) + actual = df.pivot(columns="strength").sort_index(axis="columns") + assert_frame_equal(expected, actual)