Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 40 additions & 30 deletions janitor/functions/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import numpy as np
import pandas as pd
import pandas_flavor as pf
from pandas.api.types import is_extension_array_dtype, is_scalar
from pandas.api.types import is_extension_array_dtype
from pandas.core.dtypes.concat import concat_compat

from janitor.functions.select import (
Expand Down Expand Up @@ -2110,11 +2110,13 @@ def _computations_pivot_wider(
names_expand,
index_expand,
)

out = df.pivot( # noqa: PD010
index=index, columns=names_from, values=values_from
)

if index is None:
# avoids the KeyError issue that arises from passing index=None
out = df.pivot(columns=names_from, values=values_from) # noqa: PD010
else:
out = df.pivot( # noqa: PD010
index=index, columns=names_from, values=values_from
)
indexer = out.index
if index_expand and index:
any_categoricals = (indexer.get_level_values(name) for name in index)
Expand Down Expand Up @@ -2227,43 +2229,51 @@ def _data_checks_pivot_wider(
"pivot_wider() is missing 1 required argument: 'names_from'"
)
names_from = get_index_labels([names_from], df, axis="columns")

if (values_from is None) and (index is not None):
index_ = get_index_labels([index], df, axis="columns")
values_from_ = df.columns.difference(names_from).difference(index_)
index_ = None
elif values_from is None:
values_from_ = df.columns.difference(names_from)
try:
# hack to align with pd.pivot
# this means the user passed a scalar
# that exists in the columns,
# and does not require dynamic evaluation (via the janitor.select function)
# if, over time, the below check becomes unwieldy
# or we have to support too many edge cases,
# then we can abort this step altogether,
# and inform users to
# set flatten_levels to False
# and manage the output afterwards
_ = df.columns.get_loc(values_from)
except (pd.errors.InvalidIndexError, KeyError, TypeError):
values_from_ = None
else:
values_from_ = get_index_labels([values_from], df, axis="columns")

if index is None:
values_from_ = values_from
if (values_from is None) and (index is not None):
index = get_index_labels([index], df, axis="columns")
values_from = df.columns.difference(names_from).difference(index)
elif (values_from is not None) and (index is None):
values_from = get_index_labels([values_from], df, axis="columns")
index = df.columns.difference(names_from).difference(values_from)
if index.empty:
index = None
else:
index = list(index)
else:
elif (values_from is None) and (index is None):
values_from = df.columns.difference(names_from)
index = pd.Index([])
elif (values_from is not None) and (index is not None):
values_from = get_index_labels([values_from], df, axis="columns")
index = get_index_labels([index], df, axis="columns")
if index.empty:
index = None
else:
index = list(index)
names_from = list(names_from)
if is_scalar(values_from) and (values_from is not None):
if values_from == values_from_[0]:
pass
if values_from_:
values_from = values_from_
else:
values_from = list(values_from_)
values_from = list(values_from)
names_from = list(names_from)
check("flatten_levels", flatten_levels, [bool])

if names_sep is not None:
check("names_sep", names_sep, [str])

if names_glue is not None:
check("names_glue", names_glue, [str])

check("reset_index", reset_index, [bool])
check("names_expand", names_expand, [bool])
check("index_expand", index_expand, [bool])

return (
df,
index,
Expand Down
64 changes: 61 additions & 3 deletions tests/functions/test_pivot_wider.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ def test_names_glue_wrong_label1(df_checks_output):
KeyError, match="'variabl' is not a column label in names_from."
):
df_checks_output.pivot_wider(
["geoid", "name"],
"variable",
"estimate",
index=["geoid", "name"],
names_from="variable",
values_from="estimate",
names_glue="{variabl}_estimate",
)

Expand Down Expand Up @@ -309,6 +309,32 @@ def test_no_index_names_from_order():
assert_frame_equal(result, expected_output)


def test_no_index_names_from_order2():
"""Test output if no `index` is supplied and column order is maintained."""
df_in = pd.DataFrame(
{
"gender": ["Male", "Female", "Female", "Male", "Male"],
"contVar": [22379, 24523, 23421, 23831, 29234],
},
index=[0, 0, 1, 1, 2],
)

expected_output = pd.DataFrame(
{
"Male": [22379.0, 23831.0, 29234.0],
"Female": [24523.0, 23421.0, np.nan],
}
)

result = (
df_in.pivot_wider(names_from="gender", values_from="contVar")
.loc[:, ["Male", "Female"]]
.rename_axis(index=None)
)

assert_frame_equal(result, expected_output)


def test_index_names():
"""Test output if index is supplied."""
df = pd.DataFrame(
Expand Down Expand Up @@ -673,3 +699,35 @@ def test_values_from_is_None_index_is_not_None():
)
actual = df.pivot(index=["subject", "date"], columns="strength")
assert_frame_equal(expected, actual)


def test_values_from_is_None_index_is_None():
"""
Test output if
only names_from is provided,
"""
# https://github.com/pyjanitor-devs/pyjanitor/issues/1509
df = pd.DataFrame(
{
"subject": [1, 1, 1, 2, 2, 2, 2],
"pills": [4, 4, 2, 1, 1, 1, 3],
"date": [
"10/10/2012",
"10/11/2012",
"10/12/2012",
"1/6/2014",
"1/7/2014",
"1/7/2014",
"1/8/2014",
],
"strength": [250, 250, 500, 1000, 250, 500, 250],
}
)

expected = df.pivot_wider(
names_from="strength", flatten_levels=False
).sort_index(axis="columns")
expected["subject"] = expected["subject"].apply(pd.to_numeric)
expected["pills"] = expected["pills"].apply(pd.to_numeric)
actual = df.pivot(columns="strength").sort_index(axis="columns")
assert_frame_equal(expected, actual)
Loading