-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
pivot_table very slow on Categorical data; how about an observed keyword argument? #24923 #24953
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 23 commits
d1554c2
0662fa3
6121313
9f93ab9
ebe5972
416e9c8
5c62063
8663be2
a1e3afe
22637a3
088f277
672847b
d97a077
9de99fa
9a9569f
c8e085d
2516386
0efeed8
13168d2
8518833
58a8f6e
12b8fac
09af30b
6df9e6d
a23b5d0
8d50e85
3d39dff
ee696d9
12c0f82
f586e42
cf7e8f5
a3bcf1a
bb7cfef
5921646
3c1720c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5714,6 +5714,12 @@ def pivot(self, index=None, columns=None, values=None): | |
margins_name : string, default 'All' | ||
Name of the row / column that will contain the totals | ||
when margins is True. | ||
observed : boolean, default False | ||
This only applies if any of the groupers are Categoricals. | ||
If True: only show observed values for categorical groupers. | ||
If False: show all values for categorical groupers. | ||
|
||
.. versionchanged :: 0.25.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. versionadded |
||
|
||
Returns | ||
------- | ||
|
@@ -5804,12 +5810,12 @@ def pivot(self, index=None, columns=None, values=None): | |
@Appender(_shared_docs['pivot_table']) | ||
def pivot_table(self, values=None, index=None, columns=None, | ||
aggfunc='mean', fill_value=None, margins=False, | ||
dropna=True, margins_name='All'): | ||
dropna=True, margins_name='All', observed=False): | ||
from pandas.core.reshape.pivot import pivot_table | ||
return pivot_table(self, values=values, index=index, columns=columns, | ||
aggfunc=aggfunc, fill_value=fill_value, | ||
margins=margins, dropna=dropna, | ||
margins_name=margins_name) | ||
margins_name=margins_name, observed=observed) | ||
|
||
def stack(self, level=-1, dropna=True): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
|
||
from collections import OrderedDict | ||
from datetime import date, datetime, timedelta | ||
import time | ||
|
||
import numpy as np | ||
import pytest | ||
|
@@ -38,18 +39,18 @@ def setup_method(self, method): | |
'E': np.random.randn(11), | ||
'F': np.random.randn(11)}) | ||
|
||
def test_pivot_table(self): | ||
def test_pivot_table(self, observed): | ||
benjaminr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
index = ['A', 'B'] | ||
columns = 'C' | ||
table = pivot_table(self.data, values='D', | ||
index=index, columns=columns) | ||
index=index, columns=columns, observed=observed) | ||
|
||
table2 = self.data.pivot_table( | ||
values='D', index=index, columns=columns) | ||
values='D', index=index, columns=columns, observed=observed) | ||
tm.assert_frame_equal(table, table2) | ||
|
||
# this works | ||
pivot_table(self.data, values='D', index=index) | ||
pivot_table(self.data, values='D', index=index, observed=observed) | ||
|
||
if len(index) > 1: | ||
assert table.index.names == tuple(index) | ||
|
@@ -65,6 +66,48 @@ def test_pivot_table(self): | |
index + [columns])['D'].agg(np.mean).unstack() | ||
tm.assert_frame_equal(table, expected) | ||
|
||
def test_pivot_table_categorical_observed_equal(self, observed): | ||
# issue #24923 | ||
df = pd.DataFrame({'col1': list('abcde'), | ||
benjaminr marked this conversation as resolved.
Show resolved
Hide resolved
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'col2': list('fghij'), | ||
'col3': [1, 2, 3, 4, 5]}) | ||
|
||
df.col1 = df.col1.astype('category') | ||
df.col2 = df.col1.astype('category') | ||
|
||
expected = df.pivot_table(index='col1', values='col3', | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
columns='col2', aggfunc=np.sum, | ||
fill_value=0) | ||
benjaminr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
result = df.pivot_table(index='col1', values='col3', | ||
columns='col2', aggfunc=np.sum, | ||
fill_value=0, observed=observed) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_pivot_table_categorical_observed_speed(self): | ||
# issue #24923 | ||
df = pd.DataFrame({'col1': list('abcde'), | ||
'col2': list('fghij'), | ||
'col3': [1, 2, 3, 4, 5]}) | ||
|
||
df.col1 = df.col1.astype('category') | ||
df.col2 = df.col1.astype('category') | ||
|
||
start_time_observed_false = time.time() | ||
|
||
df.pivot_table(index='col1', values='col3', | ||
columns='col2', aggfunc=np.sum, | ||
fill_value=0, observed=False) | ||
total_time_observed_false = time.time() - start_time_observed_false | ||
|
||
start_time_observed_true = time.time() | ||
df.pivot_table(index='col1', values='col3', | ||
columns='col2', aggfunc=np.sum, | ||
fill_value=0, observed=True) | ||
total_time_observed_true = time.time() - start_time_observed_true | ||
|
||
assert total_time_observed_true < total_time_observed_false | ||
|
||
def test_pivot_table_nocols(self): | ||
df = DataFrame({'rows': ['a', 'b', 'c'], | ||
'cols': ['x', 'y', 'z'], | ||
|
Uh oh!
There was an error while loading. Please reload this page.