-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
PERF: SparseDataFrame._init_dict uses intermediary dict, not DataFrame #16883
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
e785961
caf3a36
31d9b28
b55b1a2
7053de5
83d8140
e0b468f
f41b490
0a98ac9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,10 @@ | |
|
||
from pandas.core.dtypes.missing import isnull, notnull | ||
from pandas.core.dtypes.cast import maybe_upcast, find_common_type | ||
from pandas.core.dtypes.common import _ensure_platform_int, is_scipy_sparse | ||
from pandas.core.dtypes.common import ( | ||
_ensure_platform_int, is_scipy_sparse, | ||
is_float, | ||
) | ||
|
||
from pandas.core.common import _try_sort | ||
from pandas.compat.numpy import function as nv | ||
|
@@ -143,7 +146,7 @@ def _init_dict(self, data, index, columns, dtype=None): | |
sp_maker = lambda x: SparseArray(x, kind=self._default_kind, | ||
fill_value=self._default_fill_value, | ||
copy=True, dtype=dtype) | ||
sdict = DataFrame() | ||
sdict = {} | ||
for k, v in compat.iteritems(data): | ||
if isinstance(v, Series): | ||
# Force alignment, no copy necessary | ||
|
@@ -159,15 +162,12 @@ def _init_dict(self, data, index, columns, dtype=None): | |
v = [v.get(i, nan) for i in index] | ||
|
||
v = sp_maker(v) | ||
sdict[k] = v | ||
sdict[_nan_to_np_nan(k)] = v | ||
|
||
# TODO: figure out how to handle this case, all nan's? | ||
# add in any other columns we want to have (completeness) | ||
nan_vec = np.empty(len(index)) | ||
nan_vec.fill(nan) | ||
for c in columns: | ||
if c not in sdict: | ||
sdict[c] = sp_maker(nan_vec) | ||
nan_arr = sp_maker(np.full(len(index), np.nan)) | ||
sdict.update((c, nan_arr) for c in columns if c not in sdict) | ||
|
||
return to_manager(sdict, columns, index) | ||
|
||
|
@@ -846,6 +846,13 @@ def applymap(self, func): | |
return self.apply(lambda x: lmap(func, x)) | ||
|
||
|
||
def _nan_to_np_nan(value): | ||
"""Normalize nan values to singleton np.NaN object so that when NaNs are | ||
|
||
used as dict keys, getitem works. | ||
""" | ||
return np.nan if is_float(value) and isnull(value) else value | ||
|
||
|
||
def to_manager(sdf, columns, index): | ||
""" create and return the block manager from a dataframe of series, | ||
columns, index | ||
|
@@ -855,7 +862,7 @@ def to_manager(sdf, columns, index): | |
axes = [_ensure_index(columns), _ensure_index(index)] | ||
|
||
return create_block_manager_from_arrays( | ||
[sdf[c] for c in columns], columns, axes) | ||
[sdf[_nan_to_np_nan(c)] for c in columns], columns, axes) | ||
|
||
|
||
def stack_sparse_frame(frame): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe I'm being pedantic, but you can't really "fix" performance, only improve it. 😄