From 5fe65260614b0bbe5dbbee93b85ac52310f74da4 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 1 Jun 2013 15:27:48 -0400 Subject: [PATCH 1/2] PERF: (GH3733) where block splitting now done in at most 2 blocks rather than item-by-item; add vb_suite tests for masking --- pandas/core/internals.py | 56 +++++++++++++++++++++----------------- pandas/tests/test_frame.py | 10 +++++++ vb_suite/frame_methods.py | 15 ++++++++++ vb_suite/suite.py | 40 +++++++++++++-------------- 4 files changed, 76 insertions(+), 45 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 8b711f5e077ce..1ee2f7a44ae74 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -558,42 +558,48 @@ def func(c,v,o): result.fill(np.nan) return result - def create_block(result, items, transpose=True): + # see if we can operate on the entire block, or need item-by-item + result = func(cond,values,other) + if self._can_hold_na: + if not isinstance(result, np.ndarray): raise TypeError('Could not compare [%s] with block values' % repr(other)) - if transpose and is_transposed: + if is_transposed: result = result.T # try to cast if requested if try_cast: result = self._try_cast_result(result) - return make_block(result, items, self.ref_items) + return make_block(result, self.items, self.ref_items) - # see if we can operate on the entire block, or need item-by-item - if not self._can_hold_na: - axis = cond.ndim-1 - result_blocks = [] - for item in self.items: - loc = self.items.get_loc(item) - item = self.items.take([loc]) - v = values.take([loc],axis=axis) - c = cond.take([loc],axis=axis) - o = other.take([loc],axis=axis) if hasattr(other,'shape') else other - - result = func(c,v,o) - if len(result) == 1: - result = np.repeat(result,self.shape[1:]) - - result = _block_shape(result,ndim=self.ndim,shape=self.shape[1:]) - result_blocks.append(create_block(result, item, transpose=False)) - - return result_blocks - else: - result = func(cond,values,other) - return create_block(result, self.items) + # might need to separate out blocks + axis = cond.ndim-1 + cond = cond.swapaxes(axis,0) + mask = np.array([ cond[i].all() for i in enumerate(range(cond.shape[0]))],dtype=bool) + result_blocks = [] + + # can do the mask=true as a single block + if mask.any(): + items = self.items[mask] + locs = self.items.get_indexer(items) + slices = [slice(None)] * cond.ndim + slices[axis] = locs + r = self._try_cast_result(result[slices]) + result_blocks.append(make_block(r.T, items, self.ref_items)) + + # and mask=false as a single block + if (~mask).any(): + items = self.items[~mask] + locs = self.items.get_indexer(items) + slices = [slice(None)] * cond.ndim + slices[axis] = locs + r = self._try_cast_result(result[slices]) + result_blocks.append(make_block(r.T, items, self.ref_items)) + + return result_blocks class NumericBlock(Block): is_numeric = True diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index fa6579ca61358..db9efce368876 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -271,6 +271,16 @@ def test_getitem_boolean_casting(self): expected = Series({'float64': 6, 'int32' : 1, 'int64' : 1}) assert_series_equal(result, expected) + # where dtype conversions + # GH 3733 + df = DataFrame(data = np.random.randn(1000, 500)) + df = df.where(df > 0) # create nans + bools = df > 0 + mask = isnull(df) + expected = bools.astype(float).mask(mask) + result = bools.mask(mask) + assert_frame_equal(result,expected) + def test_getitem_boolean_list(self): df = DataFrame(np.arange(12).reshape(3, 4)) diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index 7745450e5c03b..122851bf91a26 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -177,3 +177,18 @@ def f(K=500): """ frame_xs_col = Benchmark('df.xs(50000,axis = 1)', setup) + +## masking +setup = common_setup + """ +data = np.random.randn(1000, 500) +df = DataFrame(data) +df = df.where(df > 0) # create nans +bools = df > 0 +mask = isnull(df) +""" + +mask_bools = Benchmark('bools.mask(mask)', setup, + start_date=datetime(2013,1,1)) + +mask_floats = Benchmark('bools.astype(float).mask(mask)', setup, + start_date=datetime(2013,1,1)) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index 905c4371837cc..339ba8e6d43b9 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -3,27 +3,27 @@ import os -modules = ['attrs_caching', - 'binary_ops', - 'ctors', - 'frame_ctor', +modules = [#'attrs_caching', + #'binary_ops', + #'ctors', + #'frame_ctor', 'frame_methods', - 'groupby', - 'index_object', - 'indexing', - 'io_bench', - 'hdfstore_bench', - 'join_merge', - 'miscellaneous', - 'panel_ctor', - 'parser', - 'reindex', - 'replace', - 'sparse', - 'reshape', - 'stat_ops', - 'timeseries'] - + #'groupby', + #'index_object', + #'indexing', + #'io_bench', + #'hdfstore_bench', + #'join_merge', + #'miscellaneous', + #'panel_ctor', + #'parser', + #'reindex', + #'replace', + #'sparse', + #'reshape', + #'stat_ops', + #'timeseries'] +] by_module = {} benchmarks = [] From 8e2d4900621e7f14b42e630d1a7c5d9575b53b24 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 1 Jun 2013 16:26:31 -0400 Subject: [PATCH 2/2] BUG: _interleave now returns an lcd of int64 or 2 x the intsize if uints are seen --- RELEASE.rst | 4 ++++ pandas/core/internals.py | 43 +++++++++++++++++++++----------------- pandas/tests/test_frame.py | 22 ++++++++++++------- vb_suite/suite.py | 40 +++++++++++++++++------------------ 4 files changed, 62 insertions(+), 47 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 35741f7eb008f..4573b45ccaf16 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -76,6 +76,7 @@ pandas 0.11.1 GH3572_). This happens before any drawing takes place which elimnates any spurious plots from showing up. - Added Faq section on repr display options, to help users customize their setup. + - ``where`` operations that result in block splitting are much faster (GH3733_) **API Changes** @@ -116,6 +117,8 @@ pandas 0.11.1 - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are deprecated - Implement ``__nonzero__`` for ``NDFrame`` objects (GH3691_, GH3696_) + - ``as_matrix`` with mixed signed and unsigned dtypes will result in 2 x the lcd of the unsigned + as an int, maxing with ``int64``, to avoid precision issues (GH3733_) **Bug Fixes** @@ -273,6 +276,7 @@ pandas 0.11.1 .. _GH3691: https://github.com/pydata/pandas/issues/3691 .. _GH3696: https://github.com/pydata/pandas/issues/3696 .. _GH3667: https://github.com/pydata/pandas/issues/3667 +.. _GH3733: https://github.com/pydata/pandas/issues/3733 pandas 0.11.0 ============= diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1ee2f7a44ae74..af1543dad0314 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -579,25 +579,15 @@ def func(c,v,o): axis = cond.ndim-1 cond = cond.swapaxes(axis,0) mask = np.array([ cond[i].all() for i in enumerate(range(cond.shape[0]))],dtype=bool) - result_blocks = [] - # can do the mask=true as a single block - if mask.any(): - items = self.items[mask] - locs = self.items.get_indexer(items) - slices = [slice(None)] * cond.ndim - slices[axis] = locs - r = self._try_cast_result(result[slices]) - result_blocks.append(make_block(r.T, items, self.ref_items)) - - # and mask=false as a single block - if (~mask).any(): - items = self.items[~mask] - locs = self.items.get_indexer(items) - slices = [slice(None)] * cond.ndim - slices[axis] = locs - r = self._try_cast_result(result[slices]) - result_blocks.append(make_block(r.T, items, self.ref_items)) + result_blocks = [] + for m in [mask, ~mask]: + if m.any(): + items = self.items[m] + slices = [slice(None)] * cond.ndim + slices[axis] = self.items.get_indexer(items) + r = self._try_cast_result(result[slices]) + result_blocks.append(make_block(r.T, items, self.ref_items)) return result_blocks @@ -2435,7 +2425,22 @@ def _lcd_dtype(l): elif have_bool: return np.dtype(bool) elif have_int and not have_float and not have_complex: - return _lcd_dtype(counts[IntBlock]) + + # if we are mixing unsigned and signed, then return + # the next biggest int type (if we can) + lcd = _lcd_dtype(counts[IntBlock]) + kinds = set([ i.dtype.kind for i in counts[IntBlock] ]) + if len(kinds) == 1: + return lcd + + if lcd == 'uint64' or lcd == 'int64': + return np.dtype('int64') + + # return 1 bigger on the itemsize if unsinged + if lcd.kind == 'u': + return np.dtype('int%s' % (lcd.itemsize*8*2)) + return lcd + elif have_dt64 and not have_float and not have_complex: return np.dtype('M8[ns]') elif have_complex: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index db9efce368876..8964b21756439 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -273,7 +273,7 @@ def test_getitem_boolean_casting(self): # where dtype conversions # GH 3733 - df = DataFrame(data = np.random.randn(1000, 500)) + df = DataFrame(data = np.random.randn(100, 50)) df = df.where(df > 0) # create nans bools = df > 0 mask = isnull(df) @@ -7578,8 +7578,10 @@ def test_where(self): def _safe_add(df): # only add to the numeric items - return DataFrame(dict([ (c,s+1) if issubclass(s.dtype.type, (np.integer,np.floating)) else (c,s) for c, s in df.iteritems() ])) - + def is_ok(s): + return issubclass(s.dtype.type, (np.integer,np.floating)) and s.dtype != 'uint8' + return DataFrame(dict([ (c,s+1) if is_ok(s) else (c,s) for c, s in df.iteritems() ])) + def _check_get(df, cond, check_dtypes = True): other1 = _safe_add(df) rs = df.where(cond, other1) @@ -7615,7 +7617,7 @@ def _check_get(df, cond, check_dtypes = True): def _check_align(df, cond, other, check_dtypes = True): rs = df.where(cond, other) for i, k in enumerate(rs.columns): - v = rs[k] + result = rs[k] d = df[k].values c = cond[k].reindex(df[k].index).fillna(False).values @@ -7623,12 +7625,16 @@ def _check_align(df, cond, other, check_dtypes = True): o = other else: if isinstance(other,np.ndarray): - o = Series(other[:,i],index=v.index).values + o = Series(other[:,i],index=result.index).values else: o = other[k].values new_values = d if c.all() else np.where(c, d, o) - assert_series_equal(v, Series(new_values,index=v.index)) + expected = Series(new_values,index=result.index) + + # since we can't always have the correct numpy dtype + # as numpy doesn't know how to downcast, don't check + assert_series_equal(result, expected, check_dtype=False) # dtypes # can't check dtype when other is an ndarray @@ -9904,14 +9910,14 @@ def test_as_matrix_lcd(self): self.assert_(values.dtype == np.float16) values = self.mixed_int.as_matrix(['A','B','C','D']) - self.assert_(values.dtype == np.uint64) + self.assert_(values.dtype == np.int64) values = self.mixed_int.as_matrix(['A','D']) self.assert_(values.dtype == np.int64) # guess all ints are cast to uints.... values = self.mixed_int.as_matrix(['A','B','C']) - self.assert_(values.dtype == np.uint64) + self.assert_(values.dtype == np.int64) values = self.mixed_int.as_matrix(['A','C']) self.assert_(values.dtype == np.int32) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index 339ba8e6d43b9..905c4371837cc 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -3,27 +3,27 @@ import os -modules = [#'attrs_caching', - #'binary_ops', - #'ctors', - #'frame_ctor', +modules = ['attrs_caching', + 'binary_ops', + 'ctors', + 'frame_ctor', 'frame_methods', - #'groupby', - #'index_object', - #'indexing', - #'io_bench', - #'hdfstore_bench', - #'join_merge', - #'miscellaneous', - #'panel_ctor', - #'parser', - #'reindex', - #'replace', - #'sparse', - #'reshape', - #'stat_ops', - #'timeseries'] -] + 'groupby', + 'index_object', + 'indexing', + 'io_bench', + 'hdfstore_bench', + 'join_merge', + 'miscellaneous', + 'panel_ctor', + 'parser', + 'reindex', + 'replace', + 'sparse', + 'reshape', + 'stat_ops', + 'timeseries'] + by_module = {} benchmarks = []