From 5fe65260614b0bbe5dbbee93b85ac52310f74da4 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sat, 1 Jun 2013 15:27:48 -0400
Subject: [PATCH 1/2] PERF: (GH3733) where block splitting now done in at most
 2 blocks rather than

      item-by-item; add vb_suite tests for masking
---
 pandas/core/internals.py   | 56 +++++++++++++++++++++-----------------
 pandas/tests/test_frame.py | 10 +++++++
 vb_suite/frame_methods.py  | 15 ++++++++++
 vb_suite/suite.py          | 40 +++++++++++++--------------
 4 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 8b711f5e077ce..1ee2f7a44ae74 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -558,42 +558,48 @@ def func(c,v,o):
                     result.fill(np.nan)
                     return result
 
-        def create_block(result, items, transpose=True):
+        # see if we can operate on the entire block, or need item-by-item
+        result = func(cond,values,other)
+        if self._can_hold_na:
+
             if not isinstance(result, np.ndarray):
                 raise TypeError('Could not compare [%s] with block values'
                                 % repr(other))
 
-            if transpose and is_transposed:
+            if is_transposed:
                 result = result.T
 
             # try to cast if requested
             if try_cast:
                 result = self._try_cast_result(result)
 
-            return make_block(result, items, self.ref_items)
+            return make_block(result, self.items, self.ref_items)
 
-        # see if we can operate on the entire block, or need item-by-item
-        if not self._can_hold_na:
-            axis = cond.ndim-1
-            result_blocks = []
-            for item in self.items:
-                loc  = self.items.get_loc(item)
-                item = self.items.take([loc])
-                v    = values.take([loc],axis=axis)
-                c    = cond.take([loc],axis=axis)
-                o    = other.take([loc],axis=axis) if hasattr(other,'shape') else other
-
-                result = func(c,v,o)
-                if len(result) == 1:
-                    result = np.repeat(result,self.shape[1:])
-
-                result = _block_shape(result,ndim=self.ndim,shape=self.shape[1:])
-                result_blocks.append(create_block(result, item, transpose=False))
-
-            return result_blocks
-        else:
-            result = func(cond,values,other)
-            return create_block(result, self.items)
+        # might need to separate out blocks
+        axis = cond.ndim-1
+        cond = cond.swapaxes(axis,0)
+        mask = np.array([ cond[i].all() for i in enumerate(range(cond.shape[0]))],dtype=bool)
+        result_blocks = []
+
+        # can do the mask=true as a single block
+        if mask.any():
+            items = self.items[mask]
+            locs  = self.items.get_indexer(items)
+            slices = [slice(None)] * cond.ndim
+            slices[axis] = locs
+            r = self._try_cast_result(result[slices])
+            result_blocks.append(make_block(r.T, items, self.ref_items))
+
+        # and mask=false as a single block
+        if (~mask).any():
+            items = self.items[~mask]
+            locs  = self.items.get_indexer(items)
+            slices = [slice(None)] * cond.ndim
+            slices[axis] = locs
+            r = self._try_cast_result(result[slices])
+            result_blocks.append(make_block(r.T, items, self.ref_items))
+
+        return result_blocks
 
 class NumericBlock(Block):
     is_numeric = True
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index fa6579ca61358..db9efce368876 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -271,6 +271,16 @@ def test_getitem_boolean_casting(self):
         expected = Series({'float64': 6, 'int32' : 1, 'int64' : 1})
         assert_series_equal(result, expected)
 
+        # where dtype conversions
+        # GH 3733
+        df = DataFrame(data = np.random.randn(1000, 500))
+        df = df.where(df > 0) # create nans
+        bools = df > 0
+        mask = isnull(df)
+        expected = bools.astype(float).mask(mask)
+        result = bools.mask(mask)
+        assert_frame_equal(result,expected)
+
     def test_getitem_boolean_list(self):
         df = DataFrame(np.arange(12).reshape(3, 4))
 
diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py
index 7745450e5c03b..122851bf91a26 100644
--- a/vb_suite/frame_methods.py
+++ b/vb_suite/frame_methods.py
@@ -177,3 +177,18 @@ def f(K=500):
 """
 
 frame_xs_col = Benchmark('df.xs(50000,axis = 1)', setup)
+
+## masking
+setup = common_setup + """
+data = np.random.randn(1000, 500)
+df = DataFrame(data)
+df = df.where(df > 0) # create nans
+bools = df > 0
+mask = isnull(df)
+"""
+
+mask_bools = Benchmark('bools.mask(mask)', setup,
+                         start_date=datetime(2013,1,1))
+
+mask_floats  = Benchmark('bools.astype(float).mask(mask)', setup,
+                         start_date=datetime(2013,1,1))
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
index 905c4371837cc..339ba8e6d43b9 100644
--- a/vb_suite/suite.py
+++ b/vb_suite/suite.py
@@ -3,27 +3,27 @@
 
 import os
 
-modules = ['attrs_caching',
-           'binary_ops',
-           'ctors',
-           'frame_ctor',
+modules = [#'attrs_caching',
+           #'binary_ops',
+           #'ctors',
+           #'frame_ctor',
            'frame_methods',
-           'groupby',
-           'index_object',
-           'indexing',
-           'io_bench',
-           'hdfstore_bench',
-           'join_merge',
-           'miscellaneous',
-           'panel_ctor',
-           'parser',
-           'reindex',
-           'replace',
-           'sparse',
-           'reshape',
-           'stat_ops',
-           'timeseries']
-
+           #'groupby',
+           #'index_object',
+           #'indexing',
+           #'io_bench',
+           #'hdfstore_bench',
+           #'join_merge',
+           #'miscellaneous',
+           #'panel_ctor',
+           #'parser',
+           #'reindex',
+           #'replace',
+           #'sparse',
+           #'reshape',
+           #'stat_ops',
+           #'timeseries']
+]
 by_module = {}
 benchmarks = []
 

From 8e2d4900621e7f14b42e630d1a7c5d9575b53b24 Mon Sep 17 00:00:00 2001
From: jreback <jeff@reback.net>
Date: Sat, 1 Jun 2013 16:26:31 -0400
Subject: [PATCH 2/2] BUG: _interleave now returns an lcd of int64 or 2 x the
 intsize if

     uints are seen
---
 RELEASE.rst                |  4 ++++
 pandas/core/internals.py   | 43 +++++++++++++++++++++-----------------
 pandas/tests/test_frame.py | 22 ++++++++++++-------
 vb_suite/suite.py          | 40 +++++++++++++++++------------------
 4 files changed, 62 insertions(+), 47 deletions(-)

diff --git a/RELEASE.rst b/RELEASE.rst
index 35741f7eb008f..4573b45ccaf16 100644
--- a/RELEASE.rst
+++ b/RELEASE.rst
@@ -76,6 +76,7 @@ pandas 0.11.1
     GH3572_). This happens before any drawing takes place which elimnates any
     spurious plots from showing up.
   - Added Faq section on repr display options, to help users customize their setup.
+  - ``where`` operations that result in block splitting are much faster (GH3733_)
 
 **API Changes**
 
@@ -116,6 +117,8 @@ pandas 0.11.1
   - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are
     deprecated
   - Implement ``__nonzero__`` for ``NDFrame`` objects (GH3691_, GH3696_)
+  - ``as_matrix`` with mixed signed and unsigned dtypes will result in 2 x the lcd of the unsigned
+    as an int, maxing with ``int64``, to avoid precision issues (GH3733_)
 
 **Bug Fixes**
 
@@ -273,6 +276,7 @@ pandas 0.11.1
 .. _GH3691: https://github.com/pydata/pandas/issues/3691
 .. _GH3696: https://github.com/pydata/pandas/issues/3696
 .. _GH3667: https://github.com/pydata/pandas/issues/3667
+.. _GH3733: https://github.com/pydata/pandas/issues/3733
 
 pandas 0.11.0
 =============
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 1ee2f7a44ae74..af1543dad0314 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -579,25 +579,15 @@ def func(c,v,o):
         axis = cond.ndim-1
         cond = cond.swapaxes(axis,0)
         mask = np.array([ cond[i].all() for i in enumerate(range(cond.shape[0]))],dtype=bool)
-        result_blocks = []
 
-        # can do the mask=true as a single block
-        if mask.any():
-            items = self.items[mask]
-            locs  = self.items.get_indexer(items)
-            slices = [slice(None)] * cond.ndim
-            slices[axis] = locs
-            r = self._try_cast_result(result[slices])
-            result_blocks.append(make_block(r.T, items, self.ref_items))
-
-        # and mask=false as a single block
-        if (~mask).any():
-            items = self.items[~mask]
-            locs  = self.items.get_indexer(items)
-            slices = [slice(None)] * cond.ndim
-            slices[axis] = locs
-            r = self._try_cast_result(result[slices])
-            result_blocks.append(make_block(r.T, items, self.ref_items))
+        result_blocks = []
+        for m in [mask, ~mask]:
+            if m.any():
+                items = self.items[m]
+                slices = [slice(None)] * cond.ndim
+                slices[axis] = self.items.get_indexer(items)
+                r = self._try_cast_result(result[slices])
+                result_blocks.append(make_block(r.T, items, self.ref_items))
 
         return result_blocks
 
@@ -2435,7 +2425,22 @@ def _lcd_dtype(l):
     elif have_bool:
         return np.dtype(bool)
     elif have_int and not have_float and not have_complex:
-        return _lcd_dtype(counts[IntBlock])
+
+        # if we are mixing unsigned and signed, then return
+        # the next biggest int type (if we can)
+        lcd = _lcd_dtype(counts[IntBlock])
+        kinds = set([ i.dtype.kind for i in counts[IntBlock] ])
+        if len(kinds) == 1:
+            return lcd
+
+        if lcd == 'uint64' or lcd == 'int64':
+            return np.dtype('int64')
+
+        # return 1 bigger on the itemsize if unsinged
+        if lcd.kind == 'u':
+            return np.dtype('int%s' % (lcd.itemsize*8*2))
+        return lcd
+    
     elif have_dt64 and not have_float and not have_complex:
         return np.dtype('M8[ns]')
     elif have_complex:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index db9efce368876..8964b21756439 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -273,7 +273,7 @@ def test_getitem_boolean_casting(self):
 
         # where dtype conversions
         # GH 3733
-        df = DataFrame(data = np.random.randn(1000, 500))
+        df = DataFrame(data = np.random.randn(100, 50))
         df = df.where(df > 0) # create nans
         bools = df > 0
         mask = isnull(df)
@@ -7578,8 +7578,10 @@ def test_where(self):
 
         def _safe_add(df):
             # only add to the numeric items
-            return DataFrame(dict([ (c,s+1) if issubclass(s.dtype.type, (np.integer,np.floating)) else (c,s) for c, s in df.iteritems() ]))
-
+            def is_ok(s):
+                return issubclass(s.dtype.type, (np.integer,np.floating)) and s.dtype != 'uint8'
+            return DataFrame(dict([ (c,s+1) if is_ok(s) else (c,s) for c, s in df.iteritems() ]))
+        
         def _check_get(df, cond, check_dtypes = True):
             other1 = _safe_add(df)
             rs = df.where(cond, other1)
@@ -7615,7 +7617,7 @@ def _check_get(df, cond, check_dtypes = True):
         def _check_align(df, cond, other, check_dtypes = True):
             rs = df.where(cond, other)
             for i, k in enumerate(rs.columns):
-                v = rs[k]
+                result = rs[k]
                 d = df[k].values
                 c = cond[k].reindex(df[k].index).fillna(False).values
 
@@ -7623,12 +7625,16 @@ def _check_align(df, cond, other, check_dtypes = True):
                     o = other
                 else:
                     if isinstance(other,np.ndarray):
-                        o = Series(other[:,i],index=v.index).values
+                        o = Series(other[:,i],index=result.index).values
                     else:
                         o = other[k].values
 
                 new_values = d if c.all() else np.where(c, d, o)
-                assert_series_equal(v, Series(new_values,index=v.index))
+                expected = Series(new_values,index=result.index)
+
+                # since we can't always have the correct numpy dtype
+                # as numpy doesn't know how to downcast, don't check
+                assert_series_equal(result, expected, check_dtype=False)
 
             # dtypes
             # can't check dtype when other is an ndarray
@@ -9904,14 +9910,14 @@ def test_as_matrix_lcd(self):
         self.assert_(values.dtype == np.float16)
 
         values = self.mixed_int.as_matrix(['A','B','C','D'])
-        self.assert_(values.dtype == np.uint64)
+        self.assert_(values.dtype == np.int64)
 
         values = self.mixed_int.as_matrix(['A','D'])
         self.assert_(values.dtype == np.int64)
 
         # guess all ints are cast to uints....
         values = self.mixed_int.as_matrix(['A','B','C'])
-        self.assert_(values.dtype == np.uint64)
+        self.assert_(values.dtype == np.int64)
 
         values = self.mixed_int.as_matrix(['A','C'])
         self.assert_(values.dtype == np.int32)
diff --git a/vb_suite/suite.py b/vb_suite/suite.py
index 339ba8e6d43b9..905c4371837cc 100644
--- a/vb_suite/suite.py
+++ b/vb_suite/suite.py
@@ -3,27 +3,27 @@
 
 import os
 
-modules = [#'attrs_caching',
-           #'binary_ops',
-           #'ctors',
-           #'frame_ctor',
+modules = ['attrs_caching',
+           'binary_ops',
+           'ctors',
+           'frame_ctor',
            'frame_methods',
-           #'groupby',
-           #'index_object',
-           #'indexing',
-           #'io_bench',
-           #'hdfstore_bench',
-           #'join_merge',
-           #'miscellaneous',
-           #'panel_ctor',
-           #'parser',
-           #'reindex',
-           #'replace',
-           #'sparse',
-           #'reshape',
-           #'stat_ops',
-           #'timeseries']
-]
+           'groupby',
+           'index_object',
+           'indexing',
+           'io_bench',
+           'hdfstore_bench',
+           'join_merge',
+           'miscellaneous',
+           'panel_ctor',
+           'parser',
+           'reindex',
+           'replace',
+           'sparse',
+           'reshape',
+           'stat_ops',
+           'timeseries']
+
 by_module = {}
 benchmarks = []