From bed7ced0426f1f0f48144397d89154f4052af9e1 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 3 Jun 2013 09:22:45 -0400 Subject: [PATCH 1/2] BUG: (GH3740) Groupby transform with item-by-item not upcasting correctly --- RELEASE.rst | 2 ++ pandas/core/groupby.py | 12 +++++++++--- pandas/tests/test_groupby.py | 9 +++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index c59a53c7f6c69..bbfc9fb948ef4 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -196,6 +196,7 @@ pandas 0.11.1 - ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for their first argument (GH3702_) - Fix file tokenization error with \r delimiter and quoted fields (GH3453_) + - Groupby transform with item-by-item not upcasting correctly (GH3740_) .. _GH3164: https://github.com/pydata/pandas/issues/3164 .. _GH2786: https://github.com/pydata/pandas/issues/2786 @@ -278,6 +279,7 @@ pandas 0.11.1 .. _GH3696: https://github.com/pydata/pandas/issues/3696 .. _GH3667: https://github.com/pydata/pandas/issues/3667 .. _GH3733: https://github.com/pydata/pandas/issues/3733 +.. _GH3740: https://github.com/pydata/pandas/issues/3740 pandas 0.11.0 ============= diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index d409adfd71158..2032f23030aeb 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1532,6 +1532,8 @@ def transform(self, func, *args, **kwargs): transformed : Series """ result = self.obj.copy() + if hasattr(result,'values'): + result = result.values if isinstance(func, basestring): wrapper = lambda x: getattr(x, func)(*args, **kwargs) @@ -1541,11 +1543,15 @@ def transform(self, func, *args, **kwargs): for name, group in self: object.__setattr__(group, 'name', name) res = wrapper(group) - # result[group.index] = res indexer = self.obj.index.get_indexer(group.index) - np.put(result, indexer, res) + if hasattr(res,'values'): + res = res.values - return result + # need to do a safe put here, as the dtype may be different + # this needs to be an ndarray + result,_ = com._maybe_upcast_indexer(result, indexer, res) + + return self.obj.__class__(result,index=self.obj.index,name=self.obj.name) class NDFrameGroupBy(GroupBy): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c56fca49cce48..852f0109058ee 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -616,6 +616,15 @@ def f(x): assert_series_equal(agged, expected, check_dtype=False) self.assert_(issubclass(agged.dtype.type, np.dtype(dtype).type)) + def test_groupby_transform_with_int(self): + + # GH 3740, make sure that we might upcast on item-by-item transform + + df = DataFrame(dict(A = [1,1,1,2,2,2], B = 1, C = [1,2,3,1,2,3], D = 'foo')) + result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std()) + expected = DataFrame(dict(B = np.nan, C = [-1,0,1,-1,0,1])) + assert_frame_equal(result,expected) + def test_indices_concatenation_order(self): # GH 2808 From af77e0e4a107493774212387fdd2edb8562d28a9 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 3 Jun 2013 12:08:26 -0400 Subject: [PATCH 2/2] BUG: ensure float into function of series transform when item-by-item downcast result if needed --- pandas/core/groupby.py | 5 +++++ pandas/tests/test_groupby.py | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2032f23030aeb..64606a6e644f9 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1534,6 +1534,7 @@ def transform(self, func, *args, **kwargs): result = self.obj.copy() if hasattr(result,'values'): result = result.values + dtype = result.dtype if isinstance(func, basestring): wrapper = lambda x: getattr(x, func)(*args, **kwargs) @@ -1541,6 +1542,8 @@ def transform(self, func, *args, **kwargs): wrapper = lambda x: func(x, *args, **kwargs) for name, group in self: + + group = com.ensure_float(group) object.__setattr__(group, 'name', name) res = wrapper(group) indexer = self.obj.index.get_indexer(group.index) @@ -1551,6 +1554,8 @@ def transform(self, func, *args, **kwargs): # this needs to be an ndarray result,_ = com._maybe_upcast_indexer(result, indexer, res) + # downcast if we can (and need) + result = _possibly_downcast_to_dtype(result, dtype) return self.obj.__class__(result,index=self.obj.index,name=self.obj.name) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 852f0109058ee..cf62b16a9dd2a 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -620,11 +620,35 @@ def test_groupby_transform_with_int(self): # GH 3740, make sure that we might upcast on item-by-item transform + # floats + df = DataFrame(dict(A = [1,1,1,2,2,2], B = Series(1,dtype='float64'), C = Series([1,2,3,1,2,3],dtype='float64'), D = 'foo')) + result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std()) + expected = DataFrame(dict(B = np.nan, C = Series([-1,0,1,-1,0,1],dtype='float64'))) + assert_frame_equal(result,expected) + + # int case df = DataFrame(dict(A = [1,1,1,2,2,2], B = 1, C = [1,2,3,1,2,3], D = 'foo')) result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std()) expected = DataFrame(dict(B = np.nan, C = [-1,0,1,-1,0,1])) assert_frame_equal(result,expected) + # int that needs float conversion + s = Series([2,3,4,10,5,-1]) + df = DataFrame(dict(A = [1,1,1,2,2,2], B = 1, C = s, D = 'foo')) + result = df.groupby('A').transform(lambda x: (x-x.mean())/x.std()) + + s1 = s.iloc[0:3] + s1 = (s1-s1.mean())/s1.std() + s2 = s.iloc[3:6] + s2 = (s2-s2.mean())/s2.std() + expected = DataFrame(dict(B = np.nan, C = concat([s1,s2]))) + assert_frame_equal(result,expected) + + # int downcasting + result = df.groupby('A').transform(lambda x: x*2/2) + expected = DataFrame(dict(B = 1, C = [2,3,4,10,5,-1])) + assert_frame_equal(result,expected) + def test_indices_concatenation_order(self): # GH 2808