From 97327895574db469eb2de518604061d746192579 Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 18 Sep 2012 12:11:36 -0400 Subject: [PATCH 1/3] BUG: apply across non-unique indices. Still failing for sparse --- pandas/core/frame.py | 32 +++++++++++++++++++------------- pandas/sparse/frame.py | 3 ++- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bcfe645d5f14c..501b6ead37b71 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3745,21 +3745,22 @@ def _apply_standard(self, func, axis, ignore_failures=False): pass if axis == 0: - series_gen = ((c, self[c]) for c in self.columns) + series_gen = (self.icol(i) for i in range(len(self.columns))) res_index = self.columns res_columns = self.index elif axis == 1: res_index = self.index res_columns = self.columns - series_gen = ((i, Series(v, self.columns, name=i)) - for i, v in izip(self.index, self.values)) + series_gen = (self.irow(i) for i in range(len(self.index))) + keys = [] results = {} if ignore_failures: successes = [] - for i, (k, v) in enumerate(series_gen): + for i, v in enumerate(series_gen): try: - results[k] = func(v) + results[i] = func(v) + keys.append(v.name) successes.append(i) except Exception: pass @@ -3768,32 +3769,37 @@ def _apply_standard(self, func, axis, ignore_failures=False): res_index = res_index.take(successes) else: try: - for k, v in series_gen: - results[k] = func(v) + for i, v in enumerate(series_gen): + results[i] = func(v) + keys.append(v.name) except Exception, e: try: if hasattr(e, 'args'): + k = res_index[i] e.args = e.args + ('occurred at index %s' % str(k),) except NameError: # pragma: no cover # no k defined yet pass raise - if len(results) > 0 and _is_sequence(results.values()[0]): - if not isinstance(results.values()[0], Series): + if len(results) > 0 and _is_sequence(results[0]): + if not isinstance(results[0], Series): index = res_columns else: index = None - result = self._constructor(data=results, index=index, - columns=res_index) + result = self._constructor(data=results, index=index) + result._set_columns(res_index) if axis == 1: result = result.T + result = result.convert_objects() - return result.convert_objects() + return result else: - return Series(results, index=res_index) + s = Series(results) + s.index = res_index + return s def _apply_broadcast(self, func, axis): if axis == 0: diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index e89df3e8ed131..c26a37852ea42 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -392,7 +392,8 @@ def as_matrix(self, columns=None): if len(columns) == 0: return np.zeros((len(self.index), 0), dtype=float) - return np.array([self[col].values for col in columns]).T + return np.array([self.icol(i).values + for i in range(len(self.columns))]).T values = property(as_matrix) From 546920e002b5d773112249a8200e5ae62b9c15b2 Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 18 Sep 2012 15:46:55 -0400 Subject: [PATCH 2/3] BUG: apply non-uq with tests #1878 --- pandas/core/frame.py | 7 +++++-- pandas/sparse/tests/test_sparse.py | 13 +++++++++++++ pandas/tests/test_frame.py | 9 +++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 501b6ead37b71..5a793e271fa06 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3751,7 +3751,9 @@ def _apply_standard(self, func, axis, ignore_failures=False): elif axis == 1: res_index = self.index res_columns = self.columns - series_gen = (self.irow(i) for i in range(len(self.index))) + series_gen = (Series(self.values[i], index=res_columns, + name=res_index[i]) + for i in range(len(res_index))) keys = [] results = {} @@ -3789,7 +3791,8 @@ def _apply_standard(self, func, axis, ignore_failures=False): index = None result = self._constructor(data=results, index=index) - result._set_columns(res_index) + result.rename(columns=dict(zip(range(len(res_index)), res_index)), + inplace=True) if axis == 1: result = result.T diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index fd2eace9ec033..c3df935d79792 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -1057,6 +1057,19 @@ def test_apply(self): self.assert_(self.empty.apply(np.sqrt) is self.empty) + def test_apply_nonuq(self): + df_orig = DataFrame([[1,2,3], [4,5,6], [7,8,9]], index=['a','a','c']) + df = df_orig.to_sparse() + rs = df.apply(lambda s: s[0], axis=1) + xp = Series([1., 4., 7.], ['a', 'a', 'c']) + assert_series_equal(rs, xp) + + #df.T breaks + df = df_orig.T.to_sparse() + rs = df.apply(lambda s: s[0], axis=0) + #no non-unique columns supported in sparse yet + #assert_series_equal(rs, xp) + def test_applymap(self): # just test that it works result = self.frame.applymap(lambda x: x * 2) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 12fd35ecad02f..9f8179c5cbb3c 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -5134,6 +5134,15 @@ def test_apply(self): expected = Series(np.nan, index=self.frame.index) assert_series_equal(result, expected) + def test_apply_standard_nonunique(self): + df = DataFrame([[1,2,3], [4,5,6], [7,8,9]], index=['a','a','c']) + rs = df.apply(lambda s: s[0], axis=1) + xp = Series([1, 4, 7], ['a', 'a', 'c']) + assert_series_equal(rs, xp) + + rs = df.T.apply(lambda s: s[0], axis=0) + assert_series_equal(rs, xp) + def test_apply_broadcast(self): broadcasted = self.frame.apply(np.mean, broadcast=True) agged = self.frame.apply(np.mean) From 2f1a6871dd904b18ca23767872eb8bf8d8cb83dd Mon Sep 17 00:00:00 2001 From: Chang She Date: Tue, 18 Sep 2012 15:54:12 -0400 Subject: [PATCH 3/3] Only call DataFrame.values once in _apply_standard --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5a793e271fa06..88dec0c583b2c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3751,7 +3751,8 @@ def _apply_standard(self, func, axis, ignore_failures=False): elif axis == 1: res_index = self.index res_columns = self.columns - series_gen = (Series(self.values[i], index=res_columns, + values = self.values + series_gen = (Series(values[i], index=res_columns, name=res_index[i]) for i in range(len(res_index)))