diff --git a/doc/source/whatsnew/v0.16.2.txt b/doc/source/whatsnew/v0.16.2.txt index ddfe6fa0b2f74..c59f2317431d5 100644 --- a/doc/source/whatsnew/v0.16.2.txt +++ b/doc/source/whatsnew/v0.16.2.txt @@ -57,7 +57,7 @@ Bug Fixes - Bug where read_hdf store.select modifies the passed columns list when multi-indexed (:issue:`7212`) - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`) - +- Bug where some of the nan funcs do not have consistent return dtypes (:issue:`10251`) - Bug in groupby.apply aggregation for Categorical not preserving categories (:issue:`10138`) - Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`) - Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index c64c50f791edf..c70fb6339517d 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -244,7 +244,10 @@ def nanall(values, axis=None, skipna=True): @bottleneck_switch(zero_value=0) def nansum(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) - the_sum = values.sum(axis, dtype=dtype_max) + dtype_sum = dtype_max + if is_float_dtype(dtype): + dtype_sum = dtype + the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask) return _wrap_results(the_sum, dtype) @@ -288,7 +291,7 @@ def get_median(x): return np.nan return algos.median(_values_from_object(x[mask])) - if values.dtype != np.float64: + if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -317,10 +320,10 @@ def get_median(x): return _wrap_results(get_median(values) if notempty else np.nan, dtype) -def _get_counts_nanvar(mask, axis, ddof): - count = _get_counts(mask, axis) - - d = count-ddof +def _get_counts_nanvar(mask, axis, ddof, dtype=float): + dtype = _get_dtype(dtype) + count = _get_counts(mask, axis, dtype=dtype) + d = count - dtype.type(ddof) # always return NaN, never inf if np.isscalar(count): @@ -341,7 +344,10 @@ def _nanvar(values, axis=None, skipna=True, ddof=1): if is_any_int_dtype(values): values = values.astype('f8') - count, d = _get_counts_nanvar(mask, axis, ddof) + if is_float_dtype(values): + count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype) + else: + count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: values = values.copy() @@ -349,7 +355,8 @@ def _nanvar(values, axis=None, skipna=True, ddof=1): X = _ensure_numeric(values.sum(axis)) XX = _ensure_numeric((values ** 2).sum(axis)) - return np.fabs((XX - X ** 2 / count) / d) + result = np.fabs((XX - X * X / count) / d) + return result @disallow('M8') @bottleneck_switch(ddof=1) @@ -375,9 +382,9 @@ def nansem(values, axis=None, skipna=True, ddof=1): mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') - count, _ = _get_counts_nanvar(mask, axis, ddof) + count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype) - return np.sqrt(var)/np.sqrt(count) + return np.sqrt(var) / np.sqrt(count) @bottleneck_switch() @@ -469,23 +476,25 @@ def nanskew(values, axis=None, skipna=True): mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') - - count = _get_counts(mask, axis) + count = _get_counts(mask, axis) + else: + count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) + typ = values.dtype.type A = values.sum(axis) / count - B = (values ** 2).sum(axis) / count - A ** 2 - C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B + B = (values ** 2).sum(axis) / count - A ** typ(2) + C = (values ** 3).sum(axis) / count - A ** typ(3) - typ(3) * A * B # floating point error B = _zero_out_fperr(B) C = _zero_out_fperr(C) - result = ((np.sqrt((count ** 2 - count)) * C) / - ((count - 2) * np.sqrt(B) ** 3)) + result = ((np.sqrt(count * count - count) * C) / + ((count - typ(2)) * np.sqrt(B) ** typ(3))) if isinstance(result, np.ndarray): result = np.where(B == 0, 0, result) @@ -504,17 +513,19 @@ def nankurt(values, axis=None, skipna=True): mask = isnull(values) if not is_float_dtype(values.dtype): values = values.astype('f8') - - count = _get_counts(mask, axis) + count = _get_counts(mask, axis) + else: + count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) + typ = values.dtype.type A = values.sum(axis) / count - B = (values ** 2).sum(axis) / count - A ** 2 - C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B - D = (values ** 4).sum(axis) / count - A ** 4 - 6 * B * A * A - 4 * C * A + B = (values ** 2).sum(axis) / count - A ** typ(2) + C = (values ** 3).sum(axis) / count - A ** typ(3) - typ(3) * A * B + D = (values ** 4).sum(axis) / count - A ** typ(4) - typ(6) * B * A * A - typ(4) * C * A B = _zero_out_fperr(B) D = _zero_out_fperr(D) @@ -526,8 +537,8 @@ def nankurt(values, axis=None, skipna=True): if B == 0: return 0 - result = (((count * count - 1.) * D / (B * B) - 3 * ((count - 1.) ** 2)) / - ((count - 2.) * (count - 3.))) + result = (((count * count - typ(1)) * D / (B * B) - typ(3) * ((count - typ(1)) ** typ(2))) / + ((count - typ(2)) * (count - typ(3)))) if isinstance(result, np.ndarray): result = np.where(B == 0, 0, result) @@ -598,7 +609,7 @@ def _zero_out_fperr(arg): if isinstance(arg, np.ndarray): return np.where(np.abs(arg) < 1e-14, 0, arg) else: - return 0 if np.abs(arg) < 1e-14 else arg + return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg @disallow('M8','m8') diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 1adb8a5d9217c..951a693d22280 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -4,7 +4,7 @@ from functools import partial import numpy as np - +from pandas import Series from pandas.core.common import isnull, is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm @@ -327,7 +327,6 @@ def test_nanmean_overflow(self): # GH 10155 # In the previous implementation mean can overflow for int dtypes, it # is now consistent with numpy - from pandas import Series # numpy < 1.9.0 is not computing this correctly from distutils.version import LooseVersion @@ -340,14 +339,19 @@ def test_nanmean_overflow(self): self.assertEqual(result, np_result) self.assertTrue(result.dtype == np.float64) - # check returned dtype - for dtype in [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: + def test_returned_dtype(self): + for dtype in [np.int16, np.int32, np.int64, np.float32, np.float64, np.float128]: s = Series(range(10), dtype=dtype) - result = s.mean() - if is_integer_dtype(dtype): - self.assertTrue(result.dtype == np.float64) - else: - self.assertTrue(result.dtype == dtype) + group_a = ['mean', 'std', 'var', 'skew', 'kurt'] + group_b = ['min', 'max'] + for method in group_a + group_b: + result = getattr(s, method)() + if is_integer_dtype(dtype) and method in group_a: + self.assertTrue(result.dtype == np.float64, + "return dtype expected from %s is np.float64, got %s instead" % (method, result.dtype)) + else: + self.assertTrue(result.dtype == dtype, + "return dtype expected from %s is %s, got %s instead" % (method, dtype, result.dtype)) def test_nanmedian(self): self.check_funs(nanops.nanmedian, np.median, diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index eb583f17f3ace..b2591c7537ad1 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -528,7 +528,6 @@ def test_nansum_buglet(self): assert_almost_equal(result, 1) def test_overflow(self): - # GH 6915 # overflowing on the smaller int dtypes for dtype in ['int32','int64']: @@ -551,25 +550,25 @@ def test_overflow(self): result = s.max() self.assertEqual(int(result),v[-1]) - for dtype in ['float32','float64']: - v = np.arange(5000000,dtype=dtype) + for dtype in ['float32', 'float64']: + v = np.arange(5000000, dtype=dtype) s = Series(v) # no bottleneck result = s.sum(skipna=False) - self.assertTrue(np.allclose(float(result),v.sum(dtype='float64'))) + self.assertEqual(result, v.sum(dtype=dtype)) result = s.min(skipna=False) - self.assertTrue(np.allclose(float(result),0.0)) + self.assertTrue(np.allclose(float(result), 0.0)) result = s.max(skipna=False) - self.assertTrue(np.allclose(float(result),v[-1])) + self.assertTrue(np.allclose(float(result), v[-1])) # use bottleneck if available result = s.sum() - self.assertTrue(np.allclose(float(result),v.sum(dtype='float64'))) + self.assertEqual(result, v.sum(dtype=dtype)) result = s.min() - self.assertTrue(np.allclose(float(result),0.0)) + self.assertTrue(np.allclose(float(result), 0.0)) result = s.max() - self.assertTrue(np.allclose(float(result),v[-1])) + self.assertTrue(np.allclose(float(result), v[-1])) class SafeForSparse(object): pass