diff --git a/doc/source/release.rst b/doc/source/release.rst index 0298eda2c78ab..aea6280a490d6 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,27 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +pandas 0.22.0 +------------- + +**Release date:** December 29, 2017 + +This is a major release from 0.21.1 and includes a single, API-breaking change. +We recommend that all users upgrade to this version after carefully reading the +release note. + +The only changes are: + +- The sum of an empty or all-*NA* ``Series`` is now ``0`` +- The product of an empty or all-*NA* ``Series`` is now ``1`` +- We've added a ``min_count`` parameter to ``.sum()`` and ``.prod()`` controlling + the minimum number of valid values for the result to be valid. If fewer than + ``min_count`` non-*NA* values are present, the result is *NA*. The default is + ``0``. To return ``NaN``, the 0.21 behavior, use ``min_count=1``. + +See the :ref:`v0.22.0 Whatsnew ` overview for further explanation +of all the places in the library this affects. + pandas 0.21.1 ------------- diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 8f779e01a6be5..64cbe0b050a61 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.22.0.txt + .. include:: whatsnew/v0.21.1.txt .. include:: whatsnew/v0.21.0.txt diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 53b052a955b45..da4acd99e3873 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -1,156 +1,220 @@ .. _whatsnew_0220: -v0.22.0 -------- +v0.22.0 (December 29, 2017) +--------------------------- -This is a major release from 0.21.1 and includes a number of API changes, -deprecations, new features, enhancements, and performance improvements along -with a large number of bug fixes. We recommend that all users upgrade to this -version. +This is a major release from 0.21.1 and includes a single, API-breaking change. +We recommend that all users upgrade to this version after carefully reading the +release note (singular!). -.. _whatsnew_0220.enhancements: +.. _whatsnew_0220.api_breaking: -New features -~~~~~~~~~~~~ +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -- -- +Pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The +summary is that -.. _whatsnew_0220.enhancements.other: +* The sum of an empty or all-*NA* ``Series`` is now ``0`` +* The product of an empty or all-*NA* ``Series`` is now ``1`` +* We've added a ``min_count`` parameter to ``.sum()`` and ``.prod()`` controlling + the minimum number of valid values for the result to be valid. If fewer than + ``min_count`` non-*NA* values are present, the result is *NA*. The default is + ``0``. To return ``NaN``, the 0.21 behavior, use ``min_count=1``. -Other Enhancements -^^^^^^^^^^^^^^^^^^ +Some background: In pandas 0.21, we fixed a long-standing inconsistency +in the return value of all-*NA* series depending on whether or not bottleneck +was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`. At the same +time, we changed the sum and prod of an empty ``Series`` to also be ``NaN``. -- -- -- +Based on feedback, we've partially reverted those changes. -.. _whatsnew_0220.api_breaking: +Arithmetic Operations +^^^^^^^^^^^^^^^^^^^^^ -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The default sum for empty or all-*NA* ``Series`` is now ``0``. -- -- -- +*pandas 0.21.x* -.. _whatsnew_0220.api: +.. code-block:: ipython -Other API Changes -^^^^^^^^^^^^^^^^^ + In [1]: pd.Series([]).sum() + Out[1]: nan -- -- -- + In [2]: pd.Series([np.nan]).sum() + Out[2]: nan -.. _whatsnew_0220.deprecations: +*pandas 0.22.0* -Deprecations -~~~~~~~~~~~~ +.. ipython:: python -- -- -- + pd.Series([]).sum() + pd.Series([np.nan]).sum() -.. _whatsnew_0220.prior_deprecations: +The default behavior is the same as pandas 0.20.3 with bottleneck installed. It +also matches the behavior of NumPy's ``np.nansum`` on empty and all-*NA* arrays. -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +To have the sum of an empty series return ``NaN`` (the default behavior of +pandas 0.20.3 without bottleneck, or pandas 0.21.x), use the ``min_count`` +keyword. -- -- -- +.. ipython:: python -.. _whatsnew_0220.performance: + pd.Series([]).sum(min_count=1) -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ +Thanks to the ``skipna`` parameter, the ``.sum`` on an all-*NA* +series is conceptually the same as the ``.sum`` of an empty one with +``skipna=True`` (the default). -- -- -- +.. ipython:: python -.. _whatsnew_0220.docs: + pd.Series([np.nan]).sum(min_count=1) # skipna=True by default -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ +The ``min_count`` parameter refers to the minimum number of *non-null* values +required for a non-NA sum or product. -- -- -- +:meth:`Series.prod` has been updated to behave the same as :meth:`Series.sum`, +returning ``1`` instead. -.. _whatsnew_0220.bug_fixes: +.. ipython:: python -Bug Fixes -~~~~~~~~~ + pd.Series([]).prod() + pd.Series([np.nan]).prod() + pd.Series([]).prod(min_count=1) -Conversion -^^^^^^^^^^ +These changes affect :meth:`DataFrame.sum` and :meth:`DataFrame.prod` as well. +Finally, a few less obvious places in pandas are affected by this change. -- -- -- +Grouping by a Categorical +^^^^^^^^^^^^^^^^^^^^^^^^^ -Indexing -^^^^^^^^ +Grouping by a ``Categorical`` and summing now returns ``0`` instead of +``NaN`` for categories with no observations. The product now returns ``1`` +instead of ``NaN``. + +*pandas 0.21.x* + +.. code-block:: ipython -- -- -- + In [8]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) -I/O -^^^ + In [9]: pd.Series([1, 2]).groupby(grouper).sum() + Out[9]: + a 3.0 + b NaN + dtype: float64 -- -- -- +*pandas 0.22* -Plotting +.. ipython:: python + + grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + pd.Series([1, 2]).groupby(grouper).sum() + +To restore the 0.21 behavior of returning ``NaN`` for unobserved groups, +use ``min_count>=1``. + +.. ipython:: python + + pd.Series([1, 2]).groupby(grouper).sum(min_count=1) + +Resample ^^^^^^^^ -- -- -- +The sum and product of all-*NA* bins has changed from ``NaN`` to ``0`` for +sum and ``1`` for product. + +*pandas 0.21.x* + +.. code-block:: ipython + + In [11]: s = pd.Series([1, 1, np.nan, np.nan], + ...: index=pd.date_range('2017', periods=4)) + ...: s + Out[11]: + 2017-01-01 1.0 + 2017-01-02 1.0 + 2017-01-03 NaN + 2017-01-04 NaN + Freq: D, dtype: float64 + + In [12]: s.resample('2d').sum() + Out[12]: + 2017-01-01 2.0 + 2017-01-03 NaN + Freq: 2D, dtype: float64 + +*pandas 0.22.0* + +.. ipython:: python + + s = pd.Series([1, 1, np.nan, np.nan], + index=pd.date_range('2017', periods=4)) + s.resample('2d').sum() + +To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. + +.. ipython:: python + + s.resample('2d').sum(min_count=1) + +In particular, upsampling and taking the sum or product is affected, as +upsampling introduces missing values even if the original series was +entirely valid. + +*pandas 0.21.x* + +.. code-block:: ipython + + In [14]: idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + + In [15]: pd.Series([1, 2], index=idx).resample('12H').sum() + Out[15]: + 2017-01-01 00:00:00 1.0 + 2017-01-01 12:00:00 NaN + 2017-01-02 00:00:00 2.0 + Freq: 12H, dtype: float64 + +*pandas 0.22.0* + +.. ipython:: python + + idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + pd.Series([1, 2], index=idx).resample("12H").sum() + +Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. -Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ +.. ipython:: python -- -- -- + pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1) -Sparse -^^^^^^ +Rolling and Expanding +^^^^^^^^^^^^^^^^^^^^^ -- -- -- +Rolling and expanding already have a ``min_periods`` keyword that behaves +similar to ``min_count``. The only case that changes is when doing a rolling +or expanding sum with ``min_periods=0``. Previously this returned ``NaN``, +when fewer than ``min_periods`` non-*NA* values were in the window. Now it +returns ``0``. -Reshaping -^^^^^^^^^ +*pandas 0.21.1* -- -- -- +.. code-block:: ipython -Numeric -^^^^^^^ + In [17]: s = pd.Series([np.nan, np.nan]) -- -- -- + In [18]: s.rolling(2, min_periods=0).sum() + Out[18]: + 0 NaN + 1 NaN + dtype: float64 -Categorical -^^^^^^^^^^^ +*pandas 0.22.0* -- -- -- +.. ipython:: python -Other -^^^^^ + s = pd.Series([np.nan, np.nan]) + s.rolling(2, min_periods=0).sum() -- -- -- +The default behavior of ``min_periods=None``, implying that ``min_periods`` +equals the window size, is unchanged. diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index d38b677df321c..14d47398ac1df 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -36,7 +36,8 @@ def get_dispatch(dtypes): def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ @@ -88,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = sumx[i, j] @@ -99,7 +100,8 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ @@ -147,7 +149,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, for i in range(ncounts): for j in range(K): - if nobs[i, j] == 0: + if nobs[i, j] < min_count: out[i, j] = NAN else: out[i, j] = prodx[i, j] @@ -159,12 +161,15 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, ct, oldmean ndarray[{{dest_type2}}, ndim=2] nobs, mean + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -208,12 +213,15 @@ def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] sumx, nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -263,7 +271,8 @@ def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -272,6 +281,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{dest_type2}} val, count Py_ssize_t ngroups = len(counts) + assert min_count == -1, "'min_count' only used in add and prod" + if len(labels) == 0: return @@ -332,7 +343,8 @@ def get_dispatch(dtypes): def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -342,6 +354,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -382,7 +396,8 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{c_type}}, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): + ndarray[int64_t] labels, int64_t rank, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -392,6 +407,8 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] resx ndarray[int64_t, ndim=2] nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -455,7 +472,8 @@ def get_dispatch(dtypes): def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -464,6 +482,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] maxx, nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -526,7 +546,8 @@ def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[int64_t] counts, ndarray[{{dest_type2}}, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -535,6 +556,8 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, {{dest_type2}} val, count ndarray[{{dest_type2}}, ndim=2] minx, nobs + assert min_count == -1, "'min_count' only used in add and prod" + if not len(values) == len(labels): raise AssertionError("len(index) != len(labels)") @@ -686,7 +709,8 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] counts, ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -695,6 +719,9 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, ndarray[int64_t] _counts ndarray data float64_t* ptr + + assert min_count == -1, "'min_count' only used in add and prod" + ngroups = len(counts) N, K = ( values).shape diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index a1c4ddbc8d0b0..3a7a6d54d3851 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -225,14 +225,16 @@ cdef class VariableWindowIndexer(WindowIndexer): right_closed: bint right endpoint closedness True if the right endpoint is closed, False if open - + floor: optional + unit for flooring the unit """ def __init__(self, ndarray input, int64_t win, int64_t minp, - bint left_closed, bint right_closed, ndarray index): + bint left_closed, bint right_closed, ndarray index, + object floor=None): self.is_variable = 1 self.N = len(index) - self.minp = _check_minp(win, minp, self.N) + self.minp = _check_minp(win, minp, self.N, floor=floor) self.start = np.empty(self.N, dtype='int64') self.start.fill(-1) @@ -347,7 +349,7 @@ def get_window_indexer(input, win, minp, index, closed, if index is not None: indexer = VariableWindowIndexer(input, win, minp, left_closed, - right_closed, index) + right_closed, index, floor) elif use_mock: indexer = MockFixedWindowIndexer(input, win, minp, left_closed, right_closed, index, floor) @@ -446,7 +448,7 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, object index, object closed): cdef: double val, prev_x, sum_x = 0 - int64_t s, e + int64_t s, e, range_endpoint int64_t nobs = 0, i, j, N bint is_variable ndarray[int64_t] start, end @@ -454,7 +456,8 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, start, end, N, win, minp, is_variable = get_window_indexer(input, win, minp, index, - closed) + closed, + floor=0) output = np.empty(N, dtype=float) # for performance we are going to iterate @@ -494,13 +497,15 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, # fixed window + range_endpoint = int_max(minp, 1) - 1 + with nogil: - for i in range(0, minp - 1): + for i in range(0, range_endpoint): add_sum(input[i], &nobs, &sum_x) output[i] = NaN - for i in range(minp - 1, N): + for i in range(range_endpoint, N): val = input[i] add_sum(val, &nobs, &sum_x) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 58d86251a4a62..9a2a763cf6def 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6921,7 +6921,8 @@ def _add_numeric_operations(cls): @Substitution(outname='mad', desc="Return the mean absolute deviation of the values " "for the requested axis", - name1=name, name2=name2, axis_descr=axis_descr) + name1=name, name2=name2, axis_descr=axis_descr, + min_count='', examples='') @Appender(_num_doc) def mad(self, axis=None, skipna=None, level=None): if skipna is None: @@ -6962,7 +6963,8 @@ def mad(self, axis=None, skipna=None, level=None): @Substitution(outname='compounded', desc="Return the compound percentage of the values for " "the requested axis", name1=name, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, + min_count='', examples='') @Appender(_num_doc) def compound(self, axis=None, skipna=None, level=None): if skipna is None: @@ -6986,10 +6988,10 @@ def compound(self, axis=None, skipna=None, level=None): lambda y, axis: np.maximum.accumulate(y, axis), "max", -np.inf, np.nan) - cls.sum = _make_stat_function( + cls.sum = _make_min_count_stat_function( cls, 'sum', name, name2, axis_descr, 'Return the sum of the values for the requested axis', - nanops.nansum) + nanops.nansum, _sum_examples) cls.mean = _make_stat_function( cls, 'mean', name, name2, axis_descr, 'Return the mean of the values for the requested axis', @@ -7005,10 +7007,10 @@ def compound(self, axis=None, skipna=None, level=None): "by N-1\n", nanops.nankurt) cls.kurtosis = cls.kurt - cls.prod = _make_stat_function( + cls.prod = _make_min_count_stat_function( cls, 'prod', name, name2, axis_descr, 'Return the product of the values for the requested axis', - nanops.nanprod) + nanops.nanprod, _prod_examples) cls.product = cls.prod cls.median = _make_stat_function( cls, 'median', name, name2, axis_descr, @@ -7139,10 +7141,13 @@ def _doc_parms(cls): numeric_only : boolean, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. +%(min_count)s\ Returns ------- -%(outname)s : %(name1)s or %(name2)s (if level specified)\n""" +%(outname)s : %(name1)s or %(name2)s (if level specified) + +%(examples)s""" _num_ddof_doc = """ @@ -7210,9 +7215,92 @@ def _doc_parms(cls): """ +_sum_examples = """\ +Examples +-------- +By default, the sum of an empty or all-NA Series is ``0``. + +>>> pd.Series([]).sum() # min_count=0 is the default +0.0 + +This can be controlled with the ``min_count`` parameter. For example, if +you'd like the sum of an empty series to be NaN, pass ``min_count=1``. + +>>> pd.Series([]).sum(min_count=1) +nan + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).sum() +0.0 + +>>> pd.Series([np.nan]).sum(min_count=1) +nan +""" + +_prod_examples = """\ +Examples +-------- +By default, the product of an empty or all-NA Series is ``1`` + +>>> pd.Series([]).prod() +1.0 + +This can be controlled with the ``min_count`` parameter + +>>> pd.Series([]).prod(min_count=1) +nan + +Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and +empty series identically. + +>>> pd.Series([np.nan]).prod() +1.0 + +>>> pd.Series([np.nan]).sum(min_count=1) +nan +""" + + +_min_count_stub = """\ +min_count : int, default 1 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + + .. versionadded :: 0.21.2 + + Added with the default being 1. This means the sum or product + of an all-NA or empty series is ``NaN``. +""" + + +def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc, + f, examples): + @Substitution(outname=name, desc=desc, name1=name1, name2=name2, + axis_descr=axis_descr, min_count=_min_count_stub, + examples=examples) + @Appender(_num_doc) + def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, + min_count=0, + **kwargs): + nv.validate_stat_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, + skipna=skipna, min_count=min_count) + return self._reduce(f, name, axis=axis, skipna=skipna, + numeric_only=numeric_only, min_count=min_count) + + return set_function_name(stat_func, name, cls) + + def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, min_count='', examples='') @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5931f6e009dab..aef5ff7ba64d3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -908,7 +908,8 @@ def _cython_transform(self, how, numeric_only=True): return self._wrap_transformed_output(output, names) - def _cython_agg_general(self, how, alt=None, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True, + min_count=-1): output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) @@ -916,7 +917,8 @@ def _cython_agg_general(self, how, alt=None, numeric_only=True): continue try: - result, names = self.grouper.aggregate(obj.values, how) + result, names = self.grouper.aggregate(obj.values, how, + min_count=min_count) except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) @@ -1223,7 +1225,8 @@ def _add_numeric_operations(cls): """ add numeric operations to the GroupBy generically """ def groupby_function(name, alias, npfunc, - numeric_only=True, _convert=False): + numeric_only=True, _convert=False, + min_count=-1): _local_template = "Compute %(f)s of group values" @@ -1233,6 +1236,8 @@ def groupby_function(name, alias, npfunc, def f(self, **kwargs): if 'numeric_only' not in kwargs: kwargs['numeric_only'] = numeric_only + if 'min_count' not in kwargs: + kwargs['min_count'] = min_count self._set_group_selection() try: return self._cython_agg_general( @@ -1280,8 +1285,8 @@ def last(x): else: return last(x) - cls.sum = groupby_function('sum', 'add', np.sum) - cls.prod = groupby_function('prod', 'prod', np.prod) + cls.sum = groupby_function('sum', 'add', np.sum, min_count=0) + cls.prod = groupby_function('prod', 'prod', np.prod, min_count=0) cls.min = groupby_function('min', 'min', np.min, numeric_only=False) cls.max = groupby_function('max', 'max', np.max, numeric_only=False) cls.first = groupby_function('first', 'first', first_compat, @@ -2107,7 +2112,7 @@ def get_group_levels(self): 'var': 'group_var', 'first': { 'name': 'group_nth', - 'f': lambda func, a, b, c, d: func(a, b, c, d, 1) + 'f': lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1) }, 'last': 'group_last', 'ohlc': 'group_ohlc', @@ -2177,7 +2182,7 @@ def wrapper(*args, **kwargs): (how, dtype_str)) return func, dtype_str - def _cython_operation(self, kind, values, how, axis): + def _cython_operation(self, kind, values, how, axis, min_count=-1): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions @@ -2262,11 +2267,12 @@ def _cython_operation(self, kind, values, how, axis): counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, - is_datetimelike) + is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) + # TODO: min_count result = self._transform( result, values, labels, func, is_numeric, is_datetimelike) @@ -2303,14 +2309,15 @@ def _cython_operation(self, kind, values, how, axis): return result, names - def aggregate(self, values, how, axis=0): - return self._cython_operation('aggregate', values, how, axis) + def aggregate(self, values, how, axis=0, min_count=-1): + return self._cython_operation('aggregate', values, how, axis, + min_count=min_count) def transform(self, values, how, axis=0): return self._cython_operation('transform', values, how, axis) def _aggregate(self, result, counts, values, comp_ids, agg_func, - is_numeric, is_datetimelike): + is_numeric, is_datetimelike, min_count=-1): if values.ndim > 3: # punting for now raise NotImplementedError("number of dimensions is currently " @@ -2319,9 +2326,10 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, for i, chunk in enumerate(values.transpose(2, 0, 1)): chunk = chunk.squeeze() - agg_func(result[:, :, i], counts, chunk, comp_ids) + agg_func(result[:, :, i], counts, chunk, comp_ids, + min_count) else: - agg_func(result, counts, values, comp_ids) + agg_func(result, counts, values, comp_ids, min_count) return result @@ -3595,9 +3603,10 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, alt=None, numeric_only=True): + def _cython_agg_general(self, how, alt=None, numeric_only=True, + min_count=-1): new_items, new_blocks = self._cython_agg_blocks( - how, alt=alt, numeric_only=numeric_only) + how, alt=alt, numeric_only=numeric_only, min_count=min_count) return self._wrap_agged_blocks(new_items, new_blocks) def _wrap_agged_blocks(self, items, blocks): @@ -3623,7 +3632,8 @@ def _wrap_agged_blocks(self, items, blocks): _block_agg_axis = 0 - def _cython_agg_blocks(self, how, alt=None, numeric_only=True): + def _cython_agg_blocks(self, how, alt=None, numeric_only=True, + min_count=-1): # TODO: the actual managing of mgr_locs is a PITA # here, it should happen via BlockManager.combine @@ -3640,7 +3650,7 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True): locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis) + block.values, how, axis=agg_axis, min_count=min_count) except NotImplementedError: # generally if we have numeric_only=False # and non-applicable functions diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e1c09947ac0b4..d1a355021f388 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -107,21 +107,14 @@ def f(values, axis=None, skipna=True, **kwds): if k not in kwds: kwds[k] = v try: - if values.size == 0: - - # we either return np.nan or pd.NaT - if is_numeric_dtype(values): - values = values.astype('float64') - fill_value = na_value_for_dtype(values.dtype) - - if values.ndim == 1: - return fill_value - else: - result_shape = (values.shape[:axis] + - values.shape[axis + 1:]) - result = np.empty(result_shape, dtype=values.dtype) - result.fill(fill_value) - return result + if values.size == 0 and kwds.get('min_count') is None: + # We are empty, returning NA for our type + # Only applies for the default `min_count` of None + # since that affects how empty arrays are handled. + # TODO(GH-18976) update all the nanops methods to + # correctly handle empty inputs and remove this check. + # It *may* just be `var` + return _na_for_min_count(values, axis) if (_USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name)): @@ -292,6 +285,36 @@ def _wrap_results(result, dtype): return result +def _na_for_min_count(values, axis): + """Return the missing value for `values` + + Parameters + ---------- + values : ndarray + axis : int or None + axis for the reduction + + Returns + ------- + result : scalar or ndarray + For 1-D values, returns a scalar of the correct missing type. + For 2-D values, returns a 1-D array where each element is missing. + """ + # we either return np.nan or pd.NaT + if is_numeric_dtype(values): + values = values.astype('float64') + fill_value = na_value_for_dtype(values.dtype) + + if values.ndim == 1: + return fill_value + else: + result_shape = (values.shape[:axis] + + values.shape[axis + 1:]) + result = np.empty(result_shape, dtype=values.dtype) + result.fill(fill_value) + return result + + def nanany(values, axis=None, skipna=True): values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna) return values.any(axis) @@ -304,7 +327,7 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch() -def nansum(values, axis=None, skipna=True): +def nansum(values, axis=None, skipna=True, min_count=0): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): @@ -312,7 +335,7 @@ def nansum(values, axis=None, skipna=True): elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) - the_sum = _maybe_null_out(the_sum, axis, mask) + the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count) return _wrap_results(the_sum, dtype) @@ -641,13 +664,13 @@ def nankurt(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nanprod(values, axis=None, skipna=True): +def nanprod(values, axis=None, skipna=True, min_count=0): mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 result = values.prod(axis) - return _maybe_null_out(result, axis, mask) + return _maybe_null_out(result, axis, mask, min_count=min_count) def _maybe_arg_null_out(result, axis, mask, skipna): @@ -683,9 +706,9 @@ def _get_counts(mask, axis, dtype=float): return np.array(count, dtype=dtype) -def _maybe_null_out(result, axis, mask): +def _maybe_null_out(result, axis, mask, min_count=1): if axis is not None and getattr(result, 'ndim', False): - null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 + null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): @@ -698,7 +721,7 @@ def _maybe_null_out(result, axis, mask): result[null_mask] = None elif result is not tslib.NaT: null_mask = mask.size - mask.sum() - if null_mask == 0: + if null_mask < min_count: result = np.nan return result diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 1adb3a078cca3..db1d3d4c5e31b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -601,9 +601,20 @@ def size(self): Resampler._deprecated_valids += dir(Resampler) + +# downsample methods +for method in ['sum', 'prod']: + + def f(self, _method=method, min_count=0, *args, **kwargs): + nv.validate_resampler_func(_method, args, kwargs) + return self._downsample(_method, min_count=min_count) + f.__doc__ = getattr(GroupBy, method).__doc__ + setattr(Resampler, method, f) + + # downsample methods -for method in ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem', - 'median', 'prod', 'ohlc']: +for method in ['min', 'max', 'first', 'last', 'mean', 'sem', + 'median', 'ohlc']: def f(self, _method=method, *args, **kwargs): nv.validate_resampler_func(_method, args, kwargs) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 1bac4037e99c9..97ab0deb50d50 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -440,7 +440,8 @@ def test_nunique(self): Series({0: 1, 1: 3, 2: 2})) def test_sum(self): - self._check_stat_op('sum', np.sum, has_numeric_only=True) + self._check_stat_op('sum', np.sum, has_numeric_only=True, + skipna_alternative=np.nansum) # mixed types (with upcasting happening) self._check_stat_op('sum', np.sum, @@ -716,7 +717,8 @@ def alt(x): def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, has_numeric_only=False, check_dtype=True, - check_dates=False, check_less_precise=False): + check_dates=False, check_less_precise=False, + skipna_alternative=None): if frame is None: frame = self.frame # set some NAs @@ -737,15 +739,11 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, assert len(result) if has_skipna: - def skipna_wrapper(x): - nona = x.dropna() - if len(nona) == 0: - return np.nan - return alternative(nona) - def wrapper(x): return alternative(x.values) + skipna_wrapper = tm._make_skipna_wrapper(alternative, + skipna_alternative) result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal(result0, frame.apply(wrapper), @@ -797,8 +795,11 @@ def wrapper(x): r0 = getattr(all_na, name)(axis=0) r1 = getattr(all_na, name)(axis=1) if name in ['sum', 'prod']: - assert np.isnan(r0).all() - assert np.isnan(r1).all() + unit = int(name == 'prod') + expected = pd.Series(unit, index=r0.index, dtype=r0.dtype) + tm.assert_series_equal(r0, expected) + expected = pd.Series(unit, index=r1.index, dtype=r1.dtype) + tm.assert_series_equal(r1, expected) def test_mode(self): df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11], @@ -936,6 +937,66 @@ def test_sum_corner(self): assert len(axis0) == 0 assert len(axis1) == 0 + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_sum_prod_nanops(self, method, unit): + idx = ['a', 'b', 'c'] + df = pd.DataFrame({"a": [unit, unit], + "b": [unit, np.nan], + "c": [np.nan, np.nan]}) + # The default + result = getattr(df, method) + expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') + + # min_count=1 + result = getattr(df, method)(min_count=1) + expected = pd.Series([unit, unit, np.nan], index=idx) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = getattr(df, method)(min_count=0) + expected = pd.Series([unit, unit, unit], index=idx, dtype='float64') + tm.assert_series_equal(result, expected) + + result = getattr(df.iloc[1:], method)(min_count=1) + expected = pd.Series([unit, np.nan, np.nan], index=idx) + tm.assert_series_equal(result, expected) + + # min_count > 1 + df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5}) + result = getattr(df, method)(min_count=5) + expected = pd.Series(result, index=['A', 'B']) + tm.assert_series_equal(result, expected) + + result = getattr(df, method)(min_count=6) + expected = pd.Series(result, index=['A', 'B']) + tm.assert_series_equal(result, expected) + + def test_sum_nanops_timedelta(self): + # prod isn't defined on timedeltas + idx = ['a', 'b', 'c'] + df = pd.DataFrame({"a": [0, 0], + "b": [0, np.nan], + "c": [np.nan, np.nan]}) + + df2 = df.apply(pd.to_timedelta) + + # 0 by default + result = df2.sum() + expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df2.sum(min_count=0) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df2.sum(min_count=1) + expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', index=idx) + tm.assert_series_equal(result, expected) + def test_sum_object(self): values = self.frame.values.astype(int) frame = DataFrame(values, index=self.frame.index, diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py index 913d3bcc09869..ad1a322fdaae9 100644 --- a/pandas/tests/groupby/test_aggregate.py +++ b/pandas/tests/groupby/test_aggregate.py @@ -809,26 +809,60 @@ def test__cython_agg_general(self): exc.args += ('operation: %s' % op, ) raise - def test_cython_agg_empty_buckets(self): - ops = [('mean', np.mean), - ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), - ('var', lambda x: np.var(x, ddof=1)), - ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), ] - + @pytest.mark.parametrize('op, targop', [ + ('mean', np.mean), + ('median', lambda x: np.median(x) if len(x) > 0 else np.nan), + ('var', lambda x: np.var(x, ddof=1)), + ('min', np.min), + ('max', np.max), ] + ) + def test_cython_agg_empty_buckets(self, op, targop): df = pd.DataFrame([11, 12, 13]) grps = range(0, 55, 5) - for op, targop in ops: - result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) - expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op,) - raise + # calling _cython_agg_general directly, instead of via the user API + # which sets different values for min_count, so do that here. + result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op) + expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x)) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op,) + raise + + def test_cython_agg_empty_buckets_nanops(self): + # GH-18869 can't call nanops on empty groups, so hardcode expected + # for these + df = pd.DataFrame([11, 12, 13], columns=['a']) + grps = range(0, 25, 5) + # add / sum + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add') + intervals = pd.interval_range(0, 20, freq=5) + expected = pd.DataFrame( + {"a": [0, 0, 36, 0]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + + # prod + result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod') + expected = pd.DataFrame( + {"a": [1, 1, 1716, 1]}, + index=pd.CategoricalIndex(intervals, name='a', ordered=True)) + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") + def test_agg_category_nansum(self): + categories = ['a', 'b', 'c'] + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=categories), + 'B': [1, 2, 3]}) + result = df.groupby("A").B.agg(np.nansum) + expected = pd.Series([3, 3, 0], + index=pd.CategoricalIndex(['a', 'b', 'c'], + categories=categories, + name='A'), + name='B') + tm.assert_series_equal(result, expected) def test_agg_over_numpy_arrays(self): # GH 3788 diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index fdc03acd3e931..d4f35aa8755d1 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -17,6 +17,142 @@ class TestGroupByCategorical(MixIn): + def test_groupby(self): + + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) + data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) + + exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) + expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) + result = data.groupby("b").mean() + tm.assert_frame_equal(result, expected) + + raw_cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + raw_cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) + + # single grouper + gb = df.groupby("A") + exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) + expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # multiple groupers + gb = df.groupby(['A', 'B']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True)], + names=['A', 'B']) + expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, + np.nan, np.nan, np.nan]}, + index=exp_index) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # multiple groupers with a non-cat + df = df.copy() + df['C'] = ['foo', 'bar'] * 2 + gb = df.groupby(['A', 'B', 'C']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True), + ['foo', 'bar']], + names=['A', 'B', 'C']) + expected = DataFrame({'values': Series( + np.nan, index=exp_index)}).sort_index() + expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # GH 8623 + x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], + [1, 'John P. Doe']], + columns=['person_id', 'person_name']) + x['person_name'] = Categorical(x.person_name) + + g = x.groupby(['person_id']) + result = g.transform(lambda x: x) + tm.assert_frame_equal(result, x[['person_name']]) + + result = x.drop_duplicates('person_name') + expected = x.iloc[[0, 1]] + tm.assert_frame_equal(result, expected) + + def f(x): + return x.drop_duplicates('person_name').iloc[0] + + result = g.apply(f) + expected = x.iloc[[0, 1]].copy() + expected.index = Index([1, 2], name='person_id') + expected['person_name'] = expected['person_name'].astype('object') + tm.assert_frame_equal(result, expected) + + # GH 9921 + # Monotonic + df = DataFrame({"a": [5, 15, 25]}) + c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) + + result = df.a.groupby(c).transform(sum) + tm.assert_series_equal(result, df['a']) + + tm.assert_series_equal( + df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) + tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + tm.assert_frame_equal( + df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) + + # Filter + tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) + tm.assert_frame_equal(df.groupby(c).filter(np.all), df) + + # Non-monotonic + df = DataFrame({"a": [5, 15, 25, -5]}) + c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) + + result = df.a.groupby(c).transform(sum) + tm.assert_series_equal(result, df['a']) + + tm.assert_series_equal( + df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) + tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + tm.assert_frame_equal( + df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) + + # GH 9603 + df = DataFrame({'a': [1, 0, 0, 0]}) + c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) + result = df.groupby(c).apply(len) + + exp_index = CategoricalIndex( + c.values.categories, ordered=c.values.ordered) + expected = Series([1, 0, 0, 0], index=exp_index) + expected.index.name = 'a' + tm.assert_series_equal(result, expected) + + def test_groupby_sort(self): + + # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby + # This should result in a properly sorted Series so that the plot + # has a sorted x axis + # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') + + df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=['value'], ascending=True) + df['value_group'] = pd.cut(df.value, range(0, 10500, 500), + right=False, labels=cat_labels) + + res = df.groupby(['value_group'])['value_group'].count() + exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] + exp.index = CategoricalIndex(exp.index, name=exp.index.name) + tm.assert_series_equal(res, exp) + def test_level_groupby_get_group(self): # GH15155 df = DataFrame(data=np.arange(2, 22, 2), @@ -526,3 +662,53 @@ def test_groupby_categorical_two_columns(self): "C3": [nan, nan, nan, nan, 10, 100, nan, nan, nan, nan, 200, 34]}, index=idx) tm.assert_frame_equal(res, exp) + + def test_empty_sum(self): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # 0 by default + result = df.groupby("A").B.sum() + expected = pd.Series([3, 1, 0], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.sum(min_count=0) + expected = pd.Series([3, 1, 0], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.sum(min_count=1) + expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count>1 + result = df.groupby("A").B.sum(min_count=2) + expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + def test_empty_prod(self): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # 1 by default + result = df.groupby("A").B.prod() + expected = pd.Series([2, 1, 1], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.prod(min_count=0) + expected = pd.Series([2, 1, 1], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.prod(min_count=1) + expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 675f8d6413b2a..7a5581c897231 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3850,7 +3850,7 @@ def h(df, arg3): # Assert the results here index = pd.Index(['A', 'B', 'C'], name='group') - expected = pd.Series([-79.5160891089, -78.4839108911, None], + expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index c8503b16a0e16..d359bfa5351a9 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -41,12 +41,11 @@ def test_groupby_with_timegrouper(self): df = df.set_index(['Date']) expected = DataFrame( - {'Quantity': np.nan}, + {'Quantity': 0}, index=date_range('20130901 13:00:00', '20131205 13:00:00', freq='5D', name='Date', closed='left')) - expected.iloc[[0, 6, 18], 0] = np.array( - [24., 6., 9.], dtype='float64') + expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64') result1 = df.resample('5D') .sum() assert_frame_equal(result1, expected) @@ -245,6 +244,8 @@ def test_timegrouper_with_reg_groups(self): result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum() assert_frame_equal(result, expected) + @pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR']) + def test_timegrouper_with_reg_groups_freq(self, freq): # GH 6764 multiple grouping with/without sort df = DataFrame({ 'date': pd.to_datetime([ @@ -258,20 +259,24 @@ def test_timegrouper_with_reg_groups(self): 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] }).set_index('date') - for freq in ['D', 'M', 'A', 'Q-APR']: - expected = df.groupby('user_id')[ - 'whole_cost'].resample( - freq).sum().dropna().reorder_levels( - ['date', 'user_id']).sort_index().astype('int64') - expected.name = 'whole_cost' - - result1 = df.sort_index().groupby([pd.Grouper(freq=freq), - 'user_id'])['whole_cost'].sum() - assert_series_equal(result1, expected) - - result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ - 'whole_cost'].sum() - assert_series_equal(result2, expected) + expected = ( + df.groupby('user_id')['whole_cost'] + .resample(freq) + .sum(min_count=1) # XXX + .dropna() + .reorder_levels(['date', 'user_id']) + .sort_index() + .astype('int64') + ) + expected.name = 'whole_cost' + + result1 = df.sort_index().groupby([pd.Grouper(freq=freq), + 'user_id'])['whole_cost'].sum() + assert_series_equal(result1, expected) + + result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[ + 'whole_cost'].sum() + assert_series_equal(result2, expected) def test_timegrouper_get_group(self): # GH 6914 diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 2ee404ab5fe0d..d6db2ab83098b 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -28,40 +28,124 @@ class TestSeriesAnalytics(TestData): @pytest.mark.parametrize("use_bottleneck", [True, False]) - @pytest.mark.parametrize("method", ["sum", "prod"]) - def test_empty(self, method, use_bottleneck): - + @pytest.mark.parametrize("method, unit", [ + ("sum", 0.0), + ("prod", 1.0) + ]) + def test_empty(self, method, unit, use_bottleneck): with pd.option_context("use_bottleneck", use_bottleneck): - # GH 9422 - # treat all missing as NaN + # GH 9422 / 18921 + # Entirely empty s = Series([]) + # NA by default result = getattr(s, method)() + assert result == unit + + # Explict + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) assert isna(result) + # Skipna, default result = getattr(s, method)(skipna=True) + result == unit + + # Skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) assert isna(result) + # All-NA s = Series([np.nan]) + # NA by default result = getattr(s, method)() + assert result == unit + + # Explicit + result = getattr(s, method)(min_count=0) + assert result == unit + + result = getattr(s, method)(min_count=1) assert isna(result) + # Skipna, default result = getattr(s, method)(skipna=True) + result == unit + + # skipna, explicit + result = getattr(s, method)(skipna=True, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=True, min_count=1) assert isna(result) + # Mix of valid, empty s = Series([np.nan, 1]) + # Default result = getattr(s, method)() assert result == 1.0 - s = Series([np.nan, 1]) + # Explicit + result = getattr(s, method)(min_count=0) + assert result == 1.0 + + result = getattr(s, method)(min_count=1) + assert result == 1.0 + + # Skipna result = getattr(s, method)(skipna=True) assert result == 1.0 + result = getattr(s, method)(skipna=True, min_count=0) + assert result == 1.0 + + result = getattr(s, method)(skipna=True, min_count=1) + assert result == 1.0 + # GH #844 (changed in 9422) df = DataFrame(np.empty((10, 0))) - assert (df.sum(1).isnull()).all() + assert (getattr(df, method)(1) == unit).all() + + s = pd.Series([1]) + result = getattr(s, method)(min_count=2) + assert isna(result) + + s = pd.Series([np.nan]) + result = getattr(s, method)(min_count=2) + assert isna(result) + + s = pd.Series([np.nan, 1]) + result = getattr(s, method)(min_count=2) + assert isna(result) + + @pytest.mark.parametrize('method, unit', [ + ('sum', 0.0), + ('prod', 1.0), + ]) + def test_empty_multi(self, method, unit): + s = pd.Series([1, np.nan, np.nan, np.nan], + index=pd.MultiIndex.from_product([('a', 'b'), (0, 1)])) + # 1 / 0 by default + result = getattr(s, method)(level=0) + expected = pd.Series([1, unit], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = getattr(s, method)(level=0, min_count=0) + expected = pd.Series([1, unit], index=['a', 'b']) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = getattr(s, method)(level=0, min_count=1) + expected = pd.Series([1, np.nan], index=['a', 'b']) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "method", ['sum', 'mean', 'median', 'std', 'var']) + "method", ['mean', 'median', 'std', 'var']) def test_ops_consistency_on_empty(self, method): # GH 7869 @@ -109,7 +193,7 @@ def test_sum_overflow(self, use_bottleneck): assert np.allclose(float(result), v[-1]) def test_sum(self): - self._check_stat_op('sum', np.sum, check_allna=True) + self._check_stat_op('sum', np.sum, check_allna=False) def test_sum_inf(self): s = Series(np.random.randn(10)) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index cf5e3fe4f29b0..255367523a3d8 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -38,7 +38,7 @@ def test_quantile(self): # GH7661 result = Series([np.timedelta64('NaT')]).sum() - assert result is pd.NaT + assert result == pd.Timedelta(0) msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 6366aae8ccdf6..48c1622aa0c4e 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3163,18 +3163,6 @@ def test_info(self): buf = compat.StringIO() df2.info(buf=buf) - def test_groupby_sort(self): - - # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby - # This should result in a properly sorted Series so that the plot - # has a sorted x axis - # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') - - res = self.cat.groupby(['value_group'])['value_group'].count() - exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] - exp.index = pd.CategoricalIndex(exp.index, name=exp.index.name) - tm.assert_series_equal(res, exp) - def test_min_max(self): # unordered cats have no min/max cat = Series(Categorical(["a", "b", "c", "d"], ordered=False)) @@ -3294,123 +3282,6 @@ def test_value_counts_with_nan(self): res = s.value_counts(dropna=False, sort=False) tm.assert_series_equal(res, exp) - def test_groupby(self): - - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) - data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - - exp_index = pd.CategoricalIndex(['a', 'b', 'c', 'd'], name='b', - ordered=True) - expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) - result = data.groupby("b").mean() - tm.assert_frame_equal(result, expected) - - raw_cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - raw_cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) - - # single grouper - gb = df.groupby("A") - exp_idx = pd.CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) - expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)}) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # multiple groupers - gb = df.groupby(['A', 'B']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True)], - names=['A', 'B']) - expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, - np.nan, np.nan, np.nan]}, - index=exp_index) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # multiple groupers with a non-cat - df = df.copy() - df['C'] = ['foo', 'bar'] * 2 - gb = df.groupby(['A', 'B', 'C']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True), - ['foo', 'bar']], - names=['A', 'B', 'C']) - expected = DataFrame({'values': Series( - np.nan, index=exp_index)}).sort_index() - expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # GH 8623 - x = pd.DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], - [1, 'John P. Doe']], - columns=['person_id', 'person_name']) - x['person_name'] = pd.Categorical(x.person_name) - - g = x.groupby(['person_id']) - result = g.transform(lambda x: x) - tm.assert_frame_equal(result, x[['person_name']]) - - result = x.drop_duplicates('person_name') - expected = x.iloc[[0, 1]] - tm.assert_frame_equal(result, expected) - - def f(x): - return x.drop_duplicates('person_name').iloc[0] - - result = g.apply(f) - expected = x.iloc[[0, 1]].copy() - expected.index = Index([1, 2], name='person_id') - expected['person_name'] = expected['person_name'].astype('object') - tm.assert_frame_equal(result, expected) - - # GH 9921 - # Monotonic - df = DataFrame({"a": [5, 15, 25]}) - c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) - - result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a']) - - tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) - tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) - - # Filter - tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) - tm.assert_frame_equal(df.groupby(c).filter(np.all), df) - - # Non-monotonic - df = DataFrame({"a": [5, 15, 25, -5]}) - c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) - - result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a']) - - tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) - tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) - - # GH 9603 - df = pd.DataFrame({'a': [1, 0, 0, 0]}) - c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=pd.Categorical(list('abcd'))) - result = df.groupby(c).apply(len) - - exp_index = pd.CategoricalIndex(c.values.categories, - ordered=c.values.ordered) - expected = pd.Series([1, 0, 0, 0], index=exp_index) - expected.index.name = 'a' - tm.assert_series_equal(result, expected) - def test_pivot_table(self): raw_cat1 = Categorical(["a", "a", "b", "b"], diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 6d2607962dfb0..aebc9cd3deaac 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -73,17 +73,11 @@ def teardown_method(self, method): def run_arithmetic(self, df, other, assert_func, check_dtype=False, test_flex=True): expr._MIN_ELEMENTS = 0 - operations = ['add', 'sub', 'mul', 'mod', 'truediv', 'floordiv', 'pow'] + operations = ['add', 'sub', 'mul', 'mod', 'truediv', 'floordiv'] if not compat.PY3: operations.append('div') for arith in operations: - # numpy >= 1.11 doesn't handle integers - # raised to integer powers - # https://github.com/pandas-dev/pandas/issues/15363 - if arith == 'pow' and not _np_version_under1p11: - continue - operator_name = arith if arith == 'div': operator_name = 'truediv' diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 9305504f8d5e3..5d56088193d30 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import division, print_function +from distutils.version import LooseVersion from functools import partial import pytest @@ -181,12 +182,17 @@ def _coerce_tds(targ, res): check_dtype=check_dtype) def check_fun_data(self, testfunc, targfunc, testarval, targarval, - targarnanval, check_dtype=True, **kwargs): + targarnanval, check_dtype=True, empty_targfunc=None, + **kwargs): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: targartempval = targarval if skipna else targarnanval - try: + if skipna and empty_targfunc and isna(targartempval).all(): + targ = empty_targfunc(targartempval, axis=axis, **kwargs) + else: targ = targfunc(targartempval, axis=axis, **kwargs) + + try: res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) self.check_results(targ, res, axis, @@ -218,10 +224,11 @@ def check_fun_data(self, testfunc, targfunc, testarval, targarval, except ValueError: return self.check_fun_data(testfunc, targfunc, testarval2, targarval2, - targarnanval2, check_dtype=check_dtype, **kwargs) + targarnanval2, check_dtype=check_dtype, + empty_targfunc=empty_targfunc, **kwargs) def check_fun(self, testfunc, targfunc, testar, targar=None, - targarnan=None, **kwargs): + targarnan=None, empty_targfunc=None, **kwargs): if targar is None: targar = testar if targarnan is None: @@ -231,7 +238,8 @@ def check_fun(self, testfunc, targfunc, testar, targar=None, targarnanval = getattr(self, targarnan) try: self.check_fun_data(testfunc, targfunc, testarval, targarval, - targarnanval, **kwargs) + targarnanval, empty_targfunc=empty_targfunc, + **kwargs) except BaseException as exc: exc.args += ('testar: %s' % testar, 'targar: %s' % targar, 'targarnan: %s' % targarnan) @@ -328,7 +336,8 @@ def test_nanall(self): def test_nansum(self): self.check_funs(nanops.nansum, np.sum, allow_str=False, - allow_date=False, allow_tdelta=True, check_dtype=False) + allow_date=False, allow_tdelta=True, check_dtype=False, + empty_targfunc=np.nansum) def test_nanmean(self): self.check_funs(nanops.nanmean, np.mean, allow_complex=False, @@ -461,8 +470,12 @@ def test_nankurt(self): allow_tdelta=False) def test_nanprod(self): + if LooseVersion(np.__version__) < LooseVersion("1.10.0"): + raise pytest.skip("np.nanprod added in 1.10.0") + self.check_funs(nanops.nanprod, np.prod, allow_str=False, - allow_date=False, allow_tdelta=False) + allow_date=False, allow_tdelta=False, + empty_targfunc=np.nanprod) def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs): res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 33fb6f1108bf2..7e442fcc2fc8b 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -3,6 +3,7 @@ from warnings import catch_warnings from datetime import datetime +from distutils.version import LooseVersion import operator import pytest @@ -10,7 +11,6 @@ import pandas as pd from pandas.core.dtypes.common import is_float_dtype -from pandas.core.dtypes.missing import remove_na_arraylike from pandas import (Series, DataFrame, Index, date_range, isna, notna, pivot, MultiIndex) from pandas.core.nanops import nanall, nanany @@ -83,13 +83,16 @@ def test_count(self): self._check_stat_op('count', f, obj=self.panel, has_skipna=False) def test_sum(self): - self._check_stat_op('sum', np.sum) + self._check_stat_op('sum', np.sum, skipna_alternative=np.nansum) def test_mean(self): self._check_stat_op('mean', np.mean) def test_prod(self): - self._check_stat_op('prod', np.prod) + if LooseVersion(np.__version__) < LooseVersion("1.10.0"): + raise pytest.skip("np.nanprod added in 1.10.0") + + self._check_stat_op('prod', np.prod, skipna_alternative=np.nanprod) def test_median(self): def wrapper(x): @@ -142,7 +145,8 @@ def alt(x): self._check_stat_op('sem', alt) - def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, + skipna_alternative=None): if obj is None: obj = self.panel @@ -154,11 +158,8 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): if has_skipna: - def skipna_wrapper(x): - nona = remove_na_arraylike(x) - if len(nona) == 0: - return np.nan - return alternative(nona) + skipna_wrapper = tm._make_skipna_wrapper(alternative, + skipna_alternative) def wrapper(x): return alternative(np.asarray(x)) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index c0e8770dff8b8..ef19f11499e00 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -4,11 +4,11 @@ import operator import pytest from warnings import catch_warnings +from distutils.version import LooseVersion import numpy as np from pandas import Series, Index, isna, notna from pandas.core.dtypes.common import is_float_dtype -from pandas.core.dtypes.missing import remove_na_arraylike from pandas.core.panel import Panel from pandas.core.panel4d import Panel4D from pandas.tseries.offsets import BDay @@ -37,13 +37,16 @@ def test_count(self): self._check_stat_op('count', f, obj=self.panel4d, has_skipna=False) def test_sum(self): - self._check_stat_op('sum', np.sum) + self._check_stat_op('sum', np.sum, skipna_alternative=np.nansum) def test_mean(self): self._check_stat_op('mean', np.mean) def test_prod(self): - self._check_stat_op('prod', np.prod) + if LooseVersion(np.__version__) < LooseVersion("1.10.0"): + raise pytest.skip("np.nanprod added in 1.10.0") + + self._check_stat_op('prod', np.prod, skipna_alternative=np.nanprod) def test_median(self): def wrapper(x): @@ -106,7 +109,8 @@ def alt(x): # self._check_stat_op('skew', alt) - def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True, + skipna_alternative=None): if obj is None: obj = self.panel4d @@ -117,11 +121,9 @@ def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): f = getattr(obj, name) if has_skipna: - def skipna_wrapper(x): - nona = remove_na_arraylike(x) - if len(nona) == 0: - return np.nan - return alternative(nona) + + skipna_wrapper = tm._make_skipna_wrapper(alternative, + skipna_alternative) def wrapper(x): return alternative(np.asarray(x)) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index e64bf2217e717..04e702644913f 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta from functools import partial from textwrap import dedent +from operator import methodcaller import pytz import pytest @@ -3377,8 +3378,45 @@ def test_aggregate_normal(self): assert_frame_equal(expected, dt_result) """ - def test_aggregate_with_nat(self): + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_resample_entirly_nat_window(self, method, unit): + s = pd.Series([0] * 2 + [np.nan] * 2, + index=pd.date_range('2017', periods=4)) + # 0 / 1 by default + result = methodcaller(method)(s.resample("2d")) + expected = pd.Series([0.0, unit], + index=pd.to_datetime(['2017-01-01', + '2017-01-03'])) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = methodcaller(method, min_count=0)(s.resample("2d")) + expected = pd.Series([0.0, unit], + index=pd.to_datetime(['2017-01-01', + '2017-01-03'])) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = methodcaller(method, min_count=1)(s.resample("2d")) + expected = pd.Series([0.0, np.nan], + index=pd.to_datetime(['2017-01-01', + '2017-01-03'])) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('func, fill_value', [ + ('min', np.nan), + ('max', np.nan), + ('sum', 0), + ('prod', 1), + ('count', 0), + ]) + def test_aggregate_with_nat(self, func, fill_value): # check TimeGrouper's aggregation is identical as normal groupby + # if NaT is included, 'var', 'std', 'mean', 'first','last' + # and 'nth' doesn't work yet n = 20 data = np.random.randn(n, 4).astype('int64') @@ -3392,42 +3430,42 @@ def test_aggregate_with_nat(self): normal_grouped = normal_df.groupby('key') dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) - for func in ['min', 'max', 'sum', 'prod']: - normal_result = getattr(normal_grouped, func)() - dt_result = getattr(dt_grouped, func)() - pad = DataFrame([[np.nan, np.nan, np.nan, np.nan]], index=[3], - columns=['A', 'B', 'C', 'D']) - expected = normal_result.append(pad) - expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') - assert_frame_equal(expected, dt_result) + normal_result = getattr(normal_grouped, func)() + dt_result = getattr(dt_grouped, func)() - for func in ['count']: - normal_result = getattr(normal_grouped, func)() - pad = DataFrame([[0, 0, 0, 0]], index=[3], - columns=['A', 'B', 'C', 'D']) - expected = normal_result.append(pad) - expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') - dt_result = getattr(dt_grouped, func)() - assert_frame_equal(expected, dt_result) + pad = DataFrame([[fill_value] * 4], index=[3], + columns=['A', 'B', 'C', 'D']) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', + periods=5, name='key') + assert_frame_equal(expected, dt_result) + assert dt_result.index.name == 'key' - for func in ['size']: - normal_result = getattr(normal_grouped, func)() - pad = Series([0], index=[3]) - expected = normal_result.append(pad) - expected = expected.sort_index() - expected.index = date_range(start='2013-01-01', freq='D', - periods=5, name='key') - dt_result = getattr(dt_grouped, func)() - assert_series_equal(expected, dt_result) - # GH 9925 - assert dt_result.index.name == 'key' + def test_aggregate_with_nat_size(self): + # GH 9925 + n = 20 + data = np.random.randn(n, 4).astype('int64') + normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + normal_df['key'] = [1, 2, np.nan, 4, 5] * 4 + + dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D']) + dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT, + datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4 + + normal_grouped = normal_df.groupby('key') + dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D')) - # if NaT is included, 'var', 'std', 'mean', 'first','last' - # and 'nth' doesn't work yet + normal_result = normal_grouped.size() + dt_result = dt_grouped.size() + + pad = Series([0], index=[3]) + expected = normal_result.append(pad) + expected = expected.sort_index() + expected.index = date_range(start='2013-01-01', freq='D', + periods=5, name='key') + assert_series_equal(expected, dt_result) + assert dt_result.index.name == 'key' def test_repr(self): # GH18203 @@ -3436,3 +3474,34 @@ def test_repr(self): "closed='left', label='left', how='mean', " "convention='e', base=0)") assert result == expected + + @pytest.mark.parametrize('method, unit', [ + ('sum', 0), + ('prod', 1), + ]) + def test_upsample_sum(self, method, unit): + s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H")) + resampled = s.resample("30T") + index = pd.to_datetime(['2017-01-01T00:00:00', + '2017-01-01T00:30:00', + '2017-01-01T01:00:00']) + + # 0 / 1 by default + result = methodcaller(method)(resampled) + expected = pd.Series([1, unit, 1], index=index) + tm.assert_series_equal(result, expected) + + # min_count=0 + result = methodcaller(method, min_count=0)(resampled) + expected = pd.Series([1, unit, 1], index=index) + tm.assert_series_equal(result, expected) + + # min_count=1 + result = methodcaller(method, min_count=1)(resampled) + expected = pd.Series([1, np.nan, 1], index=index) + tm.assert_series_equal(result, expected) + + # min_count>1 + result = methodcaller(method, min_count=2)(resampled) + expected = pd.Series([np.nan, np.nan, np.nan], index=index) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 35ae4ad4d5db4..e65de10c51300 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -475,6 +475,28 @@ def tests_empty_df_rolling(self, roller): result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() tm.assert_frame_equal(result, expected) + def test_missing_minp_zero(self): + # https://github.com/pandas-dev/pandas/pull/18921 + # minp=0 + x = pd.Series([np.nan]) + result = x.rolling(1, min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.rolling(1, min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + + def test_missing_minp_zero_variable(self): + # https://github.com/pandas-dev/pandas/pull/18921 + x = pd.Series([np.nan] * 4, + index=pd.DatetimeIndex(['2017-01-01', '2017-01-04', + '2017-01-06', '2017-01-07'])) + result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum() + expected = pd.Series(0.0, index=x.index) + tm.assert_series_equal(result, expected) + def test_multi_index_names(self): # GH 16789, 16825 @@ -548,6 +570,19 @@ def test_empty_df_expanding(self, expander): index=pd.DatetimeIndex([])).expanding(expander).sum() tm.assert_frame_equal(result, expected) + def test_missing_minp_zero(self): + # https://github.com/pandas-dev/pandas/pull/18921 + # minp=0 + x = pd.Series([np.nan]) + result = x.expanding(min_periods=0).sum() + expected = pd.Series([0.0]) + tm.assert_series_equal(result, expected) + + # minp=1 + result = x.expanding(min_periods=1).sum() + expected = pd.Series([np.nan]) + tm.assert_series_equal(result, expected) + class TestEWM(Base): @@ -864,7 +899,8 @@ def test_centered_axis_validation(self): .rolling(window=3, center=True, axis=2).mean()) def test_rolling_sum(self): - self._check_moment_func(mom.rolling_sum, np.sum, name='sum') + self._check_moment_func(mom.rolling_sum, np.nansum, name='sum', + zero_min_periods_equal=False) def test_rolling_count(self): counter = lambda x: np.isfinite(x).astype(float).sum() @@ -1349,14 +1385,18 @@ def test_fperr_robustness(self): def _check_moment_func(self, f, static_comp, name=None, window=50, has_min_periods=True, has_center=True, has_time_rule=True, preserve_nan=True, - fill_value=None, test_stable=False, **kwargs): + fill_value=None, test_stable=False, + zero_min_periods_equal=True, + **kwargs): with warnings.catch_warnings(record=True): self._check_ndarray(f, static_comp, window=window, has_min_periods=has_min_periods, preserve_nan=preserve_nan, has_center=has_center, fill_value=fill_value, - test_stable=test_stable, **kwargs) + test_stable=test_stable, + zero_min_periods_equal=zero_min_periods_equal, + **kwargs) with warnings.catch_warnings(record=True): self._check_structures(f, static_comp, @@ -1375,7 +1415,8 @@ def _check_moment_func(self, f, static_comp, name=None, window=50, def _check_ndarray(self, f, static_comp, window=50, has_min_periods=True, preserve_nan=True, has_center=True, fill_value=None, - test_stable=False, test_window=True, **kwargs): + test_stable=False, test_window=True, + zero_min_periods_equal=True, **kwargs): def get_result(arr, window, min_periods=None, center=False): return f(arr, window, min_periods=min_periods, center=center, ** kwargs) @@ -1408,10 +1449,11 @@ def get_result(arr, window, min_periods=None, center=False): assert isna(result[3]) assert notna(result[4]) - # min_periods=0 - result0 = get_result(arr, 20, min_periods=0) - result1 = get_result(arr, 20, min_periods=1) - tm.assert_almost_equal(result0, result1) + if zero_min_periods_equal: + # min_periods=0 may be equivalent to min_periods=1 + result0 = get_result(arr, 20, min_periods=0) + result1 = get_result(arr, 20, min_periods=1) + tm.assert_almost_equal(result0, result1) else: result = get_result(arr, 50) tm.assert_almost_equal(result[-1], static_comp(arr[10:-10])) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index dec67bbea854f..b6fc9c78d6476 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2861,3 +2861,31 @@ def setTZ(tz): yield finally: setTZ(orig_tz) + + +def _make_skipna_wrapper(alternative, skipna_alternative=None): + """Create a function for calling on an array. + + Parameters + ---------- + alternative : function + The function to be called on the array with no NaNs. + Only used when 'skipna_alternative' is None. + skipna_alternative : function + The function to be called on the original array + + Returns + ------- + skipna_wrapper : function + """ + if skipna_alternative: + def skipna_wrapper(x): + return skipna_alternative(x.values) + else: + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + + return skipna_wrapper