Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ New features
Other Enhancements
^^^^^^^^^^^^^^^^^^
- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether NaN/NaT values should be considered (:issue:`17534`)
-
-

Expand Down
12 changes: 6 additions & 6 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,8 @@ def ismember_{{dtype}}({{scalar}}[:] arr, {{scalar}}[:] values, bint hasnans=0):
{{py:

# dtype, ctype, table_type, npy_dtype
dtypes = [('int64', 'int64_t', 'int64', 'int64'),
dtypes = [('float64', 'float64_t', 'float64', 'float64'),
('int64', 'int64_t', 'int64', 'int64'),
('uint64', 'uint64_t', 'uint64', 'uint64'),
('object', 'object', 'pymap', 'object_')]
}}
Expand All @@ -302,11 +303,11 @@ dtypes = [('int64', 'int64_t', 'int64', 'int64'),
{{if dtype == 'object'}}


def mode_{{dtype}}(ndarray[{{ctype}}] values):
def mode_{{dtype}}(ndarray[{{ctype}}] values, bint dropna):
{{else}}


def mode_{{dtype}}({{ctype}}[:] values):
def mode_{{dtype}}({{ctype}}[:] values, bint dropna):
{{endif}}
cdef:
int count, max_count = 1
Expand All @@ -317,9 +318,9 @@ def mode_{{dtype}}({{ctype}}[:] values):

table = kh_init_{{table_type}}()
{{if dtype == 'object'}}
build_count_table_{{dtype}}(values, table, 1)
build_count_table_{{dtype}}(values, table, dropna)
{{else}}
build_count_table_{{dtype}}(values, table, 0)
build_count_table_{{dtype}}(values, table, dropna)
{{endif}}

modes = np.empty(table.n_buckets, dtype=np.{{npy_dtype}})
Expand All @@ -329,7 +330,6 @@ def mode_{{dtype}}({{ctype}}[:] values):
for k in range(table.n_buckets):
if kh_exist_{{table_type}}(table, k):
count = table.vals[k]

if count == max_count:
j += 1
elif count > max_count:
Expand Down
26 changes: 14 additions & 12 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
is_bool_dtype, needs_i8_conversion,
is_datetimetz,
is_datetime64_any_dtype, is_datetime64tz_dtype,
is_timedelta64_dtype, is_interval_dtype,
is_scalar, is_list_like,
is_timedelta64_dtype, is_datetimelike,
is_interval_dtype, is_scalar, is_list_like,
_ensure_platform_int, _ensure_object,
_ensure_float64, _ensure_uint64,
_ensure_int64)
Expand Down Expand Up @@ -798,14 +798,18 @@ def duplicated(values, keep='first'):
return f(values, keep=keep)


def mode(values):
def mode(values, dropna=True):
"""
Returns the mode(s) of an array.

Parameters
----------
values : array-like
Array over which to check for duplicate values.
dropna : boolean, default True
Don't consider counts of NaN/NaT.

.. versionadded:: 0.24.0

Returns
-------
Expand All @@ -818,20 +822,18 @@ def mode(values):

# categorical is a fast-path
if is_categorical_dtype(values):

if isinstance(values, Series):
return Series(values.values.mode(), name=values.name)
return values.mode()
return Series(values.values.mode(dropna=dropna), name=values.name)
return values.mode(dropna=dropna)

values, dtype, ndtype = _ensure_data(values)
if dropna and is_datetimelike(values):
mask = values.isnull()
values = values[~mask]

# TODO: this should support float64
if ndtype not in ['int64', 'uint64', 'object']:
ndtype = 'object'
values = _ensure_object(values)
values, dtype, ndtype = _ensure_data(values)

f = getattr(htable, "mode_{dtype}".format(dtype=ndtype))
result = f(values)
result = f(values, dropna=dropna)
try:
result = np.sort(result)
except TypeError as e:
Expand Down
16 changes: 13 additions & 3 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2118,20 +2118,30 @@ def max(self, numeric_only=None, **kwargs):
else:
return self.categories[pointer]

def mode(self):
def mode(self, dropna=True):
"""
Returns the mode(s) of the Categorical.
Always returns `Categorical` even if only one value.
Parameters
----------
dropna : boolean, default True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add versionadded (anytime you add the param)

Don't consider counts of NaN/NaT.
.. versionadded:: 0.24.0
Returns
-------
modes : `Categorical` (sorted)
"""

import pandas._libs.hashtable as htable
good = self._codes != -1
values = sorted(htable.mode_int64(_ensure_int64(self._codes[good])))
values = self._codes
if dropna:
good = self._codes != -1
values = self._codes[good]
values = sorted(htable.mode_int64(_ensure_int64(values), dropna))
result = self._constructor(values=values, categories=self.categories,
ordered=self.ordered, fastpath=True)
return result
Expand Down
8 changes: 6 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7039,7 +7039,7 @@ def _get_agg_axis(self, axis_num):
else:
raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)

def mode(self, axis=0, numeric_only=False):
def mode(self, axis=0, numeric_only=False, dropna=True):
"""
Gets the mode(s) of each element along the axis selected. Adds a row
for each mode per label, fills in gaps with nan.
Expand All @@ -7057,6 +7057,10 @@ def mode(self, axis=0, numeric_only=False):
* 1 or 'columns' : get mode of each row
numeric_only : boolean, default False
if True, only apply to numeric columns
dropna : boolean, default True
Don't consider counts of NaN/NaT.

.. versionadded:: 0.24.0

Returns
-------
Expand All @@ -7073,7 +7077,7 @@ def mode(self, axis=0, numeric_only=False):
data = self if not numeric_only else self._get_numeric_data()

def f(s):
return s.mode()
return s.mode(dropna=dropna)

return data.apply(f, axis=axis)

Expand Down
11 changes: 9 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,17 +1431,24 @@ def count(self, level=None):
return self._constructor(out, index=lev,
dtype='int64').__finalize__(self)

def mode(self):
def mode(self, dropna=True):
"""Return the mode(s) of the dataset.

Always returns Series even if only one value is returned.

Parameters
-------
dropna : boolean, default True
Don't consider counts of NaN/NaT.

.. versionadded:: 0.24.0

Returns
-------
modes : Series (sorted)
"""
# TODO: Add option for bins like value_counts()
return algorithms.mode(self)
return algorithms.mode(self, dropna=dropna)

def unique(self):
"""
Expand Down
120 changes: 70 additions & 50 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
from pandas.compat import lrange, product, PY35
from pandas import (compat, isna, notna, DataFrame, Series,
MultiIndex, date_range, Timestamp, Categorical,
_np_version_under1p12, _np_version_under1p15)
_np_version_under1p12, _np_version_under1p15,
to_datetime, to_timedelta)
import pandas as pd
import pandas.core.nanops as nanops
import pandas.core.algorithms as algorithms
import pandas.io.formats.printing as printing

import pandas.util.testing as tm
import pandas.util._test_decorators as td
Expand Down Expand Up @@ -840,54 +840,74 @@ def wrapper(x):
expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
tm.assert_series_equal(r1, expected)

def test_mode(self):
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
"B": [10, 10, 10, np.nan, 3, 4],
"C": [8, 8, 8, 9, 9, 9],
"D": np.arange(6, dtype='int64'),
"E": [8, 8, 1, 1, 3, 3]})
tm.assert_frame_equal(df[["A"]].mode(),
pd.DataFrame({"A": [12]}))
expected = pd.Series([0, 1, 2, 3, 4, 5], dtype='int64', name='D').\
to_frame()
tm.assert_frame_equal(df[["D"]].mode(), expected)
expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame()
tm.assert_frame_equal(df[["E"]].mode(), expected)
tm.assert_frame_equal(df[["A", "B"]].mode(),
pd.DataFrame({"A": [12], "B": [10.]}))
tm.assert_frame_equal(df.mode(),
pd.DataFrame({"A": [12, np.nan, np.nan, np.nan,
np.nan, np.nan],
"B": [10, np.nan, np.nan, np.nan,
np.nan, np.nan],
"C": [8, 9, np.nan, np.nan, np.nan,
np.nan],
"D": [0, 1, 2, 3, 4, 5],
"E": [1, 3, 8, np.nan, np.nan,
np.nan]}))

# outputs in sorted order
df["C"] = list(reversed(df["C"]))
printing.pprint_thing(df["C"])
printing.pprint_thing(df["C"].mode())
a, b = (df[["A", "B", "C"]].mode(),
pd.DataFrame({"A": [12, np.nan],
"B": [10, np.nan],
"C": [8, 9]}))
printing.pprint_thing(a)
printing.pprint_thing(b)
tm.assert_frame_equal(a, b)
# should work with heterogeneous types
df = pd.DataFrame({"A": np.arange(6, dtype='int64'),
"B": pd.date_range('2011', periods=6),
"C": list('abcdef')})
exp = pd.DataFrame({"A": pd.Series(np.arange(6, dtype='int64'),
dtype=df["A"].dtype),
"B": pd.Series(pd.date_range('2011', periods=6),
dtype=df["B"].dtype),
"C": pd.Series(list('abcdef'),
dtype=df["C"].dtype)})
tm.assert_frame_equal(df.mode(), exp)
@pytest.mark.parametrize("dropna, expected", [
(True, {'A': [12],
'B': [10.0],
'C': [1.0],
'D': ['a'],
'E': Categorical(['a'], categories=['a']),
'F': to_datetime(['2000-1-2']),
'G': to_timedelta(['1 days'])}),
(False, {'A': [12],
'B': [10.0],
'C': [np.nan],
'D': np.array([np.nan], dtype=object),
'E': Categorical([np.nan], categories=['a']),
'F': [pd.NaT],
'G': to_timedelta([pd.NaT])}),
(True, {'H': [8, 9, np.nan, np.nan],
'I': [8, 9, np.nan, np.nan],
'J': [1, np.nan, np.nan, np.nan],
'K': Categorical(['a', np.nan, np.nan, np.nan],
categories=['a']),
'L': to_datetime(['2000-1-2', 'NaT', 'NaT', 'NaT']),
'M': to_timedelta(['1 days', 'nan', 'nan', 'nan']),
'N': [0, 1, 2, 3]}),
(False, {'H': [8, 9, np.nan, np.nan],
'I': [8, 9, np.nan, np.nan],
'J': [1, np.nan, np.nan, np.nan],
'K': Categorical([np.nan, 'a', np.nan, np.nan],
categories=['a']),
'L': to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
'M': to_timedelta(['nan', '1 days', 'nan', 'nan']),
'N': [0, 1, 2, 3]})
])
def test_mode_dropna(self, dropna, expected):

df = DataFrame({"A": [12, 12, 19, 11],
"B": [10, 10, np.nan, 3],
"C": [1, np.nan, np.nan, np.nan],
"D": [np.nan, np.nan, 'a', np.nan],
"E": Categorical([np.nan, np.nan, 'a', np.nan]),
"F": to_datetime(['NaT', '2000-1-2', 'NaT', 'NaT']),
"G": to_timedelta(['1 days', 'nan', 'nan', 'nan']),
"H": [8, 8, 9, 9],
"I": [9, 9, 8, 8],
"J": [1, 1, np.nan, np.nan],
"K": Categorical(['a', np.nan, 'a', np.nan]),
"L": to_datetime(['2000-1-2', '2000-1-2',
'NaT', 'NaT']),
"M": to_timedelta(['1 days', 'nan',
'1 days', 'nan']),
"N": np.arange(4, dtype='int64')})

result = df[sorted(list(expected.keys()))].mode(dropna=dropna)
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(not compat.PY3, reason="only PY3")
def test_mode_sortwarning(self):
# Check for the warning that is raised when the mode
# results cannot be sorted

df = DataFrame({"A": [np.nan, np.nan, 'a', 'a']})
expected = DataFrame({'A': ['a', np.nan]})

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
result = df.mode(dropna=False)
result = result.sort_values(by='A').reset_index(drop=True)

tm.assert_frame_equal(result, expected)

def test_operators_timedelta64(self):
from datetime import timedelta
Expand Down
Loading