Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -970,6 +970,7 @@ Datetimelike
- Bug in comparison between objects with pyarrow date dtype and ``timestamp[pyarrow]`` or ``np.datetime64`` dtype failing to consider these as non-comparable (:issue:`62157`)
- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`)
- Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`)
- Bug in retaining frequency in :meth:`value_counts` specifically for :meth:`DatetimeIndex` and :meth:`TimedeltaIndex` (:issue:`33830`)
- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)

Timedelta
Expand Down
30 changes: 30 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,36 @@ def value_counts_internal(
if normalize:
result = result / counts.sum()

# freq patching for DatetimeIndex, TimedeltaIndex
try:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any way to avoid this type of post-processing i.e. have the result.index already be constructed with the correct frequency? Usually this type of fix is indicative of some operation not fully working as expected

from pandas import (
DatetimeIndex,
TimedeltaIndex,
)

if (
bins is None
and not sort
and isinstance(values, (DatetimeIndex, TimedeltaIndex))
and values.freq is not None
and isinstance(result.index, (DatetimeIndex, TimedeltaIndex))
and len(result.index) == len(values)
and result.index.equals(values)
):
base_freq = values.freq
# Rebuild the index with the original freq; name preserved.
if isinstance(result.index, DatetimeIndex):
result.index = DatetimeIndex(
result.index._data, freq=base_freq, name=result.index.name
)
else: # TimedeltaIndex
result.index = TimedeltaIndex(
result.index._data, freq=base_freq, name=result.index.name
)
except Exception:
# If freq patching fails, does not affect value_counts
pass

return result


Expand Down
150 changes: 150 additions & 0 deletions pandas/tests/base/test_value_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,3 +339,153 @@ def test_value_counts_object_inference_deprecated():
exp = dti.value_counts()
exp.index = exp.index.astype(object)
tm.assert_series_equal(res, exp)


def _vc_make_index(kind: str, periods=5, freq="D"):
if kind == "dt":
return pd.date_range("2016-01-01", periods=periods, freq=freq)
if kind == "td":
return pd.timedelta_range(Timedelta(0), periods=periods, freq=freq)
raise ValueError("kind must be 'dt' or 'td'")


@pytest.mark.parametrize(
"kind,freq,normalize",
[
("dt", "D", False),
("dt", "D", True),
("td", "D", False),
("td", "D", True),
("td", Timedelta(hours=1), False),
("td", Timedelta(hours=1), True),
],
)
def test_value_counts_freq_preserved_datetimelike_no_sort(kind, freq, normalize):
idx = _vc_make_index(kind, periods=5, freq=freq)
vc = idx.value_counts(sort=False, normalize=normalize)
assert vc.index.freq == idx.freq
if normalize:
assert np.isclose(vc.values, 1 / len(idx)).all()


@pytest.mark.parametrize(
"kind,freq",
[
("dt", "D"),
("td", "D"),
("td", Timedelta(hours=1)),
],
)
def test_value_counts_freq_drops_datetimelike_when_sorted(kind, freq):
idx = _vc_make_index(kind, periods=5, freq=freq)
vc = idx.value_counts() # default sort=True (reorders)
assert vc.index.freq is None


@pytest.mark.parametrize(
"kind,freq",
[
("dt", "D"),
("td", "D"),
("td", Timedelta(hours=1)),
],
)
def test_value_counts_freq_drops_datetimelike_with_duplicates(kind, freq):
base = _vc_make_index(kind, periods=5, freq=freq)
obj = base.insert(1, base[1]) # duplicate one label
vc = obj.value_counts(sort=False)
assert vc.index.freq is None


@pytest.mark.parametrize(
"kind,freq",
[
("dt", "D"),
("td", "D"),
("td", Timedelta(hours=1)),
],
)
def test_value_counts_freq_drops_datetimelike_with_gap(kind, freq):
base = _vc_make_index(kind, periods=5, freq=freq)
obj = base.delete(2) # remove one step to break contiguity
vc = obj.value_counts(sort=False)
assert vc.index.freq is None


@pytest.mark.parametrize(
"kind,freq,dropna,expect_hasnans",
[
("dt", "D", False, True), # keep NaT
("dt", "D", True, False), # drop NaT
("td", "D", False, True),
("td", "D", True, False),
("td", Timedelta(hours=1), False, True),
("td", Timedelta(hours=1), True, False),
],
)
def test_value_counts_freq_drops_datetimelike_with_nat(
kind, freq, dropna, expect_hasnans
):
base = _vc_make_index(kind, periods=3, freq=freq)
obj = base.insert(1, pd.NaT)
vc = obj.value_counts(dropna=dropna, sort=False)
assert vc.index.freq is None
assert vc.index.hasnans is expect_hasnans


@pytest.mark.parametrize(
"freq,start,periods,sort",
[
("D", "2016-01-01", 5, False),
("D", "2016-01-01", 5, True),
("M", "2016-01", 6, False), # MonthEnd
("M", "2016-01", 6, True),
("Q-DEC", "2016Q1", 4, False), # QuarterEnd (Dec anchored)
("Q-DEC", "2016Q1", 4, True),
("Y-DEC", "2014", 3, False), # YearEnd (Dec anchored)
("Y-DEC", "2014", 3, True),
],
)
def test_value_counts_period_freq_preserved_sort_and_nosort(freq, start, periods, sort):
pi = pd.period_range(start=start, periods=periods, freq=freq)
vc = pi.value_counts(sort=sort)
assert isinstance(vc.index, pd.PeriodIndex)
assert vc.index.dtype == pi.dtype
assert vc.index.freq == pi.freq


def test_value_counts_period_freq_preserved_with_duplicates():
pi = pd.period_range("2016-01", periods=5, freq="M")
obj = pi.insert(1, pi[1]) # duplicate one label
vc = obj.value_counts(sort=False)
assert isinstance(vc.index, pd.PeriodIndex)
assert vc.index.dtype == pi.dtype
assert vc.index.freq == pi.freq


def test_value_counts_period_freq_preserved_with_gap():
pi = pd.period_range("2016-01", periods=5, freq="M")
obj = pi.delete(2) # remove one element
vc = obj.value_counts(sort=False)
assert isinstance(vc.index, pd.PeriodIndex)
assert vc.index.dtype == pi.dtype
assert vc.index.freq == pi.freq


def test_value_counts_period_freq_preserved_with_normalize():
pi = pd.period_range("2016-01", periods=4, freq="M")
vc = pi.value_counts(normalize=True, sort=False)
assert isinstance(vc.index, pd.PeriodIndex)
assert vc.index.dtype == pi.dtype
assert vc.index.freq == pi.freq
assert np.isclose(vc.values, 1 / len(pi)).all()


def test_value_counts_period_freq_preserved_with_nat_dropna_true():
pi = pd.period_range("2016-01", periods=5, freq="M")
obj = pi.insert(1, pd.NaT)
vc = obj.value_counts(dropna=True, sort=False)
assert not vc.index.hasnans
assert isinstance(vc.index, pd.PeriodIndex)
assert vc.index.dtype == pi.dtype
assert vc.index.freq == pi.freq
Loading