diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 448ceffdaa1eb..777da289ff1bb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -971,6 +971,7 @@ Datetimelike - Bug in comparison between objects with pyarrow date dtype and ``timestamp[pyarrow]`` or ``np.datetime64`` dtype failing to consider these as non-comparable (:issue:`62157`) - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) +- Bug in retaining frequency in :meth:`value_counts` specifically for :meth:`DatetimeIndex` and :meth:`TimedeltaIndex` (:issue:`33830`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index bbca78459ca75..57f24abac0acd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -937,6 +937,36 @@ def value_counts_internal( if normalize: result = result / counts.sum() + # freq patching for DatetimeIndex, TimedeltaIndex + try: + from pandas import ( + DatetimeIndex, + TimedeltaIndex, + ) + + if ( + bins is None + and not sort + and isinstance(values, (DatetimeIndex, TimedeltaIndex)) + and values.freq is not None + and isinstance(result.index, (DatetimeIndex, TimedeltaIndex)) + and len(result.index) == len(values) + and result.index.equals(values) + ): + base_freq = values.freq + # Rebuild the index with the original freq; name preserved. + if isinstance(result.index, DatetimeIndex): + result.index = DatetimeIndex( + result.index._data, freq=base_freq, name=result.index.name + ) + else: # TimedeltaIndex + result.index = TimedeltaIndex( + result.index._data, freq=base_freq, name=result.index.name + ) + except Exception: + # If freq patching fails, does not affect value_counts + pass + return result diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index bcb31829a201f..5e102c15f04f9 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -339,3 +339,153 @@ def test_value_counts_object_inference_deprecated(): exp = dti.value_counts() exp.index = exp.index.astype(object) tm.assert_series_equal(res, exp) + + +def _vc_make_index(kind: str, periods=5, freq="D"): + if kind == "dt": + return pd.date_range("2016-01-01", periods=periods, freq=freq) + if kind == "td": + return pd.timedelta_range(Timedelta(0), periods=periods, freq=freq) + raise ValueError("kind must be 'dt' or 'td'") + + +@pytest.mark.parametrize( + "kind,freq,normalize", + [ + ("dt", "D", False), + ("dt", "D", True), + ("td", "D", False), + ("td", "D", True), + ("td", Timedelta(hours=1), False), + ("td", Timedelta(hours=1), True), + ], +) +def test_value_counts_freq_preserved_datetimelike_no_sort(kind, freq, normalize): + idx = _vc_make_index(kind, periods=5, freq=freq) + vc = idx.value_counts(sort=False, normalize=normalize) + assert vc.index.freq == idx.freq + if normalize: + assert np.isclose(vc.values, 1 / len(idx)).all() + + +@pytest.mark.parametrize( + "kind,freq", + [ + ("dt", "D"), + ("td", "D"), + ("td", Timedelta(hours=1)), + ], +) +def test_value_counts_freq_drops_datetimelike_when_sorted(kind, freq): + idx = _vc_make_index(kind, periods=5, freq=freq) + vc = idx.value_counts() # default sort=True (reorders) + assert vc.index.freq is None + + +@pytest.mark.parametrize( + "kind,freq", + [ + ("dt", "D"), + ("td", "D"), + ("td", Timedelta(hours=1)), + ], +) +def test_value_counts_freq_drops_datetimelike_with_duplicates(kind, freq): + base = _vc_make_index(kind, periods=5, freq=freq) + obj = base.insert(1, base[1]) # duplicate one label + vc = obj.value_counts(sort=False) + assert vc.index.freq is None + + +@pytest.mark.parametrize( + "kind,freq", + [ + ("dt", "D"), + ("td", "D"), + ("td", Timedelta(hours=1)), + ], +) +def test_value_counts_freq_drops_datetimelike_with_gap(kind, freq): + base = _vc_make_index(kind, periods=5, freq=freq) + obj = base.delete(2) # remove one step to break contiguity + vc = obj.value_counts(sort=False) + assert vc.index.freq is None + + +@pytest.mark.parametrize( + "kind,freq,dropna,expect_hasnans", + [ + ("dt", "D", False, True), # keep NaT + ("dt", "D", True, False), # drop NaT + ("td", "D", False, True), + ("td", "D", True, False), + ("td", Timedelta(hours=1), False, True), + ("td", Timedelta(hours=1), True, False), + ], +) +def test_value_counts_freq_drops_datetimelike_with_nat( + kind, freq, dropna, expect_hasnans +): + base = _vc_make_index(kind, periods=3, freq=freq) + obj = base.insert(1, pd.NaT) + vc = obj.value_counts(dropna=dropna, sort=False) + assert vc.index.freq is None + assert vc.index.hasnans is expect_hasnans + + +@pytest.mark.parametrize( + "freq,start,periods,sort", + [ + ("D", "2016-01-01", 5, False), + ("D", "2016-01-01", 5, True), + ("M", "2016-01", 6, False), # MonthEnd + ("M", "2016-01", 6, True), + ("Q-DEC", "2016Q1", 4, False), # QuarterEnd (Dec anchored) + ("Q-DEC", "2016Q1", 4, True), + ("Y-DEC", "2014", 3, False), # YearEnd (Dec anchored) + ("Y-DEC", "2014", 3, True), + ], +) +def test_value_counts_period_freq_preserved_sort_and_nosort(freq, start, periods, sort): + pi = pd.period_range(start=start, periods=periods, freq=freq) + vc = pi.value_counts(sort=sort) + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq + + +def test_value_counts_period_freq_preserved_with_duplicates(): + pi = pd.period_range("2016-01", periods=5, freq="M") + obj = pi.insert(1, pi[1]) # duplicate one label + vc = obj.value_counts(sort=False) + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq + + +def test_value_counts_period_freq_preserved_with_gap(): + pi = pd.period_range("2016-01", periods=5, freq="M") + obj = pi.delete(2) # remove one element + vc = obj.value_counts(sort=False) + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq + + +def test_value_counts_period_freq_preserved_with_normalize(): + pi = pd.period_range("2016-01", periods=4, freq="M") + vc = pi.value_counts(normalize=True, sort=False) + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq + assert np.isclose(vc.values, 1 / len(pi)).all() + + +def test_value_counts_period_freq_preserved_with_nat_dropna_true(): + pi = pd.period_range("2016-01", periods=5, freq="M") + obj = pi.insert(1, pd.NaT) + vc = obj.value_counts(dropna=True, sort=False) + assert not vc.index.hasnans + assert isinstance(vc.index, pd.PeriodIndex) + assert vc.index.dtype == pi.dtype + assert vc.index.freq == pi.freq