Skip to content

Commit a23edf1

Browse files
committed
ENH: Add first and last aggregations to Rolling and Expanding
1 parent 9501650 commit a23edf1

File tree

11 files changed

+451
-1
lines changed

11 files changed

+451
-1
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ Other enhancements
5454
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
57+
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
5758
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
5859
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
5960
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)

pandas/_libs/window/aggregations.pyi

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,18 @@ def roll_min(
6060
end: np.ndarray, # np.ndarray[np.int64]
6161
minp: int, # int64_t
6262
) -> np.ndarray: ... # np.ndarray[float]
63+
def roll_first(
64+
values: np.ndarray, # np.ndarray[np.float64]
65+
start: np.ndarray, # np.ndarray[np.int64]
66+
end: np.ndarray, # np.ndarray[np.int64]
67+
minp: int, # int64_t
68+
) -> np.ndarray: ... # np.ndarray[float]
69+
def roll_last(
70+
values: np.ndarray, # np.ndarray[np.float64]
71+
start: np.ndarray, # np.ndarray[np.int64]
72+
end: np.ndarray, # np.ndarray[np.int64]
73+
minp: int, # int64_t
74+
) -> np.ndarray: ... # np.ndarray[float]
6375
def roll_quantile(
6476
values: np.ndarray, # const float64_t[:]
6577
start: np.ndarray, # np.ndarray[np.int64]

pandas/_libs/window/aggregations.pyx

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,6 +1133,89 @@ cdef _roll_min_max(ndarray[float64_t] values,
11331133

11341134
return output
11351135

1136+
# ----------------------------------------------------------------------
1137+
# Rolling first, last
1138+
1139+
1140+
def roll_first(const float64_t[:] values, ndarray[int64_t] start,
1141+
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
1142+
return _roll_first_last(values, start, end, minp, is_first=1)
1143+
1144+
1145+
def roll_last(const float64_t[:] values, ndarray[int64_t] start,
1146+
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
1147+
return _roll_first_last(values, start, end, minp, is_first=0)
1148+
1149+
1150+
cdef _roll_first_last(const float64_t[:] values, ndarray[int64_t] start,
1151+
ndarray[int64_t] end, int64_t minp, bint is_first):
1152+
cdef:
1153+
Py_ssize_t i, j, fl_idx
1154+
bint is_monotonic_increasing_bounds
1155+
int64_t nobs = 0, N = len(start), s, e
1156+
float64_t val, res
1157+
ndarray[float64_t] output
1158+
1159+
is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
1160+
start, end
1161+
)
1162+
1163+
output = np.empty(N, dtype=np.float64)
1164+
1165+
if (end - start).max() == 0:
1166+
output[:] = NaN
1167+
return output
1168+
1169+
with nogil:
1170+
for i in range(0, N):
1171+
s = start[i]
1172+
e = end[i]
1173+
1174+
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
1175+
fl_idx = -1
1176+
nobs = 0
1177+
for j in range(s, e):
1178+
val = values[j]
1179+
if val == val:
1180+
if not is_first or fl_idx < s:
1181+
fl_idx = j
1182+
nobs += 1
1183+
else:
1184+
# handle deletes
1185+
for j in range(start[i - 1], s):
1186+
val = values[j]
1187+
if val == val:
1188+
nobs -= 1
1189+
1190+
# update fl_idx if out of range, if first
1191+
if is_first and fl_idx < s:
1192+
fl_idx = -1
1193+
for j in range(s, end[i - 1]):
1194+
val = values[j]
1195+
if val == val:
1196+
fl_idx = j
1197+
break
1198+
1199+
# handle adds
1200+
for j in range(end[i - 1], e):
1201+
val = values[j]
1202+
if val == val:
1203+
if not is_first or fl_idx < s:
1204+
fl_idx = j
1205+
nobs += 1
1206+
1207+
if nobs >= minp and fl_idx >= s:
1208+
res = values[fl_idx]
1209+
else:
1210+
res = NaN
1211+
1212+
output[i] = res
1213+
1214+
if not is_monotonic_increasing_bounds:
1215+
nobs = 0
1216+
1217+
return output
1218+
11361219

11371220
cdef enum InterpolationType:
11381221
LINEAR,

pandas/core/window/expanding.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,64 @@ def skew(self, numeric_only: bool = False):
664664
def kurt(self, numeric_only: bool = False):
665665
return super().kurt(numeric_only=numeric_only)
666666

667+
@doc(
668+
template_header,
669+
create_section_header("Parameters"),
670+
kwargs_numeric_only,
671+
create_section_header("Returns"),
672+
template_returns,
673+
create_section_header("Examples"),
674+
dedent(
675+
"""
676+
The example below will show an expanding calculation with a window size of
677+
three.
678+
679+
>>> s = pd.Series(range(5))
680+
>>> s.expanding(3).first()
681+
0 NaN
682+
1 NaN
683+
2 0.0
684+
3 0.0
685+
4 0.0
686+
dtype: float64
687+
"""
688+
).replace("\n", "", 1),
689+
window_method="expanding",
690+
aggregation_description="First (left-most) element of the window",
691+
agg_method="first",
692+
)
693+
def first(self, numeric_only: bool = False):
694+
return super().first(numeric_only=numeric_only)
695+
696+
@doc(
697+
template_header,
698+
create_section_header("Parameters"),
699+
kwargs_numeric_only,
700+
create_section_header("Returns"),
701+
template_returns,
702+
create_section_header("Examples"),
703+
dedent(
704+
"""
705+
The example below will show an expanding calculation with a window size of
706+
three.
707+
708+
>>> s = pd.Series(range(5))
709+
>>> s.expanding(3).last()
710+
0 NaN
711+
1 NaN
712+
2 2.0
713+
3 3.0
714+
4 4.0
715+
dtype: float64
716+
"""
717+
).replace("\n", "", 1),
718+
window_method="expanding",
719+
aggregation_description="Last (right-most) element of the window",
720+
agg_method="last",
721+
)
722+
def last(self, numeric_only: bool = False):
723+
return super().last(numeric_only=numeric_only)
724+
667725
@doc(
668726
template_header,
669727
create_section_header("Parameters"),

pandas/core/window/rolling.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,22 @@ def kurt(self, numeric_only: bool = False):
17051705
numeric_only=numeric_only,
17061706
)
17071707

1708+
def first(self, numeric_only: bool = False):
1709+
window_func = window_aggregations.roll_first
1710+
return self._apply(
1711+
window_func,
1712+
name="first",
1713+
numeric_only=numeric_only,
1714+
)
1715+
1716+
def last(self, numeric_only: bool = False):
1717+
window_func = window_aggregations.roll_last
1718+
return self._apply(
1719+
window_func,
1720+
name="last",
1721+
numeric_only=numeric_only,
1722+
)
1723+
17081724
def quantile(
17091725
self,
17101726
q: float,
@@ -2539,6 +2555,64 @@ def sem(self, ddof: int = 1, numeric_only: bool = False):
25392555
def kurt(self, numeric_only: bool = False):
25402556
return super().kurt(numeric_only=numeric_only)
25412557

2558+
@doc(
2559+
template_header,
2560+
create_section_header("Parameters"),
2561+
kwargs_numeric_only,
2562+
create_section_header("Returns"),
2563+
template_returns,
2564+
create_section_header("Examples"),
2565+
dedent(
2566+
"""
2567+
The example below will show a rolling calculation with a window size of
2568+
three.
2569+
2570+
>>> s = pd.Series(range(5))
2571+
>>> s.rolling(3).first()
2572+
0 NaN
2573+
1 NaN
2574+
2 0.0
2575+
3 1.0
2576+
4 2.0
2577+
dtype: float64
2578+
"""
2579+
).replace("\n", "", 1),
2580+
window_method="rolling",
2581+
aggregation_description="First (left-most) element of the window",
2582+
agg_method="first",
2583+
)
2584+
def first(self, numeric_only: bool = False):
2585+
return super().first(numeric_only=numeric_only)
2586+
2587+
@doc(
2588+
template_header,
2589+
create_section_header("Parameters"),
2590+
kwargs_numeric_only,
2591+
create_section_header("Returns"),
2592+
template_returns,
2593+
create_section_header("Examples"),
2594+
dedent(
2595+
"""
2596+
The example below will show a rolling calculation with a window size of
2597+
three.
2598+
2599+
>>> s = pd.Series(range(5))
2600+
>>> s.rolling(3).last()
2601+
0 NaN
2602+
1 NaN
2603+
2 2.0
2604+
3 3.0
2605+
4 4.0
2606+
dtype: float64
2607+
"""
2608+
).replace("\n", "", 1),
2609+
window_method="rolling",
2610+
aggregation_description="Last (right-most) element of the window",
2611+
agg_method="last",
2612+
)
2613+
def last(self, numeric_only: bool = False):
2614+
return super().last(numeric_only=numeric_only)
2615+
25422616
@doc(
25432617
template_header,
25442618
create_section_header("Parameters"),

pandas/tests/window/test_cython_aggregations.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ def _get_rolling_aggregations():
3030
("roll_median_c", window_aggregations.roll_median_c),
3131
("roll_max", window_aggregations.roll_max),
3232
("roll_min", window_aggregations.roll_min),
33+
("roll_first", window_aggregations.roll_first),
34+
("roll_last", window_aggregations.roll_last),
3335
]
3436
+ [
3537
(

pandas/tests/window/test_expanding.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,8 @@ def test_moment_functions_zero_length_pairwise(f):
451451
lambda x: x.expanding(min_periods=5).corr(x, pairwise=False),
452452
lambda x: x.expanding(min_periods=5).max(),
453453
lambda x: x.expanding(min_periods=5).min(),
454+
lambda x: x.expanding(min_periods=5).first(),
455+
lambda x: x.expanding(min_periods=5).last(),
454456
lambda x: x.expanding(min_periods=5).sum(),
455457
lambda x: x.expanding(min_periods=5).mean(),
456458
lambda x: x.expanding(min_periods=5).std(),
@@ -596,6 +598,104 @@ def test_expanding_corr_pairwise_diff_length():
596598
tm.assert_frame_equal(result4, expected)
597599

598600

601+
@pytest.mark.parametrize(
602+
"values,method,expected",
603+
[
604+
(
605+
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
606+
"first",
607+
[float("nan"), float("nan"), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
608+
),
609+
(
610+
[1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
611+
"first",
612+
[
613+
float("nan"),
614+
float("nan"),
615+
float("nan"),
616+
float("nan"),
617+
1.0,
618+
1.0,
619+
1.0,
620+
1.0,
621+
1.0,
622+
1.0,
623+
],
624+
),
625+
(
626+
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
627+
"last",
628+
[float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
629+
),
630+
(
631+
[1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
632+
"last",
633+
[
634+
float("nan"),
635+
float("nan"),
636+
float("nan"),
637+
float("nan"),
638+
5.0,
639+
5.0,
640+
7.0,
641+
7.0,
642+
9.0,
643+
9.0,
644+
],
645+
),
646+
],
647+
)
648+
def test_expanding_first_last(values, method, expected):
649+
# GH#33155
650+
x = Series(values)
651+
result = getattr(x.expanding(3), method)()
652+
expected = Series(expected)
653+
tm.assert_almost_equal(result, expected)
654+
655+
x = DataFrame({"A": values})
656+
result = getattr(x.expanding(3), method)()
657+
expected = DataFrame({"A": expected})
658+
tm.assert_almost_equal(result, expected)
659+
660+
661+
@pytest.mark.parametrize(
662+
"values,method,expected",
663+
[
664+
(
665+
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
666+
"first",
667+
[1.0] * 10,
668+
),
669+
(
670+
[1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
671+
"first",
672+
[1.0] * 10,
673+
),
674+
(
675+
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
676+
"last",
677+
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
678+
),
679+
(
680+
[1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
681+
"last",
682+
[1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0],
683+
),
684+
],
685+
)
686+
def test_expanding_first_last_no_minp(values, method, expected):
687+
# GH#33155
688+
x = Series(values)
689+
result = getattr(x.expanding(min_periods=0), method)()
690+
expected = Series(expected)
691+
tm.assert_almost_equal(result, expected)
692+
693+
x = DataFrame({"A": values})
694+
result = getattr(x.expanding(min_periods=0), method)()
695+
expected = DataFrame({"A": expected})
696+
tm.assert_almost_equal(result, expected)
697+
698+
599699
def test_expanding_apply_args_kwargs(engine_and_raw):
600700
def mean_w_arg(x, const):
601701
return np.mean(x) + const

0 commit comments

Comments
 (0)