Skip to content

Commit c5e8152

Browse files
jangoreckimcol
andauthored
rolling median (#7315)
* rolling median * keep lint-c happy * try skip lint here * codecov * Apply wording suggestions from @mcol code review Co-authored-by: Marco Colombo <[email protected]> * solve unreached codecov lines by raising warning * Revert "solve unreached codecov lines by raising warning" This reverts commit 34af12e. * codecov edge cases not reached before --------- Co-authored-by: Marco Colombo <[email protected]>
1 parent 08ea530 commit c5e8152

File tree

10 files changed

+1041
-50
lines changed

10 files changed

+1041
-50
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ export(frollsum)
5757
export(frollmax)
5858
export(frollmin)
5959
export(frollprod)
60+
export(frollmedian)
6061
export(frollapply)
6162
export(frolladapt)
6263
export(nafill)

NEWS.md

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,51 @@
246246
#9: 2025-09-22 9 8 9.0
247247
```
248248

249-
19. New rolling functions, `frollmin` and `frollprod`, have been implemented, towards [#2778](https://github.com/Rdatatable/data.table/issues/2778). Thanks to @jangorecki for implementation.
249+
19. New rolling functions: `frollmin`, `frollprod` and `frollmedian`, have been implemented, towards [#2778](https://github.com/Rdatatable/data.table/issues/2778). Thanks to @jangorecki for implementation. Implementation of rolling median is based on a novel algorithm "sort-median" described by [@suomela](https://github.com/suomela) in his 2014 paper [Median Filtering is Equivalent to Sorting](https://arxiv.org/abs/1406.1717). "sort-median" scales very well, not only for size of input vector but also for size of rolling window.
250+
```r
251+
rollmedian = function(x, n) {
252+
ans = rep(NA_real_, nx<-length(x))
253+
if (n<=nx) for (i in n:nx) ans[i] = median(x[(i-n+1L):(i)])
254+
ans
255+
}
256+
library(data.table)
257+
setDTthreads(8)
258+
set.seed(108)
259+
x = rnorm(1e5)
260+
261+
n = 100
262+
system.time(rollmedian(x, n))
263+
# user system elapsed
264+
# 2.049 0.001 2.051
265+
system.time(frollapply(x, n, median, simplify=unlist))
266+
# user system elapsed
267+
# 3.071 0.223 0.436
268+
system.time(frollmedian(x, n))
269+
# user system elapsed
270+
# 0.013 0.000 0.004
271+
272+
n = 1000
273+
system.time(rollmedian(x, n))
274+
# user system elapsed
275+
# 3.496 0.009 3.507
276+
system.time(frollapply(x, n, median, simplify=unlist))
277+
# user system elapsed
278+
# 4.552 0.307 0.632
279+
system.time(frollmedian(x, n))
280+
# user system elapsed
281+
# 0.015 0.000 0.004
282+
283+
n = 10000
284+
system.time(rollmedian(x, n))
285+
# user system elapsed
286+
# 16.350 0.025 16.382
287+
system.time(frollapply(x, n, median, simplify=unlist))
288+
# user system elapsed
289+
# 14.865 0.722 2.267
290+
system.time(frollmedian(x, n))
291+
# user system elapsed
292+
# 0.028 0.000 0.005
293+
```
250294

251295
### BUG FIXES
252296

R/froll.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,3 +213,6 @@ frollmin = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left
213213
frollprod = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA) {
214214
froll(fun="prod", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, hasNA=hasNA, give.names=give.names)
215215
}
216+
frollmedian = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA) {
217+
froll(fun="median", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, hasNA=hasNA, give.names=give.names)
218+
}

inst/tests/froll.Rraw

Lines changed: 183 additions & 29 deletions
Large diffs are not rendered by default.

man/froll.Rd

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,14 @@
99
\alias{frollmax}
1010
\alias{frollmin}
1111
\alias{frollprod}
12+
\alias{frollmedian}
1213
\alias{roll}
1314
\alias{rollmean}
1415
\alias{rollsum}
1516
\alias{rollmax}
1617
\alias{rollmin}
1718
\alias{rollprod}
19+
\alias{rollmedian}
1820
\title{Rolling functions}
1921
\description{
2022
Fast rolling functions to calculate aggregates on a sliding window. For a user-defined rolling function see \code{\link{frollapply}}. For "time-aware" (irregularly spaced time series) rolling function see \code{\link{frolladapt}}.
@@ -30,6 +32,8 @@
3032
na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA)
3133
frollprod(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"),
3234
na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA)
35+
frollmedian(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"),
36+
na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA)
3337
}
3438
\arguments{
3539
\item{x}{ Integer, numeric or logical vector, coerced to numeric, on which sliding window calculates an aggregate function. It supports vectorized input, then it needs to be a \code{data.table}, \code{data.frame} or a \code{list}, in which case a rolling function is applied to each column/vector. }
@@ -82,20 +86,21 @@
8286
\item \code{has.nf=FALSE} uses faster implementation that does not support non-finite values. Then depending on the rolling function it will either:
8387
\itemize{
8488
\item (\emph{mean, sum, prod}) detect non-finite, re-run non-finite aware.
85-
\item (\emph{max, min}) does not detect non-finites and may silently give incorrect answer.
89+
\item (\emph{max, min, median}) does not detect non-finites and may silently produce an incorrect answer.
8690
}
8791
In general \code{has.nf=FALSE && any(!is.finite(x))} should be considered as undefined behavior. Therefore \code{has.nf=FALSE} should be used with care.
8892
}
8993
}
9094
\section{Implementation}{
91-
Each rolling function has 4 different implementations. First factor that decides which implementation is being used is \code{adaptive} argument, see setion below for details. Then for each of those two algorithms (adaptive \code{TRUE} or \code{FALSE}) there are two \code{algo} argument values.
95+
Each rolling function has 4 different implementations. First factor that decides which implementation is used is the \code{adaptive} argument (either \code{TRUE} or \code{FALSE}), see section below for details. Then for each of those two algorithms there are usually two implementations depending on the \code{algo} argument.
9296
\itemize{
93-
\item \code{algo="fast"} uses \emph{"on-line"}, single pass, algorithm.
97+
\item \code{algo="fast"} uses \emph{"online"}, single pass, algorithm.
9498
\itemize{
95-
\item \emph{max} and \emph{min} rolling function will not do only a single pass but, on average \code{length(x)/n}, nested loops will be computed. The bigger the window the bigger advantage over algo \emph{exact} which computes \code{length(x)} nested loops. Note that \emph{exact} uses multiple CPUs so for a small window size and many CPUs it is possible it will be actually faster than \emph{fast} but in those cases elapsed timings will likely be far below a single second.
96-
\item Not all functions have \emph{fast} implementation available. As of now \emph{max} and \emph{min} in case of \code{adaptive=TRUE} do not have \emph{fast} implementation, therefore it will automatically fall back to \emph{exact} implementation. \code{datatable.verbose} option can be used to check that.
99+
\item \emph{max} and \emph{min} rolling function will not do only a single pass but, on average, they will compute \code{length(x)/n} nested loops. The larger the window, the greater the advantage over the \emph{exact} algorithm, which computes \code{length(x)} nested loops. Note that \emph{exact} uses multiple CPUs so for a small window sizes and many CPUs it may actually be faster than \emph{fast}. However, in such cases the elapsed timings will likely be far below a single second.
100+
\item \emph{median} will use a novel algorithm described by \emph{Jukka Suomela} in his paper \emph{Median Filtering is Equivalent to Sorting (2014)}. See references section for the link. Implementation here is extended to support arbitrary length of input and an even window size. Despite extensive validation of results this function should be considered experimental. When missing values are detected it will fall back to slower \code{algo="exact"} implementation.
101+
\item Not all functions have \emph{fast} implementation available. As of now adaptive \emph{max}, adaptive \emph{min} and adaptive \emph{median} do not have \emph{fast} implementation, therefore it will automatically fall back to \emph{exact} implementation. \code{datatable.verbose} option can be used to check that.
97102
}
98-
\item \code{algo="exact"} will make rolling functions to use a more computationally-intensive algorithm. For each observation from input vector it will compute a function on a window from scratch (complexity \eqn{O(n^2)}).
103+
\item \code{algo="exact"} will make the rolling functions use a more computationally-intensive algorithm. For each observation in the input vector it will compute a function on a rolling window from scratch (complexity \eqn{O(n^2)}).
99104
\itemize{
100105
\item Depeneding on the function, this algorithm may suffers less from floating point rounding error (the same consideration applies to base \code{\link[base]{mean}}).
101106
\item In case of \emph{mean} (and possibly other functions in future), it will additionally make extra pass to perform floating point error correction. Error corrections might not be truly exact on some platforms (like Windows) when using multiple threads.
@@ -152,6 +157,7 @@ frollsum(d, 3:4)
152157
frollmax(d, 3:4)
153158
frollmin(d, 3:4)
154159
frollprod(d, 3:4)
160+
frollmedian(d, 3:4)
155161
156162
# partial=TRUE
157163
x = 1:6/2
@@ -207,6 +213,6 @@ sapply(errs, format, scientific=FALSE) # roundoff
207213
\code{\link{frollapply}}, \code{\link{frolladapt}}, \code{\link{shift}}, \code{\link{data.table}}, \code{\link{setDTthreads}}
208214
}
209215
\references{
210-
\href{https://en.wikipedia.org/wiki/Round-off_error}{Round-off error}
216+
\href{https://en.wikipedia.org/wiki/Round-off_error}{Round-off error}, \href{https://arxiv.org/abs/1406.1717}{"Median Filtering is Equivalent to Sorting" by Jukka Suomela}
211217
}
212218
\keyword{ data }

src/data.table.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -221,15 +221,20 @@ void initDTthreads(void);
221221
int getDTthreads(const int64_t n, const bool throttle);
222222
void avoid_openmp_hang_within_fork(void);
223223

224+
// shellsort.c
225+
void shellsort(const double *x, int n, int *o);
226+
//void shellsortna(const double *x, int n, int *o, bool *isna); // not used till NA support added to frollmedian algo="fast"
227+
224228
typedef enum { // adding rolling functions here and in frollfunR in frollR.c
225229
MEAN = 0,
226230
SUM = 1,
227231
MAX = 2,
228232
MIN = 3,
229-
PROD = 4
233+
PROD = 4,
234+
MEDIAN = 5
230235
} rollfun_t;
231236
// froll.c
232-
void frollfun(rollfun_t rfun, unsigned int algo, const double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasnf, bool verbose);
237+
void frollfun(rollfun_t rfun, unsigned int algo, const double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasnf, bool verbose, bool par);
233238
void frollmeanFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
234239
void frollmeanExact(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
235240
void frollsumFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
@@ -240,6 +245,8 @@ void frollminFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill,
240245
void frollminExact(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
241246
void frollprodFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
242247
void frollprodExact(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
248+
void frollmedianFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose, bool par);
249+
void frollmedianExact(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
243250

244251
// frolladaptive.c
245252
void frolladaptivefun(rollfun_t rfun, unsigned int algo, const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
@@ -253,6 +260,8 @@ void frolladaptivemaxExact(const double *x, uint64_t nx, ans_t *ans, const int *
253260
void frolladaptiveminExact(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
254261
void frolladaptiveprodFast(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
255262
void frolladaptiveprodExact(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
263+
//void frolladaptivemedianFast(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose); // does not exists as of now
264+
void frolladaptivemedianExact(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
256265

257266
// frollR.c
258267
SEXP frollfunR(SEXP fun, SEXP xobj, SEXP kobj, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasnf, SEXP adaptive);

0 commit comments

Comments
 (0)