Rdatatable
diff --git a/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎NEWS.md‎
Lines changed: 45 additions & 1 deletion b/‎NEWS.md‎
Lines changed: 45 additions & 1 deletion
diff --git a/‎R/froll.R‎
Lines changed: 3 additions & 0 deletions b/‎R/froll.R‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎inst/tests/froll.Rraw‎
Lines changed: 183 additions & 29 deletions b/‎inst/tests/froll.Rraw‎
Lines changed: 183 additions & 29 deletions
diff --git a/‎man/froll.Rd‎
Lines changed: 13 additions & 7 deletions b/‎man/froll.Rd‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎src/data.table.h‎
Lines changed: 11 additions & 2 deletions b/‎src/data.table.h‎
Lines changed: 11 additions & 2 deletions
@@ -57,6 +57,7 @@ export(frollsum)
 export(frollmax)
 export(frollmin)
 export(frollprod)
+export(frollmedian)
 export(frollapply)
 export(frolladapt)
 export(nafill)
 
@@ -246,7 +246,51 @@
     #9: 2025-09-22     9         8           9.0
     ```
 
-19. New rolling functions, `frollmin` and `frollprod`, have been implemented, towards [#2778](https://github.com/Rdatatable/data.table/issues/2778). Thanks to @jangorecki for implementation.
+19. New rolling functions: `frollmin`, `frollprod` and `frollmedian`, have been implemented, towards [#2778](https://github.com/Rdatatable/data.table/issues/2778). Thanks to @jangorecki for implementation. Implementation of rolling median is based on a novel algorithm "sort-median" described by [@suomela](https://github.com/suomela) in his 2014 paper [Median Filtering is Equivalent to Sorting](https://arxiv.org/abs/1406.1717). "sort-median" scales very well, not only for size of input vector but also for size of rolling window.
+    ```r
+    rollmedian = function(x, n) {
+      ans = rep(NA_real_, nx<-length(x))
+      if (n<=nx) for (i in n:nx) ans[i] = median(x[(i-n+1L):(i)])
+      ans
+    }
+    library(data.table)
+    setDTthreads(8)
+    set.seed(108)
+    x = rnorm(1e5)
+
+    n = 100
+    system.time(rollmedian(x, n))
+    #   user  system elapsed
+    #  2.049   0.001   2.051
+    system.time(frollapply(x, n, median, simplify=unlist))
+    #   user  system elapsed
+    #  3.071   0.223   0.436
+    system.time(frollmedian(x, n))
+    #   user  system elapsed
+    #  0.013   0.000   0.004
+
+    n = 1000
+    system.time(rollmedian(x, n))
+    #   user  system elapsed
+    #  3.496   0.009   3.507
+    system.time(frollapply(x, n, median, simplify=unlist))
+    #   user  system elapsed
+    #  4.552   0.307   0.632
+    system.time(frollmedian(x, n))
+    #   user  system elapsed
+    #  0.015   0.000   0.004
+
+    n = 10000
+    system.time(rollmedian(x, n))
+    #   user  system elapsed
+    # 16.350   0.025  16.382
+    system.time(frollapply(x, n, median, simplify=unlist))
+    #   user  system elapsed
+    # 14.865   0.722   2.267
+    system.time(frollmedian(x, n))
+    #   user  system elapsed
+    #  0.028   0.000   0.005
+    ```
 
 ### BUG FIXES
 
 
@@ -213,3 +213,6 @@ frollmin = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left
 frollprod = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA) {
   froll(fun="prod", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, hasNA=hasNA, give.names=give.names)
 }
+frollmedian = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA) {
+  froll(fun="median", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, hasNA=hasNA, give.names=give.names)
+}
@@ -9,12 +9,14 @@
 \alias{frollmax}
 \alias{frollmin}
 \alias{frollprod}
+\alias{frollmedian}
 \alias{roll}
 \alias{rollmean}
 \alias{rollsum}
 \alias{rollmax}
 \alias{rollmin}
 \alias{rollprod}
+\alias{rollmedian}
 \title{Rolling functions}
 \description{
   Fast rolling functions to calculate aggregates on a sliding window. For a user-defined rolling function see \code{\link{frollapply}}. For "time-aware" (irregularly spaced time series) rolling function see \code{\link{frolladapt}}.
@@ -30,6 +32,8 @@
     na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA)
   frollprod(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"),
     na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA)
+  frollmedian(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"),
+    na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA)
 }
 \arguments{
   \item{x}{ Integer, numeric or logical vector, coerced to numeric, on which sliding window calculates an aggregate function. It supports vectorized input, then it needs to be a \code{data.table}, \code{data.frame} or a \code{list}, in which case a rolling function is applied to each column/vector. }
@@ -82,20 +86,21 @@
     \item \code{has.nf=FALSE} uses faster implementation that does not support non-finite values. Then depending on the rolling function it will either:
     \itemize{
       \item (\emph{mean, sum, prod}) detect non-finite, re-run non-finite aware.
-      \item (\emph{max, min}) does not detect non-finites and may silently give incorrect answer.
+      \item (\emph{max, min, median}) does not detect non-finites and may silently produce an incorrect answer.
     }
     In general \code{has.nf=FALSE && any(!is.finite(x))} should be considered as undefined behavior. Therefore \code{has.nf=FALSE} should be used with care.
   }
 }
 \section{Implementation}{
-  Each rolling function has 4 different implementations. First factor that decides which implementation is being used is \code{adaptive} argument, see setion below for details. Then for each of those two algorithms (adaptive \code{TRUE} or \code{FALSE}) there are two \code{algo} argument values.
+  Each rolling function has 4 different implementations. First factor that decides which implementation is used is the \code{adaptive} argument (either \code{TRUE} or \code{FALSE}), see section below for details. Then for each of those two algorithms there are usually two implementations depending on the \code{algo} argument.
   \itemize{
-    \item \code{algo="fast"} uses \emph{"on-line"}, single pass, algorithm.
+    \item \code{algo="fast"} uses \emph{"online"}, single pass, algorithm.
     \itemize{
-      \item \emph{max} and \emph{min} rolling function will not do only a single pass but, on average \code{length(x)/n}, nested loops will be computed. The bigger the window the bigger advantage over algo \emph{exact} which computes \code{length(x)} nested loops. Note that \emph{exact} uses multiple CPUs so for a small window size and many CPUs it is possible it will be actually faster than \emph{fast} but in those cases elapsed timings will likely be far below a single second.
-      \item Not all functions have \emph{fast} implementation available. As of now \emph{max} and \emph{min} in case of \code{adaptive=TRUE} do not have \emph{fast} implementation, therefore it will automatically fall back to \emph{exact} implementation. \code{datatable.verbose} option can be used to check that.
+      \item \emph{max} and \emph{min} rolling function will not do only a single pass but, on average, they will compute \code{length(x)/n} nested loops. The larger the window, the greater the advantage over the \emph{exact} algorithm, which computes \code{length(x)} nested loops. Note that \emph{exact} uses multiple CPUs so for a small window sizes and many CPUs it may actually be faster than \emph{fast}. However, in such cases the elapsed timings will likely be far below a single second.
+      \item \emph{median} will use a novel algorithm described by \emph{Jukka Suomela} in his paper \emph{Median Filtering is Equivalent to Sorting (2014)}. See references section for the link. Implementation here is extended to support arbitrary length of input and an even window size. Despite extensive validation of results this function should be considered experimental. When missing values are detected it will fall back to slower \code{algo="exact"} implementation.
+      \item Not all functions have \emph{fast} implementation available. As of now adaptive \emph{max}, adaptive \emph{min} and adaptive \emph{median} do not have \emph{fast} implementation, therefore it will automatically fall back to \emph{exact} implementation. \code{datatable.verbose} option can be used to check that.
     }
-    \item \code{algo="exact"} will make rolling functions to use a more computationally-intensive algorithm. For each observation from input vector it will compute a function on a window from scratch (complexity \eqn{O(n^2)}).
+    \item \code{algo="exact"} will make the rolling functions use a more computationally-intensive algorithm. For each observation in the input vector it will compute a function on a rolling window from scratch (complexity \eqn{O(n^2)}).
     \itemize{
       \item Depeneding on the function, this algorithm may suffers less from floating point rounding error (the same consideration applies to base \code{\link[base]{mean}}).
       \item In case of \emph{mean} (and possibly other functions in future), it will additionally make extra pass to perform floating point error correction. Error corrections might not be truly exact on some platforms (like Windows) when using multiple threads.
@@ -152,6 +157,7 @@ frollsum(d, 3:4)
 frollmax(d, 3:4)
 frollmin(d, 3:4)
 frollprod(d, 3:4)
+frollmedian(d, 3:4)
 
 # partial=TRUE
 x = 1:6/2
@@ -207,6 +213,6 @@ sapply(errs, format, scientific=FALSE) # roundoff
   \code{\link{frollapply}}, \code{\link{frolladapt}}, \code{\link{shift}}, \code{\link{data.table}}, \code{\link{setDTthreads}}
 }
 \references{
-  \href{https://en.wikipedia.org/wiki/Round-off_error}{Round-off error}
+  \href{https://en.wikipedia.org/wiki/Round-off_error}{Round-off error}, \href{https://arxiv.org/abs/1406.1717}{"Median Filtering is Equivalent to Sorting" by Jukka Suomela}
 }
 \keyword{ data }
@@ -221,15 +221,20 @@ void initDTthreads(void);
 int getDTthreads(const int64_t n, const bool throttle);
 void avoid_openmp_hang_within_fork(void);
 
+// shellsort.c
+void shellsort(const double *x, int n, int *o);
+//void shellsortna(const double *x, int n, int *o, bool *isna); // not used till NA support added to frollmedian algo="fast"
+
 typedef enum { // adding rolling functions here and in frollfunR in frollR.c
   MEAN = 0,
   SUM = 1,
   MAX = 2,
   MIN = 3,
-  PROD = 4
+  PROD = 4,
+  MEDIAN = 5
 } rollfun_t;
 // froll.c
-void frollfun(rollfun_t rfun, unsigned int algo, const double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasnf, bool verbose);
+void frollfun(rollfun_t rfun, unsigned int algo, const double *x, uint64_t nx, ans_t *ans, int k, int align, double fill, bool narm, int hasnf, bool verbose, bool par);
 void frollmeanFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
 void frollmeanExact(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
 void frollsumFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
@@ -240,6 +245,8 @@ void frollminFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill,
 void frollminExact(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
 void frollprodFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
 void frollprodExact(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
+void frollmedianFast(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose, bool par);
+void frollmedianExact(const double *x, uint64_t nx, ans_t *ans, int k, double fill, bool narm, int hasnf, bool verbose);
 
 // frolladaptive.c
 void frolladaptivefun(rollfun_t rfun, unsigned int algo, const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
@@ -253,6 +260,8 @@ void frolladaptivemaxExact(const double *x, uint64_t nx, ans_t *ans, const int *
 void frolladaptiveminExact(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
 void frolladaptiveprodFast(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
 void frolladaptiveprodExact(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
+//void frolladaptivemedianFast(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose); // does not exists as of now
+void frolladaptivemedianExact(const double *x, uint64_t nx, ans_t *ans, const int *k, double fill, bool narm, int hasnf, bool verbose);
 
 // frollR.c
 SEXP frollfunR(SEXP fun, SEXP xobj, SEXP kobj, SEXP fill, SEXP algo, SEXP align, SEXP narm, SEXP hasnf, SEXP adaptive);
Original file line number	Diff line number	Diff line change
`@@ -213,3 +213,6 @@ frollmin = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left`
`213`	`213`	`frollprod = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA) {`
`214`	`214`	`froll(fun="prod", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, hasNA=hasNA, give.names=give.names)`
`215`	`215`	`}`
	`216`	`+frollmedian = function(x, n, fill=NA, algo=c("fast","exact"), align=c("right","left","center"), na.rm=FALSE, has.nf=NA, adaptive=FALSE, partial=FALSE, give.names=FALSE, hasNA) {`
	`217`	`+ froll(fun="median", x=x, n=n, fill=fill, algo=algo, align=align, na.rm=na.rm, has.nf=has.nf, adaptive=adaptive, partial=partial, hasNA=hasNA, give.names=give.names)`
	`218`	`+}`