Skip to content

Commit 6f225f3

Browse files
authored
frollapply simplify smarter, handles median well (#7314)
1 parent 88635ad commit 6f225f3

File tree

2 files changed

+99
-69
lines changed

2 files changed

+99
-69
lines changed

R/frollapply.R

Lines changed: 67 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,49 +1,58 @@
11
## ansmask is to handle leading values from fill to match type of the ans
22
simplifylist = function(x, fill, ansmask) {
3-
l = lengths(x)
4-
ul = unique(l)
5-
if (length(ul)!=1L) ## different lenghts
6-
return(x)
7-
t = vapply_1c(x, typeof, use.names=FALSE)
8-
ut = unique(t)
9-
if (length(ut)==2L) {
10-
all.ut = ut
11-
t = vapply_1c(x[ansmask], typeof, use.names=FALSE)
12-
ut = unique(t)
13-
if (length(ut)!=1L)
14-
return(x) ## different typeof even excluding fill, a FUN was not type stable
15-
if (!(ut=="integer"||ut=="logical"||ut=="double"||ut=="complex"||ut=="character"||ut=="raw"))
16-
return(x) ## ans is not atomic
17-
if (identical(fill, NA)) { ## different typeof, try to handle fill=NA logical type
18-
filli = which(!ansmask)
19-
ans1 = x[[which.first(ansmask)]]
20-
x[filli] = rep_len(list(ans1[NA]), length(filli)) ## this will recycle to length of ans1
21-
} else if (all(c("integer","double") %in% all.ut)) { ## typeof numeric and int, fill is coerced to the type FUN
22-
filli = which(!ansmask)
23-
cast = if (ut=="double") as.numeric else as.integer
24-
x[filli] = rep_len(list(cast(fill)), length(filli))
25-
} else { ## length == 2L but no easy way to match type
26-
return(x)
3+
all.t = vapply_1c(x, typeof, use.names=FALSE)
4+
all.ut = unique(all.t)
5+
if (length(all.ut) > 1L) {
6+
ans.t = vapply_1c(x[ansmask], typeof, use.names=FALSE)
7+
ans.ut = unique(ans.t)
8+
## ans postprocessing to match types
9+
if ((length(ans.ut) == 2L) && all(ans.ut %in% c("double","integer","logical"))) { ## align numeric and integer when function is not type stable: median #7313
10+
if ("double" %in% ans.ut) {
11+
if ("integer" %in% ans.ut)
12+
x[ansmask & all.t=="integer"] = lapply(x[ansmask & all.t=="integer"], as.numeric) ## coerce integer to double
13+
if ("logical" %in% ans.ut)
14+
x[ansmask & all.t=="logical"] = lapply(x[ansmask & all.t=="logical"], as.numeric) ## coerce logical to double
15+
ans.ut = "double"
16+
} else if ("integer" %in% ans.ut) {
17+
if ("logical" %in% ans.ut)
18+
x[ansmask & all.t=="logical"] = lapply(x[ansmask & all.t=="logical"], as.integer) ## coerce logical to integer
19+
else
20+
internal_error("simplifylist aligning return types, at that place there should have been some logical types in the answer") # nocov
21+
ans.ut = "integer"
22+
}
23+
}
24+
## file postprocessing to match types
25+
if (length(ans.ut) == 1L && equal.lengths(x[ansmask])) {
26+
if (identical(fill, NA)) { ## different typeof of ans and default fill=NA and lengths of ans equal
27+
filli = which(!ansmask)
28+
ans1 = x[[which.first(ansmask)]] ## first ans from full window
29+
x[filli] = rep_len(list(ans1[NA]), length(filli)) ## this will make NA of matching type to ans1 and recycle for all filli
30+
all.ut = ans.ut
31+
} else if (typeof(fill) != ans.ut && all(c(typeof(fill), ans.ut) %in% c("double","integer","logical"))) { ## fill=-2, ans=1L
32+
filli = which(!ansmask)
33+
cast = switch(ans.ut, double = as.numeric, integer = as.integer, logical = as.logical)
34+
x[filli] = rep_len(list(cast(fill)), length(filli))
35+
all.ut = ans.ut
36+
}
2737
}
28-
} else if (length(ut)>2L) { ## unique typeof length > 2L
29-
return(x)
3038
}
31-
if (ut=="integer"||ut=="logical"||ut=="double"||ut=="complex"||ut=="character"||ut=="raw") {
32-
if (ul==1L) ## length 1
39+
all.ut = unique(vapply_1c(x, typeof, use.names=FALSE))
40+
if ((length(all.ut) == 1L) && all(all.ut %in% c("integer","logical","double","complex","character","raw"))) {
41+
if (identical(unique(lengths(x)), 1L)) { ## length 1
3342
return(unlist(x, recursive=FALSE, use.names=FALSE))
34-
else ## length 2+
43+
} else if (equal.lengths(x)) { ## length 2+ and equal
3544
return(rbindlist(lapply(x, as.list)))
36-
} else if (ut=="list") {
37-
if (all(vapply_1b(x, is.data.frame, use.names=FALSE))) ## list(data.table(...), data.table(...))
38-
return(rbindlist(x))
39-
ll = lapply(x, lengths) ## length of each column of each x
40-
ull = unique(ll)
41-
if (length(ull)==1L) ## list(list(1:2, 1:2), list(2:3, 2:3))
45+
}
46+
} else if (identical(all.ut, "list")) {
47+
if (all_data.frame(x)) ## list(data.table(...), data.table(...))
4248
return(rbindlist(x))
43-
lu = function(x) length(unique(x))
44-
if (all(vapply_1i(ull, lu, use.names=FALSE)==1L)) ## within each x column lengths the same, each could be DF: list(list(1, 2), list(1:2, 2:3))
49+
if (equal.lengths(x)) ## same length lists: list(list(1:2, 1:2), list(2:3, 2:3))
4550
return(rbindlist(x))
46-
} ## else NULL, closure, special, builtin, environment, S4, ...
51+
}
52+
## not simplified, return as is
53+
# not length stable
54+
# not type stable
55+
# NULL, closure, special, builtin, environment, S4, ...
4756
x
4857
}
4958

@@ -86,6 +95,8 @@ frollapply = function(X, N, FUN, ..., by.column=TRUE, fill=NA, align=c("right","
8695
align = match.arg(align)
8796
FUN = match.fun(FUN)
8897
verbose = getOption("datatable.verbose")
98+
if (!length(X))
99+
return(vector(mode=typeof(X), length=0L))
89100
if (give.names)
90101
orig = list(N=N, adaptive=adaptive)
91102

@@ -284,26 +295,31 @@ frollapply = function(X, N, FUN, ..., by.column=TRUE, fill=NA, align=c("right","
284295
cat("frollapply running on single CPU thread\n")
285296
ans = vector("list", nx*nn)
286297
## vectorized x
298+
287299
for (i in seq_len(nx)) {
288300
thisx = X[[i]]
289301
thislen = len[i]
290-
if (!thislen)
291-
next
292-
if (!use.fork0) {
293-
use.fork = use.fork0
294-
} else {
295-
# throttle
296-
DTths = getDTthreadsC(thislen, TRUE)
297-
use.fork = DTths > 1L
298-
if (verbose) {
299-
if (DTths < DTths0)
300-
catf("frollapply run on %d CPU threads throttled to %d threads, input length %d\n", DTths0, DTths, thislen)
301-
else
302-
catf("frollapply running on %d CPU threads\n", DTths)
302+
if (thislen) {
303+
if (!use.fork0) {
304+
use.fork = use.fork0
305+
} else {
306+
# throttle
307+
DTths = getDTthreadsC(thislen, TRUE)
308+
use.fork = DTths > 1L
309+
if (verbose) {
310+
if (DTths < DTths0)
311+
catf("frollapply run on %d CPU threads throttled to %d threads, input length %d\n", DTths0, DTths, thislen)
312+
else
313+
catf("frollapply running on %d CPU threads\n", DTths)
314+
}
303315
}
304316
}
305317
## vectorized n
306318
for (j in seq_len(nn)) {
319+
if (!thislen) {
320+
ans[[(i-1L)*nn+j]] = vector(mode=typeof(thisx), length=0L)
321+
next
322+
}
307323
thisn = N[[j]]
308324
w = allocWindow(thisx, thisn) ## prepare window, handles adaptive
309325
ansmask = ansMask(thislen, thisn)

inst/tests/froll.Rraw

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1565,21 +1565,24 @@ if (getDTthreads()>1L) { ## check for consistency
15651565
}
15661566

15671567
#### corner cases from examples - handled properly after frollapply rewrite to R
1568-
test(6010.101, frollapply(1:5, 3, function(x) head(x, 2)), list(NA, NA, 1:2, 2:3, 3:4))
1568+
test(6010.101, frollapply(1:5, 3, function(x) head(x, 2)), data.table(V1 = c(NA, NA, 1L, 2L, 3L), V2 = c(NA, NA, 2L, 3L, 4L)))
15691569
f = function(x) {
1570-
n = length(x)
1571-
# length 1 will be returned only for first iteration where we check length
1572-
if (n==x[n]) x[1L] else range(x) # range(x)[2L] is silently ignored
1570+
n = length(x) # length 1 will be returned only for first iteration
1571+
if (n==x[n]) x[1L] else range(x)
15731572
}
15741573
test(6010.102, frollapply(1:5, 3, f), list(NA,NA,1L,c(2L,4L),c(3L,5L)))
1574+
f = function(x) {
1575+
n = length(x) # length 1 will be returned only for last iteration
1576+
if (n==x[n]) range(x) else x[1L]
1577+
}
1578+
test(6010.1021, frollapply(1:5, 3, f), list(NA,NA,c(1L,3L),2L,3L))
15751579
test(6010.103, frollapply(c(1,2,1,1,1,2,3,2), 3, uniqueN), c(NA,NA,2L,2L,1L,2L,3L,2L))
15761580
test(6010.104, frollapply(c(1,2,1,1,NA,2,NA,2), 3, anyNA), c(NA,NA,FALSE,FALSE,TRUE,TRUE,TRUE,TRUE))
15771581
f = function(x) {
1578-
n = length(x)
1579-
# double type will be returned only for first iteration where we check type
1580-
if (n==x[n]) 1 else NA # NA logical turns into garbage without coercion to double
1582+
n = length(x) # double type will be returned only for first iteration
1583+
if (n==x[n]) 1 else NA # NA logical coerced properly
15811584
}
1582-
test(6010.105, head(frollapply(1:5, 3, f), 3L), list(NA, NA, 1))
1585+
test(6010.105, frollapply(1:5, 3, f), c(NA,NA,1,NA,NA))
15831586

15841587
## partial
15851588
x = 1:6/2
@@ -1615,7 +1618,7 @@ test(6010.2014, frollapply(1:5, rep(3, 5), toString, adaptive=TRUE), c(NA,NA,"1,
16151618
test(6010.2015, frollapply(1:2, 1:2, mean, adaptive=TRUE, align="right"), c(1, 1.5))
16161619
test(6010.2016, frollapply(1:2, 1:2, mean, adaptive=TRUE, align="center"), error="using adaptive TRUE and align 'center' is not implemented")
16171620
test(6010.2017, frollapply(list(1:2, 1:3), list(1:2), mean, adaptive=TRUE), error="adaptive rolling function can only process 'X' having equal length of elements; If you want to call rolling function on list having variable length of elements call it for each field separately")
1618-
test(6010.2018, frollapply(1:5, rep(3, 5), function(x) head(x, 2), adaptive=TRUE), list(NA, NA, 1:2, 2:3, 3:4))
1621+
test(6010.2018, frollapply(1:5, rep(3, 5), function(x) head(x, 2), adaptive=TRUE), data.table(V1 = c(NA, NA, 1L, 2L, 3L), V2 = c(NA, NA, 2L, 3L, 4L)))
16191622
test(6010.2019, frollapply(1:10, list(1:5), mean, adaptive=TRUE), error="length of integer vector(s) provided as list to 'N' argument must be equal to number of observations provided in 'X'")
16201623
test(6010.202, frollapply(1:10, 1:5, mean, adaptive=TRUE), error="length of integer vector(s) provided as list to 'N' argument must be equal to number of observations provided in 'X'")
16211624
options(datatable.verbose=TRUE)
@@ -1652,7 +1655,9 @@ test(6010.522, frollapply(c(1:3,NA,5:6), 4L, sum), rep(NA_integer_,6))
16521655
test(6010.523, frollapply(c(1:3,NA,5:6), 4L, sum, na.rm=TRUE), c(NA,NA,NA,6L,10L,14L))
16531656
test(6010.524, frollapply(c(1,2,3,NA,NA,NA,NA), 3L, mean), c(NA,NA,2,NA,NA,NA,NA))
16541657
test(6010.525, frollapply(c(1,2,3,NA,NA,NA,NA), 3L, mean, na.rm=TRUE), c(NA,NA,2,2.5,3,NaN,NaN))
1655-
test(6010.526, frollapply(numeric(), 3L, sum), list())
1658+
test(6010.526, frollapply(numeric(), 3L, sum), numeric())
1659+
test(6010.5261, frollapply(integer(), 3L, sum), integer())
1660+
test(6010.5262, frollapply(logical(), 3L, sum), logical())
16561661
test(6010.527, frollapply(1:5, 3L, toString), c(NA, NA, "1, 2, 3", "2, 3, 4", "3, 4, 5"))
16571662
ma = function(x, n, na.rm=FALSE) {
16581663
ans = rep(NA_real_, nx<-length(x))
@@ -1705,9 +1710,9 @@ test(6010.6062, frollapply(as.list(x), rep(3,5), function(x) c(length(x[[1L]]),
17051710
test(6010.607, frollapply(list(), 3, identity, by.column=FALSE), list())
17061711
test(6010.608, frollapply(list(numeric(), numeric()), 3, identity, by.column=FALSE), list())
17071712
test(6010.609, frollapply(list(numeric(), 1:3), 3, identity, by.column=FALSE), error="all vectors must have equal lengths")
1708-
test(6010.610, frollapply(numeric(), 3, identity), list())
1709-
test(6010.611, frollapply(list(numeric(), numeric()), 3, identity), list(NULL,NULL))
1710-
test(6010.612, frollapply(list(numeric(), 1:3), 3, identity), list(NULL, list(NA,NA,1:3)))
1713+
test(6010.610, frollapply(numeric(), 3, identity), numeric())
1714+
test(6010.611, frollapply(list(numeric(), numeric()), 3, identity), list(numeric(),numeric()))
1715+
test(6010.612, frollapply(list(numeric(), 1:3), 3, identity), list(numeric(), data.table(c(NA,NA,1L),c(NA,NA,2L),c(NA,NA,3L))))
17111716

17121717
## codecov memcpy calls #7304
17131718
x = data.table(v1=1:2, v2=c(1,2), v3=c("a","b"), v4=list(1,2), v5=as.complex(1:2), v6=as.raw(1:2))
@@ -1763,7 +1768,7 @@ rm(X, ans, n)
17631768
test(6010.701, frollapply(1:5, 2, sum), c(NA,3L,5L,7L,9L))
17641769
test(6010.702, frollapply(1:5, 2, sum, simplify=unlist), c(NA,3L,5L,7L,9L))
17651770
test(6010.703, frollapply(1:5, 2, sum, simplify=FALSE), list(NA,3L,5L,7L,9L))
1766-
test(6010.704, frollapply(1:5, 2, range), list(NA,1:2,2:3,3:4,4:5)) ## fill=NA could possibly be recycled to length of FUN results
1771+
test(6010.704, frollapply(1:5, 2, range), data.table(c(NA,1:4),c(NA,2:5))) ## fill=NA could possibly be recycled to length of FUN results, it is now #7313
17671772
test(6010.705, frollapply(1:5, 2, range, simplify=FALSE), list(NA,1:2,2:3,3:4,4:5))
17681773
test(6010.706, frollapply(1:5, 2, range, fill=c(NA_integer_,NA_integer_)), data.table(V1=c(NA,1:4), V2=c(NA,2:5)))
17691774
test(6010.707, frollapply(1:5, 2, range, fill=c(min=NA_integer_, max=NA_integer_)), data.table(min=c(NA,1:4), max=c(NA,2:5)))
@@ -1773,6 +1778,15 @@ test(6010.710, frollapply(1:5, 2, function(x) as.list(range(x)), fill=list(min=N
17731778
test(6010.711, frollapply(1:5, 2, function(x) as.list(range(x)), fill=list(NA_integer_, NA_integer_), simplify=FALSE), list(list(NA_integer_, NA_integer_), as.list(1:2), as.list(2:3), as.list(3:4), as.list(4:5)))
17741779
test(6010.712, as.null(frollapply(1:3, 1, function(x) if (x==1L) sum else if (x==2L) mean else `[`, simplify=TRUE)), NULL) ## as.null as we are only interested in codecov here
17751780
test(6010.713, as.null(frollapply(1:3, 1, function(x) `[`, simplify = TRUE)), NULL) ## as.null as we are only interested in codecov here
1781+
# frollapply simplifylist could be more smart about median results #7313
1782+
test(6010.751, frollapply(FUN=median, adaptive=TRUE, list(1:3,2:4), list(c(2,0,2), c(0,2,0))), list(c(NA,NA_real_,2.5), c(NA_real_,1.5,NA_real_), c(NA,NA_real_,3.5), c(NA_real_,2.5,NA_real_)))
1783+
test(6010.752, frollapply(FUN=median, adaptive=TRUE, 1:3, c(2,0,2), fill=99), c(99,NA_real_,2.5))
1784+
test(6010.753, frollapply(FUN=median, adaptive=TRUE, c(1L,2L,4L), c(2,0,2), fill=99L), c(99,NA_real_,3))
1785+
test(6010.754, frollapply(FUN=median, adaptive=TRUE, c(1L,2L,3L), c(2,0,2), fill=99), c(99,NA_real_,2.5))
1786+
test(6010.755, frollapply(1:2, 1, function(i) if (i==1L) 1L else FALSE), c(1L,0L))
1787+
test(6010.756, frollapply(1:3, 2, fill=9, function(i) if (i[1L]==1L) 1L else FALSE), c(9L,1L,0L)) ## matches fun answer
1788+
test(6010.757, frollapply(1:3, 2, fill=9L, function(i) if (i[1L]==1L) 1 else FALSE), c(9,1,0)) ## matches fun answer
1789+
test(6010.758, frollapply(1:3, 2, fill=0, function(i) TRUE), c(FALSE,TRUE,TRUE)) ## matches fun answer
17761790

17771791
#### mutlithreading throttle caveats from manual: copy, fixing .internal.selfref
17781792
use.fork = .Platform$OS.type!="windows" && getDTthreads()>1L
@@ -1785,8 +1799,8 @@ if (use.fork) {
17851799
test(6010.763, frollapply(c(1, 9), N=1L, FUN=identity), c(1,9)) ## good only because threads >= input
17861800
test(6010.764, frollapply(c(1, 5, 9), N=1L, FUN=identity), c(5,5,9)) ## unexpected again
17871801
is.ok = function(x) {stopifnot(is.data.table(x)); capture.output(print(attr(x, ".internal.selfref", TRUE)))!="<pointer: (nil)>"}
1788-
ans = frollapply(1:2, 2, data.table) ## default: fill=NA
1789-
test(6010.770, is.ok(ans[[2L]])) ## mismatch of 'fill' type so simplify=TRUE did not run rbindlist but frollapply detected DT and fixed
1802+
ans = frollapply(1:2, 2, data.table, simplify=FALSE) ## default: fill=NA
1803+
test(6010.770, is.ok(ans[[2L]])) ## frollapply detected DT and fixed
17901804
ans = frollapply(1:2, 2, data.table, fill=data.table(NA)) ## fill type match
17911805
test(6010.771, is.ok(ans)) ## simplify=TRUE did run rbindlist, but frollapply fixed anyway
17921806
ans = frollapply(1:2, 2, data.table, fill=data.table(NA), simplify=FALSE)
@@ -1892,8 +1906,8 @@ test(6010.9968, frollapply(FUN=sum, list(c(1,2,3)), list(n1=c(2,2,2)), adaptive=
18921906
test(6010.9969, frollapply(FUN=sum, list(x1=c(1,2,3)), list(n1=c(2,2,2)), adaptive=TRUE, partial=TRUE, give.names=TRUE), list(x1_n1=c(1,3,5)))
18931907

18941908
# frollapply doesn't handle zero-length output #7054
1895-
test(6010.9991, frollapply(list(integer()), 0, function(x) 1), list(NULL))
1896-
test(6010.9992, frollapply(list(integer()), list(integer()), str, adaptive=TRUE), list(NULL))
1909+
test(6010.9991, frollapply(list(integer()), 0, function(x) 1), list(integer()))
1910+
test(6010.9992, frollapply(list(integer()), list(integer()), str, adaptive=TRUE), list(integer()))
18971911

18981912
## frolladapt
18991913
test(6015.000, frolladapt(1:3, 2, align="center"), error="'align' other than 'right' has not yet been implemented")

0 commit comments

Comments
 (0)