Skip to content

Commit b0e6ba1

Browse files
committed
first/last return 'true vectors' so dogroups.c knows not to recycle length-1 and pad with NA
1 parent c9f5507 commit b0e6ba1

File tree

5 files changed

+68
-51
lines changed

5 files changed

+68
-51
lines changed

R/last.R

Lines changed: 44 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -20,60 +20,64 @@ last = function(x, n=1L, na.rm=FALSE, ...) {
2020
stopifnot(isTRUEorFALSE(na.rm) || identical(na.rm,"row"))
2121
stopifnot(is.numeric(n), length(n)==1L, n>=0L)
2222
n = as.integer(n)
23-
.headtail = if (first) utils::head else utils::tail
24-
if (isFALSE(na.rm) || n==0L)
25-
return(.headtail(x, n=n, ...))
2623
if (is.data.frame(x)) {
2724
if (!nrow(x)) return(x)
2825
if (identical(na.rm, "row")) { # any NA on the row removes that row
2926
nna = which_(.Call(Cdt_na, x, seq_along(x)), bool=FALSE)
30-
# from na.omit.data.table without calling na.omit which would subset all non-NA rows
31-
# TODO: n and first/last could be passed to Cdt_na and it could stop after finding n
32-
nna = .headtail(nna, n=n)
33-
if (length(nna) < min(n,nrow(x))) {
34-
# to match optimized na.rm=TRUE behavior; e.g. when .SD is one column
35-
# TODO: extra argument all.na=NA|NULL (or pad.na=) could control this
36-
pad = rep.int(NA, min(n,nrow(x))-length(nna))
37-
# returning min(n,nrow(x)) is what optimized one-column does because GForce needs to be deterministic by group
38-
# currently; i.e. number of items per group doesn't depend on how many NA there are
39-
nna = if (first) c(nna, pad) else c(pad, nna)
40-
}
41-
ans = x[nna,,drop=FALSE]
42-
# DT[NA] returns NULL for list columns. TODO: change [.data.table to return NA for list columns
43-
# In the meantime, fix up the NULLs here in first/last
44-
for (col in which(vapply_1b(ans, is.list))) {
45-
for (i in which(vapply_1b(ans[[col]], is.null))) {
46-
set(ans, i, col, NA)
27+
# very similar to na.omit.data.table
28+
# TODO: n and first/last could be passed to Cdt_na and it could stop after finding n (it already does that in gsumm.c when gforce optimized)
29+
nna = .firstlastVector(nna, n=n, first=first, na.rm=FALSE)
30+
ans = .Call(CsubsetDT, x, nna, seq_along(x)) # works on DF too
31+
} else {
32+
ans = lapply(x, .firstlastVector, n=n, first=first, na.rm=na.rm)
33+
if (na.rm) {
34+
l = vapply_1i(ans, length)
35+
m = max(l)
36+
for (i in which(l<m)) {
37+
ans[[i]] = c(ans[[i]], rep(NA, m-l[i]))
4738
}
39+
# any row.names won't align to the values now in the result so don't retain them
4840
}
49-
return(ans)
50-
}
51-
# else na.rm==TRUE; select the first/last non-NA within each column
52-
ans = lapply(x, .narmVector, n=n, first=first)
53-
l = vapply_1i(ans, length)
54-
m = max(l)
55-
for (i in which(l<m)) { # pad with NA
56-
ans[[i]] = c(ans[[i]], rep(NA, m-l[i]))
5741
}
5842
if (is.data.table(x)) setDT(ans) else setDF(ans)
5943
setattr(ans, "class", class(x))
44+
if (!isTRUE(na.rm) && length(rn<-attr(x,"row.names")))
45+
setattr(ans, "row.names", if (isFALSE(na.rm)) .firstlastVector(rn, n=n, first=first, na.rm=FALSE)
46+
else rn[nna])
6047
return(ans)
6148
}
6249
if (!length(x))
6350
return(x)
64-
if (is.vector(x) && !isFALSE(na.rm))
65-
return(.narmVector(x, n=n, first=first))
66-
if (!isFALSE(na.rm))
67-
stopf("na.rm=TRUE|'row' is not currently supported for '%s'", class(x)[1L])
68-
.headtail(x, n=n, ...)
69-
# TODO when n=1, return(x[length(x)]) would save method dispatch overhead
70-
# TODO and previous version had lx = length(x); if (!lx) x else x[[lx]]. So empty input returned empty
51+
if (!is.vector(x)) {
52+
if (!isFALSE(na.rm))
53+
stopf("na.rm=TRUE|'row' is not currently supported for '%s'", class(x)[1L])
54+
return((if (first) utils::head else utils::tail)(x, n=n, ...)) # e.g. matrix
55+
}
56+
return(.firstlastVector(x, n=n, first=first, na.rm=!isFALSE(na.rm))) # !isFALSE to convert 'row' to TRUE
7157
}
7258

73-
.narmVector = function(x, n, first) {
74-
nna = which_(is.na(x) | (is.list(x) & vapply_1b(x,is.null)), bool=FALSE) # TODO: again, n and first/last could be passed to C here
75-
if (!length(nna)) x[0L]
76-
else if (n==1L) x[nna[if (first) 1L else length(nna)]]
77-
else x[(if (first) utils::head else utils::tail)(nna, n)] # TODO: avoid dispatch here and do ourselves since just a vector
59+
.firstlastVector = function(x, n, first, na.rm) {
60+
if (!length(x)) return(x)
61+
if (n==0L) return(x[0L])
62+
ans = if (na.rm) {
63+
nna = which_(if (is.list(x)) vapply_1b(x,function(y){is.null(y)||(length(y)==1L&&is.na(y))})
64+
else is.na(x), bool=FALSE) # TODO: again, n and first/last could be passed to C here
65+
if (!length(nna)) x[0L]
66+
else {y=min(n,length(nna)); x[nna[if (first) seq.int(1L,y) else seq.int(length(nna)-y+1L,length(nna))]]}
67+
} else {
68+
y=min(n,length(x)); x[if (first) seq.int(1L,y) else seq.int(length(x)-y+1L,length(x))]
69+
}
70+
if (n>1L || na.rm) # n!=length(ans)
71+
.Call("Csettruelength", ans, length(ans))
72+
# for dogroups.c to know that shorter results (including when na.rm results in a length-1) should be padded with NA to match the length of longer items
73+
# head and tail with na.rm=TRUE are by their nature returning a vector and therefore shouldn't be recycled when length-1; test 2240.81
74+
# TODO: new function pad() could be provided so user can do things like DT[, .(pad(na.omit(B)), pad(na.omit(C))), by=grp]
75+
# to avoid the error 'Supplied 2 items for column 1 of group 1 which has 3 rows ...'
76+
# and/or pad= could be added to [.data.table to allow padding all results
77+
# Since gforce_dynamic optimizes head/tail it knows to pad and that's optimized. However, default last(x) and first(x) (i.e. n=1 na.rm=FALSE) are
78+
# single-valued like mean,median etc and are recycled in the same way. This is consistent with n=1 na.rm=FALSE already not being treated as
79+
# gforce_dynamic in gsumm.c either.
80+
# ***** TODO *****: n=1 na.rm=TRUE is not vector result and should be recycled to be consistent with n=1 na.rm=FALSE. It's just n>1 that is true vector then.
81+
ans
7882
}
7983

inst/tests/tests.Rraw

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18865,14 +18865,14 @@ for (opt in c(0, Inf)) {
1886518865
output=out)
1886618866
test(test_no+.22, DT[, lapply(.SD, first, na.rm=TRUE), by=grp, verbose=TRUE], ans, output=out)
1886718867
test(test_no+.23, DT[, first(.SD, na.rm='row'), by=grp, verbose=TRUE],
18868-
data.table(grp=1:4, A=c(NA,2L,NA,3L), B=c(NA,3,NA,pi), C=c(NA,"b",NA,"d"), D=c(NA,3i,NA,4i), E=list(NA,c("a","b"),NA,3:4)),
18868+
data.table(grp=c(2L,4L), A=c(2L,3L), B=c(3,pi), C=c("b","d"), D=c(3i,4i), E=list(c("a","b"),3:4)),
1886918869
notOutput="GForce optimized") # TODO: could try to implement gforce optimized na.rm='row' in future
1887018870
test(test_no+.24, DT[, last(.SD, na.rm=TRUE), by=grp, verbose=TRUE],
1887118871
ans<-data.table(grp=1:4, A=c(1L,2L,NA,3L), B=c(1,3,3,pi), C=c("a","b","c","d"), D=c(1i,3i,NA,4i), E=list(NA,c("a","b"),list(1:2),3:4)),
1887218872
output=out)
1887318873
test(test_no+.25, DT[, lapply(.SD, last, na.rm=TRUE), by=grp, verbose=TRUE], ans, output=out)
1887418874
test(test_no+.26, DT[, last(.SD, na.rm='row'), by=grp, verbose=TRUE],
18875-
data.table(grp=1:4, A=c(NA,2L,NA,3L), B=c(NA,3,NA,pi), C=c(NA,"b",NA,"d"), D=c(NA,3i,NA,4i), E=list(NA,c("a","b"),NA,3:4)),
18875+
data.table(grp=c(2L,4L), A=c(2L,3L), B=c(3,pi), C=c("b","d"), D=c(3i,4i), E=list(c("a","b"),3:4)),
1887618876
output="GForce FALSE")
1887718877
test(test_no+.27, DT[, .(last(A,na.rm=TRUE), first(B, na.rm=TRUE), last(C)), by=grp, verbose=TRUE],
1887818878
data.table(grp=1:4, V1=c(1L, 2L, NA, 3L), V2=c(1,pi,3,pi), V3=c(NA,NA,"c",NA)),
@@ -18886,13 +18886,13 @@ for (opt in c(0, Inf)) {
1888618886
data.table(grp=INT(1,2,2,3,4,4), A=INT(1,2,NA,NA,3,3), B=c(1,pi,3,3,pi,NA), C=c("a","b",NA,"c","d",NA), D=c(1i,2i,3i,NA,4i,NA), E=list(NA,c("a","b"),NA,list(1:2),3:4,NA)),
1888718887
output=out)
1888818888
test(test_no+.42, DT[, last(.SD, na.rm="row", n=2), by=grp, verbose=TRUE],
18889-
data.table(grp=INT(1,1,2,2,3,4,4), A=c(NA,NA,NA,2L,NA,NA,3L), B=c(NA,NA,NA,3,NA,NA,pi), C=c(NA,NA,NA,"b",NA,NA,"d"), D=c(NA,NA,NA,3i,NA,NA,4i), E=list(NA,NA,NA,c("a","b"),NA,NA,3:4)),
18889+
data.table(grp=INT(2,4), A=c(2L,3L), B=c(3,pi), C=c("b","d"), D=c(3i,4i), E=list(c("a","b"),3:4)),
1889018890
output="GForce FALSE")
1889118891
test(test_no+.43, DT[, first(.SD, na.rm=TRUE, n=2), by=grp, verbose=TRUE],
1889218892
data.table(grp=INT(1,2,2,3,4,4), A=INT(1,2,NA,NA,3,3), B=c(1,pi,3,3,pi,NA), C=c("a","b",NA,"c","d",NA), D=c(1i,2i,3i,NA,4i,NA), E=list(NA,c("a","b"),NA,list(1:2),3:4,NA)),
1889318893
output=out)
1889418894
test(test_no+.44, DT[, first(.SD, na.rm="row", n=2), by=grp, verbose=TRUE],
18895-
data.table(grp=INT(1,1,2,2,3,4,4), A=c(NA,NA,2L,NA,NA,3L,NA), B=c(NA,NA,3,NA,NA,pi,NA), C=c(NA,NA,"b",NA,NA,"d",NA), D=c(NA,NA,3i,NA,NA,4i,NA), E=list(NA,NA,c("a","b"),NA,NA,3:4,NA)),
18895+
data.table(grp=INT(2,4), A=c(2L,3L), B=c(3,pi), C=c("b","d"), D=c(3i,4i), E=list(c("a","b"),3:4)),
1889618896
output="GForce FALSE")
1889718897
test(test_no+.51, DT[, first(.SD, na.rm=TRUE, n=0), by=grp, verbose=TRUE], DT[0], output="GForce FALSE")
1889818898
test(test_no+.52, DT[, last(.SD, na.rm=TRUE, n=0), by=grp, verbose=TRUE], DT[0], output="GForce FALSE")
@@ -18913,9 +18913,8 @@ for (opt in c(0, Inf)) {
1891318913
test(test_no+.74, DT[,last(A, na.rm=NA),by=grp,verbose=TRUE], error="na.rm", output=out)
1891418914

1891518915
# aligning two gforce dynamic columns the same between optimized and unoptimized
18916-
# needs to be top aligned otherwise dogroups.c would need knowledge of R first/last (last was aligned at the bottom in the PR before this test added)
18917-
# but outstanding is that dogroups.c recycles length-1 whereas gforce_dynamic currently NA fills with vector output from
18918-
# first/last n>1 na.rm=TRUE in mind. So this test fails when not optimized (2240.81) currently.
18916+
# needs to be top aligned otherwise dogroups.c would need knowledge of whether first or last was called (last was aligned at the bottom in the PR before
18917+
# this test added). dogroups.c now just needs to know whether the vector is a true vector to know not to recycle length-1 and to pad with NA
1891918918
test(test_no+.81, DT[, .(first(B, n=2, na.rm=TRUE), last(C, na.rm=TRUE)), by=grp, verbose=TRUE],
1892018919
data.table(grp=INT(1,2,2,3,4), V1=c(1,pi,3,3,pi), V2=c("a","b",NA,"c","d")),
1892118920
output=out)

src/assign.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,12 @@ SEXP truelength(SEXP x) {
277277
return ScalarInteger(isNull(x) ? 0 : TRUELENGTH(x));
278278
}
279279

280+
SEXP settruelength(SEXP x, SEXP n) {
281+
// currently just for first/last and dogroups.c; see comments at the end of last.R
282+
SET_TRUELENGTH(x, INTEGER(n)[0]);
283+
return R_NilValue;
284+
}
285+
280286
SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
281287
return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0]));
282288
}

src/dogroups.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -414,15 +414,21 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
414414
// thislen>0
415415
if (TYPEOF(source) != TYPEOF(target))
416416
error(_("Column %d of result for group %d is type '%s' but expecting type '%s'. Column types must be consistent for each group."), j+1, i+1, type2char(TYPEOF(source)), type2char(TYPEOF(target)));
417-
if (thislen>1 && thislen!=maxn && grpn>0) { // grpn>0 for grouping empty tables; test 1986
418-
error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn);
419-
}
420417
bool copied = false;
421418
if (isNewList(target) && anySpecialStatic(source)) { // see comments in anySpecialStatic()
422419
source = PROTECT(copyAsPlain(source));
423420
copied = true;
424421
}
425-
memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, "");
422+
if (TRUELENGTH(source)==LENGTH(source)) {
423+
// first() and last() set truelength to mark that it is a true vector; see comments at the end of last.R and test 2240.81
424+
// a true vector is not recycled when length-1 and is padded with NA to match the length of the longest result
425+
memrecycle(target, R_NilValue, thisansloc, thislen, source, 0, -1, 0, ""); // just using memrecycle to copy contents
426+
writeNA(target, thisansloc+thislen, maxn-thislen, true); // pad with NA
427+
} else {
428+
if (thislen>1 && thislen!=maxn && grpn>0) // grpn>0 for grouping empty tables; test 1986
429+
error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn);
430+
memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, "");
431+
}
426432
if (copied) UNPROTECT(1);
427433
}
428434
}

src/init.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ SEXP shallowwrapper();
5454
SEXP alloccolwrapper();
5555
SEXP selfrefokwrapper();
5656
SEXP truelength();
57+
SEXP settruelength();
5758
SEXP setcharvec();
5859
SEXP setcolorder();
5960
SEXP chmatch_R();
@@ -148,6 +149,7 @@ R_CallMethodDef callMethods[] = {
148149
{"Calloccolwrapper", (DL_FUNC) &alloccolwrapper, -1},
149150
{"Cselfrefokwrapper", (DL_FUNC) &selfrefokwrapper, -1},
150151
{"Ctruelength", (DL_FUNC) &truelength, -1},
152+
{"Csettruelength", (DL_FUNC) &settruelength, -1},
151153
{"Csetcharvec", (DL_FUNC) &setcharvec, -1},
152154
{"Csetcolorder", (DL_FUNC) &setcolorder, -1},
153155
{"Cchmatch", (DL_FUNC) &chmatch_R, -1},

0 commit comments

Comments
 (0)