Skip to content

Commit 94eb6ed

Browse files
committed
Added gforce_dynamic attrib returned by gfirstlast and gshift to gforce at C level which now reps grpcols instead of R level
1 parent c28786a commit 94eb6ed

File tree

8 files changed

+245
-127
lines changed

8 files changed

+245
-127
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -622,6 +622,8 @@
622622

623623
53. `as.data.frame(DT, row.names=)` no longer silently ignores `row.names`, [#5319](https://github.com/Rdatatable/data.table/issues/5319). Thanks to @dereckdemezquita for the fix and PR, and @ben-schwen for guidance.
624624

625+
54. An empty result in one column by group would be correctly filled with `NA` with warning: `Item <n> of j's result for group <n> is zero length. This will be filled with <n> NAs to match the longest column in this result. Later groups may have a similar problem but only the first is reported to save filling the warning buffer.` However, this was not true for type `list` where the fill value was `NULL`. Empty list results are now filled with `NA`.
626+
625627
## NOTES
626628

627629
1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.<type>()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example :

R/data.table.R

Lines changed: 5 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1863,43 +1863,8 @@ replace_dot_alias = function(e) {
18631863
assign(".N", len__, thisEnv) # For #334
18641864
#fix for #1683
18651865
if (use.I) assign(".I", seq_len(nrow(x)), thisEnv)
1866-
ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971.
1867-
gi = if (length(o__)) o__[f__] else f__
1868-
g = lapply(grpcols, function(i) groups[[i]][gi])
1869-
1870-
# returns all rows instead of one per group
1871-
nrow_funs = c("gshift")
1872-
.is_nrows = function(q) {
1873-
if (!is.call(q)) return(FALSE)
1874-
if (q[[1L]] == "list") {
1875-
any(vapply(q, .is_nrows, FALSE))
1876-
} else {
1877-
q[[1L]] %chin% nrow_funs
1878-
}
1879-
}
1880-
1881-
# ghead/gtail/gfirst/glast(n) support for n>1 #5060 #523 #5168
1882-
qn = 0L
1883-
if (!is.symbol(jsub)) {
1884-
headTail_arg = function(q) {
1885-
if (length(q)<3L || !(q1=q[[1L]]) %chin% c("ghead", "gtail", "gfirst", "glast")) return(0L)
1886-
q = match.call(get(q1), q)
1887-
qn = q[["n"]] # not q$n because partial argument matching matches to na.rm when n isn't present
1888-
if (length(qn)==1L && is.numeric(qn) && qn!=1L) qn else 0L
1889-
}
1890-
if (jsub[[1L]] == "list"){
1891-
qn = max(sapply(jsub, headTail_arg))
1892-
} else if (length(jsub)>=3L) {
1893-
qn = headTail_arg(jsub)
1894-
}
1895-
}
1896-
if (qn > 0L) {
1897-
grplens = pmin.int(qn, len__)
1898-
g = lapply(g, rep.int, times=grplens)
1899-
} else if (.is_nrows(jsub)) {
1900-
g = lapply(g, rep.int, times=len__)
1901-
}
1902-
ans = c(g, ans)
1866+
ans = gforce(thisEnv, jsub, o__, f__, len__, irows, # irows needed for #971
1867+
.Call(CsubsetVector, groups, grpcols)) # just a list() subset to make C level neater; doesn't copy column contents
19031868
} else {
19041869
ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose)
19051870
}
@@ -1922,9 +1887,9 @@ replace_dot_alias = function(e) {
19221887
# TO DO: setkey could mark the key whether it is unique or not.
19231888
if (!is.null(lhs)) {
19241889
if (GForce) { # GForce should work with := #1414
1925-
vlen = length(ans[[1L]])
1890+
vlen = length(ans[[1L]]) # TODO: this might be ngrp when na.rm=TRUE and one group has 2 and another 0, so needs enhancing here (by passing all-1 back from gans?)
19261891
# replicate vals if GForce returns 1 value per group
1927-
jvals = if (vlen==length(len__)) lapply(tail(ans, -length(g)), rep, times=len__) else tail(ans, -length(g)) # see comment in #4245 for why rep instead of rep.int
1892+
jvals = if (vlen==length(len__)) lapply(tail(ans, -length(grpcols)), rep, times=len__) else tail(ans, -length(grpcols)) # see comment in #4245 for why rep instead of rep.int
19281893
jrows = vecseq(f__,len__,NULL)
19291894
if (length(o__)) jrows = o__[jrows]
19301895
if (length(irows)) jrows = irows[jrows]
@@ -3039,7 +3004,7 @@ gshift = function(x, n=1L, fill=NA, type=c("lag", "lead", "shift", "cyclic")) {
30393004
stopifnot(is.numeric(n))
30403005
.Call(Cgshift, x, as.integer(n), fill, type)
30413006
}
3042-
gforce = function(env, jsub, o, f, l, rows) .Call(Cgforce, env, jsub, o, f, l, rows)
3007+
gforce = function(env, jsub, o, f, l, rows, grpcols) .Call(Cgforce, env, jsub, o, f, l, rows, grpcols)
30433008

30443009
.prepareFastSubset = function(isub, x, enclos, notjoin, verbose = FALSE){
30453010
## helper that decides, whether a fast binary search can be performed, if i is a call

R/last.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ last = function(x, n=1L, na.rm=FALSE, ...) {
5151
# else na.rm==TRUE; select the first/last non-NA within each column
5252
ans = lapply(x, .narmVector, n=n, first=first)
5353
l = vapply_1i(ans, length)
54-
m = min(n, nrow(x))
54+
m = max(l)
5555
for (i in which(l<m)) { # pad with NA
5656
ans[[i]] = if (first) c(ans[[i]], rep(NA, m-l[i]))
5757
else c(rep(NA, m-l[i]), ans[[i]])
@@ -73,7 +73,7 @@ last = function(x, n=1L, na.rm=FALSE, ...) {
7373

7474
.narmVector = function(x, n, first) {
7575
nna = which_(is.na(x) | (is.list(x) & vapply_1b(x,is.null)), bool=FALSE) # TODO: again, n and first/last could be passed to C here
76-
if (!length(nna)) if (is.list(x)) list(NA) else x[NA_integer_]
76+
if (!length(nna)) x[0L]
7777
else if (n==1L) x[nna[if (first) 1L else length(nna)]]
7878
else x[(if (first) utils::head else utils::tail)(nna, n)] # TODO: avoid dispatch here and do ourselves since just a vector
7979
}

inst/tests/tests.Rraw

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18801,11 +18801,11 @@ DT = data.table( grp = INT(1,1, 2,2,2, 3, 4,4),
1880118801
# all the results instead may reveal mistakes and be good but then might be inflexible to adding more cases.
1880218802
FNNA = function(x) { # First Not NA
1880318803
nna = which(if (is.list(x)) !is.na(x) & !sapply(x,is.null) else !is.na(x))
18804-
if (length(nna)) x[nna[1L]] else if (is.list(x)) list(NA) else x[NA_integer_]
18804+
if (length(nna)) x[nna[1L]] else x[0L]
1880518805
}
1880618806
LNNA = function(x) { # Last Not NA
1880718807
nna = which(if (is.list(x)) !is.na(x) & !sapply(x,is.null) else !is.na(x))
18808-
if (length(nna)) x[nna[length(nna)]] else if (is.list(x)) list(NA) else x[NA_integer_]
18808+
if (length(nna)) x[nna[length(nna)]] else x[0L]
1880918809
}
1881018810
test_no = 2238.0
1881118811
for (col in setdiff(names(DT), "grp")) {
@@ -18871,11 +18871,12 @@ test(2239.51, DT[, .(last(A,na.rm=TRUE), first(B, na.rm=TRUE), last(C)), by=grp,
1887118871
data.table(grp=1:4, V1=c(1L, 2L, NA, 3L), V2=c(1,pi,3,pi), V3=c(NA,NA,"c",NA)),
1887218872
output="GForce optimized.*glast.*gfirst.*glast")
1887318873
test(2239.61, DT[, last(D,na.rm=TRUE,n=2)], c(3i,4i))
18874-
test(2239.62, DT[, last(B,na.rm=TRUE,n=2), by=grp, verbose=TRUE], data.table(grp=INT(1,1,2,2,3,4,4), V1=c(NA,1,pi,3,3,NA,pi)), output="GForce.*glast")
18874+
test(2239.62, DT[, last(B,na.rm=TRUE,n=2), by=grp, verbose=TRUE], data.table(grp=INT(1,2,2,3,4), V1=c(1,pi,3,3,pi)), output="GForce.*glast")
1887518875
test(2239.63, DT[, first(D,na.rm=TRUE,n=2)], c(1i,2i))
18876-
test(2239.64, DT[, first(B,na.rm=TRUE,n=2), by=grp, verbose=TRUE], data.table(grp=INT(1,1,2,2,3,4,4), V1=c(1,NA,pi,3,3,pi,NA)), output="GForce.*gfirst")
18876+
test(2239.64, DT[, first(B,na.rm=TRUE,n=2), by=grp, verbose=TRUE], data.table(grp=INT(1,2,2,3,4), V1=c(1,pi,3,3,pi)), output="GForce.*gfirst")
18877+
test(2239.65, DT[, last(D,na.rm=TRUE,n=1), by=grp, verbose=TRUE], data.table(grp=INT(1,2,4), V1=c(1i,3i,4i)), output="GForce.*glast")
1887718878
test(2239.71, DT[, last(.SD, na.rm=TRUE, n=2), by=grp, verbose=TRUE],
18878-
ans<-data.table(grp=INT(1,1,2,2,3,4,4), A=INT(NA,1,NA,2,NA,3,3), B=c(NA,1,pi,3,3,NA,pi), C=c(NA,"a",NA,"b","c",NA,"d"), D=c(NA,1i,2i,3i,NA,NA,4i), E=list(NA,NA,NA,c("a","b"),list(1:2),NA,3:4)),
18879+
ans<-data.table(grp=INT(1,2,2,3,4,4), A=INT(1,NA,2,NA,3,3), B=c(1,pi,3,3,NA,pi), C=c("a",NA,"b","c",NA,"d"), D=c(1i,2i,3i,NA,NA,4i), E=list(NA,NA,c("a","b"),list(1:2),NA,3:4)),
1887918880
output="GForce.*glast")
1888018881
old = options(datatable.optimize=0)
1888118882
test(2239.72, DT[, last(.SD, na.rm=TRUE, n=2), by=grp, verbose=TRUE], ans, output="GForce FALSE")
@@ -18884,7 +18885,7 @@ test(2239.73, DT[, last(.SD, na.rm="row", n=2), by=grp, verbose=TRUE],
1888418885
data.table(grp=INT(1,1,2,2,3,4,4), A=c(NA,NA,NA,2L,NA,NA,3L), B=c(NA,NA,NA,3,NA,NA,pi), C=c(NA,NA,NA,"b",NA,NA,"d"), D=c(NA,NA,NA,3i,NA,NA,4i), E=list(NA,NA,NA,c("a","b"),NA,NA,3:4)),
1888518886
output="GForce FALSE")
1888618887
test(2239.74, DT[, first(.SD, na.rm=TRUE, n=2), by=grp, verbose=TRUE],
18887-
ans<-data.table(grp=INT(1,1,2,2,3,4,4), A=INT(1,NA,2,NA,NA,3,3), B=c(1,NA,pi,3,3,pi,NA), C=c("a",NA,"b",NA,"c","d",NA), D=c(1i,NA,2i,3i,NA,4i,NA), E=list(NA,NA,c("a","b"),NA,list(1:2),3:4,NA)),
18888+
ans<-data.table(grp=INT(1,2,2,3,4,4), A=INT(1,2,NA,NA,3,3), B=c(1,pi,3,3,pi,NA), C=c("a","b",NA,"c","d",NA), D=c(1i,2i,3i,NA,4i,NA), E=list(NA,c("a","b"),NA,list(1:2),3:4,NA)),
1888818889
output="GForce.*gfirst")
1888918890
old = options(datatable.optimize=0)
1889018891
test(2239.75, DT[, first(.SD, na.rm=TRUE, n=2), by=grp, verbose=TRUE], ans, output="GForce FALSE")

src/data.table.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ extern SEXP char_POSIXct;
8080
extern SEXP char_POSIXt;
8181
extern SEXP char_UTC;
8282
extern SEXP char_nanotime;
83+
extern SEXP char_lens;
8384
extern SEXP char_indices;
8485
extern SEXP char_allLen1;
8586
extern SEXP char_allGrp1;
@@ -94,8 +95,7 @@ extern SEXP sym_index;
9495
extern SEXP sym_BY;
9596
extern SEXP sym_starts, char_starts;
9697
extern SEXP sym_maxgrpn;
97-
extern SEXP sym_lens, char_lens;
98-
extern SEXP sym_first;
98+
extern SEXP sym_gforce_dynamic;
9999
extern SEXP sym_colClassesAs;
100100
extern SEXP sym_verbose;
101101
extern SEXP SelfRefSymbol;

src/dogroups.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
409409
warning(_("Item %d of j's result for group %d is zero length. This will be filled with %d NAs to match the longest column in this result. Later groups may have a similar problem but only the first is reported to save filling the warning buffer."), j+1, i+1, maxn);
410410
NullWarnDone = TRUE;
411411
}
412-
writeNA(target, thisansloc, maxn, false);
412+
writeNA(target, thisansloc, maxn, true);
413413
} else {
414414
// thislen>0
415415
if (TYPEOF(source) != TYPEOF(target))

0 commit comments

Comments
 (0)