From b0efcf59442a7d086c6df17fa6a45c81b082322e Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 18 Apr 2020 19:38:06 +0100 Subject: [PATCH 01/53] lazy forder --- R/setkey.R | 9 ++-- inst/tests/tests.Rraw | 110 ++++++++++++++++++++++++++++++++++++++++++ src/data.table.h | 3 +- src/forder.c | 108 ++++++++++++++++++++++++++++++++++++++++- src/init.c | 1 + 5 files changed, 225 insertions(+), 6 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 334ca1e801..97f67e9b2e 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -172,8 +172,7 @@ is.sorted = function(x, by=seq_along(x)) { } ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') -forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE) -{ +forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE, lazy=TRUE) { if (is.atomic(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stop("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL @@ -183,7 +182,11 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.las if (length(order) == 1L) order = rep(order, length(by)) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level - .Call(Cforder, x, by, retGrp, sort, order, na.last) # returns integer() if already sorted, regardless of sort=TRUE|FALSE + # returns integer() if already sorted, regardless of sort=TRUE|FALSE + if (lazy) + .Call(Cforder, x, by, retGrp, sort, order, na.last) + else + .Call(CforderDo, x, by, retGrp, sort, order, na.last) } forder = function(..., na.last=TRUE, decreasing=FALSE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8f..9e9061870e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16846,3 +16846,113 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN)) test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) A = data.table(A=as.complex(rep(NA, 5))) test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) + +# lazy forder +dd = data.table(a=1:2, b=2:1) +d = copy(dd) +op = options(datatable.verbose=TRUE) +test(2139.01, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2139.02, forderv(d, "b", lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +setkeyv(d, "b") +options(datatable.verbose=TRUE) +test(2139.03, forderv(d, "b"), integer(), output="forder.*opt=1.*took") +test(2139.04, forderv(d, "b", lazy=FALSE), integer(), notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +d = copy(dd) +setindexv(d, "b") +options(datatable.verbose=TRUE) +test(2139.05, forderv(d, "b"), 2:1, output="forder.*opt=2.*took") +test(2139.06, forderv(d, "b", lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +d = copy(dd) +options(datatable.verbose=TRUE) +test(2139.11, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2139.12, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.13, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2139.14, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +setkeyv(d, c("a","b")) +options(datatable.verbose=TRUE) +test(2139.21, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") +test(2139.22, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.23, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2139.24, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +setkeyv(d, c("b","a")) +options(datatable.verbose=TRUE) +test(2139.25, forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") +test(2139.26, forderv(d, c("a","b"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.27, forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") +test(2139.28, forderv(d, c("b","a"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +d = copy(dd) +setindexv(d, c("a","b")) +options(datatable.verbose=TRUE) +test(2139.31, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2139.32, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.33, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2139.34, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +setindexv(d, NULL) +setindexv(d, c("b","a")) +options(datatable.verbose=TRUE) +test(2139.35, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2139.36, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.37, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2139.38, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +setindexv(d, NULL) +setindexv(d, list(c("a","b"), c("b","a"))) +options(datatable.verbose=TRUE) +test(2139.41, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2139.42, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.43, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2139.44, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +d = copy(dd) +setkeyv(d, c("a","b")) +setindexv(d, list(c("a","b"), c("b","a"))) +options(datatable.verbose=TRUE) +test(2139.51, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached +test(2139.52, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.53, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2139.54, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +setkeyv(d, NULL) +setindexv(d, list(c("a","b"), c("b","a"))) +options(datatable.verbose=TRUE) +test(2139.55, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") +test(2139.56, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +options(datatable.verbose=FALSE) +d = copy(dd) +setkeyv(d, c("a","b")) +setindexv(d, list(c("a","a"), c("b","a"))) +options(datatable.verbose=TRUE) +ab = structure(integer(), starts=1:2, maxgrpn=1L) +ba = structure(2:1, starts=1:2, maxgrpn=1L) +test(2139.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2139.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, notOutput="forder.*opt.*took") +test(2139.63, forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=0.*took") +test(2139.64, forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, notOutput="forder.*opt.*took") +test(2139.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=0.*took") +test(2139.66, forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.67, forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=0.*took") +test(2139.68, forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.69, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2139.70, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, notOutput="forder.*opt.*took") +test(2139.71, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2139.72, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, notOutput="forder.*opt.*took") +test(2139.73, forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") +test(2139.74, forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.75, forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") +test(2139.76, forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.77, forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") +test(2139.78, forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.79, forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") +test(2139.80, forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.81, forderv(1:2), integer(), output="forder.*opt=0.*took") +test(2139.82, forderv(1:2, lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.83, forderv(2:1), 2:1, output="forder.*opt=0.*took") +test(2139.84, forderv(2:1, lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +options(op) diff --git a/src/data.table.h b/src/data.table.h index 90ff7fb6fc..439130da27 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -123,7 +123,8 @@ int checkOverAlloc(SEXP x); // forder.c int StrCmp(SEXP x, SEXP y); uint64_t dtwiddle(void *p, int i); -SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg); +SEXP forderDo(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg); +SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg); // lazy wrapper to forderDo int getNumericRounding_C(); // reorder.c diff --git a/src/forder.c b/src/forder.c index ea0be76d04..aca0e2f051 100644 --- a/src/forder.c +++ b/src/forder.c @@ -414,10 +414,9 @@ uint64_t dtwiddle(void *p, int i) void radix_r(const int from, const int to, const int radix); -SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg) +SEXP forderDo(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg) { // sortGroups TRUE from setkey and regular forder, FALSE from by= for efficiency so strings don't have to be sorted and can be left in appearance order // when sortGroups is TRUE, ascArg contains +1/-1 for ascending/descending of each by column; when FALSE ascArg is ignored -{ #ifdef TIMING_ON memset(tblock, 0, MAX_NTH*NBLOCK*sizeof(double)); @@ -801,6 +800,111 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S return ans; } +// all(x==1L) +static bool all1(SEXP x) { + if (!isInteger(x)) + error("internal error: all1 got non-integer"); // # nocov + int *xp = INTEGER(x); + for (int i=0; i Date: Sat, 18 Apr 2020 21:05:06 +0100 Subject: [PATCH 02/53] fix tests --- R/bmerge.R | 5 +--- R/setkey.R | 8 ++---- inst/tests/tests.Rraw | 2 +- src/bmerge.c | 22 ++++++++-------- src/data.table.h | 4 +-- src/forder.c | 59 +++++++++++++++++++++++++++++++++---------- src/init.c | 1 - 7 files changed, 61 insertions(+), 40 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 3d6ab028f3..680fb94f3c 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -116,9 +116,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } } - ## after all modifications of i, check if i has a proper key on all icols - io = identical(icols, head(chmatch(key(i), names(i)), length(icols))) - ## after all modifications of x, check if x has a proper key on all xcols. ## If not, calculate the order. Also for non-equi joins, the order must be calculated. non_equi = which.first(ops != 1L) # 1 is "==" operator @@ -178,7 +175,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } if (verbose) {last.started.at=proc.time();cat("Starting bmerge ...\n");flush.console()} - ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), io, xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) + ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) if (verbose) {cat("bmerge done in",timetaken(last.started.at),"\n"); flush.console()} # TO DO: xo could be moved inside Cbmerge diff --git a/R/setkey.R b/R/setkey.R index 97f67e9b2e..799f265d4a 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -172,7 +172,7 @@ is.sorted = function(x, by=seq_along(x)) { } ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') -forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE, lazy=TRUE) { +forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE, lazy=NA) { if (is.atomic(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stop("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL @@ -182,11 +182,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.las if (length(order) == 1L) order = rep(order, length(by)) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level - # returns integer() if already sorted, regardless of sort=TRUE|FALSE - if (lazy) - .Call(Cforder, x, by, retGrp, sort, order, na.last) - else - .Call(CforderDo, x, by, retGrp, sort, order, na.last) + .Call(Cforder, x, by, retGrp, sort, order, na.last, lazy) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } forder = function(..., na.last=TRUE, decreasing=FALSE) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9e9061870e..7159618704 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13364,7 +13364,7 @@ test(1962.0461, forderv(DT, order = c(1L, -1L)), error="Either order= is n test(1962.0462, forderv(DT, order = 2L), error='Item 1 of order (ascending/descending) is 2. Must be +1 or -1') test(1962.0471, forderv(mean), error="'x' argument must be data.table compatible") test(1962.0472, forderv(DT, by=mean), error="argument specifying columns must be character or numeric") -test(1962.0473, forderv(NULL), error="DT is an empty list() of 0 columns") +test(1962.0473, forderv(NULL), error="DT is NULL") setDF(DT) test(1962.0481, forder(DT), 3:1) diff --git a/src/bmerge.c b/src/bmerge.c index 15d7d6f4f7..e550e4adf6 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -37,7 +37,7 @@ static Rboolean rollToNearest=FALSE; void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisgrp, int lowmax, int uppmax); -SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg) { +SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg) { int xN, iN, protecti=0; ctr=0; // needed for non-equi join case SEXP retFirstArg, retLengthArg, retIndexArg, allLen1Arg, allGrp1Arg; @@ -138,17 +138,15 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE allGrp1[0] = TRUE; protecti += 2; - // isorted arg - o = NULL; - if (!LOGICAL(isorted)[0]) { - SEXP order = PROTECT(allocVector(INTSXP, length(icolsArg))); - protecti++; - for (int j=0; j Date: Sat, 18 Apr 2020 21:38:05 +0100 Subject: [PATCH 03/53] fix tests --- inst/tests/tests.Rraw | 60 +++++++++++++++++++++---------------------- src/forder.c | 3 +++ 2 files changed, 33 insertions(+), 30 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7159618704..9bbbe19849 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16852,78 +16852,78 @@ dd = data.table(a=1:2, b=2:1) d = copy(dd) op = options(datatable.verbose=TRUE) test(2139.01, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") -test(2139.02, forderv(d, "b", lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.02, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setkeyv(d, "b") options(datatable.verbose=TRUE) test(2139.03, forderv(d, "b"), integer(), output="forder.*opt=1.*took") -test(2139.04, forderv(d, "b", lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.04, forderv(d, "b", lazy=FALSE), integer(), output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) setindexv(d, "b") options(datatable.verbose=TRUE) test(2139.05, forderv(d, "b"), 2:1, output="forder.*opt=2.*took") -test(2139.06, forderv(d, "b", lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.06, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) options(datatable.verbose=TRUE) test(2139.11, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2139.12, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.12, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.13, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2139.14, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.14, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setkeyv(d, c("a","b")) options(datatable.verbose=TRUE) test(2139.21, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") -test(2139.22, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.22, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.23, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2139.24, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.24, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setkeyv(d, c("b","a")) options(datatable.verbose=TRUE) test(2139.25, forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") -test(2139.26, forderv(d, c("a","b"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.26, forderv(d, c("a","b"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") test(2139.27, forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") -test(2139.28, forderv(d, c("b","a"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.28, forderv(d, c("b","a"), lazy=FALSE), integer(), output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) setindexv(d, c("a","b")) options(datatable.verbose=TRUE) test(2139.31, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2139.32, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.32, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.33, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2139.34, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.34, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setindexv(d, NULL) setindexv(d, c("b","a")) options(datatable.verbose=TRUE) test(2139.35, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2139.36, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.36, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.37, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2139.38, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.38, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setindexv(d, NULL) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) test(2139.41, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2139.42, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.42, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.43, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2139.44, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.44, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) test(2139.51, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached -test(2139.52, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.52, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.53, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2139.54, forderv(d, c("b","a"), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.54, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setkeyv(d, NULL) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) test(2139.55, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") -test(2139.56, forderv(d, c("a","b"), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.56, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) setkeyv(d, c("a","b")) @@ -16932,27 +16932,27 @@ options(datatable.verbose=TRUE) ab = structure(integer(), starts=1:2, maxgrpn=1L) ba = structure(2:1, starts=1:2, maxgrpn=1L) test(2139.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2139.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, notOutput="forder.*opt.*took") +test(2139.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") test(2139.63, forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=0.*took") -test(2139.64, forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, notOutput="forder.*opt.*took") +test(2139.64, forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") test(2139.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=0.*took") -test(2139.66, forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.66, forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.67, forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=0.*took") -test(2139.68, forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.68, forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") test(2139.69, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2139.70, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, notOutput="forder.*opt.*took") +test(2139.70, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") test(2139.71, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2139.72, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, notOutput="forder.*opt.*took") +test(2139.72, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") test(2139.73, forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") -test(2139.74, forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.74, forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, output="forder.*opt=0.*took") test(2139.75, forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") -test(2139.76, forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.76, forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.77, forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") -test(2139.78, forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.78, forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.79, forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") -test(2139.80, forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.80, forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, output="forder.*opt=0.*took") test(2139.81, forderv(1:2), integer(), output="forder.*opt=0.*took") -test(2139.82, forderv(1:2, lazy=FALSE), integer(), notOutput="forder.*opt.*took") +test(2139.82, forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.83, forderv(2:1), 2:1, output="forder.*opt=0.*took") -test(2139.84, forderv(2:1, lazy=FALSE), 2:1, notOutput="forder.*opt.*took") +test(2139.84, forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(op) diff --git a/src/forder.c b/src/forder.c index 644e43c770..40a8b909d3 100644 --- a/src/forder.c +++ b/src/forder.c @@ -883,6 +883,9 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S if (!isLogical(lazyArg) || LENGTH(lazyArg) != 1) error("lazy must be logical TRUE, FALSE or NA of length 1"); + if (!length(DT)) + return allocVector(INTSXP, 0); + int opt = -1; // -1=unknown, 0=none, 1=keyOpt, 2=idxOpt if (LOGICAL(lazyArg)[0]==NA_LOGICAL) { if (isNewList(DT) && From 9df2668c80f93db3887ce218c6f442ba737bd20d Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 18 Apr 2020 21:49:06 +0100 Subject: [PATCH 04/53] respect use.index option --- inst/tests/tests.Rraw | 8 ++++++++ src/forder.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9bbbe19849..6ff54a99d1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16955,4 +16955,12 @@ test(2139.81, forderv(1:2), integer(), output="forder.*opt=0.*took") test(2139.82, forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.83, forderv(2:1), 2:1, output="forder.*opt=0.*took") test(2139.84, forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +options(datatable.verbose=FALSE) +d = copy(dd) +setindexv(d, "b") +options(datatable.verbose=TRUE) +op2 = options(datatable.use.index=FALSE) +test(2139.91, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2139.92, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") +options(op2) options(op) diff --git a/src/forder.c b/src/forder.c index 40a8b909d3..5433b15ec1 100644 --- a/src/forder.c +++ b/src/forder.c @@ -921,7 +921,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S } } - if (opt == -1) { + if (opt == -1 && LOGICAL(GetOption(install("datatable.use.index"), R_NilValue))[0]==TRUE) { SEXP idx = getIndex(DT, by); if (!isNull(idx)) { opt = 2; // idxOpt From 62982ed7b5f721a64a6c7a2f517a0380b6024aa7 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 18 Apr 2020 21:55:12 +0100 Subject: [PATCH 05/53] bmerge timings --- src/bmerge.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/bmerge.c b/src/bmerge.c index e550e4adf6..543a0207b0 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -38,6 +38,10 @@ static Rboolean rollToNearest=FALSE; void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisgrp, int lowmax, int uppmax); SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP rollarg, SEXP rollendsArg, SEXP nomatchArg, SEXP multArg, SEXP opArg, SEXP nqgrpArg, SEXP nqmaxgrpArg) { + const bool verbose = GetVerbose(); + double tic=0.0, tic0=0.0; + if (verbose) + tic = omp_get_wtime(); int xN, iN, protecti=0; ctr=0; // needed for non-equi join case SEXP retFirstArg, retLengthArg, retIndexArg, allLen1Arg, allGrp1Arg; @@ -157,10 +161,14 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP // start bmerge if (iN) { + if (verbose) + tic0 = omp_get_wtime(); // embarassingly parallel if we've storage space for nqmaxgrp*iN for (int kk=0; kk 1 && mult == ALL) { @@ -191,6 +199,8 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP Free(retLength); Free(retIndex); } + if (verbose) + Rprintf("bmerge: took %.3fs\n", omp_get_wtime()-tic); UNPROTECT(protecti); return (ans); } From eec497158bdb5c44e76b7fbf1da2d82bde5a856e Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sat, 18 Apr 2020 22:28:13 +0100 Subject: [PATCH 06/53] codecov --- inst/tests/tests.Rraw | 2 ++ src/forder.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6ff54a99d1..ba5e344aa7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16964,3 +16964,5 @@ test(2139.91, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") test(2139.92, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(op2) options(op) +test(2139.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") + diff --git a/src/forder.c b/src/forder.c index 5433b15ec1..63762f86b4 100644 --- a/src/forder.c +++ b/src/forder.c @@ -871,7 +871,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S if (verbose) tic = omp_get_wtime(); if (isNull(DT)) - error("DT is NULL"); // # nocov + error("DT is NULL"); if (!IS_TRUE_OR_FALSE(retGrpArg)) error("retGrp must be TRUE or FALSE"); if (!IS_TRUE_OR_FALSE(sortGroupsArg)) @@ -879,7 +879,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S if (!isLogical(naArg) || LENGTH(naArg) != 1) error("na.last must be logical TRUE, FALSE or NA of length 1"); if (!isInteger(ascArg)) - error("order must be integer"); + error("order must be integer"); // # nocov # coerced to int in R if (!isLogical(lazyArg) || LENGTH(lazyArg) != 1) error("lazy must be logical TRUE, FALSE or NA of length 1"); From 9affbfabf2958b07b73ad9708b53cf2191e7432b Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 19 Apr 2020 21:51:15 +0100 Subject: [PATCH 07/53] helper function --- src/forder.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/forder.c b/src/forder.c index 63762f86b4..9e86faacfb 100644 --- a/src/forder.c +++ b/src/forder.c @@ -863,6 +863,11 @@ SEXP getIndex(SEXP x, SEXP cols) { return idx; } +// isTRUE(getOption("datatable.use.index")) +bool GetUseIndex() { + return LOGICAL(GetOption(install("datatable.use.index"), R_NilValue))[0]==TRUE; +} + // lazy forder, re-use existing key or index if possible, otherwise call forderDo SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg) { const bool verbose = GetVerbose(); @@ -921,7 +926,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S } } - if (opt == -1 && LOGICAL(GetOption(install("datatable.use.index"), R_NilValue))[0]==TRUE) { + if (opt == -1 && GetUseIndex()) { SEXP idx = getIndex(DT, by); if (!isNull(idx)) { opt = 2; // idxOpt From b36136afc0245ca531f08870c1333363f78e710e Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 10:37:24 +0100 Subject: [PATCH 08/53] reduce diff to master --- R/setkey.R | 2 +- src/bmerge.c | 2 +- src/data.table.h | 4 ++-- src/forder.c | 9 +++++---- src/init.c | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 799f265d4a..61120235a0 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -182,7 +182,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.las if (length(order) == 1L) order = rep(order, length(by)) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level - .Call(Cforder, x, by, retGrp, sort, order, na.last, lazy) # returns integer() if already sorted, regardless of sort=TRUE|FALSE + .Call(CforderLazy, x, by, retGrp, sort, order, na.last, lazy) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } forder = function(..., na.last=TRUE, decreasing=FALSE) diff --git a/src/bmerge.c b/src/bmerge.c index 543a0207b0..567ca496da 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -146,7 +146,7 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP protecti++; for (int j=0; j Date: Mon, 20 Apr 2020 10:59:37 +0100 Subject: [PATCH 09/53] rename fix --- src/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/init.c b/src/init.c index e477676ec2..40ee82bf82 100644 --- a/src/init.c +++ b/src/init.c @@ -151,7 +151,7 @@ R_CallMethodDef callMethods[] = { {"Cfcast", (DL_FUNC) &fcast, -1}, {"Cuniqlist", (DL_FUNC) &uniqlist, -1}, {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1}, -{"CforderLazy", (DL_FUNC) &forder, -1}, +{"CforderLazy", (DL_FUNC) &forderLazy, -1}, {"Cfsorted", (DL_FUNC) &fsorted, -1}, {"Cgforce", (DL_FUNC) &gforce, -1}, {"Cgsum", (DL_FUNC) &gsum, -1}, From dfe883ed4daf2cca541fad1a2e614117e31cc1a6 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 15:59:43 +0100 Subject: [PATCH 10/53] setindex writes groups (retGrp=TRUE) forder C set index directly smart opt for index retGrp=T/F no tests updated yet --- R/setkey.R | 44 +++++++---------------- src/forder.c | 98 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 81 insertions(+), 61 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 61120235a0..d2507a542f 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -56,23 +56,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU miss = !(cols %chin% colnames(x)) if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=",")) - ## determine, whether key is already present: - if (identical(key(x),cols)) { - if (!physical) { - ## create index as integer() because already sorted by those columns - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer()) - } - return(invisible(x)) - } else if(identical(head(key(x), length(cols)), cols)){ - if (!physical) { - ## create index as integer() because already sorted by those columns - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer()) - } else { - ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key. - setattr(x,"sorted",cols) - } + if (physical && identical(head(key(x), length(cols)), cols)){ ## for !physical we need to compute groups as well #4387 + ## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key. + setattr(x,"sorted",cols) return(invisible(x)) } @@ -84,25 +70,19 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov newkey = paste0(cols, collapse="__") - if (!any(indices(x) == newkey)) { - if (verbose) { - tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R - # suppress needed for tests 644 and 645 in verbose mode - cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") - } else { - o = forderv(x, cols, sort=TRUE, retGrp=FALSE) - } + if (verbose) { + # we now also retGrp=TRUE #4387 for !physical + tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=!physical))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R + # suppress needed for tests 644 and 645 in verbose mode + cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") } else { - if (verbose) cat("setkey on columns ", brackify(cols), " using existing index '", newkey, "'\n", sep="") - o = getindex(x, newkey) + o = forderv(x, cols, sort=TRUE, retGrp=!physical) } - if (!physical) { - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o) + if (!physical) { # index saved from C forderLazy already return(invisible(x)) } - setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. if (length(o)) { + setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. Only when order changes. if (verbose) { last.started.at = proc.time() } .Call(Creorder,x,o) if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() } @@ -172,7 +152,7 @@ is.sorted = function(x, by=seq_along(x)) { } ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') -forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE, lazy=NA) { +forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE, lazy=getOption("datatable.forder.lazy",NA)) { if (is.atomic(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stop("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL diff --git a/src/forder.c b/src/forder.c index 44270b3dd0..fe3d697b76 100644 --- a/src/forder.c +++ b/src/forder.c @@ -835,40 +835,65 @@ bool colsKeyHead(SEXP x, SEXP cols) { return true; } -// paste0("__", cols, collapse="") -SEXP idxName(SEXP cols) { - if (!isString(cols)) - error("internal error: 'cols' must be a character"); // # nocov +// paste0("__", names(x)[cols], collapse="") +SEXP idxName(SEXP x, SEXP cols) { + if (!isInteger(cols)) + error("internal error: 'cols' must be an integer"); // # nocov + SEXP idx_names = PROTECT(subsetVector(getAttrib(x, R_NamesSymbol), cols)); SEXP char_underscore2 = PROTECT(ScalarString(mkChar("__"))); SEXP char_empty = PROTECT(ScalarString(mkChar(""))); SEXP sym_paste0 = install("paste0"); - SEXP call_paste0 = PROTECT(lang4(sym_paste0, char_underscore2, cols, char_empty)); + SEXP call_paste0 = PROTECT(lang4(sym_paste0, char_underscore2, idx_names, char_empty)); SET_TAG(CDDDR(call_paste0), install("collapse")); SEXP ans = PROTECT(eval(call_paste0, R_GlobalEnv)); - UNPROTECT(4); + UNPROTECT(5); return ans; } -// attr(attr(x, "index"), idxName(names(x)[cols])) +// attr(attr(x, "index"), idxName(x, cols)) SEXP getIndex(SEXP x, SEXP cols) { if (!isInteger(cols)) error("internal error: 'cols' must be an integer"); // # nocov SEXP index = getAttrib(x, sym_index); if (isNull(index)) return index; - SEXP idx_names = PROTECT(subsetVector(getAttrib(x, R_NamesSymbol), cols)); - SEXP name_idx = PROTECT(idxName(idx_names)); + SEXP name_idx = PROTECT(idxName(x, cols)); SEXP sym_idx = install(CHAR(STRING_ELT(name_idx, 0))); SEXP idx = getAttrib(index, sym_idx); - UNPROTECT(2); + UNPROTECT(1); return idx; } +// attr(attr(x, "index"), idxName(x, cols)) <- o +void putIndex(SEXP x, SEXP cols, SEXP o) { + if (!isInteger(cols)) + error("internal error: 'cols' must be an integer"); // # nocov + if (!isInteger(o)) + error("internal error: 'o' must be an integer"); // # nocov + SEXP index = getAttrib(x, sym_index); + if (isNull(index)) { + index = PROTECT(allocVector(INTSXP, 0)); + setAttrib(x, sym_index, index); + UNPROTECT(1); + } + SEXP name_idx = PROTECT(idxName(x, cols)); + SEXP sym_idx = install(CHAR(STRING_ELT(name_idx, 0))); + if (!isNull(getAttrib(index, sym_idx))) + error("internal error: trying to put index but it was already there, that should have been escaped before"); + setAttrib(index, sym_idx, o); + UNPROTECT(1); +} + // isTRUE(getOption("datatable.use.index")) bool GetUseIndex() { return LOGICAL(GetOption(install("datatable.use.index"), R_NilValue))[0]==TRUE; } +// isTRUE(getOption("datatable.auto.index")) +bool GetAutoIndex() { + return LOGICAL(GetOption(install("datatable.auto.index"), R_NilValue))[0]==TRUE; +} + // lazy forder, re-use existing key or index if possible, otherwise call forder SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg) { const bool verbose = GetVerbose(); @@ -880,63 +905,78 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr error("DT is NULL"); if (!IS_TRUE_OR_FALSE(retGrpArg)) error("retGrp must be TRUE or FALSE"); + bool retGrp = (bool)LOGICAL(retGrpArg)[0]; if (!IS_TRUE_OR_FALSE(sortGroupsArg)) error("sort must be TRUE or FALSE"); + bool sortGroups = (bool)LOGICAL(sortGroupsArg)[0]; if (!isLogical(naArg) || LENGTH(naArg) != 1) error("na.last must be logical TRUE, FALSE or NA of length 1"); + bool na = (bool)LOGICAL(naArg)[0]; if (!isInteger(ascArg)) error("order must be integer"); // # nocov # coerced to int in R if (!isLogical(lazyArg) || LENGTH(lazyArg) != 1) error("lazy must be logical TRUE, FALSE or NA of length 1"); + int lazy = LOGICAL(lazyArg)[0]; if (!length(DT)) return allocVector(INTSXP, 0); int opt = -1; // -1=unknown, 0=none, 1=keyOpt, 2=idxOpt - if (LOGICAL(lazyArg)[0]==NA_LOGICAL) { - if (isNewList(DT) && - !LOGICAL(retGrpArg)[0] && - LOGICAL(sortGroupsArg)[0] && - !LOGICAL(naArg)[0] && - all1(ascArg)) { + if (lazy==NA_LOGICAL) { + if (isNewList(DT) && sortGroups && !na && + all1(ascArg)) { // could ascArg=-1 be handled by a rev()? opt = -1; } else { + if (verbose) + Rprintf("forder: opt not possible: isNewList(DT)=%d, sortGroups=%d, !na=%d, all1(ascArg)=%d\n", isNewList(DT), sortGroups, !na, all1(ascArg)); opt = 0; } - } else if (LOGICAL(lazyArg)[0]) { + } else if (lazy) { if (!isNewList(DT)) error("internal error: lazy set to TRUE but DT is not a list"); // # nocov - if (LOGICAL(retGrpArg)[0]) - error("internal error: lazy set to TRUE but retGrp is TRUE"); // # nocov # proposal to change that: #4346 - if (!LOGICAL(sortGroupsArg)[0]) + if (!sortGroups) error("internal error: lazy set to TRUE but sort is FALSE"); // # nocov - if (LOGICAL(naArg)[0]!=FALSE) + if (na) error("internal error: lazy set to TRUE but na.last is not FALSE"); // # nocov if (!all1(ascArg)) error("internal error: lazy set to TRUE but order is not all 1"); // # nocov opt = -1; - } else if (!LOGICAL(lazyArg)[0]) { + } else if (!lazy) { opt = 0; } SEXP ans = R_NilValue; - if (opt == -1) { - if (colsKeyHead(DT, by)) { - opt = 1; // keyOpt - ans = PROTECT(allocVector(INTSXP, 0)); protecti++; - } + if (opt == -1 && !retGrp && colsKeyHead(DT, by)) { + opt = 1; // keyOpt + ans = PROTECT(allocVector(INTSXP, 0)); protecti++; } if (opt == -1 && GetUseIndex()) { SEXP idx = getIndex(DT, by); if (!isNull(idx)) { - opt = 2; // idxOpt - ans = idx; + bool hasGrp = !isNull(getAttrib(idx, sym_starts)); + if ((!hasGrp && !retGrp) || + (hasGrp && retGrp)) { + opt = 2; // idxOpt retGrp==hasGrp, if condition unfolded for codecov + } else if (hasGrp && !retGrp) { + opt = 2; // idxOpt but need to drop groups + idx = copyAsPlain(idx); // we could shallow copy here + setAttrib(idx, sym_starts, R_NilValue); + setAttrib(idx, sym_maxgrpn, R_NilValue); + } // else: !hasGrp && retGrp: need to compute groups as they are not in index + if (opt == 2) + ans = idx; } } if (opt < 1) { ans = PROTECT(forder(DT, by, retGrpArg, sortGroupsArg, ascArg, naArg)); protecti++; + if (opt == -1 && // opt==0 means that arguments (na.last,..) were not of type index, or lazy=FALSE + GetUseIndex() && GetAutoIndex()) { + putIndex(DT, by, ans); + if (verbose) + Rprintf("forder: setting index (retGrp=%d) on DT: %s\n", retGrp, CHAR(STRING_ELT(idxName(DT, by), 0))); + } } if (verbose) From efb3319e51b6cb0ca6f21179931eb4a507742fcd Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 21:33:01 +0100 Subject: [PATCH 11/53] calc order, not groups --- R/data.table.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 9292ee940b..69da54b790 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2994,10 +2994,9 @@ isReallyReal = function(x) { if (!getOption("datatable.auto.index")) return(NULL) if (verbose) {cat("Creating new index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} if (verbose) {last.started.at=proc.time();cat("Creating index", paste0(names(i), collapse = "__"), "done in ... ");flush.console()} - setindexv(x, names(i)) + idx = forderv(x, names(i), sort=TRUE, retGrp=FALSE) if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} if (verbose) {cat("Optimized subsetting with index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} - idx = attr(attr(x, "index", exact=TRUE), paste0("__", names(i), collapse = ""), exact=TRUE) idxCols = names(i) } if(!is.null(idxCols)){ From 68b378dee0baa775261a50db33e5fc5601ccf49d Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 21:42:29 +0100 Subject: [PATCH 12/53] expect to reach optimization --- R/data.table.R | 2 +- R/setkey.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 69da54b790..5de53bd104 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2994,7 +2994,7 @@ isReallyReal = function(x) { if (!getOption("datatable.auto.index")) return(NULL) if (verbose) {cat("Creating new index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} if (verbose) {last.started.at=proc.time();cat("Creating index", paste0(names(i), collapse = "__"), "done in ... ");flush.console()} - idx = forderv(x, names(i), sort=TRUE, retGrp=FALSE) + idx = forderv(x, names(i), sort=TRUE, retGrp=FALSE, lazy=TRUE) if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} if (verbose) {cat("Optimized subsetting with index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} idxCols = names(i) diff --git a/R/setkey.R b/R/setkey.R index d2507a542f..da51596e77 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -72,11 +72,11 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU newkey = paste0(cols, collapse="__") if (verbose) { # we now also retGrp=TRUE #4387 for !physical - tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=!physical))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R + tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=!physical, lazy=TRUE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R # suppress needed for tests 644 and 645 in verbose mode cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n") } else { - o = forderv(x, cols, sort=TRUE, retGrp=!physical) + o = forderv(x, cols, sort=TRUE, retGrp=!physical, lazy=TRUE) } if (!physical) { # index saved from C forderLazy already return(invisible(x)) From 0bd0e1e72309d1c346b2d21489ee8851d1caeb13 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 22:21:19 +0100 Subject: [PATCH 13/53] skip opt for list --- src/forder.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/forder.c b/src/forder.c index fe3d697b76..e5cf6a50b0 100644 --- a/src/forder.c +++ b/src/forder.c @@ -839,7 +839,10 @@ bool colsKeyHead(SEXP x, SEXP cols) { SEXP idxName(SEXP x, SEXP cols) { if (!isInteger(cols)) error("internal error: 'cols' must be an integer"); // # nocov - SEXP idx_names = PROTECT(subsetVector(getAttrib(x, R_NamesSymbol), cols)); + SEXP dt_names = getAttrib(x, R_NamesSymbol); + if (!isString(dt_names)) + error("internal error: 'DT' has no names"); // # nocov + SEXP idx_names = PROTECT(subsetVector(dt_names, cols)); SEXP char_underscore2 = PROTECT(ScalarString(mkChar("__"))); SEXP char_empty = PROTECT(ScalarString(mkChar(""))); SEXP sym_paste0 = install("paste0"); @@ -923,17 +926,18 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr int opt = -1; // -1=unknown, 0=none, 1=keyOpt, 2=idxOpt if (lazy==NA_LOGICAL) { - if (isNewList(DT) && sortGroups && !na && + if (INHERITS(DT, char_datatable) && // unnamed list should not be optimized + sortGroups && !na && all1(ascArg)) { // could ascArg=-1 be handled by a rev()? opt = -1; } else { if (verbose) - Rprintf("forder: opt not possible: isNewList(DT)=%d, sortGroups=%d, !na=%d, all1(ascArg)=%d\n", isNewList(DT), sortGroups, !na, all1(ascArg)); + Rprintf("forder: opt not possible: is.data.table(DT)=%d, sortGroups=%d, !na=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, !na, all1(ascArg)); opt = 0; } } else if (lazy) { - if (!isNewList(DT)) - error("internal error: lazy set to TRUE but DT is not a list"); // # nocov + if (!INHERITS(DT,char_datatable)) + error("internal error: lazy set to TRUE but DT is not a data.table"); // # nocov if (!sortGroups) error("internal error: lazy set to TRUE but sort is FALSE"); // # nocov if (na) @@ -978,7 +982,6 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr Rprintf("forder: setting index (retGrp=%d) on DT: %s\n", retGrp, CHAR(STRING_ELT(idxName(DT, by), 0))); } } - if (verbose) Rprintf("forder: opt=%d, took %.3fs\n", opt, omp_get_wtime()-tic); UNPROTECT(protecti); From 71f5aeb8120bad8f03293c555e2d54dd759b1fe6 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 23:10:58 +0100 Subject: [PATCH 14/53] override retGrp=F by retGrp=T is legit use --- src/forder.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/forder.c b/src/forder.c index e5cf6a50b0..b5bb6af201 100644 --- a/src/forder.c +++ b/src/forder.c @@ -881,7 +881,8 @@ void putIndex(SEXP x, SEXP cols, SEXP o) { } SEXP name_idx = PROTECT(idxName(x, cols)); SEXP sym_idx = install(CHAR(STRING_ELT(name_idx, 0))); - if (!isNull(getAttrib(index, sym_idx))) + SEXP idx = getAttrib(index, sym_idx); + if (!isNull(idx) && !isNull(getAttrib(idx, sym_starts))) // we override retGrp=F index with retGrp=T index error("internal error: trying to put index but it was already there, that should have been escaped before"); setAttrib(index, sym_idx, o); UNPROTECT(1); @@ -982,6 +983,7 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr Rprintf("forder: setting index (retGrp=%d) on DT: %s\n", retGrp, CHAR(STRING_ELT(idxName(DT, by), 0))); } } + if (verbose) Rprintf("forder: opt=%d, took %.3fs\n", opt, omp_get_wtime()-tic); UNPROTECT(protecti); From de916aadb696175d9def53d6d3f3afb1608b851e Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 23:21:48 +0100 Subject: [PATCH 15/53] more backward compatiblility, no retGrp from getindex --- R/setkey.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/setkey.R b/R/setkey.R index da51596e77..57a52dbe4f 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -110,7 +110,7 @@ getindex = function(x, name) { if (!is.null(ans) && (!is.integer(ans) || (length(ans)!=nrow(x) && length(ans)!=0L))) { stop("Internal error: index '",name,"' exists but is invalid") # nocov } - ans + c(ans) ## drop starts and maxgrpn attributes } haskey = function(x) !is.null(key(x)) From 0a3da1dea9c25d700ec6abade356a68ff664eeb1 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 23:22:11 +0100 Subject: [PATCH 16/53] more verbose messages during opts in forderLazy --- src/forder.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/forder.c b/src/forder.c index b5bb6af201..520fc42fd9 100644 --- a/src/forder.c +++ b/src/forder.c @@ -954,6 +954,8 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr if (opt == -1 && !retGrp && colsKeyHead(DT, by)) { opt = 1; // keyOpt ans = PROTECT(allocVector(INTSXP, 0)); protecti++; + if (verbose) + Rprintf("forder: using key: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } if (opt == -1 && GetUseIndex()) { @@ -969,8 +971,11 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr setAttrib(idx, sym_starts, R_NilValue); setAttrib(idx, sym_maxgrpn, R_NilValue); } // else: !hasGrp && retGrp: need to compute groups as they are not in index - if (opt == 2) + if (opt == 2) { ans = idx; + if (verbose) + Rprintf("forder: using existing index: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } } } From 068c1a9854dffe69fa9a1418d2b385ce5743db8d Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 20 Apr 2020 23:48:05 +0100 Subject: [PATCH 17/53] recycle order 1/-1 argument in one place --- R/setkey.R | 1 - src/bmerge.c | 6 +----- src/forder.c | 12 +++++++++++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 57a52dbe4f..fb30c60509 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -159,7 +159,6 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.las } else { if (!length(x)) return(integer(0L)) # e.g. forderv(data.table(NULL)) and forderv(list()) return integer(0L)) by = colnamesInt(x, by, check_dups=FALSE) - if (length(order) == 1L) order = rep(order, length(by)) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level .Call(CforderLazy, x, by, retGrp, sort, order, na.last, lazy) # returns integer() if already sorted, regardless of sort=TRUE|FALSE diff --git a/src/bmerge.c b/src/bmerge.c index 567ca496da..090629f379 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -142,11 +142,7 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP allGrp1[0] = TRUE; protecti += 2; - SEXP order = PROTECT(allocVector(INTSXP, length(icolsArg))); - protecti++; - for (int j=0; j", "Index: ", " a b", "1: 1 4", "2: 2 5", "3: 3 6")) + c("Key: ", "Indices: , ", " a b", "1: 1 4", "2: 2 5", "3: 3 6")) ## index 'b' is still good, so we now keep it # dev regression #2285 cat("A B C\n1 2 3\n4 5 6", file=f<-tempfile()) @@ -12527,7 +12527,7 @@ test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L) DT = data.table(a = c(3, 2, 1, 2, 3), b = c(1, 2, 1, 1, 2)) setindexv(DT, list('a', c('a', 'b'))) test(1897.1, indices(DT), c("a", "a__b")) -test(1897.2, attributes(attr(DT, 'index')), +test(1897.2, lapply(attributes(attr(DT, 'index')), c), ## lapply(, c) to ensure no starts, maxgrpn attributes list(`__a` = c(3L, 2L, 4L, 1L, 5L), `__a__b` = c(3L, 4L, 2L, 1L, 5L))) @@ -12567,9 +12567,9 @@ test(1899.18, as.matrix(DT, rownames=TRUE, rownames.value=1:nrow(DT)), error="ro # index argument for fread, #2633 DT_str = c('a,b\n3,1\n2,2\n1,1\n2,1\n3,2') -test(1900.1, attributes(attr(fread(DT_str, index = 'a'), 'index')), +test(1900.1, lapply(attributes(attr(fread(DT_str, index = 'a'), 'index')), c), list(`__a` = c(3L, 2L, 4L, 1L, 5L))) -test(1900.2, attributes(attr(fread(DT_str, index = list('a,b', c('b', 'a'), 'a')), 'index')), +test(1900.2, lapply(attributes(attr(fread(DT_str, index = list('a,b', c('b', 'a'), 'a')), 'index')), c), list(`__a__b` = c(3L, 4L, 2L, 1L, 5L), `__b__a` = c(3L, 4L, 1L, 2L, 5L), `__a` = c(3L, 2L, 4L, 1L, 5L))) @@ -12580,7 +12580,7 @@ test(1900.4, fread(DT_str, index = list('a', 1L)), # col.names applied before index test(1900.5, fread(DT_str, col.names = c('c', 'd'), index = 'a'), error = 'some columns are not in the data.table') -test(1900.6, attributes(attr(fread(DT_str, index = c('a', 'b')), 'index')), +test(1900.6, lapply(attributes(attr(fread(DT_str, index = c('a', 'b')), 'index')), c), list(`__a__b` = c(3L, 4L, 2L, 1L, 5L))) # . within bquote shouldn't be swapped to list, #1912 diff --git a/src/forder.c b/src/forder.c index 64115d2a28..1151e514b9 100644 --- a/src/forder.c +++ b/src/forder.c @@ -898,12 +898,23 @@ void putIndex(SEXP x, SEXP cols, SEXP o) { // isTRUE(getOption("datatable.use.index")) bool GetUseIndex() { - return LOGICAL(GetOption(install("datatable.use.index"), R_NilValue))[0]==TRUE; + SEXP opt = GetOption(install("datatable.use.index"), R_NilValue); + if (!IS_TRUE_OR_FALSE(opt)) + error("'datatable.use.index' option must be TRUE or FALSE"); + return LOGICAL(opt)[0]; } // isTRUE(getOption("datatable.auto.index")) bool GetAutoIndex() { - return LOGICAL(GetOption(install("datatable.auto.index"), R_NilValue))[0]==TRUE; + // for now temporarily 'forder.auto.index' not 'auto.index' to disabled it by default + // because it writes attr on .SD which is re-used by all groups leading to incorrect results + // DT[, .(uN=uniqueN(.SD)), by=A] + SEXP opt = GetOption(install("datatable.forder.auto.index"), R_NilValue); + if (isNull(opt)) + return false; + if (!IS_TRUE_OR_FALSE(opt)) + error("'datatable.forder.auto.index' option must be TRUE or FALSE"); + return LOGICAL(opt)[0]; } // lazy forder, re-use existing key or index if possible, otherwise call forder @@ -990,7 +1001,8 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr if (opt < 1) { ans = PROTECT(forder(DT, by, retGrpArg, sortGroupsArg, ascArg, naArg)); protecti++; if (opt == -1 && // opt==0 means that arguments (na.last,..) were not of type index, or lazy=FALSE - GetUseIndex() && GetAutoIndex()) { + GetUseIndex() && + GetAutoIndex()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only putIndex(DT, by, ans); if (verbose) Rprintf("forderLazy: setting index (retGrp=%d) on DT: %s\n", retGrp, CHAR(STRING_ELT(idxName(DT, by), 0))); From 514913a77931cc1a4a36139333ab7bd591c2a1a5 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 21 Apr 2020 12:13:17 +0100 Subject: [PATCH 22/53] fix tests --- inst/tests/tests.Rraw | 11 ++++++----- src/forder.c | 8 +++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 59a6053ba4..45f7571547 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13360,7 +13360,7 @@ test(1962.042, forderv(DT, na.last = c(TRUE, FALSE)), error='na.last must be lo test(1962.043, forderv(DT$a, by = 'a'), error='x is a single vector, non-NULL') test(1962.044, forderv(DT$a, order = 2L), error='Item 1 of order (ascending/descending) is 2. Must be +1 or -1') test(1962.045, forderv(DT$a, order = c(1L, -1L)), error='Input is an atomic vector (not a list of columns) but order= is not a length 1 integer') -test(1962.0461, forderv(DT, order = c(1L, -1L)), error="Either order= is not integer or its length (2) is different to by='s length (1)") +test(1962.0461, forderv(DT, order = c(1L, -1L)), error="length (2) is different to by='s length (1)") test(1962.0462, forderv(DT, order = 2L), error='Item 1 of order (ascending/descending) is 2. Must be +1 or -1') test(1962.0471, forderv(mean), error="'x' argument must be data.table compatible") test(1962.0472, forderv(DT, by=mean), error="argument specifying columns must be character or numeric") @@ -16927,13 +16927,15 @@ test(2139.56, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt options(datatable.verbose=FALSE) d = copy(dd) setkeyv(d, c("a","b")) -setindexv(d, list(c("a","a"), c("b","a"))) +setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) ab = structure(integer(), starts=1:2, maxgrpn=1L) ba = structure(2:1, starts=1:2, maxgrpn=1L) -test(2139.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=0.*took") +forderv(d, c("a","b")) +test(2139.60, forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") +test(2139.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") test(2139.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2139.63, forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=0.*took") +test(2139.63, forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") test(2139.64, forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") test(2139.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=0.*took") test(2139.66, forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") @@ -16965,4 +16967,3 @@ test(2139.92, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(op2) options(op) test(2139.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") - diff --git a/src/forder.c b/src/forder.c index 1151e514b9..eb2d83521f 100644 --- a/src/forder.c +++ b/src/forder.c @@ -985,11 +985,13 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr (hasGrp && retGrp)) { opt = 2; // idxOpt retGrp==hasGrp, if condition unfolded for codecov } else if (hasGrp && !retGrp) { - opt = 2; // idxOpt but need to drop groups - idx = copyAsPlain(idx); // we could shallow copy here + idx = copyAsPlain(idx); setAttrib(idx, sym_starts, R_NilValue); setAttrib(idx, sym_maxgrpn, R_NilValue); - } // else: !hasGrp && retGrp: need to compute groups as they are not in index + opt = 2; // idxOpt but need to drop groups + } else { // !hasGrp && retGrp + Rprintf("forderLazy: index found but no retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } if (opt == 2) { ans = idx; if (verbose) From ca2823ddd2db1eb9479a3278474f7014db316dd1 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 21 Apr 2020 14:55:10 +0100 Subject: [PATCH 23/53] code coverage --- R/data.table.R | 17 +- inst/tests/tests.Rraw | 13 +- src/forder.c | 409 +++++++++++++++++++++--------------------- 3 files changed, 221 insertions(+), 218 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 699d2ba369..f4e1453314 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2992,17 +2992,16 @@ isReallyReal = function(x) { if (is.null(idx)){ ## if nothing else helped, auto create a new index that can be used if (!getOption("datatable.auto.index")) return(NULL) - if (verbose) {cat("Creating new index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} - if (verbose) {last.started.at=proc.time();cat("Creating index", paste0(names(i), collapse = "__"), "done in ... ");flush.console()} - if (!getOption("datatable.forder.auto.index", FALSE)) { ## forder can write index, but disabled for now - setindexv(x, names(i)) - idx = attr(attr(x, "index", exact=TRUE), paste0("__", names(i), collapse = ""), exact=TRUE) - } else { - idx = forderv(x, names(i), sort=TRUE, retGrp=FALSE, lazy=TRUE) + idxCols = names(i) + if (verbose) {cat("Creating new index '", paste0(idxCols, collapse = "__"),"'\n",sep="");flush.console()} + if (verbose) {last.started.at=proc.time();cat("Creating index", paste0(idxCols, collapse = "__"), "done in ... ");flush.console()} + idx = forderv(x, idxCols, sort=TRUE, retGrp=FALSE, lazy=TRUE) + if (!isTRUE(getOption("datatable.forder.auto.index"))) { ## forder can write index, but disabled for now + if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) + setattr(attr(x, "index", exact=TRUE), paste0("__", idxCols, collapse=""), idx) } if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} - if (verbose) {cat("Optimized subsetting with index '", paste0(names(i), collapse = "__"),"'\n",sep="");flush.console()} - idxCols = names(i) + if (verbose) {cat("Optimized subsetting with index '", paste0(idxCols, collapse = "__"),"'\n",sep="");flush.console()} } if(!is.null(idxCols)){ setkeyv(i, idxCols) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 45f7571547..1a0f63f9e5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16965,5 +16965,16 @@ op2 = options(datatable.use.index=FALSE) test(2139.91, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") test(2139.92, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(op2) -options(op) +options(datatable.verbose=FALSE) +d = data.table(x = 2:1) +make_retGrp0_idx = d[x==1L] +test(2139.93, attr(attr(d, "index"), "__x"), 2:1) +options(datatable.verbose=TRUE) +test(2139.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L), output="forder.*index found but no retGrp.*forder.*opt=-1.*took") +d = data.table(x = 2:1) +op2 = options("datatable.forder.auto.index"=TRUE) +test(2139.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0") +test(2139.96, forderv(DT, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1") +options(op2) test(2139.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") +options(op) diff --git a/src/forder.c b/src/forder.c index eb2d83521f..991028f43d 100644 --- a/src/forder.c +++ b/src/forder.c @@ -809,214 +809,6 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S return ans; } -// all(x==1L) -static bool all1(SEXP x) { - if (!isInteger(x)) - error("internal error: all1 got non-integer"); // # nocov - int *xp = INTEGER(x); - for (int i=0; i Date: Tue, 21 Apr 2020 15:09:57 +0100 Subject: [PATCH 24/53] minor update for safer use of internal option --- R/setkey.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/setkey.R b/R/setkey.R index 322d12b1fe..9ffc1c0983 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -79,7 +79,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU o = forderv(x, cols, sort=TRUE, retGrp=!physical, lazy=TRUE) } if (!physical) { # index COULD BE saved from C forderLazy already, but disabled for now - if (!getOption("datatable.forder.auto.index", FALSE)) { + if (!isTRUE(getOption("datatable.forder.auto.index"))) { if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o) } From 4d0dec317cdcc9f3383e3d8c0885b549effa7b8d Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 21 Apr 2020 15:12:06 +0100 Subject: [PATCH 25/53] fix bad name in unit test --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1a0f63f9e5..95a9320120 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16974,7 +16974,7 @@ test(2139.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1 d = data.table(x = 2:1) op2 = options("datatable.forder.auto.index"=TRUE) test(2139.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0") -test(2139.96, forderv(DT, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1") +test(2139.96, forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1") options(op2) test(2139.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") options(op) From 1606046476d20a806fff6082bdd15d909eac0285 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Fri, 8 May 2020 20:39:17 +0100 Subject: [PATCH 26/53] retGrp=F requires downgrade idx and it seems to be costly --- src/forder.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/forder.c b/src/forder.c index 991028f43d..7fff7f0569 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1526,7 +1526,11 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr (hasGrp && retGrp)) { opt = 2; // idxOpt retGrp==hasGrp, if condition unfolded for codecov } else if (hasGrp && !retGrp) { - idx = copyAsPlain(idx); + // shallow_duplicate is faster than copyAsPlain, but shallow_duplicate is AFAIK good for VECSXP, not for INTSXP + // it is still the bottleneck in this opt, it is now better to call retGrp=TRUE and just not use those extra arguments + // can we do better here? real shallow for INTSXP? If we could just re-point data pointer... like we do for DT columns + // SEXP new; INTEGER(new) = INTEGER(idx); setAttrib(new, ..., R_NilValue) + idx = shallow_duplicate(idx); setAttrib(idx, sym_starts, R_NilValue); setAttrib(idx, sym_maxgrpn, R_NilValue); opt = 2; // idxOpt but need to drop groups From a9b01fffb59148f81c0709cfe1efedc541796bf4 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 20 May 2020 02:25:14 +0100 Subject: [PATCH 27/53] NA stats from forder --- R/setkey.R | 6 +-- inst/tests/tests.Rraw | 7 ++- src/bmerge.c | 2 +- src/data.table.h | 6 ++- src/forder.c | 109 ++++++++++++++++++++++++++++++++---------- src/init.c | 4 ++ 6 files changed, 99 insertions(+), 35 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 9ffc1c0983..de6069a470 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -156,7 +156,7 @@ is.sorted = function(x, by=seq_along(x)) { } ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') -forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE, lazy=getOption("datatable.forder.lazy",NA)) { +forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, order=1L, na.last=FALSE, lazy=getOption("datatable.forder.lazy",NA)) { if (is.atomic(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stop("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL @@ -165,7 +165,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.las by = colnamesInt(x, by, check_dups=FALSE) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level - .Call(CforderLazy, x, by, retGrp, sort, order, na.last, lazy) # returns integer() if already sorted, regardless of sort=TRUE|FALSE + .Call(CforderLazy, x, by, retGrp, retStats, sort, order, na.last, lazy) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } forder = function(..., na.last=TRUE, decreasing=FALSE) @@ -202,7 +202,7 @@ forder = function(..., na.last=TRUE, decreasing=FALSE) data = eval(sub, parent.frame(), parent.frame()) } stopifnot(isTRUEorFALSE(decreasing)) - o = forderv(data, seq_along(data), sort=TRUE, retGrp=FALSE, order= if (decreasing) -asc else asc, na.last) + o = forderv(data, seq_along(data), retGrp=FALSE, retStats=FALSE, sort=TRUE, order=if (decreasing) -asc else asc, na.last=na.last) if (!length(o) && length(data)>=1L) o = seq_along(data[[1L]]) else o o } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 95a9320120..921616b5b9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16929,9 +16929,8 @@ d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) -ab = structure(integer(), starts=1:2, maxgrpn=1L) -ba = structure(2:1, starts=1:2, maxgrpn=1L) -forderv(d, c("a","b")) +ab = structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L) +ba = structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L) test(2139.60, forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") test(2139.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") test(2139.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") @@ -16970,7 +16969,7 @@ d = data.table(x = 2:1) make_retGrp0_idx = d[x==1L] test(2139.93, attr(attr(d, "index"), "__x"), 2:1) options(datatable.verbose=TRUE) -test(2139.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L), output="forder.*index found but no retGrp.*forder.*opt=-1.*took") +test(2139.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") d = data.table(x = 2:1) op2 = options("datatable.forder.auto.index"=TRUE) test(2139.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0") diff --git a/src/bmerge.c b/src/bmerge.c index 090629f379..56f989f624 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -142,7 +142,7 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP allGrp1[0] = TRUE; protecti += 2; - SEXP oSxp = PROTECT(forderLazy(i, icolsArg, ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE), ScalarLogical(TRUE))); protecti++; + SEXP oSxp = PROTECT(forderLazy(i, icolsArg, ScalarLogical(FALSE), ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE), ScalarLogical(TRUE))); protecti++; if (!LENGTH(oSxp)) o = NULL; else diff --git a/src/data.table.h b/src/data.table.h index 78bb77f477..09fd69c765 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -91,6 +91,8 @@ extern SEXP sym_index; extern SEXP sym_BY; extern SEXP sym_starts, char_starts; extern SEXP sym_maxgrpn; +extern SEXP sym_hasna; +extern SEXP sym_hasinfnan; extern SEXP sym_colClassesAs; extern SEXP sym_verbose; extern SEXP SelfRefSymbol; @@ -123,8 +125,8 @@ int checkOverAlloc(SEXP x); // forder.c int StrCmp(SEXP x, SEXP y); uint64_t dtwiddle(void *p, int i); -SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg); -SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg, SEXP lazyArg); // lazy wrapper to forder +SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); +SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg); // lazy wrapper to forder int getNumericRounding_C(); // reorder.c diff --git a/src/forder.c b/src/forder.c index 7fff7f0569..941117c6f9 100644 --- a/src/forder.c +++ b/src/forder.c @@ -30,7 +30,7 @@ // #define TIMING_ON -static bool retgrp = true; // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn : +static bool retgrp = true, retstats = true; // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn : static int nrow = 0; // used as group size stack allocation limit (when all groups are 1 row) static int *gs = NULL; // gs = final groupsizes e.g. 23,12,87,2,1,34,... static int gs_alloc = 0; // allocated size of gs @@ -414,7 +414,7 @@ uint64_t dtwiddle(void *p, int i) void radix_r(const int from, const int to, const int radix); -SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg) +SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg) // sortGroups TRUE from setkey and regular forder, FALSE from by= for efficiency so strings don't have to be sorted and can be left in appearance order // when sortGroups is TRUE, ascArg contains +1/-1 for ascending/descending of each by column; when FALSE ascArg is ignored { @@ -474,6 +474,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S if (!IS_TRUE_OR_FALSE(retGrpArg)) STOP(_("retGrp must be TRUE or FALSE")); // # nocov # covered in lazy forder retgrp = LOGICAL(retGrpArg)[0]==TRUE; + if (!IS_TRUE_OR_FALSE(retStatsArg)) + STOP(_("retStats must be TRUE or FALSE")); // # nocov # covered in lazy forder + retstats = LOGICAL(retStatsArg)[0]==TRUE; + if (!retstats && retgrp) + error("retStats must be TRUE whenever retGrp is TRUE"); // # nocov # covered in lazy forder if (!IS_TRUE_OR_FALSE(sortGroupsArg)) STOP(_("sort must be TRUE or FALSE")); // # nocov # covered in lazy forder sortType = LOGICAL(sortGroupsArg)[0]==TRUE; // if sortType is 1, it is later flipped between +1/-1 according to ascArg. Otherwise ascArg is ignored when sortType==0 @@ -490,6 +495,10 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S setAttrib(ans, sym_starts, allocVector(INTSXP, 0)); setAttrib(ans, sym_maxgrpn, ScalarInteger(0)); } + if (retstats) { + setAttrib(ans, sym_hasna, ScalarInteger(0)); + setAttrib(ans, sym_hasinfnan, ScalarInteger(0)); + } UNPROTECT(n_protect); return ans; } @@ -515,6 +524,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, S bool complexRerun = false; // see comments below in CPLXSXP case SEXP CplxPart = R_NilValue; if (n_cplx) { CplxPart=PROTECT(allocVector(REALSXP, nrow)); n_protect++; } // one alloc is reused for each part + int any_na=0, any_infnan=0; // collect more statistics about the data #2879, allow optimize of order(na.last=TRUE) as well #3023 TEND(2); for (int col=0; col0) + any_na = 1; // may be written multiple times, for each column that has NA, but thats fine + if (infnan_count>0) + any_infnan = 1; if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) { // all same value; skip column as nothing to do; [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA) if (na_count==nrow && nalast==-1) { for (int i=0; i0 || attr(idx, "hasinfnan")>0 +bool idxHasNA(SEXP idx) { + return INTEGER(getAttrib(idx, sym_hasna))[0]>0 || INTEGER(getAttrib(idx, sym_hasinfnan))[0]>0; +} + // lazy forder, re-use existing key or index if possible, otherwise call forder -SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg) { +SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg) { const bool verbose = GetVerbose(); int protecti = 0; double tic=0.0; @@ -1474,6 +1497,11 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr if (!IS_TRUE_OR_FALSE(retGrpArg)) error("retGrp must be TRUE or FALSE"); bool retGrp = (bool)LOGICAL(retGrpArg)[0]; + if (!IS_TRUE_OR_FALSE(retStatsArg)) + error("retStats must be TRUE or FALSE"); + bool retStats = (bool)LOGICAL(retStatsArg)[0]; + if (!retStats && retGrp) + error("retStats must be TRUE whenever retGrp is TRUE"); // retStats doesnt cost anything and it will be much easier to optimize use of index if (!IS_TRUE_OR_FALSE(sortGroupsArg)) error("sort must be TRUE or FALSE"); bool sortGroups = (bool)LOGICAL(sortGroupsArg)[0]; @@ -1490,12 +1518,12 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr int opt = -1; // -1=unknown, 0=none, 1=keyOpt, 2=idxOpt if (lazy==NA_LOGICAL) { if (INHERITS(DT, char_datatable) && // unnamed list should not be optimized - sortGroups && !na && + sortGroups && all1(ascArg)) { // could ascArg=-1 be handled by a rev()? opt = -1; } else { if (verbose) - Rprintf("forderLazy: opt not possible: is.data.table(DT)=%d, sortGroups=%d, !na=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, !na, all1(ascArg)); + Rprintf("forderLazy: opt not possible: is.data.table(DT)=%d, sortGroups=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, all1(ascArg)); opt = 0; } } else if (lazy) { @@ -1503,8 +1531,6 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr error("internal error: lazy set to TRUE but DT is not a data.table"); // # nocov if (!sortGroups) error("internal error: lazy set to TRUE but sort is FALSE"); // # nocov - if (na) - error("internal error: lazy set to TRUE but na.last is not FALSE"); // # nocov if (!all1(ascArg)) error("internal error: lazy set to TRUE but order is not all 1"); // # nocov opt = -1; @@ -1521,21 +1547,54 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr if (opt == -1 && GetUseIndex()) { SEXP idx = getIndex(DT, by); if (!isNull(idx)) { - bool hasGrp = !isNull(getAttrib(idx, sym_starts)); - if ((!hasGrp && !retGrp) || - (hasGrp && retGrp)) { - opt = 2; // idxOpt retGrp==hasGrp, if condition unfolded for codecov - } else if (hasGrp && !retGrp) { - // shallow_duplicate is faster than copyAsPlain, but shallow_duplicate is AFAIK good for VECSXP, not for INTSXP - // it is still the bottleneck in this opt, it is now better to call retGrp=TRUE and just not use those extra arguments - // can we do better here? real shallow for INTSXP? If we could just re-point data pointer... like we do for DT columns - // SEXP new; INTEGER(new) = INTEGER(idx); setAttrib(new, ..., R_NilValue) - idx = shallow_duplicate(idx); - setAttrib(idx, sym_starts, R_NilValue); - setAttrib(idx, sym_maxgrpn, R_NilValue); - opt = 2; // idxOpt but need to drop groups - } else { // !hasGrp && retGrp - Rprintf("forderLazy: index found but no retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + bool hasStats = !isNull(getAttrib(idx, sym_hasna)); + if (!na || // na.last=FALSE + (hasStats && !idxHasNA(idx))) { // na.last=TRUE && !anyNA + bool hasGrp = !isNull(getAttrib(idx, sym_starts)); + if (hasGrp && !hasStats) + error("internal error: index has 'starts' attribute but not 'hasna', please report to issue tracker"); // # nocov + if (hasGrp==retGrp && hasStats==retStats) { + opt = 2; // idxOpt + } else if ( + (hasGrp && !retGrp && !(!hasStats && retStats)) || // !hasStats should never happen when hasGrp + (hasStats && !retStats && !(!hasGrp && retGrp)) + ) { + // shallow_duplicate is faster than copyAsPlain, but shallow_duplicate is AFAIK good for VECSXP, not for INTSXP + // it is still the bottleneck in this opt, it is better to call retGrp=TRUE and just not use those extra attributes + // can we do better here? real shallow for INTSXP? If we could just re-point data pointer... like we do for DT columns + // SEXP new; INTEGER(new) = INTEGER(idx); setAttrib(new, ..., R_NilValue) + idx = shallow_duplicate(idx); + if (hasGrp && !retGrp) { + setAttrib(idx, sym_starts, R_NilValue); + setAttrib(idx, sym_maxgrpn, R_NilValue); + } + if (hasStats && !retStats) { + setAttrib(idx, sym_hasna, R_NilValue); + setAttrib(idx, sym_hasinfnan, R_NilValue); + } + opt = 2; // idxOpt but need to drop groups or stats + } else if (!hasGrp && retGrp && !hasStats && retStats) { + if (verbose) + Rprintf("forderLazy: index found but not for retGrp and retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else if (!hasGrp && retGrp && hasStats) { + if (verbose) + Rprintf("forderLazy: index found but not for retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else if (hasGrp && !hasStats && retStats) { + if (verbose) + Rprintf("forderLazy: index found but not for retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else { + error("internal error: lazy forder index optimization unhandled branch of retGrp-retStats, please report to issue tracker"); // # nocov + } + } else { + if (!hasStats) { + if (verbose) + Rprintf("forderLazy: index found but na.last=TRUE and no stats available: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else if (idxHasNA(idx)) { + if (verbose) + Rprintf("forderLazy: index found but na.last=TRUE and NAs present: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + } else { + error("internal error: lazy forder index optimization unhandled branch of last.na=T, please report to issue tracker"); // # nocov + } } if (opt == 2) { ans = idx; @@ -1545,13 +1604,13 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP sortGroupsArg, SEXP ascAr } } if (opt < 1) { - ans = PROTECT(forder(DT, by, retGrpArg, sortGroupsArg, ascArg, naArg)); protecti++; - if (opt == -1 && // opt==0 means that arguments (na.last,..) were not of type index, or lazy=FALSE + ans = PROTECT(forder(DT, by, retGrpArg, retStatsArg, sortGroupsArg, ascArg, naArg)); protecti++; + if (opt == -1 && // opt==0 means that arguments (sort, asc) were not of type index, or lazy=FALSE GetUseIndex() && GetAutoIndex()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only putIndex(DT, by, ans); if (verbose) - Rprintf("forderLazy: setting index (retGrp=%d) on DT: %s\n", retGrp, CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderLazy: setting index (retGrp=%d, retStats=%d) on DT: %s\n", retGrp, retStats, CHAR(STRING_ELT(idxName(DT, by), 0))); } } if (verbose) diff --git a/src/init.c b/src/init.c index 40ee82bf82..70c545c5a0 100644 --- a/src/init.c +++ b/src/init.c @@ -25,6 +25,8 @@ SEXP sym_index; SEXP sym_BY; SEXP sym_starts, char_starts; SEXP sym_maxgrpn; +SEXP sym_hasna; +SEXP sym_hasinfnan; SEXP sym_colClassesAs; SEXP sym_verbose; SEXP SelfRefSymbol; @@ -339,6 +341,8 @@ void attribute_visible R_init_datatable(DllInfo *info) sym_index = install("index"); sym_BY = install(".BY"); sym_maxgrpn = install("maxgrpn"); + sym_hasna = install("hasna"); + sym_hasinfnan = install("hasinfnan"); sym_colClassesAs = install("colClassesAs"); sym_verbose = install("datatable.verbose"); SelfRefSymbol = install(".internal.selfref"); From ded1e1ab9e23cbd4c74ebf5d8172b40bbee33dbb Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 20 May 2020 02:48:09 +0100 Subject: [PATCH 28/53] keyOpt fix, and existing tests --- inst/tests/tests.Rraw | 19 ++++++++++--------- src/forder.c | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 921616b5b9..13e5e0dba7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13147,7 +13147,7 @@ test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'), # appearance order of two low-cardinality columns that were squashed in pr#3124 DT = data.table(A=INT(1,3,2,3,2), B=1:5) # respect groups in 1st column (3's and 2's) -test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L)) +test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L, hasna=0L, hasinfnan=0L)) # skip values that are not present in old, #3030 DT <- data.table(a=1, b=2, d=3) @@ -14103,12 +14103,12 @@ test(1993.1, foverlaps(xp, yp, nomatch = 0L, which=TRUE), data.table(xid=1L, yid test(1993.2, foverlaps(xp, yp, by.x=c("day", "year")), error="Some interval cols are of type POSIXct while others are not") # forderv NaN,Inf and Inf when at most 1 finite value is present, #3381. These broke in v1.12.0. They pass in v1.11.8. -test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L)) -test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L)) -test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L)) -test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L)) -test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L)) -test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L)) +test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L, hasna=0L, hasinfnan=1L)) +test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L, hasna=0L, hasinfnan=1L)) +test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L)) +test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L)) +test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L)) +test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L)) test(1994.7, data.table(A=c(-Inf,21,Inf),V=1:3)[,sum(V),by=A]$V1, 1:3) # 0 length items should not result in no-recycle error, #3386 @@ -16936,9 +16936,9 @@ test(2139.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*to test(2139.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") test(2139.63, forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") test(2139.64, forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") -test(2139.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=0.*took") +test(2139.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via hasnan index attribute test(2139.66, forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2139.67, forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=0.*took") +test(2139.67, forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via hasnan index attribute test(2139.68, forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") test(2139.69, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") test(2139.70, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") @@ -16957,6 +16957,7 @@ test(2139.82, forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.83, forderv(2:1), 2:1, output="forder.*opt=0.*took") test(2139.84, forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) +#test(2139.851) # tests for NAs and na.last arg d = copy(dd) setindexv(d, "b") options(datatable.verbose=TRUE) diff --git a/src/forder.c b/src/forder.c index 941117c6f9..04e6c6ee30 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1538,7 +1538,7 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro opt = 0; } SEXP ans = R_NilValue; - if (opt == -1 && !retGrp && colsKeyHead(DT, by)) { + if (opt == -1 && !na && !retGrp && colsKeyHead(DT, by)) { opt = 1; // keyOpt ans = PROTECT(allocVector(INTSXP, 0)); protecti++; if (verbose) From 53cce9856072a069a653200e2e3c1a0d58c072da Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 20 May 2020 02:51:35 +0100 Subject: [PATCH 29/53] fixes for na.last in key and setting idx --- src/forder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/forder.c b/src/forder.c index 04e6c6ee30..351861f233 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1606,6 +1606,7 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro if (opt < 1) { ans = PROTECT(forder(DT, by, retGrpArg, retStatsArg, sortGroupsArg, ascArg, naArg)); protecti++; if (opt == -1 && // opt==0 means that arguments (sort, asc) were not of type index, or lazy=FALSE + (!na || (retStats && !idxHasNA(ans))) && // lets create index even if na.last=T used but no NAs detected! GetUseIndex() && GetAutoIndex()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only putIndex(DT, by, ans); From 1e923660432e946f78a87027e8c7246f7261caa4 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Wed, 20 May 2020 04:06:27 +0100 Subject: [PATCH 30/53] filling tests for na.last=T and possible fixes --- inst/tests/tests.Rraw | 50 ++++++++++++++++++++++++++++++++++++++++--- src/forder.c | 4 ++-- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 13e5e0dba7..9417969151 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16957,7 +16957,43 @@ test(2139.82, forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") test(2139.83, forderv(2:1), 2:1, output="forder.*opt=0.*took") test(2139.84, forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) -#test(2139.851) # tests for NAs and na.last arg +test(2139.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE") +test(2139.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") +ddd = data.table(v1=1:3, v2=c(1L,NA,3L), v3=c(3:2,NaN), v4=c(1:2,Inf), v5=c(-Inf,NA,3)) ## tests for NAs and na.last arg +d = copy(ddd) +invisible(d[v1==1L, verbose=FALSE]) +options(datatable.verbose=TRUE) +test(2139.853, forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") +test(2139.854, o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") +setattr(d, "index", setattr(integer(), "__v1", o)) +test(2139.855, forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") +test(2139.856, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2139.857, forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), hasna=0L, hasinfnan=0L), output="forder.*opt=2.*took") +d = copy(ddd) +invisible(d[v1==1L, verbose=FALSE]) +test(2139.858, forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") +setindexv(d, "v2", verbose=FALSE) +test(2139.859, forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") +options(datatable.verbose=FALSE) +d = copy(ddd) +setkeyv(d, "v1") +setindexv(d, list("v2","v3","v4","v5",c("v1","v2"),c("v1","v3"),c("v2","v3"),c("v1","v4"),c("v1","v5"),c("v1","v4","v5"))) +options(datatable.verbose=TRUE) +test(2139.861, forderv(d, "v1"), integer(), output="forder.*opt=1.*took") +test(2139.862, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=-1.*took") ## cannot use key for na.last +setindexv(d, "v1", verbose=FALSE) +test(2139.863, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2139.864, forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2139.865, forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2139.866, forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2139.867, forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2139.868, forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2139.869, forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2139.870, forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2139.871, forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2139.872, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2139.873, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +options(datatable.verbose=FALSE) d = copy(dd) setindexv(d, "b") options(datatable.verbose=TRUE) @@ -16973,8 +17009,16 @@ options(datatable.verbose=TRUE) test(2139.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") d = data.table(x = 2:1) op2 = options("datatable.forder.auto.index"=TRUE) -test(2139.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0") -test(2139.96, forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1") +test(2139.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") +test(2139.96, forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1, retStats=1") +setindexv(d, NULL, verbose=FALSE) +test(2139.971, forderv(d, "x", retStats=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +setindexv(d, NULL, verbose=FALSE) +test(2139.972, forderv(d, "x", retStats=TRUE, na.last=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +setindexv(d, NULL, verbose=FALSE) +test(2139.973, forderv(data.table(x=c(2:1,NA)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2139.974, forderv(data.table(x=c(2:1,NaN)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2139.975, forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") options(op2) test(2139.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") options(op) diff --git a/src/forder.c b/src/forder.c index 351861f233..6dd12e4576 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1576,10 +1576,10 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro } else if (!hasGrp && retGrp && !hasStats && retStats) { if (verbose) Rprintf("forderLazy: index found but not for retGrp and retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); - } else if (!hasGrp && retGrp && hasStats) { + } else if (!hasGrp && retGrp) { if (verbose) Rprintf("forderLazy: index found but not for retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); - } else if (hasGrp && !hasStats && retStats) { + } else if (!hasStats && retStats) { if (verbose) Rprintf("forderLazy: index found but not for retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else { From 0f95e6f9cd8b4456117e28b8376c39de5a13862d Mon Sep 17 00:00:00 2001 From: jangorecki Date: Thu, 21 May 2020 17:16:55 +0100 Subject: [PATCH 31/53] more stats, any non ascii utf8 --- inst/tests/tests.Rraw | 27 ++++++++++++++++----------- src/data.table.h | 2 ++ src/forder.c | 33 ++++++++++++++++++++++++++------- src/init.c | 4 ++++ 4 files changed, 48 insertions(+), 18 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9417969151..a8afd27928 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13147,7 +13147,7 @@ test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'), # appearance order of two low-cardinality columns that were squashed in pr#3124 DT = data.table(A=INT(1,3,2,3,2), B=1:5) # respect groups in 1st column (3's and 2's) -test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L, hasna=0L, hasinfnan=0L)) +test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L, hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L)) # skip values that are not present in old, #3030 DT <- data.table(a=1, b=2, d=3) @@ -14103,12 +14103,12 @@ test(1993.1, foverlaps(xp, yp, nomatch = 0L, which=TRUE), data.table(xid=1L, yid test(1993.2, foverlaps(xp, yp, by.x=c("day", "year")), error="Some interval cols are of type POSIXct while others are not") # forderv NaN,Inf and Inf when at most 1 finite value is present, #3381. These broke in v1.12.0. They pass in v1.11.8. -test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L, hasna=0L, hasinfnan=1L)) -test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L, hasna=0L, hasinfnan=1L)) -test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L)) -test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L)) -test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L)) -test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L)) +test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) test(1994.7, data.table(A=c(-Inf,21,Inf),V=1:3)[,sum(V),by=A]$V1, 1:3) # 0 length items should not result in no-recycle error, #3386 @@ -16929,8 +16929,8 @@ d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) -ab = structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L) -ba = structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L) +ab = structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L) +ba = structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L) test(2139.60, forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") test(2139.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") test(2139.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") @@ -16968,7 +16968,7 @@ test(2139.854, o<-forderv(d, "v1", retStats=TRUE), output="index found but not f setattr(d, "index", setattr(integer(), "__v1", o)) test(2139.855, forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") test(2139.856, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") -test(2139.857, forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), hasna=0L, hasinfnan=0L), output="forder.*opt=2.*took") +test(2139.857, forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") d = copy(ddd) invisible(d[v1==1L, verbose=FALSE]) test(2139.858, forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") @@ -16994,6 +16994,11 @@ test(2139.871, forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder test(2139.872, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") test(2139.873, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg options(datatable.verbose=FALSE) +d = fread(testDir("1680-fread-header-encoding.csv"), encoding="Latin-1") ## re-use some existing non utf8 data +anyEnc = function(x) unlist(attributes(forderv(x, retStats=TRUE))[c("anynotascii","anynotutf8")]) +test(2139.881, anyEnc(d), c(anynotascii=1L,anynotutf8=1L)) +test(2139.882, anyEnc(d[,-2L]), c(anynotascii=0L,anynotutf8=0L)) +test(2139.883, anyEnc(c("a","b","\u221A")), c(anynotascii=1L,anynotutf8=0L)) d = copy(dd) setindexv(d, "b") options(datatable.verbose=TRUE) @@ -17006,7 +17011,7 @@ d = data.table(x = 2:1) make_retGrp0_idx = d[x==1L] test(2139.93, attr(attr(d, "index"), "__x"), 2:1) options(datatable.verbose=TRUE) -test(2139.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") +test(2139.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") d = data.table(x = 2:1) op2 = options("datatable.forder.auto.index"=TRUE) test(2139.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") diff --git a/src/data.table.h b/src/data.table.h index 09fd69c765..1de513f718 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -93,6 +93,8 @@ extern SEXP sym_starts, char_starts; extern SEXP sym_maxgrpn; extern SEXP sym_hasna; extern SEXP sym_hasinfnan; +extern SEXP sym_anynotascii; +extern SEXP sym_anynotutf8; extern SEXP sym_colClassesAs; extern SEXP sym_verbose; extern SEXP SelfRefSymbol; diff --git a/src/forder.c b/src/forder.c index 6dd12e4576..1e85403d91 100644 --- a/src/forder.c +++ b/src/forder.c @@ -30,7 +30,8 @@ // #define TIMING_ON -static bool retgrp = true, retstats = true; // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn : +static bool retgrp = true; // return group sizes as well as the ordering vector? If so then use gs, gsalloc and gsn : +static bool retstats = true; // return extra flags for any NA, NaN, -Inf, +Inf, non-ASCII, non-UTF8 static int nrow = 0; // used as group size stack allocation limit (when all groups are 1 row) static int *gs = NULL; // gs = final groupsizes e.g. 23,12,87,2,1,34,... static int gs_alloc = 0; // allocated size of gs @@ -283,11 +284,11 @@ static void cradix(SEXP *x, int n) free(cradix_xtmp); cradix_xtmp=NULL; } -static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count) +static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8) // group numbers are left in truelength to be fetched by WRITE_KEY { int na_count=0; - bool anyneedutf8=false; + bool any_notascii=false, any_notutf8=false; if (ustr_n!=0) STOP(_("Internal error: ustr isn't empty when starting range_str: ustr_n=%d, ustr_alloc=%d"), ustr_n, ustr_alloc); // # nocov if (ustr_maxlen!=0) STOP(_("Internal error: ustr_maxlen isn't 0 when starting range_str")); // # nocov // savetl_init() has already been called at the start of forder @@ -314,16 +315,23 @@ static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int ustr[ustr_n++] = s; SET_TRUELENGTH(s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s); - if (!anyneedutf8 && NEED2UTF8(s)) anyneedutf8=true; + if (!IS_ASCII(s)) { + if (!any_notascii) + any_notascii=true; + if (!any_notutf8 && !IS_UTF8(s)) + any_notutf8=true; + } } } *out_na_count = na_count; + *out_anynotascii = any_notascii; + *out_anynotutf8 = any_notutf8; if (ustr_n==0) { // all na *out_min = 0; *out_max = 0; return; } - if (anyneedutf8) { + if (any_notutf8) { SEXP ustr2 = PROTECT(allocVector(STRSXP, ustr_n)); for (int i=0; i0) any_infnan = 1; + if (any_notascii) + anynotascii = 1; + if (any_notutf8) + anynotutf8 = 1; if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) { // all same value; skip column as nothing to do; [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA) if (na_count==nrow && nalast==-1) { for (int i=0; i Date: Thu, 21 May 2020 17:28:37 +0100 Subject: [PATCH 32/53] better naming of new stats attributes --- inst/tests/tests.Rraw | 26 +++++++++--------- src/data.table.h | 4 +-- src/forder.c | 62 +++++++++++++++++++++---------------------- src/init.c | 8 +++--- 4 files changed, 50 insertions(+), 50 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a8afd27928..afe253ec0d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13147,7 +13147,7 @@ test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'), # appearance order of two low-cardinality columns that were squashed in pr#3124 DT = data.table(A=INT(1,3,2,3,2), B=1:5) # respect groups in 1st column (3's and 2's) -test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L, hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L)) +test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L)) # skip values that are not present in old, #3030 DT <- data.table(a=1, b=2, d=3) @@ -14103,12 +14103,12 @@ test(1993.1, foverlaps(xp, yp, nomatch = 0L, which=TRUE), data.table(xid=1L, yid test(1993.2, foverlaps(xp, yp, by.x=c("day", "year")), error="Some interval cols are of type POSIXct while others are not") # forderv NaN,Inf and Inf when at most 1 finite value is present, #3381. These broke in v1.12.0. They pass in v1.11.8. -test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) -test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) -test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) -test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) -test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) -test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.1, forderv(c(NaN, Inf, -Inf), retGrp=TRUE), structure(INT(1,3,2), starts=1:3, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.2, forderv(c(-Inf, 0, Inf), retGrp=TRUE), structure(integer(), starts=1:3, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.3, forderv(c(-Inf, Inf), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.4, forderv(c(Inf, -Inf), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.5, forderv(c(0, NaN), retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) +test(1994.6, forderv(c(NaN, 0), retGrp=TRUE), structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=1L, anynotascii=0L, anynotutf8=0L)) test(1994.7, data.table(A=c(-Inf,21,Inf),V=1:3)[,sum(V),by=A]$V1, 1:3) # 0 length items should not result in no-recycle error, #3386 @@ -16929,16 +16929,16 @@ d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) -ab = structure(integer(), starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L) -ba = structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L) +ab = structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) +ba = structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) test(2139.60, forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") test(2139.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") test(2139.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") test(2139.63, forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") test(2139.64, forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") -test(2139.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via hasnan index attribute +test(2139.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute test(2139.66, forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2139.67, forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via hasnan index attribute +test(2139.67, forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute test(2139.68, forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") test(2139.69, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") test(2139.70, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") @@ -16968,7 +16968,7 @@ test(2139.854, o<-forderv(d, "v1", retStats=TRUE), output="index found but not f setattr(d, "index", setattr(integer(), "__v1", o)) test(2139.855, forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") test(2139.856, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") -test(2139.857, forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") +test(2139.857, forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") d = copy(ddd) invisible(d[v1==1L, verbose=FALSE]) test(2139.858, forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") @@ -17011,7 +17011,7 @@ d = data.table(x = 2:1) make_retGrp0_idx = d[x==1L] test(2139.93, attr(attr(d, "index"), "__x"), 2:1) options(datatable.verbose=TRUE) -test(2139.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, hasna=0L, hasinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") +test(2139.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") d = data.table(x = 2:1) op2 = options("datatable.forder.auto.index"=TRUE) test(2139.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") diff --git a/src/data.table.h b/src/data.table.h index 1de513f718..03ea4cd65c 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -91,8 +91,8 @@ extern SEXP sym_index; extern SEXP sym_BY; extern SEXP sym_starts, char_starts; extern SEXP sym_maxgrpn; -extern SEXP sym_hasna; -extern SEXP sym_hasinfnan; +extern SEXP sym_anyna; +extern SEXP sym_anyinfnan; extern SEXP sym_anynotascii; extern SEXP sym_anynotutf8; extern SEXP sym_colClassesAs; diff --git a/src/forder.c b/src/forder.c index 1e85403d91..116a19891f 100644 --- a/src/forder.c +++ b/src/forder.c @@ -288,7 +288,7 @@ static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int // group numbers are left in truelength to be fetched by WRITE_KEY { int na_count=0; - bool any_notascii=false, any_notutf8=false; + bool anynotascii=false, anynotutf8=false; if (ustr_n!=0) STOP(_("Internal error: ustr isn't empty when starting range_str: ustr_n=%d, ustr_alloc=%d"), ustr_n, ustr_alloc); // # nocov if (ustr_maxlen!=0) STOP(_("Internal error: ustr_maxlen isn't 0 when starting range_str")); // # nocov // savetl_init() has already been called at the start of forder @@ -316,22 +316,22 @@ static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int SET_TRUELENGTH(s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s); if (!IS_ASCII(s)) { - if (!any_notascii) - any_notascii=true; - if (!any_notutf8 && !IS_UTF8(s)) - any_notutf8=true; + if (!anynotascii) + anynotascii=true; + if (!anynotutf8 && !IS_UTF8(s)) + anynotutf8=true; } } } *out_na_count = na_count; - *out_anynotascii = any_notascii; - *out_anynotutf8 = any_notutf8; + *out_anynotascii = anynotascii; + *out_anynotutf8 = anynotutf8; if (ustr_n==0) { // all na *out_min = 0; *out_max = 0; return; } - if (any_notutf8) { + if (anynotutf8) { SEXP ustr2 = PROTECT(allocVector(STRSXP, ustr_n)); for (int i=0; i0) any_infnan = 1; - if (any_notascii) - anynotascii = 1; - if (any_notutf8) - anynotutf8 = 1; + if (anynotascii) + any_notascii = 1; + if (anynotutf8) + any_notutf8 = 1; if (na_count==nrow || (min>0 && min==max && na_count==0 && infnan_count==0)) { // all same value; skip column as nothing to do; [min,max] is just of finite values (excludes +Inf,-Inf,NaN and NA) if (na_count==nrow && nalast==-1) { for (int i=0; i0 || attr(idx, "hasinfnan")>0 -bool idxHasNA(SEXP idx) { - return INTEGER(getAttrib(idx, sym_hasna))[0]>0 || INTEGER(getAttrib(idx, sym_hasinfnan))[0]>0; +// attr(idx, "anyna")>0 || attr(idx, "anyinfnan")>0 +bool idxAnyNF(SEXP idx) { + return INTEGER(getAttrib(idx, sym_anyna))[0]>0 || INTEGER(getAttrib(idx, sym_anyinfnan))[0]>0; } // lazy forder, re-use existing key or index if possible, otherwise call forder @@ -1564,12 +1564,12 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro if (opt == -1 && GetUseIndex()) { SEXP idx = getIndex(DT, by); if (!isNull(idx)) { - bool hasStats = !isNull(getAttrib(idx, sym_hasna)); + bool hasStats = !isNull(getAttrib(idx, sym_anyna)); if (!na || // na.last=FALSE - (hasStats && !idxHasNA(idx))) { // na.last=TRUE && !anyNA + (hasStats && !idxAnyNF(idx))) { // na.last=TRUE && !anyNA bool hasGrp = !isNull(getAttrib(idx, sym_starts)); if (hasGrp && !hasStats) - error("internal error: index has 'starts' attribute but not 'hasna', please report to issue tracker"); // # nocov + error("internal error: index has 'starts' attribute but not 'anyna', please report to issue tracker"); // # nocov if (hasGrp==retGrp && hasStats==retStats) { opt = 2; // idxOpt } else if ( @@ -1586,8 +1586,8 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro setAttrib(idx, sym_maxgrpn, R_NilValue); } if (hasStats && !retStats) { - setAttrib(idx, sym_hasna, R_NilValue); - setAttrib(idx, sym_hasinfnan, R_NilValue); + setAttrib(idx, sym_anyna, R_NilValue); + setAttrib(idx, sym_anyinfnan, R_NilValue); setAttrib(idx, sym_anynotascii, R_NilValue); setAttrib(idx, sym_anynotutf8, R_NilValue); } @@ -1608,7 +1608,7 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro if (!hasStats) { if (verbose) Rprintf("forderLazy: index found but na.last=TRUE and no stats available: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); - } else if (idxHasNA(idx)) { + } else if (idxAnyNF(idx)) { if (verbose) Rprintf("forderLazy: index found but na.last=TRUE and NAs present: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else { @@ -1625,7 +1625,7 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro if (opt < 1) { ans = PROTECT(forder(DT, by, retGrpArg, retStatsArg, sortGroupsArg, ascArg, naArg)); protecti++; if (opt == -1 && // opt==0 means that arguments (sort, asc) were not of type index, or lazy=FALSE - (!na || (retStats && !idxHasNA(ans))) && // lets create index even if na.last=T used but no NAs detected! + (!na || (retStats && !idxAnyNF(ans))) && // lets create index even if na.last=T used but no NAs detected! GetUseIndex() && GetAutoIndex()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only putIndex(DT, by, ans); diff --git a/src/init.c b/src/init.c index c4a52cbf42..35090343e4 100644 --- a/src/init.c +++ b/src/init.c @@ -25,8 +25,8 @@ SEXP sym_index; SEXP sym_BY; SEXP sym_starts, char_starts; SEXP sym_maxgrpn; -SEXP sym_hasna; -SEXP sym_hasinfnan; +SEXP sym_anyna; +SEXP sym_anyinfnan; SEXP sym_anynotascii; SEXP sym_anynotutf8; SEXP sym_colClassesAs; @@ -343,8 +343,8 @@ void attribute_visible R_init_datatable(DllInfo *info) sym_index = install("index"); sym_BY = install(".BY"); sym_maxgrpn = install("maxgrpn"); - sym_hasna = install("hasna"); - sym_hasinfnan = install("hasinfnan"); + sym_anyna = install("anyna"); + sym_anyinfnan = install("anyinfnan"); sym_anynotascii = install("anynotascii"); sym_anynotutf8 = install("anynotutf8"); sym_colClassesAs = install("colClassesAs"); From 91437de547233e0af192863da581fc719792561a Mon Sep 17 00:00:00 2001 From: jangorecki Date: Tue, 30 Jun 2020 00:39:20 +0100 Subject: [PATCH 33/53] add extra escape to escape IS_ASCII checks --- src/forder.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/forder.c b/src/forder.c index 116a19891f..28453405dc 100644 --- a/src/forder.c +++ b/src/forder.c @@ -315,10 +315,11 @@ static void range_str(SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int ustr[ustr_n++] = s; SET_TRUELENGTH(s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s); - if (!IS_ASCII(s)) { + if (!anynotutf8 && // even if anynotascii we still want to know if anynotutf8, and anynotutf8 implies anynotascii already + !IS_ASCII(s)) { // anynotutf8 implies anynotascii and IS_ASCII will be cheaper than IS_UTF8, so start with this one if (!anynotascii) anynotascii=true; - if (!anynotutf8 && !IS_UTF8(s)) + if (!IS_UTF8(s)) anynotutf8=true; } } From 0f1577547b13a435225535caff690449ec3c7b22 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 14 Mar 2024 18:12:48 -0700 Subject: [PATCH 34/53] update test number after merge --- inst/tests/tests.Rraw | 196 +++++++++++++++++++++--------------------- 1 file changed, 98 insertions(+), 98 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c904913d4f..c44f81951f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18335,79 +18335,79 @@ if (test_bit64) { dd = data.table(a=1:2, b=2:1) d = copy(dd) op = options(datatable.verbose=TRUE) -test(2240.01, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") -test(2240.02, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.01, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2249.02, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setkeyv(d, "b") options(datatable.verbose=TRUE) -test(2240.03, forderv(d, "b"), integer(), output="forder.*opt=1.*took") -test(2240.04, forderv(d, "b", lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.03, forderv(d, "b"), integer(), output="forder.*opt=1.*took") +test(2249.04, forderv(d, "b", lazy=FALSE), integer(), output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) setindexv(d, "b") options(datatable.verbose=TRUE) -test(2240.05, forderv(d, "b"), 2:1, output="forder.*opt=2.*took") -test(2240.06, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.05, forderv(d, "b"), 2:1, output="forder.*opt=2.*took") +test(2249.06, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) options(datatable.verbose=TRUE) -test(2240.11, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2240.12, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.13, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2240.14, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.11, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2249.12, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.13, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2249.14, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setkeyv(d, c("a","b")) options(datatable.verbose=TRUE) -test(2240.21, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") -test(2240.22, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.23, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2240.24, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.21, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") +test(2249.22, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.23, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2249.24, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setkeyv(d, c("b","a")) options(datatable.verbose=TRUE) -test(2240.25, forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") -test(2240.26, forderv(d, c("a","b"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2240.27, forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") -test(2240.28, forderv(d, c("b","a"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.25, forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") +test(2249.26, forderv(d, c("a","b"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.27, forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") +test(2249.28, forderv(d, c("b","a"), lazy=FALSE), integer(), output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) setindexv(d, c("a","b")) options(datatable.verbose=TRUE) -test(2240.31, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2240.32, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.33, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2240.34, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.31, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2249.32, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.33, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2249.34, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setindexv(d, NULL) setindexv(d, c("b","a")) options(datatable.verbose=TRUE) -test(2240.35, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2240.36, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.37, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2240.38, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.35, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2249.36, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.37, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2249.38, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setindexv(d, NULL) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) -test(2240.41, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2240.42, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.43, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2240.44, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.41, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2249.42, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.43, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2249.44, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) -test(2240.51, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached -test(2240.52, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.53, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2240.54, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.51, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached +test(2249.52, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.53, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2249.54, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) setkeyv(d, NULL) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) -test(2240.55, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") -test(2240.56, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.55, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") +test(2249.56, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") options(datatable.verbose=FALSE) d = copy(dd) setkeyv(d, c("a","b")) @@ -18415,99 +18415,99 @@ setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) ab = structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) ba = structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) -test(2240.60, forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") -test(2240.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") -test(2240.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2240.63, forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") -test(2240.64, forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") -test(2240.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute -test(2240.66, forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.67, forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute -test(2240.68, forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2240.69, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2240.70, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2240.71, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2240.72, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2240.73, forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") -test(2240.74, forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2240.75, forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") -test(2240.76, forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.77, forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") -test(2240.78, forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.79, forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") -test(2240.80, forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2240.81, forderv(1:2), integer(), output="forder.*opt=0.*took") -test(2240.82, forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2240.83, forderv(2:1), 2:1, output="forder.*opt=0.*took") -test(2240.84, forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.60, forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") +test(2249.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") +test(2249.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2249.63, forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") +test(2249.64, forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") +test(2249.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute +test(2249.66, forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.67, forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute +test(2249.68, forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.69, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2249.70, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2249.71, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2249.72, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2249.73, forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") +test(2249.74, forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.75, forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") +test(2249.76, forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.77, forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") +test(2249.78, forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.79, forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") +test(2249.80, forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.81, forderv(1:2), integer(), output="forder.*opt=0.*took") +test(2249.82, forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2249.83, forderv(2:1), 2:1, output="forder.*opt=0.*took") +test(2249.84, forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) -test(2240.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE") -test(2240.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") +test(2249.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE") +test(2249.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") ddd = data.table(v1=1:3, v2=c(1L,NA,3L), v3=c(3:2,NaN), v4=c(1:2,Inf), v5=c(-Inf,NA,3)) ## tests for NAs and na.last arg d = copy(ddd) invisible(d[v1==1L, verbose=FALSE]) options(datatable.verbose=TRUE) -test(2240.853, forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") -test(2240.854, o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") +test(2249.853, forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") +test(2249.854, o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") setattr(d, "index", setattr(integer(), "__v1", o)) -test(2240.855, forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") -test(2240.856, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") -test(2240.857, forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") +test(2249.855, forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") +test(2249.856, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2249.857, forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") d = copy(ddd) invisible(d[v1==1L, verbose=FALSE]) -test(2240.858, forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") +test(2249.858, forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") setindexv(d, "v2", verbose=FALSE) -test(2240.859, forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") +test(2249.859, forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") options(datatable.verbose=FALSE) d = copy(ddd) setkeyv(d, "v1") setindexv(d, list("v2","v3","v4","v5",c("v1","v2"),c("v1","v3"),c("v2","v3"),c("v1","v4"),c("v1","v5"),c("v1","v4","v5"))) options(datatable.verbose=TRUE) -test(2240.861, forderv(d, "v1"), integer(), output="forder.*opt=1.*took") -test(2240.862, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=-1.*took") ## cannot use key for na.last +test(2249.861, forderv(d, "v1"), integer(), output="forder.*opt=1.*took") +test(2249.862, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=-1.*took") ## cannot use key for na.last setindexv(d, "v1", verbose=FALSE) -test(2240.863, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") -test(2240.864, forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2240.865, forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg -test(2240.866, forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2240.867, forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") -test(2240.868, forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2240.869, forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") -test(2240.870, forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2240.871, forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg -test(2240.872, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2240.873, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2249.863, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2249.864, forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2249.865, forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2249.866, forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2249.867, forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2249.868, forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2249.869, forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2249.870, forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2249.871, forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2249.872, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2249.873, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg options(datatable.verbose=FALSE) d = fread(testDir("1680-fread-header-encoding.csv"), encoding="Latin-1") ## re-use some existing non utf8 data anyEnc = function(x) unlist(attributes(forderv(x, retStats=TRUE))[c("anynotascii","anynotutf8")]) -test(2240.881, anyEnc(d), c(anynotascii=1L,anynotutf8=1L)) -test(2240.882, anyEnc(d[,-2L]), c(anynotascii=0L,anynotutf8=0L)) -test(2240.883, anyEnc(c("a","b","\u221A")), c(anynotascii=1L,anynotutf8=0L)) +test(2249.881, anyEnc(d), c(anynotascii=1L,anynotutf8=1L)) +test(2249.882, anyEnc(d[,-2L]), c(anynotascii=0L,anynotutf8=0L)) +test(2249.883, anyEnc(c("a","b","\u221A")), c(anynotascii=1L,anynotutf8=0L)) d = copy(dd) setindexv(d, "b") options(datatable.verbose=TRUE) op2 = options(datatable.use.index=FALSE) -test(2240.91, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") -test(2240.92, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2249.91, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2249.92, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(op2) options(datatable.verbose=FALSE) d = data.table(x = 2:1) make_retGrp0_idx = d[x==1L] -test(2240.93, attr(attr(d, "index"), "__x"), 2:1) +test(2249.93, attr(attr(d, "index"), "__x"), 2:1) options(datatable.verbose=TRUE) -test(2240.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") +test(2249.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") d = data.table(x = 2:1) op2 = options("datatable.forder.auto.index"=TRUE) -test(2240.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") -test(2240.96, forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1, retStats=1") +test(2249.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") +test(2249.96, forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1, retStats=1") setindexv(d, NULL, verbose=FALSE) -test(2240.971, forderv(d, "x", retStats=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +test(2249.971, forderv(d, "x", retStats=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") setindexv(d, NULL, verbose=FALSE) -test(2240.972, forderv(d, "x", retStats=TRUE, na.last=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +test(2249.972, forderv(d, "x", retStats=TRUE, na.last=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") setindexv(d, NULL, verbose=FALSE) -test(2240.973, forderv(data.table(x=c(2:1,NA)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") -test(2240.974, forderv(data.table(x=c(2:1,NaN)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") -test(2240.975, forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2249.973, forderv(data.table(x=c(2:1,NA)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2249.974, forderv(data.table(x=c(2:1,NaN)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2249.975, forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") options(op2) -test(2240.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") +test(2249.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") options(op) From c8f5b7efa0963394a2cb10c4986ba7570fbdd40f Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 11 Jul 2024 23:12:36 +0000 Subject: [PATCH 35/53] apply minor review feedback --- R/data.table.R | 6 +++--- src/bmerge.c | 2 +- src/forder.c | 7 ++----- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index ea8d9c66b2..75d7f55fb8 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3235,15 +3235,15 @@ is_constantish = function(q, check_singleton=FALSE) { ## if nothing else helped, auto create a new index that can be used if (!getOption("datatable.auto.index")) return(NULL) idxCols = names(i) - if (verbose) {catf("Creating new index '%s'\n", paste0(idxCols, collapse = "__"));flush.console()} - if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste0(idxCols, collapse = "__"));flush.console()} + if (verbose) {catf("Creating new index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} + if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(idxCols, collapse = "__"));flush.console()} idx = forderv(x, idxCols, sort=TRUE, retGrp=FALSE, lazy=TRUE) if (!isTRUE(getOption("datatable.forder.auto.index"))) { ## forder can write index, but disabled for now, see #4386 if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) setattr(attr(x, "index", exact=TRUE), paste0("__", idxCols, collapse=""), idx) } if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} - if (verbose) {catf("Optimized subsetting with index '%s'\n", paste0(idxCols, collapse = "__"));flush.console()} + if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} } if(!is.null(idxCols)){ setkeyv(i, idxCols) diff --git a/src/bmerge.c b/src/bmerge.c index ee1e20a2c8..4765ed42aa 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -164,7 +164,7 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r allGrp1[0] = TRUE; protecti += 2; - SEXP oSxp = PROTECT(forderLazy(idt, icolsArg, ScalarLogical(FALSE), ScalarLogical(FALSE), ScalarLogical(TRUE), ScalarInteger(1), ScalarLogical(FALSE), ScalarLogical(TRUE))); protecti++; + SEXP oSxp = PROTECT(forderLazy(idt, icolsArg, /* retGrpArg= */ScalarLogical(FALSE), /* retStatsArg= */ScalarLogical(FALSE), /* sortGroupsArg= */ScalarLogical(TRUE), /* ascArg= */ScalarInteger(1), /* naArg= */ScalarLogical(FALSE), /* lazyArg= */ScalarLogical(TRUE))); protecti++; if (!LENGTH(oSxp)) o = NULL; else diff --git a/src/forder.c b/src/forder.c index cfd1b0b6fb..acc0b60c70 100644 --- a/src/forder.c +++ b/src/forder.c @@ -528,7 +528,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA bool complexRerun = false; // see comments below in CPLXSXP case SEXP CplxPart = R_NilValue; if (n_cplx) { CplxPart=PROTECT(allocVector(REALSXP, nrow)); n_protect++; } // one alloc is reused for each part - int any_na=0, any_infnan=0, any_notascii=0, any_notutf8=0;; // collect more statistics about the data #2879, allow optimize of order(na.last=TRUE) as well #3023 + int any_na=0, any_infnan=0, any_notascii=0, any_notutf8=0; // collect more statistics about the data #2879, allow optimize of order(na.last=TRUE) as well #3023 TEND(2); for (int col=0; col Date: Thu, 11 Jul 2024 23:17:35 +0000 Subject: [PATCH 36/53] More minor review feedback --- inst/tests/tests.Rraw | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c4c57f77f6..4e765bc36e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11752,7 +11752,7 @@ test(1775.3, capture.output(print(DT2, print.keys = TRUE)), setkey(DT2, a) setindexv(DT2, c("b","a")) test(1775.4, capture.output(print(DT2, print.keys = TRUE)), - c("Key: ", "Indices: , ", " a b", "1: 1 4", "2: 2 5", "3: 3 6")) ## index 'b' is still good, so we now keep it + c("Key: ", "Indices: , ", " a b", "1: 1 4", "2: 2 5", "3: 3 6")) ## index 'b' is still good, so we keep it # dev regression #2285 cat("A B C\n1 2 3\n4 5 6", file=f<-tempfile()) @@ -12791,7 +12791,7 @@ test(1899.18, as.matrix(DT, rownames=TRUE, rownames.value=1:nrow(DT)), error="ro # index argument for fread, #2633 DT_str = c('a,b\n3,1\n2,2\n1,1\n2,1\n3,2') -test(1900.1, lapply(attributes(attr(fread(DT_str, index = 'a'), 'index')), c), +test(1900.1, lapply(attributes(attr(fread(DT_str, index = 'a'), 'index')), c), # lapply(, c) to ensure no starts, maxgrpn attributes list(`__a` = c(3L, 2L, 4L, 1L, 5L))) test(1900.2, lapply(attributes(attr(fread(DT_str, index = list('a,b', c('b', 'a'), 'a')), 'index')), c), list(`__a__b` = c(3L, 4L, 2L, 1L, 5L), @@ -18753,7 +18753,7 @@ test(2267.32, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt test(2267.33, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") test(2267.34, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) -setindexv(d, NULL) +d = copy(dd) setindexv(d, c("b","a")) options(datatable.verbose=TRUE) test(2267.35, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") @@ -18761,7 +18761,7 @@ test(2267.36, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt test(2267.37, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") test(2267.38, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) -setindexv(d, NULL) +d = copy(dd) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) test(2267.41, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") @@ -18778,7 +18778,7 @@ test(2267.52, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt test(2267.53, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") test(2267.54, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") options(datatable.verbose=FALSE) -setkeyv(d, NULL) +d = copy(dd) setindexv(d, list(c("a","b"), c("b","a"))) options(datatable.verbose=TRUE) test(2267.55, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") From c2ae34a0968b998c1e8b8ad361fd9a1708fcf7a3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 11 Jul 2024 23:26:42 +0000 Subject: [PATCH 37/53] use options= to set options --- inst/tests/tests.Rraw | 241 +++++++++++++++++++----------------------- 1 file changed, 107 insertions(+), 134 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4e765bc36e..eb8b5398f1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18709,150 +18709,122 @@ test(2266, eval(parse(text="DT[ , .N, a\U00F1o]$N[1L]")), 2L) # lazy forder, #4386 dd = data.table(a=1:2, b=2:1) d = copy(dd) -op = options(datatable.verbose=TRUE) -test(2267.01, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") -test(2267.02, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.01, options = c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2267.02, options = c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") setkeyv(d, "b") -options(datatable.verbose=TRUE) -test(2267.03, forderv(d, "b"), integer(), output="forder.*opt=1.*took") -test(2267.04, forderv(d, "b", lazy=FALSE), integer(), output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.03, options = c(datatable.verbose=TRUE), forderv(d, "b"), integer(), output="forder.*opt=1.*took") +test(2267.04, options = c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), integer(), output="forder.*opt=0.*took") d = copy(dd) setindexv(d, "b") -options(datatable.verbose=TRUE) -test(2267.05, forderv(d, "b"), 2:1, output="forder.*opt=2.*took") -test(2267.06, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.05, options = c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=2.*took") +test(2267.06, options = c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) -options(datatable.verbose=TRUE) -test(2267.11, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2267.12, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.13, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2267.14, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.11, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2267.12, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.13, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2267.14, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") setkeyv(d, c("a","b")) -options(datatable.verbose=TRUE) -test(2267.21, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") -test(2267.22, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.23, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2267.24, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.21, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") +test(2267.22, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.23, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2267.24, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") setkeyv(d, c("b","a")) -options(datatable.verbose=TRUE) -test(2267.25, forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") -test(2267.26, forderv(d, c("a","b"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2267.27, forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") -test(2267.28, forderv(d, c("b","a"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.25, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") +test(2267.26, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2267.27, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") +test(2267.28, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), integer(), output="forder.*opt=0.*took") d = copy(dd) setindexv(d, c("a","b")) -options(datatable.verbose=TRUE) -test(2267.31, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2267.32, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.33, forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2267.34, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.31, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2267.32, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.33, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2267.34, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setindexv(d, c("b","a")) -options(datatable.verbose=TRUE) -test(2267.35, forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2267.36, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.37, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2267.38, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.35, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2267.36, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.37, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2267.38, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setindexv(d, list(c("a","b"), c("b","a"))) -options(datatable.verbose=TRUE) -test(2267.41, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2267.42, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.43, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2267.44, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.41, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2267.42, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.43, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2267.44, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) -options(datatable.verbose=TRUE) -test(2267.51, forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached -test(2267.52, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.53, forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2267.54, forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.51, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached +test(2267.52, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.53, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2267.54, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setindexv(d, list(c("a","b"), c("b","a"))) -options(datatable.verbose=TRUE) -test(2267.55, forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") -test(2267.56, forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.55, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") +test(2267.56, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) -options(datatable.verbose=TRUE) ab = structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) ba = structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) -test(2267.60, forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") -test(2267.61, forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") -test(2267.62, forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2267.63, forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") -test(2267.64, forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") -test(2267.65, forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute -test(2267.66, forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.67, forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute -test(2267.68, forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2267.69, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2267.70, forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2267.71, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2267.72, forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2267.73, forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") -test(2267.74, forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2267.75, forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") -test(2267.76, forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.77, forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") -test(2267.78, forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.79, forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") -test(2267.80, forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2267.81, forderv(1:2), integer(), output="forder.*opt=0.*took") -test(2267.82, forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2267.83, forderv(2:1), 2:1, output="forder.*opt=0.*took") -test(2267.84, forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(datatable.verbose=FALSE) +test(2267.60, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") +test(2267.61, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") +test(2267.62, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2267.63, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") +test(2267.64, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") +test(2267.65, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute +test(2267.66, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.67, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute +test(2267.68, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2267.69, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2267.70, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2267.71, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2267.72, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2267.73, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") +test(2267.74, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2267.75, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") +test(2267.76, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.77, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") +test(2267.78, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.79, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") +test(2267.80, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2267.81, options = c(datatable.verbose=TRUE), forderv(1:2), integer(), output="forder.*opt=0.*took") +test(2267.82, options = c(datatable.verbose=TRUE), forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2267.83, options = c(datatable.verbose=TRUE), forderv(2:1), 2:1, output="forder.*opt=0.*took") +test(2267.84, options = c(datatable.verbose=TRUE), forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") test(2267.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE") test(2267.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") ddd = data.table(v1=1:3, v2=c(1L,NA,3L), v3=c(3:2,NaN), v4=c(1:2,Inf), v5=c(-Inf,NA,3)) ## tests for NAs and na.last arg d = copy(ddd) -invisible(d[v1==1L, verbose=FALSE]) -options(datatable.verbose=TRUE) -test(2267.853, forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") -test(2267.854, o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") +invisible(d[v1==1L]) +test(2267.853, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") +test(2267.854, options = c(datatable.verbose=TRUE), o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") setattr(d, "index", setattr(integer(), "__v1", o)) -test(2267.855, forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") -test(2267.856, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") -test(2267.857, forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") +test(2267.855, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") +test(2267.856, options = c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2267.857, options = c(datatable.verbose=TRUE), forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") d = copy(ddd) -invisible(d[v1==1L, verbose=FALSE]) -test(2267.858, forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") -setindexv(d, "v2", verbose=FALSE) -test(2267.859, forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") -options(datatable.verbose=FALSE) +invisible(d[v1==1L]) +test(2267.858, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") +setindexv(d, "v2") +test(2267.859, options = c(datatable.verbose=TRUE), forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") d = copy(ddd) setkeyv(d, "v1") setindexv(d, list("v2","v3","v4","v5",c("v1","v2"),c("v1","v3"),c("v2","v3"),c("v1","v4"),c("v1","v5"),c("v1","v4","v5"))) -options(datatable.verbose=TRUE) -test(2267.861, forderv(d, "v1"), integer(), output="forder.*opt=1.*took") -test(2267.862, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=-1.*took") ## cannot use key for na.last -setindexv(d, "v1", verbose=FALSE) -test(2267.863, forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") -test(2267.864, forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2267.865, forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg -test(2267.866, forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2267.867, forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") -test(2267.868, forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2267.869, forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") -test(2267.870, forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2267.871, forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg -test(2267.872, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2267.873, forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg -options(datatable.verbose=FALSE) +test(2267.861, options = c(datatable.verbose=TRUE), forderv(d, "v1"), integer(), output="forder.*opt=1.*took") +test(2267.862, options = c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=-1.*took") ## cannot use key for na.last +setindexv(d, "v1") +test(2267.863, options = c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2267.864, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2267.865, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2267.866, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2267.867, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2267.868, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2267.869, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2267.870, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2267.871, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2267.872, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2267.873, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg d = fread(testDir("1680-fread-header-encoding.csv"), encoding="Latin-1") ## re-use some existing non utf8 data anyEnc = function(x) unlist(attributes(forderv(x, retStats=TRUE))[c("anynotascii","anynotutf8")]) test(2267.881, anyEnc(d), c(anynotascii=1L,anynotutf8=1L)) @@ -18860,29 +18832,30 @@ test(2267.882, anyEnc(d[,-2L]), c(anynotascii=0L,anynotutf8=0L)) test(2267.883, anyEnc(c("a","b","\u221A")), c(anynotascii=1L,anynotutf8=0L)) d = copy(dd) setindexv(d, "b") -options(datatable.verbose=TRUE) -op2 = options(datatable.use.index=FALSE) -test(2267.91, forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") -test(2267.92, forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") -options(op2) -options(datatable.verbose=FALSE) +test(2267.91, options = c(datatable.verbose=TRUE, datatable.use.index=FALSE), + forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2267.92, options = c(datatable.verbose=TRUE, datatable.use.index=FALSE), + forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = data.table(x = 2:1) make_retGrp0_idx = d[x==1L] test(2267.93, attr(attr(d, "index"), "__x"), 2:1) -options(datatable.verbose=TRUE) -test(2267.94, forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") +test(2267.94, options = c(datatable.verbose=TRUE), forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") d = data.table(x = 2:1) -op2 = options("datatable.forder.auto.index"=TRUE) -test(2267.95, d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") -test(2267.96, forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1, retStats=1") -setindexv(d, NULL, verbose=FALSE) -test(2267.971, forderv(d, "x", retStats=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") -setindexv(d, NULL, verbose=FALSE) -test(2267.972, forderv(d, "x", retStats=TRUE, na.last=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") -setindexv(d, NULL, verbose=FALSE) -test(2267.973, forderv(data.table(x=c(2:1,NA)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") -test(2267.974, forderv(data.table(x=c(2:1,NaN)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") -test(2267.975, forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") -options(op2) +test(2267.95, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") +test(2267.96, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1, retStats=1") +setindexv(d, NULL) +test(2267.971, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", retStats=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +setindexv(d, NULL) +test(2267.972, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", retStats=TRUE, na.last=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") +setindexv(d, NULL) +test(2267.973, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(data.table(x=c(2:1,NA)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2267.974, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(data.table(x=c(2:1,NaN)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") +test(2267.975, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), + forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") test(2267.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") -options(op) From 4184fe91b3b7307c93f5e1161693b7086635ff10 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 11 Jul 2024 23:28:36 +0000 Subject: [PATCH 38/53] more feedback --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index eb8b5398f1..53064c78cd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18767,7 +18767,7 @@ setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) ab = structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) ba = structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) -test(2267.60, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") +test(2267.60, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") # c(): strip attributes test(2267.61, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") test(2267.62, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") test(2267.63, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") From 0588d4d1f3cc2c2bb5b6df452c854aaa379c9f17 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Jul 2024 16:21:39 -0700 Subject: [PATCH 39/53] Rename forderLazy->forderMaybePresorted --- R/setkey.R | 4 ++-- src/bmerge.c | 2 +- src/data.table.h | 2 +- src/forder.c | 22 +++++++++++----------- src/init.c | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 4638846e57..d4ec9e3b71 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -71,7 +71,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } else { o = forderv(x, cols, sort=TRUE, retGrp=!physical, lazy=TRUE) } - if (!physical) { # index COULD BE saved from C forderLazy already, but disabled for now + if (!physical) { # index COULD BE saved from C forderMaybePresorted already, but disabled for now if (!isTRUE(getOption("datatable.forder.auto.index"))) { if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o) @@ -153,7 +153,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, by = colnamesInt(x, by, check_dups=FALSE) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level - .Call(CforderLazy, x, by, retGrp, retStats, sort, order, na.last, lazy) # returns integer() if already sorted, regardless of sort=TRUE|FALSE + .Call(CforderMaybePresorted, x, by, retGrp, retStats, sort, order, na.last, lazy) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } forder = function(..., na.last=TRUE, decreasing=FALSE) diff --git a/src/bmerge.c b/src/bmerge.c index 4765ed42aa..c58b095c68 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -164,7 +164,7 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r allGrp1[0] = TRUE; protecti += 2; - SEXP oSxp = PROTECT(forderLazy(idt, icolsArg, /* retGrpArg= */ScalarLogical(FALSE), /* retStatsArg= */ScalarLogical(FALSE), /* sortGroupsArg= */ScalarLogical(TRUE), /* ascArg= */ScalarInteger(1), /* naArg= */ScalarLogical(FALSE), /* lazyArg= */ScalarLogical(TRUE))); protecti++; + SEXP oSxp = PROTECT(forderMaybePresorted(idt, icolsArg, /* retGrpArg= */ScalarLogical(FALSE), /* retStatsArg= */ScalarLogical(FALSE), /* sortGroupsArg= */ScalarLogical(TRUE), /* ascArg= */ScalarInteger(1), /* naArg= */ScalarLogical(FALSE), /* lazyArg= */ScalarLogical(TRUE))); protecti++; if (!LENGTH(oSxp)) o = NULL; else diff --git a/src/data.table.h b/src/data.table.h index b5b5af812f..478d9d9676 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -142,7 +142,7 @@ int checkOverAlloc(SEXP x); int StrCmp(SEXP x, SEXP y); uint64_t dtwiddle(double x); SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); -SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg); // lazy wrapper to forder +SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg); // lazy wrapper to forder int getNumericRounding_C(void); // reorder.c diff --git a/src/forder.c b/src/forder.c index acc0b60c70..67944830ce 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1575,7 +1575,7 @@ bool idxAnyNF(SEXP idx) { } // lazy forder, re-use existing key or index if possible, otherwise call forder -SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg) { +SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg) { const bool verbose = GetVerbose(); int protecti = 0; double tic=0.0; @@ -1612,7 +1612,7 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro opt = -1; } else { if (verbose) - Rprintf("forderLazy: opt not possible: is.data.table(DT)=%d, sortGroups=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, all1(ascArg)); + Rprintf("forderMaybePresorted: opt not possible: is.data.table(DT)=%d, sortGroups=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, all1(ascArg)); opt = 0; } } else if (lazy) { @@ -1631,7 +1631,7 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro opt = 1; // keyOpt ans = PROTECT(allocVector(INTSXP, 0)); protecti++; if (verbose) - Rprintf("forderLazy: using key: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderMaybePresorted: using key: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } if (opt == -1 && GetUseIndex()) { SEXP idx = getIndex(DT, by); @@ -1666,23 +1666,23 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro opt = 2; // idxOpt but need to drop groups or stats } else if (!hasGrp && retGrp && !hasStats && retStats) { if (verbose) - Rprintf("forderLazy: index found but not for retGrp and retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderMaybePresorted: index found but not for retGrp and retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else if (!hasGrp && retGrp) { if (verbose) - Rprintf("forderLazy: index found but not for retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderMaybePresorted: index found but not for retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else if (!hasStats && retStats) { if (verbose) - Rprintf("forderLazy: index found but not for retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderMaybePresorted: index found but not for retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else { error("internal error: lazy forder index optimization unhandled branch of retGrp-retStats, please report to issue tracker"); // # nocov } } else { if (!hasStats) { if (verbose) - Rprintf("forderLazy: index found but na.last=TRUE and no stats available: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderMaybePresorted: index found but na.last=TRUE and no stats available: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else if (idxAnyNF(idx)) { if (verbose) - Rprintf("forderLazy: index found but na.last=TRUE and NAs present: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderMaybePresorted: index found but na.last=TRUE and NAs present: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else { error("internal error: lazy forder index optimization unhandled branch of last.na=T, please report to issue tracker"); // # nocov } @@ -1690,7 +1690,7 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro if (opt == 2) { ans = idx; if (verbose) - Rprintf("forderLazy: using existing index: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderMaybePresorted: using existing index: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } } } @@ -1702,11 +1702,11 @@ SEXP forderLazy(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGro GetAutoIndex()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only putIndex(DT, by, ans); if (verbose) - Rprintf("forderLazy: setting index (retGrp=%d, retStats=%d) on DT: %s\n", retGrp, retStats, CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderMaybePresorted: setting index (retGrp=%d, retStats=%d) on DT: %s\n", retGrp, retStats, CHAR(STRING_ELT(idxName(DT, by), 0))); } } if (verbose) - Rprintf("forderLazy: opt=%d, took %.3fs\n", opt, omp_get_wtime()-tic); + Rprintf("forderMaybePresorted: opt=%d, took %.3fs\n", opt, omp_get_wtime()-tic); UNPROTECT(protecti); return ans; } diff --git a/src/init.c b/src/init.c index 42f2ae2d4e..32870a7b89 100644 --- a/src/init.c +++ b/src/init.c @@ -76,7 +76,7 @@ R_CallMethodDef callMethods[] = { {"Cfcast", (DL_FUNC) &fcast, -1}, {"Cuniqlist", (DL_FUNC) &uniqlist, -1}, {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1}, -{"CforderLazy", (DL_FUNC) &forderLazy, -1}, +{"CforderMaybePresorted", (DL_FUNC) &forderMaybePresorted, -1}, {"Cforder", (DL_FUNC) &forder, -1}, {"Cissorted", (DL_FUNC) &issorted, -1}, {"Cgforce", (DL_FUNC) &gforce, -1}, From 8b3a80a193e03ad6b1da53d0f1872e75ec03edfb Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 12 Jul 2024 16:25:27 -0700 Subject: [PATCH 40/53] UNPROTECT() more aggressively --- src/forder.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/forder.c b/src/forder.c index 67944830ce..887bdfdc77 100644 --- a/src/forder.c +++ b/src/forder.c @@ -458,10 +458,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA if (LENGTH(ascArg) != LENGTH(by)) { if (LENGTH(ascArg)!=1) STOP(_("'order' length (%d) is different to by='s length (%d)"), LENGTH(ascArg), LENGTH(by)); - SEXP recycleAscArg = PROTECT(allocVector(INTSXP, LENGTH(by))); n_protect++; + SEXP recycleAscArg = PROTECT(allocVector(INTSXP, LENGTH(by))); for (int j=0; j Date: Fri, 12 Jul 2024 16:31:48 -0700 Subject: [PATCH 41/53] maybe_reset_index() helper --- R/data.table.R | 5 +---- R/setkey.R | 12 ++++++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 75d7f55fb8..a0d7819786 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3238,10 +3238,7 @@ is_constantish = function(q, check_singleton=FALSE) { if (verbose) {catf("Creating new index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(idxCols, collapse = "__"));flush.console()} idx = forderv(x, idxCols, sort=TRUE, retGrp=FALSE, lazy=TRUE) - if (!isTRUE(getOption("datatable.forder.auto.index"))) { ## forder can write index, but disabled for now, see #4386 - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", idxCols, collapse=""), idx) - } + maybe_reset_index(x, idxCols, idx) ## forder can write index, but disabled for now, see #4386 if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} } diff --git a/R/setkey.R b/R/setkey.R index d4ec9e3b71..c7ee8b0bf4 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -72,10 +72,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU o = forderv(x, cols, sort=TRUE, retGrp=!physical, lazy=TRUE) } if (!physical) { # index COULD BE saved from C forderMaybePresorted already, but disabled for now - if (!isTRUE(getOption("datatable.forder.auto.index"))) { - if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) - setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o) - } + maybe_reset_index(x, cols, o) return(invisible(x)) } if (length(o)) { @@ -143,6 +140,13 @@ is.sorted = function(x, by=NULL) { # Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1) } +maybe_reset_index = function(x, idx, cols) { + if (isTRUE(getOption("datatable.forder.auto.index"))) return(invisible()) + if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer()) + setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), idx) + invisible(x) +} + ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, order=1L, na.last=FALSE, lazy=getOption("datatable.forder.lazy",NA)) { if (is.atomic(x) || is.null(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), From 3e898e98bc6ec5d77bbf3dead2f51a0d16081202 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jul 2024 01:40:27 +0000 Subject: [PATCH 42/53] Strict prototyping (-Wstrict-prototypes) --- src/forder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/forder.c b/src/forder.c index acc0b60c70..fb31943371 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1549,7 +1549,7 @@ void putIndex(SEXP x, SEXP cols, SEXP o) { } // isTRUE(getOption("datatable.use.index")) -bool GetUseIndex() { +bool GetUseIndex(void) { SEXP opt = GetOption(install("datatable.use.index"), R_NilValue); if (!IS_TRUE_OR_FALSE(opt)) error("'datatable.use.index' option must be TRUE or FALSE"); // # nocov @@ -1557,7 +1557,7 @@ bool GetUseIndex() { } // isTRUE(getOption("datatable.auto.index")) -bool GetAutoIndex() { +bool GetAutoIndex(void) { // for now temporarily 'forder.auto.index' not 'auto.index' to disabled it by default // because it writes attr on .SD which is re-used by all groups leading to incorrect results // DT[, .(uN=uniqueN(.SD)), by=A] From d8adf3c8533b356ddb763ae2982f28005ff3c39f Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jul 2024 04:45:28 +0000 Subject: [PATCH 43/53] fix sloppy refactor for maybe_reset_index() --- R/data.table.R | 2 +- R/setkey.R | 2 +- inst/tests/tests.Rraw | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index fbea8de165..354777d06c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3243,7 +3243,7 @@ is_constantish = function(q, check_singleton=FALSE) { if (verbose) {catf("Creating new index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(idxCols, collapse = "__"));flush.console()} idx = forderv(x, idxCols, sort=TRUE, retGrp=FALSE, lazy=TRUE) - maybe_reset_index(x, idxCols, idx) ## forder can write index, but disabled for now, see #4386 + maybe_reset_index(x, idx, idxCols) ## forder can write index, but disabled for now, see #4386 if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} } diff --git a/R/setkey.R b/R/setkey.R index c7ee8b0bf4..10efeba1f3 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -72,7 +72,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU o = forderv(x, cols, sort=TRUE, retGrp=!physical, lazy=TRUE) } if (!physical) { # index COULD BE saved from C forderMaybePresorted already, but disabled for now - maybe_reset_index(x, cols, o) + maybe_reset_index(x, o, cols) return(invisible(x)) } if (length(o)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4aeebb696a..67522b70a5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5319,7 +5319,8 @@ test(1313.02, DT[, max(y), by=x], DT[, base::max(y), by=x]) test(1313.03, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) test(1313.04, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) -DT[x==6, y := INT(NA)] +DT[x==6, + y := INT(NA)] test(1313.05, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.06, DT[, max(y), by=x], DT[, base::max(y), by=x]) test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) From 1d398aa0cb5549ba1aaf99725db039456f528ad8 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jul 2024 14:38:45 +0000 Subject: [PATCH 44/53] Fix implicit reliance on datatable.optimize --- inst/tests/tests.Rraw | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 67522b70a5..332e9a093e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18824,7 +18824,11 @@ test(2269.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE" test(2269.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") ddd = data.table(v1=1:3, v2=c(1L,NA,3L), v3=c(3:2,NaN), v4=c(1:2,Inf), v5=c(-Inf,NA,3)) ## tests for NAs and na.last arg d = copy(ddd) -invisible(d[v1==1L]) +invisible(local({ + o = options(datatable.optimize=Inf) + on.exit(options(o)) + d[v1 == 1] # _not_ setindex(d, v1), which will compute retGrp/retStats +})) test(2269.853, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") test(2269.854, options = c(datatable.verbose=TRUE), o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") setattr(d, "index", setattr(integer(), "__v1", o)) @@ -18832,7 +18836,11 @@ test(2269.855, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE test(2269.856, options = c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") test(2269.857, options = c(datatable.verbose=TRUE), forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") d = copy(ddd) -invisible(d[v1==1L]) +invisible(local({ + o = options(datatable.optimize=Inf) + on.exit(options(o)) + d[v1 == 1] # _not_ setindex(d, v1), which will compute retGrp/retStats +})) test(2269.858, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") setindexv(d, "v2") test(2269.859, options = c(datatable.verbose=TRUE), forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") From ac19b839b19746ae8e5a5aaa26cb4201e0d7659a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jul 2024 14:48:54 +0000 Subject: [PATCH 45/53] Fix elsewhere, and encapsulate the logic inside a test() --- inst/tests/tests.Rraw | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 332e9a093e..bcb1f6b5f2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18824,24 +18824,16 @@ test(2269.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE" test(2269.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") ddd = data.table(v1=1:3, v2=c(1L,NA,3L), v3=c(3:2,NaN), v4=c(1:2,Inf), v5=c(-Inf,NA,3)) ## tests for NAs and na.last arg d = copy(ddd) -invisible(local({ - o = options(datatable.optimize=Inf) - on.exit(options(o)) - d[v1 == 1] # _not_ setindex(d, v1), which will compute retGrp/retStats -})) -test(2269.853, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") +test(2269.8530, options=c(datatable.optimize=Inf), {d[v1 == 1L]; indices(d)}, "v1") +test(2269.8531, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") test(2269.854, options = c(datatable.verbose=TRUE), o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") setattr(d, "index", setattr(integer(), "__v1", o)) test(2269.855, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") test(2269.856, options = c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") test(2269.857, options = c(datatable.verbose=TRUE), forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") d = copy(ddd) -invisible(local({ - o = options(datatable.optimize=Inf) - on.exit(options(o)) - d[v1 == 1] # _not_ setindex(d, v1), which will compute retGrp/retStats -})) -test(2269.858, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") +test(2269.8580, options=c(datatable.optimize=Inf), {d[v1 == 1L]; indices(d)}, "v1") # _not_ setindex(d, v1), which will compute retGrp/retStats +test(2269.8581, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") setindexv(d, "v2") test(2269.859, options = c(datatable.verbose=TRUE), forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") d = copy(ddd) @@ -18873,11 +18865,10 @@ test(2269.91, options = c(datatable.verbose=TRUE, datatable.use.index=FALSE), test(2269.92, options = c(datatable.verbose=TRUE, datatable.use.index=FALSE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = data.table(x = 2:1) -make_retGrp0_idx = d[x==1L] -test(2269.93, attr(attr(d, "index"), "__x"), 2:1) +test(2269.93, options = c(datatable.optimize=Inf), {d[x == 1L]; attr(attr(d, "index"), "__x")}, 2:1) test(2269.94, options = c(datatable.verbose=TRUE), forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") d = data.table(x = 2:1) -test(2269.95, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), +test(2269.95, options = list(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE, datatable.optimize=Inf), d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") test(2269.96, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1, retStats=1") From 9675f06d5772e1d7c181f2ba925aaeda2f2d4c9a Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jul 2024 14:49:52 +0000 Subject: [PATCH 46/53] style --- inst/tests/tests.Rraw | 180 +++++++++++++++++++++--------------------- 1 file changed, 90 insertions(+), 90 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bcb1f6b5f2..a0fe4e977c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18737,122 +18737,122 @@ test(2268, rbindlist(y, fill=TRUE), rbindlist(x, fill=TRUE)[rep(1:5, N)]) # lazy forder, #4386 dd = data.table(a=1:2, b=2:1) d = copy(dd) -test(2269.01, options = c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") -test(2269.02, options = c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.01, options=c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") +test(2269.02, options=c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") setkeyv(d, "b") -test(2269.03, options = c(datatable.verbose=TRUE), forderv(d, "b"), integer(), output="forder.*opt=1.*took") -test(2269.04, options = c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.03, options=c(datatable.verbose=TRUE), forderv(d, "b"), integer(), output="forder.*opt=1.*took") +test(2269.04, options=c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), integer(), output="forder.*opt=0.*took") d = copy(dd) setindexv(d, "b") -test(2269.05, options = c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=2.*took") -test(2269.06, options = c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.05, options=c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=2.*took") +test(2269.06, options=c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) -test(2269.11, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2269.12, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.13, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2269.14, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.11, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2269.12, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.13, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2269.14, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") setkeyv(d, c("a","b")) -test(2269.21, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") -test(2269.22, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.23, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2269.24, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.21, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") +test(2269.22, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.23, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2269.24, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") setkeyv(d, c("b","a")) -test(2269.25, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") -test(2269.26, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2269.27, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") -test(2269.28, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.25, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") +test(2269.26, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.27, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") +test(2269.28, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), integer(), output="forder.*opt=0.*took") d = copy(dd) setindexv(d, c("a","b")) -test(2269.31, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2269.32, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.33, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2269.34, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.31, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2269.32, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.33, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") +test(2269.34, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setindexv(d, c("b","a")) -test(2269.35, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2269.36, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.37, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2269.38, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.35, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") +test(2269.36, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.37, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2269.38, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setindexv(d, list(c("a","b"), c("b","a"))) -test(2269.41, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2269.42, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.43, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2269.44, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.41, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") +test(2269.42, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.43, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2269.44, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) -test(2269.51, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached -test(2269.52, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.53, options = c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2269.54, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.51, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached +test(2269.52, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.53, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") +test(2269.54, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setindexv(d, list(c("a","b"), c("b","a"))) -test(2269.55, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") -test(2269.56, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.55, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") +test(2269.56, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) ab = structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) ba = structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) -test(2269.60, options = c(datatable.verbose=TRUE), forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") # c(): strip attributes -test(2269.61, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") -test(2269.62, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2269.63, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") -test(2269.64, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") -test(2269.65, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute -test(2269.66, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.67, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute -test(2269.68, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2269.69, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2269.70, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2269.71, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2269.72, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") -test(2269.73, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") -test(2269.74, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2269.75, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") -test(2269.76, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.77, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") -test(2269.78, options = c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.79, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") -test(2269.80, options = c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, output="forder.*opt=0.*took") -test(2269.81, options = c(datatable.verbose=TRUE), forderv(1:2), integer(), output="forder.*opt=0.*took") -test(2269.82, options = c(datatable.verbose=TRUE), forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") -test(2269.83, options = c(datatable.verbose=TRUE), forderv(2:1), 2:1, output="forder.*opt=0.*took") -test(2269.84, options = c(datatable.verbose=TRUE), forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.60, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") # c(): strip attributes +test(2269.61, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") +test(2269.62, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2269.63, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") +test(2269.64, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") +test(2269.65, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute +test(2269.66, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.67, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute +test(2269.68, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.69, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2269.70, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2269.71, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") +test(2269.72, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2269.73, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") +test(2269.74, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.75, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") +test(2269.76, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.77, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") +test(2269.78, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.79, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") +test(2269.80, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2269.81, options=c(datatable.verbose=TRUE), forderv(1:2), integer(), output="forder.*opt=0.*took") +test(2269.82, options=c(datatable.verbose=TRUE), forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2269.83, options=c(datatable.verbose=TRUE), forderv(2:1), 2:1, output="forder.*opt=0.*took") +test(2269.84, options=c(datatable.verbose=TRUE), forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") test(2269.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE") test(2269.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") ddd = data.table(v1=1:3, v2=c(1L,NA,3L), v3=c(3:2,NaN), v4=c(1:2,Inf), v5=c(-Inf,NA,3)) ## tests for NAs and na.last arg d = copy(ddd) test(2269.8530, options=c(datatable.optimize=Inf), {d[v1 == 1L]; indices(d)}, "v1") test(2269.8531, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE), output="index found but not for retGrp and retStats") -test(2269.854, options = c(datatable.verbose=TRUE), o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") +test(2269.854, options=c(datatable.verbose=TRUE), o<-forderv(d, "v1", retStats=TRUE), output="index found but not for retStats") setattr(d, "index", setattr(integer(), "__v1", o)) -test(2269.855, options = c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") -test(2269.856, options = c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") -test(2269.857, options = c(datatable.verbose=TRUE), forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") +test(2269.855, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE), output="index found but not for retGrp") +test(2269.856, options=c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2269.857, options=c(datatable.verbose=TRUE), forderv(d, "v1", retStats=TRUE, na.last=TRUE), structure(integer(), anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*opt=2.*took") d = copy(ddd) test(2269.8580, options=c(datatable.optimize=Inf), {d[v1 == 1L]; indices(d)}, "v1") # _not_ setindex(d, v1), which will compute retGrp/retStats test(2269.8581, options=c(datatable.verbose=TRUE), forderv(d, "v1", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and no stats available") setindexv(d, "v2") -test(2269.859, options = c(datatable.verbose=TRUE), forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") +test(2269.859, options=c(datatable.verbose=TRUE), forderv(d, "v2", retGrp=TRUE, retStats=TRUE, na.last=TRUE), output="index found but na.last=TRUE and NAs present") d = copy(ddd) setkeyv(d, "v1") setindexv(d, list("v2","v3","v4","v5",c("v1","v2"),c("v1","v3"),c("v2","v3"),c("v1","v4"),c("v1","v5"),c("v1","v4","v5"))) -test(2269.861, options = c(datatable.verbose=TRUE), forderv(d, "v1"), integer(), output="forder.*opt=1.*took") -test(2269.862, options = c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=-1.*took") ## cannot use key for na.last +test(2269.861, options=c(datatable.verbose=TRUE), forderv(d, "v1"), integer(), output="forder.*opt=1.*took") +test(2269.862, options=c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=-1.*took") ## cannot use key for na.last setindexv(d, "v1") -test(2269.863, options = c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") -test(2269.864, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2269.865, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg -test(2269.866, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2269.867, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") -test(2269.868, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2269.869, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") -test(2269.870, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2269.871, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg -test(2269.872, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") -test(2269.873, options = c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2269.863, options=c(datatable.verbose=TRUE), forderv(d, "v1", na.last=TRUE), integer(), output="forder.*opt=2.*took") +test(2269.864, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2269.865, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v2"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2269.866, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2269.867, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v3"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2269.868, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2269.869, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") +test(2269.870, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2269.871, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg +test(2269.872, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="index found but na.last=TRUE and NAs present") +test(2269.873, options=c(datatable.verbose=TRUE), forderv(d, c("v1","v4","v5"), na.last=TRUE), integer(), output="forder.*opt=-1.*took") # same but testing another msg d = fread(testDir("1680-fread-header-encoding.csv"), encoding="Latin-1") ## re-use some existing non utf8 data anyEnc = function(x) unlist(attributes(forderv(x, retStats=TRUE))[c("anynotascii","anynotutf8")]) test(2269.881, anyEnc(d), c(anynotascii=1L,anynotutf8=1L)) @@ -18860,29 +18860,29 @@ test(2269.882, anyEnc(d[,-2L]), c(anynotascii=0L,anynotutf8=0L)) test(2269.883, anyEnc(c("a","b","\u221A")), c(anynotascii=1L,anynotutf8=0L)) d = copy(dd) setindexv(d, "b") -test(2269.91, options = c(datatable.verbose=TRUE, datatable.use.index=FALSE), +test(2269.91, options=c(datatable.verbose=TRUE, datatable.use.index=FALSE), forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") -test(2269.92, options = c(datatable.verbose=TRUE, datatable.use.index=FALSE), +test(2269.92, options=c(datatable.verbose=TRUE, datatable.use.index=FALSE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") d = data.table(x = 2:1) -test(2269.93, options = c(datatable.optimize=Inf), {d[x == 1L]; attr(attr(d, "index"), "__x")}, 2:1) -test(2269.94, options = c(datatable.verbose=TRUE), forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") +test(2269.93, options=c(datatable.optimize=Inf), {d[x == 1L]; attr(attr(d, "index"), "__x")}, 2:1) +test(2269.94, options=c(datatable.verbose=TRUE), forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") d = data.table(x = 2:1) -test(2269.95, options = list(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE, datatable.optimize=Inf), +test(2269.95, options=list(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE, datatable.optimize=Inf), d[x==1L], data.table(x=1L), output="forder.*setting index.*retGrp=0, retStats=0") -test(2269.96, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), +test(2269.96, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), forderv(d, "x", retGrp=TRUE), output="forder.*setting index.*retGrp=1, retStats=1") setindexv(d, NULL) -test(2269.971, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), +test(2269.971, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), forderv(d, "x", retStats=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") setindexv(d, NULL) -test(2269.972, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), +test(2269.972, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), forderv(d, "x", retStats=TRUE, na.last=TRUE), output="forder.*setting index.*retGrp=0, retStats=1") setindexv(d, NULL) -test(2269.973, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), +test(2269.973, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), forderv(data.table(x=c(2:1,NA)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") -test(2269.974, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), +test(2269.974, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), forderv(data.table(x=c(2:1,NaN)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") -test(2269.975, options = c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), +test(2269.975, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") test(2269.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") From e8b9dcd32eb4fabf1dd21a4de302b12a18f22a37 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 15 Jul 2024 17:58:19 +0000 Subject: [PATCH 47/53] spurious whitespace change --- inst/tests/tests.Rraw | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a0fe4e977c..99e17561d5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5319,8 +5319,7 @@ test(1313.02, DT[, max(y), by=x], DT[, base::max(y), by=x]) test(1313.03, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) test(1313.04, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) -DT[x==6, - y := INT(NA)] +DT[x==6, y := INT(NA)] test(1313.05, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.06, DT[, max(y), by=x], DT[, base::max(y), by=x]) test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) From a48ff5ed3b43087db181cde5f412223c2000c2da Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 21 Jul 2024 16:12:25 +0200 Subject: [PATCH 48/53] NEWS entry for lazy forder --- NEWS.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/NEWS.md b/NEWS.md index 262aace175..f44b4f57c5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -116,6 +116,43 @@ 21. Refactored some non-API calls to R macros for S4 objects (#6180)[https://github.com/Rdatatable/data.table/issues/6180]. There should be no user-visible change. Thanks to various R users & R core for pushing to have a clearer definition of "API" for R, and thanks @MichaelChirico for implementing here. +22. Internal routine for finding order will now re-using existing index. Similar optimization was already present in R code, but now have been pushed to C. It now covers wider range of use cases and collects more statistics about its input opening possibility for more optimizations in other functions. + +Functions `setindex` (and `setindexv`) will now compute groups positions as well. Moreover extra statistics are being collected now. Finding order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics to not impose a slow down. +```r +d2 = data.table(id=2:1, v2=1:2) +setindexv(d2, "id") +str(attr(attr(d2, "index"), "__id")) +# int [1:2] 2 1 +# - attr(*, "starts")= int [1:2] 1 2 +# - attr(*, "maxgrpn")= int 1 +# - attr(*, "anyna")= int 0 +# - attr(*, "anyinfnan")= int 0 +# - attr(*, "anynotascii")= int 0 +# - attr(*, "anynotutf8")= int 0 + +d2 = data.table(id=2:1, v2=1:2) +invisible(d2[id==1L]) +str(attr(attr(d2, "index"), "__id")) +# int [1:2] 2 1 +``` +Closes [#4387](https://github.com/Rdatatable/data.table/issues/4387) and [#2947](https://github.com/Rdatatable/data.table/issues/2947). + +Allows to re-use index during join, where one of the finding order calls is made from C code. +```r +d1 = data.table(id=1:2, v1=1:2) +d2 = data.table(id=2:1, v2=1:2) +setindexv(d2, "id") +d1[d2, on="id", verbose=TRUE] +#... +#Starting bmerge ... +#forderMaybePresorted: using existing index: __id +#forderMaybePresorted: opt=2, took 0.000s +#... +``` +Closes [#4380](https://github.com/Rdatatable/data.table/issues/4380). +Thanks to @jangorecki for implementing. + ## TRANSLATIONS 1. Fix a typo in a Mandarin translation of an error message that was hiding the actual error message, [#6172](https://github.com/Rdatatable/data.table/issues/6172). Thanks @trafficfan for the report and @MichaelChirico for the fix. From 59f5f212e8029fec586adc089e86a12f48dfa334 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 21 Jul 2024 10:03:43 -0700 Subject: [PATCH 49/53] tidy up NEWS --- NEWS.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index f44b4f57c5..977a05d4dd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -116,9 +116,12 @@ 21. Refactored some non-API calls to R macros for S4 objects (#6180)[https://github.com/Rdatatable/data.table/issues/6180]. There should be no user-visible change. Thanks to various R users & R core for pushing to have a clearer definition of "API" for R, and thanks @MichaelChirico for implementing here. -22. Internal routine for finding order will now re-using existing index. Similar optimization was already present in R code, but now have been pushed to C. It now covers wider range of use cases and collects more statistics about its input opening possibility for more optimizations in other functions. +22. Internal routine for finding sort order will now re-use any existing index. A similar optimization was already present in R code, but this has now been pushed to C and covers a wider range of use cases and collects more statistics about its input (e.g. whether any infinite entries were found), opening the possibility for more optimizations in other functions. + +Functions `setindex` (and `setindexv`) will now compute groups' positions as well. `setindex()` also collects the extra statistics alluded to above. + +Finding sort order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics so as not to impose a slowdown. -Functions `setindex` (and `setindexv`) will now compute groups positions as well. Moreover extra statistics are being collected now. Finding order in other routines (for example subset `d2[id==1L]`) does not include those extra statistics to not impose a slow down. ```r d2 = data.table(id=2:1, v2=1:2) setindexv(d2, "id") @@ -136,9 +139,9 @@ invisible(d2[id==1L]) str(attr(attr(d2, "index"), "__id")) # int [1:2] 2 1 ``` -Closes [#4387](https://github.com/Rdatatable/data.table/issues/4387) and [#2947](https://github.com/Rdatatable/data.table/issues/2947). -Allows to re-use index during join, where one of the finding order calls is made from C code. +This feature also enables re-use of sort index during joins, in cases where one of the calls to find sort order is made from C code. + ```r d1 = data.table(id=1:2, v1=1:2) d2 = data.table(id=2:1, v2=1:2) @@ -150,8 +153,8 @@ d1[d2, on="id", verbose=TRUE] #forderMaybePresorted: opt=2, took 0.000s #... ``` -Closes [#4380](https://github.com/Rdatatable/data.table/issues/4380). -Thanks to @jangorecki for implementing. + +This feature resolves [#4387](https://github.com/Rdatatable/data.table/issues/4387), [#2947](https://github.com/Rdatatable/data.table/issues/2947), [#4380](https://github.com/Rdatatable/data.table/issues/4380), and [#1321](https://github.com/Rdatatable/data.table/issues/1321). Thanks to @jangorecki, @jan-glx, and @MichaelChirico for the reports and @jangorecki for implementing. ## TRANSLATIONS From 32c630556ed3f80513ca365d117b3fd7d4ab7adc Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 21 Jul 2024 10:08:31 -0700 Subject: [PATCH 50/53] PROTECT() on key attribute --- src/forder.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/forder.c b/src/forder.c index f0760fb399..211b403268 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1480,11 +1480,13 @@ bool colsKeyHead(SEXP x, SEXP cols) { error("internal error: 'x' must be a list"); // # nocov if (!isInteger(cols)) error("internal error: 'cols' must be an integer"); // # nocov - SEXP key = getAttrib(x, sym_sorted); - if (isNull(key) || (length(key) < length(cols))) + SEXP key = PROTECT(getAttrib(x, sym_sorted)); + if (isNull(key) || (length(key) < length(cols))) { + UNPROTECT(1); // key return false; - SEXP names = getAttrib(x, R_NamesSymbol); - SEXP keynames = PROTECT(chmatch(key, names, 0)); + } + SEXP keynames = PROTECT(chmatch(key, getAttrib(x, R_NamesSymbol), 0)); + UNPROTECT(1); // key int *keynamesp = INTEGER(keynames), *colsp = INTEGER(cols); for (int i=0; i Date: Sat, 27 Jul 2024 14:05:51 -0700 Subject: [PATCH 51/53] rename arg/option 'lazy' -> 'reuseSorting' --- R/setkey.R | 4 ++-- src/forder.c | 40 ++++++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/R/setkey.R b/R/setkey.R index 10efeba1f3..60d97c5257 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -148,7 +148,7 @@ maybe_reset_index = function(x, idx, cols) { } ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character') -forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, order=1L, na.last=FALSE, lazy=getOption("datatable.forder.lazy",NA)) { +forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, order=1L, na.last=FALSE, reuseSorting=getOption("datatable.reuse.sorting", NA)) { if (is.atomic(x) || is.null(x)) { # including forderv(NULL) which returns error consistent with base::order(NULL), if (!missing(by) && !is.null(by)) stopf("x is a single vector, non-NULL 'by' doesn't make sense") by = NULL @@ -157,7 +157,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, by = colnamesInt(x, by, check_dups=FALSE) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level - .Call(CforderMaybePresorted, x, by, retGrp, retStats, sort, order, na.last, lazy) # returns integer() if already sorted, regardless of sort=TRUE|FALSE + .Call(CforderMaybePresorted, x, by, retGrp, retStats, sort, order, na.last, reuseSorting) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } forder = function(..., na.last=TRUE, decreasing=FALSE) diff --git a/src/forder.c b/src/forder.c index e9cce3e14f..3aeb140c48 100644 --- a/src/forder.c +++ b/src/forder.c @@ -454,7 +454,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA Rprintf(_("forder.c received %d rows and %d columns\n"), length(VECTOR_ELT(DT,0)), length(DT)); } if (!length(DT)) - STOP(_("Internal error: DT is an empty list() of 0 columns")); // # nocov # caught in lazy forder + STOP(_("Internal error: DT is an empty list() of 0 columns")); // # nocov # caught in reuseSorting forder if (!isInteger(by) || !LENGTH(by)) STOP(_("Internal error: DT has %d columns but 'by' is either not integer or is length 0"), length(DT)); // # nocov colnamesInt catches, 2099.2 if (!isInteger(ascArg)) @@ -479,20 +479,20 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA if (TYPEOF(VECTOR_ELT(DT, by_i-1)) == CPLXSXP) n_cplx++; } if (!IS_TRUE_OR_FALSE(retGrpArg)) - STOP(_("retGrp must be TRUE or FALSE")); // # nocov # covered in lazy forder + STOP(_("retGrp must be TRUE or FALSE")); // # nocov # covered in reuseSorting forder retgrp = LOGICAL(retGrpArg)[0]==TRUE; if (!IS_TRUE_OR_FALSE(retStatsArg)) - STOP(_("retStats must be TRUE or FALSE")); // # nocov # covered in lazy forder + STOP(_("retStats must be TRUE or FALSE")); // # nocov # covered in reuseSorting forder retstats = LOGICAL(retStatsArg)[0]==TRUE; if (!retstats && retgrp) - error("retStats must be TRUE whenever retGrp is TRUE"); // # nocov # covered in lazy forder + error("retStats must be TRUE whenever retGrp is TRUE"); // # nocov # covered in reuseSorting forder if (!IS_TRUE_OR_FALSE(sortGroupsArg)) - STOP(_("sort must be TRUE or FALSE")); // # nocov # covered in lazy forder + STOP(_("sort must be TRUE or FALSE")); // # nocov # covered in reuseSorting forder sortType = LOGICAL(sortGroupsArg)[0]==TRUE; // if sortType is 1, it is later flipped between +1/-1 according to ascArg. Otherwise ascArg is ignored when sortType==0 if (!retgrp && !sortType) STOP(_("At least one of retGrp= or sort= must be TRUE")); if (!isLogical(naArg) || LENGTH(naArg) != 1) - STOP(_("na.last must be logical TRUE, FALSE or NA of length 1")); // # nocov # covered in lazy forder + STOP(_("na.last must be logical TRUE, FALSE or NA of length 1")); // # nocov # covered in reuseSorting forder nalast = (LOGICAL(naArg)[0] == NA_LOGICAL) ? -1 : LOGICAL(naArg)[0]; // 1=na last, 0=na first (default), -1=remove na if (nrow==0) { @@ -1615,8 +1615,8 @@ bool idxAnyNF(SEXP idx) { return INTEGER(getAttrib(idx, sym_anyna))[0]>0 || INTEGER(getAttrib(idx, sym_anyinfnan))[0]>0; } -// lazy forder, re-use existing key or index if possible, otherwise call forder -SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg) { +// forder, re-use existing key or index if possible, otherwise call forder +SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg) { const bool verbose = GetVerbose(); int protecti = 0; double tic=0.0; @@ -1640,13 +1640,13 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE bool na = (bool)LOGICAL(naArg)[0]; if (!isInteger(ascArg)) error("order must be integer"); // # nocov # coerced to int in R - if (!isLogical(lazyArg) || LENGTH(lazyArg) != 1) - error("lazy must be logical TRUE, FALSE or NA of length 1"); - int lazy = LOGICAL(lazyArg)[0]; + if (!isLogical(reuseSortingArg) || LENGTH(reuseSortingArg) != 1) + error("reuseSorting must be logical TRUE, FALSE or NA of length 1"); + int reuseSorting = LOGICAL(reuseSortingArg)[0]; if (!length(DT)) return allocVector(INTSXP, 0); int opt = -1; // -1=unknown, 0=none, 1=keyOpt, 2=idxOpt - if (lazy==NA_LOGICAL) { + if (reuseSorting==NA_LOGICAL) { if (INHERITS(DT, char_datatable) && // unnamed list should not be optimized sortGroups && all1(ascArg)) { // could ascArg=-1 be handled by a rev()? @@ -1656,15 +1656,15 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE Rprintf("forderMaybePresorted: opt not possible: is.data.table(DT)=%d, sortGroups=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, all1(ascArg)); opt = 0; } - } else if (lazy) { + } else if (reuseSorting) { if (!INHERITS(DT,char_datatable)) - error("internal error: lazy set to TRUE but DT is not a data.table"); // # nocov + error("internal error: reuseSorting set to TRUE but DT is not a data.table"); // # nocov if (!sortGroups) - error("internal error: lazy set to TRUE but sort is FALSE"); // # nocov + error("internal error: reuseSorting set to TRUE but sort is FALSE"); // # nocov if (!all1(ascArg)) - error("internal error: lazy set to TRUE but order is not all 1"); // # nocov + error("internal error: reuseSorting set to TRUE but order is not all 1"); // # nocov opt = -1; - } else if (!lazy) { + } else if (!reuseSorting) { opt = 0; } SEXP ans = R_NilValue; @@ -1715,7 +1715,7 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE if (verbose) Rprintf("forderMaybePresorted: index found but not for retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else { - error("internal error: lazy forder index optimization unhandled branch of retGrp-retStats, please report to issue tracker"); // # nocov + error("internal error: reuseSorting forder index optimization unhandled branch of retGrp-retStats, please report to issue tracker"); // # nocov } } else { if (!hasStats) { @@ -1725,7 +1725,7 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE if (verbose) Rprintf("forderMaybePresorted: index found but na.last=TRUE and NAs present: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else { - error("internal error: lazy forder index optimization unhandled branch of last.na=T, please report to issue tracker"); // # nocov + error("internal error: reuseSorting forder index optimization unhandled branch of last.na=T, please report to issue tracker"); // # nocov } } if (opt == 2) { @@ -1737,7 +1737,7 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE } if (opt < 1) { ans = PROTECT(forder(DT, by, retGrpArg, retStatsArg, sortGroupsArg, ascArg, naArg)); protecti++; - if (opt == -1 && // opt==0 means that arguments (sort, asc) were not of type index, or lazy=FALSE + if (opt == -1 && // opt==0 means that arguments (sort, asc) were not of type index, or reuseSorting=FALSE (!na || (retStats && !idxAnyNF(ans))) && // lets create index even if na.last=T used but no NAs detected! GetUseIndex() && GetAutoIndex()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only From 320f4966543b48a40702bd5deaf1e4822f7054b3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 27 Jul 2024 14:07:29 -0700 Subject: [PATCH 52/53] MaybeSorted->ReuseSorting --- NEWS.md | 4 ++-- R/setkey.R | 4 ++-- src/bmerge.c | 2 +- src/data.table.h | 2 +- src/forder.c | 22 +++++++++++----------- src/init.c | 2 +- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index 72b1db5203..10ec47757c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -159,8 +159,8 @@ setindexv(d2, "id") d1[d2, on="id", verbose=TRUE] #... #Starting bmerge ... -#forderMaybePresorted: using existing index: __id -#forderMaybePresorted: opt=2, took 0.000s +#forderReuseSorting: using existing index: __id +#forderReuseSorting: opt=2, took 0.000s #... ``` diff --git a/R/setkey.R b/R/setkey.R index 60d97c5257..74df277b59 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -71,7 +71,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } else { o = forderv(x, cols, sort=TRUE, retGrp=!physical, lazy=TRUE) } - if (!physical) { # index COULD BE saved from C forderMaybePresorted already, but disabled for now + if (!physical) { # index COULD BE saved from C forderReuseSorting already, but disabled for now maybe_reset_index(x, o, cols) return(invisible(x)) } @@ -157,7 +157,7 @@ forderv = function(x, by=seq_along(x), retGrp=FALSE, retStats=retGrp, sort=TRUE, by = colnamesInt(x, by, check_dups=FALSE) } order = as.integer(order) # length and contents of order being +1/-1 is checked at C level - .Call(CforderMaybePresorted, x, by, retGrp, retStats, sort, order, na.last, reuseSorting) # returns integer() if already sorted, regardless of sort=TRUE|FALSE + .Call(CforderReuseSorting, x, by, retGrp, retStats, sort, order, na.last, reuseSorting) # returns integer() if already sorted, regardless of sort=TRUE|FALSE } forder = function(..., na.last=TRUE, decreasing=FALSE) diff --git a/src/bmerge.c b/src/bmerge.c index 5f9f343d40..108d828610 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -162,7 +162,7 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r allGrp1[0] = TRUE; protecti += 2; - SEXP oSxp = PROTECT(forderMaybePresorted(idt, icolsArg, /* retGrpArg= */ScalarLogical(FALSE), /* retStatsArg= */ScalarLogical(FALSE), /* sortGroupsArg= */ScalarLogical(TRUE), /* ascArg= */ScalarInteger(1), /* naArg= */ScalarLogical(FALSE), /* lazyArg= */ScalarLogical(TRUE))); protecti++; + SEXP oSxp = PROTECT(forderReuseSorting(idt, icolsArg, /* retGrpArg= */ScalarLogical(FALSE), /* retStatsArg= */ScalarLogical(FALSE), /* sortGroupsArg= */ScalarLogical(TRUE), /* ascArg= */ScalarInteger(1), /* naArg= */ScalarLogical(FALSE), /* lazyArg= */ScalarLogical(TRUE))); protecti++; if (!LENGTH(oSxp)) o = NULL; else diff --git a/src/data.table.h b/src/data.table.h index 68cc48435e..a848ef0344 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -143,7 +143,7 @@ int checkOverAlloc(SEXP x); int StrCmp(SEXP x, SEXP y); uint64_t dtwiddle(double x); SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg); -SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP lazyArg); // lazy wrapper to forder +SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg); // reuseSorting wrapper to forder int getNumericRounding_C(void); // reorder.c diff --git a/src/forder.c b/src/forder.c index 3aeb140c48..564f55ec30 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1616,7 +1616,7 @@ bool idxAnyNF(SEXP idx) { } // forder, re-use existing key or index if possible, otherwise call forder -SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg) { +SEXP forderReuseSorting(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsArg, SEXP ascArg, SEXP naArg, SEXP reuseSortingArg) { const bool verbose = GetVerbose(); int protecti = 0; double tic=0.0; @@ -1653,7 +1653,7 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE opt = -1; } else { if (verbose) - Rprintf("forderMaybePresorted: opt not possible: is.data.table(DT)=%d, sortGroups=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, all1(ascArg)); + Rprintf("forderReuseSorting: opt not possible: is.data.table(DT)=%d, sortGroups=%d, all1(ascArg)=%d\n", INHERITS(DT,char_datatable), sortGroups, all1(ascArg)); opt = 0; } } else if (reuseSorting) { @@ -1672,7 +1672,7 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE opt = 1; // keyOpt ans = PROTECT(allocVector(INTSXP, 0)); protecti++; if (verbose) - Rprintf("forderMaybePresorted: using key: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderReuseSorting: using key: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } if (opt == -1 && GetUseIndex()) { SEXP idx = getIndex(DT, by); @@ -1707,23 +1707,23 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE opt = 2; // idxOpt but need to drop groups or stats } else if (!hasGrp && retGrp && !hasStats && retStats) { if (verbose) - Rprintf("forderMaybePresorted: index found but not for retGrp and retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderReuseSorting: index found but not for retGrp and retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else if (!hasGrp && retGrp) { if (verbose) - Rprintf("forderMaybePresorted: index found but not for retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderReuseSorting: index found but not for retGrp: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else if (!hasStats && retStats) { if (verbose) - Rprintf("forderMaybePresorted: index found but not for retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderReuseSorting: index found but not for retStats: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else { error("internal error: reuseSorting forder index optimization unhandled branch of retGrp-retStats, please report to issue tracker"); // # nocov } } else { if (!hasStats) { if (verbose) - Rprintf("forderMaybePresorted: index found but na.last=TRUE and no stats available: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderReuseSorting: index found but na.last=TRUE and no stats available: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else if (idxAnyNF(idx)) { if (verbose) - Rprintf("forderMaybePresorted: index found but na.last=TRUE and NAs present: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderReuseSorting: index found but na.last=TRUE and NAs present: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } else { error("internal error: reuseSorting forder index optimization unhandled branch of last.na=T, please report to issue tracker"); // # nocov } @@ -1731,7 +1731,7 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE if (opt == 2) { ans = idx; if (verbose) - Rprintf("forderMaybePresorted: using existing index: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderReuseSorting: using existing index: %s\n", CHAR(STRING_ELT(idxName(DT, by), 0))); } } } @@ -1743,11 +1743,11 @@ SEXP forderMaybePresorted(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SE GetAutoIndex()) { // disabled by default, use datatable.forder.auto.index=T to enable, do not export/document, use for debugging only putIndex(DT, by, ans); if (verbose) - Rprintf("forderMaybePresorted: setting index (retGrp=%d, retStats=%d) on DT: %s\n", retGrp, retStats, CHAR(STRING_ELT(idxName(DT, by), 0))); + Rprintf("forderReuseSorting: setting index (retGrp=%d, retStats=%d) on DT: %s\n", retGrp, retStats, CHAR(STRING_ELT(idxName(DT, by), 0))); } } if (verbose) - Rprintf("forderMaybePresorted: opt=%d, took %.3fs\n", opt, omp_get_wtime()-tic); + Rprintf("forderReuseSorting: opt=%d, took %.3fs\n", opt, omp_get_wtime()-tic); UNPROTECT(protecti); return ans; } diff --git a/src/init.c b/src/init.c index ead5cb5984..5ab53d0913 100644 --- a/src/init.c +++ b/src/init.c @@ -77,7 +77,7 @@ R_CallMethodDef callMethods[] = { {"Cfcast", (DL_FUNC) &fcast, -1}, {"Cuniqlist", (DL_FUNC) &uniqlist, -1}, {"Cuniqlengths", (DL_FUNC) &uniqlengths, -1}, -{"CforderMaybePresorted", (DL_FUNC) &forderMaybePresorted, -1}, +{"CforderReuseSorting", (DL_FUNC) &forderReuseSorting, -1}, {"Cforder", (DL_FUNC) &forder, -1}, {"Cissorted", (DL_FUNC) &issorted, -1}, {"Cgforce", (DL_FUNC) &gforce, -1}, From 799b4a936ca5b37ed21391daf813c667faf306ff Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 27 Jul 2024 14:13:32 -0700 Subject: [PATCH 53/53] other lazy= usage --- R/data.table.R | 2 +- R/setkey.R | 4 +-- inst/tests/tests.Rraw | 64 +++++++++++++++++++++---------------------- 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 092e1461e8..cb32836b03 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3244,7 +3244,7 @@ is_constantish = function(q, check_singleton=FALSE) { idxCols = names(i) if (verbose) {catf("Creating new index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} if (verbose) {last.started.at=proc.time();catf("Creating index %s done in ...", paste(idxCols, collapse = "__"));flush.console()} - idx = forderv(x, idxCols, sort=TRUE, retGrp=FALSE, lazy=TRUE) + idx = forderv(x, idxCols, sort=TRUE, retGrp=FALSE, reuseSorting=TRUE) maybe_reset_index(x, idx, idxCols) ## forder can write index, but disabled for now, see #4386 if (verbose) {cat(timetaken(last.started.at),"\n");flush.console()} if (verbose) {catf("Optimized subsetting with index '%s'\n", paste(idxCols, collapse = "__"));flush.console()} diff --git a/R/setkey.R b/R/setkey.R index 74df277b59..f43ed39aa2 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -65,11 +65,11 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU if (verbose) { # we now also retGrp=TRUE #4387 for !physical - tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=!physical, lazy=TRUE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R + tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R # suppress needed for tests 644 and 645 in verbose mode catf("forder took %.03f sec\n", tt["user.self"]+tt["sys.self"]) } else { - o = forderv(x, cols, sort=TRUE, retGrp=!physical, lazy=TRUE) + o = forderv(x, cols, sort=TRUE, retGrp=!physical, reuseSorting=TRUE) } if (!physical) { # index COULD BE saved from C forderReuseSorting already, but disabled for now maybe_reset_index(x, o, cols) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7efb30bf6d..b90743e28e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18819,58 +18819,58 @@ test(2274.33, rbind(x,y, ignore.attr=TRUE), data.table(a=structure(c(1L, 2L, 2L, dd = data.table(a=1:2, b=2:1) d = copy(dd) test(2275.01, options=c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") -test(2275.02, options=c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.02, options=c(datatable.verbose=TRUE), forderv(d, "b", reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") setkeyv(d, "b") test(2275.03, options=c(datatable.verbose=TRUE), forderv(d, "b"), integer(), output="forder.*opt=1.*took") -test(2275.04, options=c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.04, options=c(datatable.verbose=TRUE), forderv(d, "b", reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") d = copy(dd) setindexv(d, "b") test(2275.05, options=c(datatable.verbose=TRUE), forderv(d, "b"), 2:1, output="forder.*opt=2.*took") -test(2275.06, options=c(datatable.verbose=TRUE), forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.06, options=c(datatable.verbose=TRUE), forderv(d, "b", reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) test(2275.11, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2275.12, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.12, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.13, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2275.14, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.14, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") setkeyv(d, c("a","b")) test(2275.21, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took") -test(2275.22, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.22, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.23, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2275.24, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.24, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") setkeyv(d, c("b","a")) test(2275.25, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), 2:1, output="forder.*opt=-1.*took") -test(2275.26, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.26, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") test(2275.27, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), integer(), output="forder.*opt=1.*took") -test(2275.28, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.28, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") d = copy(dd) setindexv(d, c("a","b")) test(2275.31, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2275.32, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.32, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.33, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=-1.*took") -test(2275.34, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.34, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setindexv(d, c("b","a")) test(2275.35, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=-1.*took") -test(2275.36, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.36, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.37, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2275.38, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.38, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setindexv(d, list(c("a","b"), c("b","a"))) test(2275.41, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took") -test(2275.42, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.42, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.43, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2275.44, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.44, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) test(2275.51, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=1.*took", notOutput="forder.*opt=2.*took") # idxOpt is not reached -test(2275.52, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.52, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.53, options=c(datatable.verbose=TRUE), forderv(d, c("b","a")), 2:1, output="forder.*opt=2.*took") -test(2275.54, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.54, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") d = copy(dd) setindexv(d, list(c("a","b"), c("b","a"))) test(2275.55, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), integer(), output="forder.*opt=2.*took", notOutput="forder.*opt=1.*took") -test(2275.56, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.56, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") d = copy(dd) setkeyv(d, c("a","b")) setindexv(d, list(c("a","b"), c("b","a"))) @@ -18878,29 +18878,29 @@ ab = structure(integer(), starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynot ba = structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L) test(2275.60, options=c(datatable.verbose=TRUE), forderv(d, c("a","b")), c(ab), output="forder.*opt=1.*took") # c(): strip attributes test(2275.61, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE), ab, output="forder.*opt=2.*took") -test(2275.62, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2275.62, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), retGrp=TRUE, reuseSorting=FALSE), ab, output="forder.*opt=0.*took") test(2275.63, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE), ba, output="forder.*opt=2.*took") -test(2275.64, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE, lazy=FALSE), ba, output="forder.*opt=0.*took") +test(2275.64, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), retGrp=TRUE, reuseSorting=FALSE), ba, output="forder.*opt=0.*took") test(2275.65, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE), integer(), output="forder.*opt=2.*took") # via anyna index attribute -test(2275.66, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.66, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), na.last=TRUE, reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.67, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE), 2:1, output="forder.*opt=2.*took") # via anyna index attribute -test(2275.68, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.68, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), na.last=TRUE, reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") test(2275.69, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2275.70, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2275.70, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), sort=FALSE, retGrp=TRUE, reuseSorting=FALSE), ab, output="forder.*opt=0.*took") test(2275.71, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE), ab, output="forder.*opt=0.*took") -test(2275.72, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, lazy=FALSE), ab, output="forder.*opt=0.*took") +test(2275.72, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), sort=FALSE, retGrp=TRUE, reuseSorting=FALSE), ab, output="forder.*opt=0.*took") test(2275.73, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L), 2:1, output="forder.*opt=0.*took") -test(2275.74, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.74, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=-1L, reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") test(2275.75, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L), integer(), output="forder.*opt=0.*took") -test(2275.76, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.76, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=-1L, reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.77, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L)), integer(), output="forder.*opt=0.*took") -test(2275.78, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L), lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.78, options=c(datatable.verbose=TRUE), forderv(d, c("a","b"), order=c(1L,-1L), reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.79, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L)), 2:1, output="forder.*opt=0.*took") -test(2275.80, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L), lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.80, options=c(datatable.verbose=TRUE), forderv(d, c("b","a"), order=c(1L,-1L), reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") test(2275.81, options=c(datatable.verbose=TRUE), forderv(1:2), integer(), output="forder.*opt=0.*took") -test(2275.82, options=c(datatable.verbose=TRUE), forderv(1:2, lazy=FALSE), integer(), output="forder.*opt=0.*took") +test(2275.82, options=c(datatable.verbose=TRUE), forderv(1:2, reuseSorting=FALSE), integer(), output="forder.*opt=0.*took") test(2275.83, options=c(datatable.verbose=TRUE), forderv(2:1), 2:1, output="forder.*opt=0.*took") -test(2275.84, options=c(datatable.verbose=TRUE), forderv(2:1, lazy=FALSE), 2:1, output="forder.*opt=0.*took") +test(2275.84, options=c(datatable.verbose=TRUE), forderv(2:1, reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") test(2275.851, forderv(2:1, retStats=NA), error="retStats must be TRUE or FALSE") test(2275.852, forderv(2:1, retGrp=TRUE, retStats=FALSE), error="retStats must be TRUE whenever retGrp is TRUE") ddd = data.table(v1=1:3, v2=c(1L,NA,3L), v3=c(3:2,NaN), v4=c(1:2,Inf), v5=c(-Inf,NA,3)) ## tests for NAs and na.last arg @@ -18944,7 +18944,7 @@ setindexv(d, "b") test(2275.91, options=c(datatable.verbose=TRUE, datatable.use.index=FALSE), forderv(d, "b"), 2:1, output="forder.*opt=-1.*took") test(2275.92, options=c(datatable.verbose=TRUE, datatable.use.index=FALSE), - forderv(d, "b", lazy=FALSE), 2:1, output="forder.*opt=0.*took") + forderv(d, "b", reuseSorting=FALSE), 2:1, output="forder.*opt=0.*took") d = data.table(x = 2:1) test(2275.93, options=c(datatable.optimize=Inf), {d[x == 1L]; attr(attr(d, "index"), "__x")}, 2:1) test(2275.94, options=c(datatable.verbose=TRUE), forderv(d, "x", retGrp=TRUE), structure(2:1, starts=1:2, maxgrpn=1L, anyna=0L, anyinfnan=0L, anynotascii=0L, anynotutf8=0L), output="forder.*index found but not for retGrp and retStats.*forder.*opt=-1.*took") @@ -18966,4 +18966,4 @@ test(2275.974, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRU forderv(data.table(x=c(2:1,NaN)), "x", retStats=TRUE, na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") test(2275.975, options=c(datatable.verbose=TRUE, datatable.forder.auto.index=TRUE), forderv(d, "x", na.last=TRUE), notOutput="forder.*setting index.*retGrp=0, retStats=1") -test(2275.99, forderv(data.table(a=1), lazy=c(TRUE, TRUE)), error="lazy must be") +test(2275.99, forderv(data.table(a=1), reuseSorting=c(TRUE, TRUE)), error="reuseSorting must be")