diff --git a/NAMESPACE b/NAMESPACE index 0e0c733ce2..fd32389380 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -59,6 +59,8 @@ export(nafill) export(setnafill) export(.Last.updated) export(fcoalesce) +export(cbindlist) +export(mergelist) export(substitute2) #export(DT) # mtcars |> DT(i,j,by) #4872 #5472 diff --git a/NEWS.md b/NEWS.md index 4bf066f417..3d23e739e8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -65,6 +65,8 @@ rowwiseDT( 4. `patterns()` in `melt()` combines correctly with user-defined `cols=`, which can be useful to specify a subset of columns to reshape without having to use a regex, for example `patterns("2", cols=c("y1", "y2"))` will only give `y2` even if there are other columns in the input matching `2`, [#6498](https://github.com/Rdatatable/data.table/issues/6498). Thanks to @hongyuanjia for the report, and to @tdhock for the PR. +5. (add example here?) New functions `cbindlist` and `mergelist` have been implemented and exported. Works like `cbind`/`merge` but takes `list` of data.tables on input. `merge` happens in `Reduce` fashion. Supports `how` (_left_, _inner_, _full_, _right_, _semi_, _anti_, _cross_) joins and `mult` argument, closes [#599](https://github.com/Rdatatable/data.table/issues/599) and [#2576](https://github.com/Rdatatable/data.table/issues/2576). + ## BUG FIXES 1. Using `print.data.table()` with character truncation using `datatable.prettyprint.char` no longer errors with `NA` entries, [#6441](https://github.com/Rdatatable/data.table/issues/6441). Thanks to @r2evans for the bug report, and @joshhwuu for the fix. diff --git a/R/data.table.R b/R/data.table.R index 7b48704a1a..0ad21cf9fa 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -199,7 +199,7 @@ replace_dot_alias = function(e) { } return(x) } - if (!mult %chin% c("first","last","all")) stopf("mult argument can only be 'first', 'last' or 'all'") + if (!mult %chin% c("first", "last", "all")) stopf("mult argument can only be 'first', 'last' or 'all'") missingroll = missing(roll) if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'") if (is.character(roll)) { @@ -519,13 +519,22 @@ replace_dot_alias = function(e) { if (!byjoin || nqbyjoin) { # Really, `anyDuplicated` in base is AWESOME! # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates - if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} - irows = if (allLen1) f__ else vecseq(f__,len__, - if (allow.cartesian || - notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). - !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) { - NULL # #742. If 'i' has no duplicates, ignore - } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)). + if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()} + irows = if (allLen1) f__ else { + join.many = getOption("datatable.join.many") # #914, default TRUE for backward compatibility + anyDups = if (!join.many && length(f__)==1L && len__==nrow(x)) { + NULL # special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"] + } else if (!notjoin && ( # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x). + !allow.cartesian || + !join.many)) + as.logical(anyDuplicated(f__, incomparables = c(0L, NA_integer_))) + limit = if (!is.null(anyDups) && anyDups) { # #742. If 'i' has no duplicates, ignore + if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.") + else if (!allow.cartesian && !notjoin) as.double(nrow(x)+nrow(i)) + else internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov + } + vecseq(f__, len__, limit) + } # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)). if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # Fix for #1092 and #1074 # TODO: implement better version of "any"/"all"/"which" to avoid diff --git a/R/mergelist.R b/R/mergelist.R new file mode 100644 index 0000000000..b9bfae0fd4 --- /dev/null +++ b/R/mergelist.R @@ -0,0 +1,338 @@ +cbindlist = function(l, copy=TRUE) { + ans = .Call(Ccbindlist, l, copy) + if (anyDuplicated(names(ans))) { ## invalidate key and index + setattr(ans, "sorted", NULL) + setattr(ans, "index", integer()) + } + setDT(ans) + ans +} + +# when 'on' is missing then use keys, used only for inner and full join +onkeys = function(x, y) { + if (is.null(x) && !is.null(y)) y + else if (!is.null(x) && is.null(y)) x + else if (!is.null(x) && !is.null(y)) { + if (length(x)>=length(y)) intersect(y, x) ## align order to shorter|rhs key + else intersect(x, y) + } else NULL # nocov ## internal error is being called later in mergepair +} +someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) { + keep = colnamesInt(x, keep) + drop = colnamesInt(x, drop) + cols = colnamesInt(x, cols) + ans = union(keep, setdiff(cols, drop)) + if (!retain.order) return(ans) + intersect(colnamesInt(x, NULL), ans) +} +hasindex = function(x, by, retGrp=FALSE) { + index = attr(x, "index", TRUE) + if (is.null(index)) return(FALSE) + idx_name = paste0("__",by,collapse="") + idx = attr(index, idx_name, TRUE) + if (is.null(idx)) return(FALSE) + if (!retGrp) return(TRUE) + return(!is.null(attr(idx, "starts", TRUE))) +} + +# fdistinct applies mult='first|last' +# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE] +# it may not copy when copy=FALSE and x is unique by 'on' +fdistinct = function(x, on=key(x), mult=c("first","last"), cols=seq_along(x), copy=TRUE) { + if (!perhaps.data.table(x)) + stopf("'x' must be data.table") + if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x))) + stopf("'on' must be character column names of 'x' argument") + mult = match.arg(mult) + if (is.null(cols)) + cols = seq_along(x) + else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols)) + stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument") + if (!isTRUEorFALSE(copy)) + stopf("'%s' must be TRUE or FALSE", "copy") + ## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last" + ## this short circuit will work after #4386 because it requires retGrp=T + #### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE) + sort = TRUE ## above line does not work for the moment, test 302.02 + o = forderv(x, by=on, sort=sort, retGrp=TRUE) + if (attr(o, "maxgrpn", TRUE) <= 1L) { + ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE) + if (copy) ans = copy(ans) + return(ans) + } + f = attr(o, "starts", exact=TRUE) + if (mult=="last") { + if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov + f = c(f[-1L]-1L, nrow(x)) ## last of each group + } + if (length(o)) f = o[f] + if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order + .Call(CsubsetDT, x, f, someCols(x, cols, keep=on)) +} + +# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow) +# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT +dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) { + nomatch = switch(how, "inner"=, "semi"=, "anti"=, "cross"= 0L, "left"=, "right"=, "full"= NA_integer_) + nomatch0 = identical(nomatch, 0L) + if (is.null(mult)) + mult = switch(how, "semi"=, "anti"= "last", "cross"= "all", "inner"=, "left"=, "right"=, "full"= "error") + if (void && mult!="error") + internal_error("void must be used with mult='error'") # nocov + if (how=="cross") { ## short-circuit bmerge results only for cross join + if (length(on) || mult!="all" || !join.many) + stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE") + if (void) + internal_error("cross join must be used with void=FALSE") # nocov + ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer()) + } else { + if (!length(on)) + stopf("'on' must be non-zero length character vector") + if (mult=="all" && (how=="semi" || how=="anti")) + stopf("semi and anti joins must be used with mult!='all'") + icols = colnamesInt(i, on, check_dups=TRUE) + xcols = colnamesInt(x, on, check_dups=TRUE) + ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose) + if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line + return(invisible(NULL)) + } else if (how=="semi" || how=="anti") { ## semi and anti short-circuit + irows = which(if (how=="semi") ans$lens!=0L else ans$lens==0L) ## we will subset i rather than x, thus assign to irows, not to xrows + if (length(irows)==length(ans$lens)) irows = NULL + return(list(ans=ans, irows=irows)) + } else if (mult=="all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check + !(length(ans$starts)==1L && ans$lens==nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"] + anyDuplicated(ans$starts, incomparables=c(0L,NA_integer_)) + ) + stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.") + } + + ## xrows, join-to + xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL) + if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)] + len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here + + ## irows, join-from + irows = if (!(ans$allLen1 && (!nomatch0 || len.x==length(ans$starts)))) seqexp(ans$lens) + len.i = if (is.null(irows)) nrow(i) else length(irows) + + if (length(ans$xo) && length(xrows)) + xrows = ans$xo[xrows] + len.x = length(xrows) + + if (len.i!=len.x) + internal_error("dtmerge out len.i != len.x") # nocov + + return(list(ans=ans, irows=irows, xrows=xrows)) +} + +# atomic join between two tables +mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=names(rhs), copy=TRUE, join.many=TRUE, verbose=FALSE) { + semianti = how=="semi" || how=="anti" + innerfull = how=="inner" || how=="full" + { + if (how!="cross") { + if (is.null(on)) { + if (how=="left" || semianti) on = key(rhs) + else if (how=="right") on = key(lhs) + else if (innerfull) on = onkeys(key(lhs), key(rhs)) + if (is.null(on)) + stopf("'on' is missing and necessary key is not present") + } + if (any(bad.on <- !on %chin% names(lhs))) + stopf("'on' argument specify columns to join [%s] that are not present in LHS table [%s]", brackify(on[bad.on]), brackify(names(lhs))) + if (any(bad.on <- !on %chin% names(rhs))) + stopf("'on' argument specify columns to join [%s] that are not present in RHS table [%s]", brackify(on[bad.on]), brackify(names(rhs))) + } else if (is.null(on)) { + on = character() ## cross join only + } + } ## on + { + if (how!="right") { + jnfm = lhs; fm.cols = lhs.cols; jnto = rhs; to.cols = rhs.cols + } else { + jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols + } + } ## join-to and join-from tables and columns (right outer join swap) + + ## ensure symmetric join for inner|full join, apply mult on both tables, bmerge do only 'x' table + cp.i = FALSE ## copy marker of out.i + if ((innerfull) && !is.null(mult) && (mult=="first" || mult=="last")) { + jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE) ## might not copy when already unique by 'on' + cp.i = nrow(jnfm)!=nrow(lhs) ## nrow(lhs) bc how='inner|full' so jnfm=lhs + } else if (how=="inner" && (is.null(mult) || mult=="error")) { ## we do this branch only to raise error from bmerge, we cannot use forder to just find duplicates because those duplicates might not have matching rows in another table, full join checks mult='error' during two non-void bmerges + dtmerge(x=jnfm, i=jnto, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many, void=TRUE) + } + + ## binary merge + ans = dtmerge(x=jnto, i=jnfm, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many) + + ## make i side + out.i = if (is.null(ans$irows)) + .shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on, retain.order=semianti), retain.key=TRUE) + else + .Call(CsubsetDT, jnfm, ans$irows, someCols(jnfm, fm.cols, keep=on, retain.order=semianti)) + cp.i = cp.i || !is.null(ans$irows) + + ## make x side + if (semianti) { + out.x = list(); cp.x = TRUE + } else { + out.x = if (is.null(ans$xrows)) ## as of now xrows cannot be NULL #4409 thus nocov below + internal_error("dtmerge()$xrows returned NULL, #4409 been resolved but related code has not been updated?") #.shallow(jnto, cols=someCols(jnto, to.cols, drop=on), retain.key=TRUE) # nocov ## as of now nocov does not make difference r-lib/covr#279 + else + .Call(CsubsetDT, jnto, ans$xrows, someCols(jnto, to.cols, drop=on)) + cp.x = !is.null(ans$xrows) + ## ensure no duplicated column names in merge results + if (any(dup.i<-names(out.i) %chin% names(out.x))) + stopf("merge result has duplicated column names, use 'cols' argument or rename columns in 'l' tables, duplicated column(s): %s", brackify(names(out.i)[dup.i])) + } + + ## stack i and x + if (how!="full") { + if (!cp.i && copy) out.i = copy(out.i) + #if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here + out = .Call(Ccbindlist, list(out.i, out.x), FALSE) + if (how=="right") setcolorder(out, neworder=c(on, names(out.x))) ## arrange columns: i.on, x.cols, i.cols + } else { # how=="full" + ## we made left join side above, proceed to right join side, so swap tbls + jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols + + cp.r = FALSE + if (!is.null(mult) && (mult=="first" || mult=="last")) { + jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE) + cp.r = nrow(jnfm)!=nrow(rhs) ## nrow(rhs) bc jnfm=rhs + } ## mult=="error" check was made on one side already, below we do on the second side, test 101.43 + + ## binary merge anti join + bns = dtmerge(x=jnto, i=jnfm, on=on, how="anti", mult=if (!is.null(mult) && mult!="all") mult, verbose=verbose, join.many=join.many) + + ## make anti join side + out.r = if (is.null(bns$irows)) + .shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on), retain.key=TRUE) ## retain.key is used only in the edge case when !nrow(out.i) + else + .Call(CsubsetDT, jnfm, bns$irows, someCols(jnfm, fm.cols, keep=on)) + cp.r = cp.r || !is.null(bns$irows) + + ## short circuit to avoid rbindlist to empty sets and retains keys + if (!nrow(out.r)) { ## possibly also !nrow(out.i) + if (!cp.i && copy) out.i = copy(out.i) + #if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here + out = .Call(Ccbindlist, list(out.i, out.x), FALSE) + } else if (!nrow(out.i)) { ## but not !nrow(out.r) + if (!cp.r && copy) out.r = copy(out.r) + if (length(add<-setdiff(names(out.i), names(out.r)))) { ## add missing columns of proper types NA + neworder = copy(names(out.i)) #set(out.r, NULL, add, lapply(unclass(out.i)[add], `[`, 1L)) ## 291.04 overalloc exceed fail during set() + out.i = lapply(unclass(out.i)[add], `[`, seq_len(nrow(out.r))) ## could eventually remove this when cbindlist recycle 0 rows up, note that we need out.r not to be copied + out.r = .Call(Ccbindlist, list(out.r, out.i), FALSE) + setcolorder(out.r, neworder=neworder) + } + out = out.r + } else { ## all might have not been copied yet, rbindlist will copy + out.l = .Call(Ccbindlist, list(out.i, out.x), FALSE) + out = rbindlist(list(out.l, out.r), use.names=TRUE, fill=TRUE) + } + } + setDT(out) +} + +mergelist = function(l, on, cols, how=c("left","inner","full","right","semi","anti","cross"), mult, copy=TRUE, join.many=getOption("datatable.join.many")) { + verbose = getOption("datatable.verbose") + if (verbose) + p = proc.time()[[3L]] + { + if (!is.list(l) || is.data.frame(l)) + stopf("'l' must be a list") + if (!all(vapply_1b(l, is.data.table))) + stopf("Every element of 'l' list must be data.table objects") + if (!all(lengths(l))) + stopf("Tables in 'l' argument must be non-zero columns tables") + if (any(vapply_1i(l, function(x) anyDuplicated(names(x))))) + stopf("Some of the tables in 'l' have duplicated column names") + } ## l + if (!isTRUEorFALSE(copy)) + stopf("'%s' must be TRUE or FALSE", "copy") + n = length(l) + if (n<2L) { + out = if (!n) as.data.table(l) else l[[1L]] + if (copy) out = copy(out) + if (verbose) + catf("mergelist: merging %d table(s), took %.3fs\n", n, proc.time()[[3L]]-p) + return(out) + } + { + if (!is.list(join.many)) + join.many = rep(list(join.many), n-1L) + if (length(join.many)!=n-1L || !all(vapply_1b(join.many, isTRUEorFALSE))) + stopf("'join.many' must be TRUE or FALSE, or a list of such which length must be length(l)-1L") + } ## join.many + { + if (missing(mult)) + mult = NULL + if (!is.list(mult)) + mult = rep(list(mult), n-1L) + if (length(mult)!=n-1L || !all(vapply_1b(mult, function(x) is.null(x) || (is.character(x) && length(x)==1L && !anyNA(x) && x %chin% c("error","all","first","last"))))) + stopf("'mult' must be one of [error, all, first, last] or NULL, or a list of such which length must be length(l)-1L") + } ## mult + { + if (missing(how) || is.null(how)) + how = match.arg(how) + if (!is.list(how)) + how = rep(list(how), n-1L) + if (length(how)!=n-1L || !all(vapply_1b(how, function(x) is.character(x) && length(x)==1L && !anyNA(x) && x %chin% c("left","inner","full","right","semi","anti","cross")))) + stopf("'how' must be one of [left, inner, full, right, semi, anti, cross], or a list of such which length must be length(l)-1L") + } ## how + { + if (missing(cols) || is.null(cols)) { + cols = vector("list", n) + } else { + if (!is.list(cols)) + stopf("'%s' must be a list", "cols") + if (length(cols) != n) + stopf("'cols' must be same length as 'l'") + skip = vapply_1b(cols, is.null) + if (!all(vapply_1b(cols[!skip], function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x)))) + stopf("'cols' must be a list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULLs (all columns)") + if (any(mapply(function(x, icols) !all(icols %chin% names(x)), l[!skip], cols[!skip]))) + stopf("'cols' specify columns not present in corresponding table") + } + } ## cols + { + if (missing(on) || is.null(on)) { + on = vector("list", n-1L) + } else { + if (!is.list(on)) + on = rep(list(on), n-1L) + if (length(on)!=n-1L || !all(vapply_1b(on, function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x)))) ## length checked in dtmerge + stopf("'on' must be non-NA, non-duplicated, character vector, or a list of such which length must be length(l)-1L") + } + } ## on + + l.mem = lapply(l, vapply, address, "") + out = l[[1L]] + out.cols = cols[[1L]] + for (join.i in seq_len(n-1L)) { + rhs.i = join.i + 1L + out = mergepair( + lhs = out, rhs = l[[rhs.i]], + on = on[[join.i]], + how = how[[join.i]], mult = mult[[join.i]], + lhs.cols = out.cols, rhs.cols = cols[[rhs.i]], + copy = FALSE, ## avoid any copies inside, will copy once below + join.many = join.many[[join.i]], + verbose = verbose + ) + out.cols = copy(names(out)) + } + out.mem = vapply_1c(out, address) + if (copy) + .Call(CcopyCols, out, colnamesInt(out, names(out.mem)[out.mem %chin% unique(unlist(l.mem, recursive=FALSE))])) + if (verbose) + catf("mergelist: merging %d tables, took %.3fs\n", n, proc.time()[[3L]]-p) + out +} + +# Previously, we had a custom C implementation here, which is ~2x faster, +# but this is fast enough we don't bother maintaining a new routine. +# Hopefully in the future rep() can recognize the ALTREP and use that, too. +seqexp = function(x) rep(seq_along(x), x) +perhaps.data.table = function(x) .Call(CperhapsDataTableR, x) diff --git a/R/onLoad.R b/R/onLoad.R index ef96849e85..ff8b18c02d 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -82,6 +82,7 @@ "datatable.print.trunc.cols"="FALSE", # for print.data.table "datatable.show.indices"="FALSE", # for print.data.table "datatable.allow.cartesian"="FALSE", # datatable. + "datatable.join.many"="TRUE", # mergelist, [.data.table #4383 #914 "datatable.dfdispatchwarn"="TRUE", # not a function argument "datatable.warnredundantby"="TRUE", # not a function argument "datatable.alloccol"="1024L", # argument 'n' of alloc.col. Over-allocate 1024 spare column slots diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw new file mode 100644 index 0000000000..422d8d7097 --- /dev/null +++ b/inst/tests/mergelist.Rraw @@ -0,0 +1,338 @@ +require(methods) + +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + mergepair = data.table:::mergepair + perhaps.data.table = data.table:::perhaps.data.table + hasindex = data.table:::hasindex + fdistinct = data.table:::fdistinct + forderv = data.table:::forderv +} + +addresses = function(x) vapply(x, address, "") +copied = function(ans, l) { + all(!addresses(ans) %chin% unlist(recursive=FALSE, lapply(l, addresses))) +} +notcopied = function(ans, l, how="left", unless=character()) { + if (how %chin% unless) return(copied(ans, l)) ## used during looping tests for easier escape + if (how=="full") return( ## either side, left|right, notcopied is fine + all(addresses(l[[1L]]) %chin% addresses(ans)) || all(addresses(l[[length(l)]]) %chin% addresses(ans)) + ) + all(addresses(if (how=="right") l[[length(l)]] else l[[1L]]) %chin% addresses(ans)) +} + +# internal helpers + +test(1.01, perhaps.data.table(list())) +test(1.02, perhaps.data.table(list(a=1:2))) +test(1.03, perhaps.data.table(list(a=1:2, b=1:2))) +test(1.04, perhaps.data.table(list(1:2, 1:2)), FALSE) + +test(2.01, fdistinct(list(x=c(1L,1:2), b=1:2), on="x", mult="last"), error="must be data.table") +test(2.02, fdistinct(data.table(x=c(1L,1:2)), on="z", mult="last"), error="must be character column names of") +test(2.03, fdistinct(data.table(x=c(1L,1:2)), on="x", mult="last", cols=character()), error="must be non-zero length, non-NA, integer or character columns of") +test(2.04, fdistinct(data.table(x=c(1L,1:2, y=1:3)), on="x", mult="last", copy=NA), error="must be TRUE or FALSE") +d = data.table(x=1:2, y=1:2) +test(2.05, ans<-fdistinct(d, on="x", mult="last"), d) +test(2.06, intersect(addresses(ans), addresses(d)), character()) +test(2.07, ans<-fdistinct(d, on="x", mult="last", copy=FALSE), d) +test(2.08, addresses(ans), addresses(d)) +d = data.table(x=c(2:1,2L), y=1:3) +test(2.09, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2)) +test(2.10, fdistinct(d, on="x", mult="last"), data.table(x=1:2, y=2:3)) +setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x", retGrp=TRUE)) ## retGrp=T index #4386 +test(2.11, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2)) + +test(3.01, hasindex(d, "x")) +test(3.02, hasindex(d, "x", retGrp=TRUE)) +setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x")) ## retGrp=F index #4386 +test(3.03, hasindex(d, "x")) +test(3.04, !hasindex(d, "x", retGrp=TRUE)) +setattr(d, "index", NULL) +test(3.05, !hasindex(d, "x")) +test(3.06, !hasindex(d, "x", retGrp=TRUE)) +setattr(d, "index", integer()) +test(3.07, !hasindex(d, "x")) +test(3.08, !hasindex(d, "x", retGrp=TRUE)) +rm(d) + +# cbindlist + +l = list( + d1 = data.table(x=1:3, v1=1L), + d2 = data.table(y=3:1, v2=2L), + d3 = data.table(z=2:4, v3=3L) +) +ans = cbindlist(l) +expected = data.table(l$d1, l$d2, l$d3) +test(11.01, ans, expected) +test(11.02, intersect(addresses(ans), addresses(expected)), character()) +ans = cbindlist(l, copy=FALSE) +expected = setDT(c(l$d1, l$d2, l$d3)) +test(11.03, ans, expected) +test(11.04, length(intersect(addresses(ans), addresses(expected))), ncol(expected)) +test(11.05, cbindlist(list(data.table(a=1L), data.table(), data.table(d=2L), data.table(f=3L))), data.table(a=1L,d=2L,f=3L)) +rm(expected) +## codecov +test(12.01, cbindlist(data.frame(a=1L), data.frame(b=1L)), error="must be a list") +test(12.02, cbindlist(TRUE, FALSE), error="must be a list") +test(12.03, cbindlist(list(), NA), error="must be TRUE or FALSE") +test(12.04, cbindlist(list(data.table(a=1L), 1L)), error="is not of data.table type") +test(12.05, options = c(datatable.verbose=TRUE), cbindlist(list(data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2), output="cbindlist.*took") +test(12.06, cbindlist(list(data.table(), data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2)) +test(12.07, cbindlist(list(data.table(), data.table(a=1:2), list(b=1:2))), data.table(a=1:2, b=1:2)) +test(12.08, cbindlist(list(data.table(a=integer()), list(b=integer()))), data.table(a=integer(), b=integer())) +## duplicated names +test(12.09, cbindlist(list(data.table(a=1L, b=2L), data.table(b=3L, d=4L))), data.table(a=1L, b=2L, b=3L, d=4L)) +ans = cbindlist(list(setindexv(data.table(a=2:1, b=1:2),"a"), data.table(a=1:2, b=2:1, key="a"), data.table(a=2:1, b=1:2))) +test(12.10, ans, data.table(a=2:1, b=1:2, a=1:2, b=2:1, a=2:1, b=1:2)) +test(12.11, indices(ans), NULL) +## recycling, first ensure cbind recycling that we want to match to +test(12.12, cbind(data.table(x=integer()), data.table(a=1:2)), data.table(x=c(NA_integer_,NA), a=1:2)) +test(12.13, cbind(data.table(x=1L), data.table(a=1:2)), data.table(x=c(1L,1L), a=1:2)) +test(12.14, cbindlist(list(data.table(a=integer()), data.table(b=1:2))), error="recycling.*not yet implemented") +test(12.15, cbindlist(list(data.table(a=1L), data.table(b=1:2))), error="recycling.*not yet implemented") +test(12.16, cbindlist(list(data.table(a=integer()), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow") +test(12.17, cbindlist(list(data.table(a=1L), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow") + +## retain indices +d = data.table(x=1:2, y=2:1, z=2:1, v1=1:2) ## ensure setDT will retain key and indices when it is called on the list, bc Ccbindlist returns list +setkeyv(d, "x"); setindexv(d, list("y", "z")) +a = attributes(d) +attributes(d) = a[!names(a) %in% c("class",".internal.selfref","row.names")] +test(13.01, class(d), "list") +setDT(d) +test(13.02, key(d), "x") +test(13.03, hasindex(d, "y") && hasindex(d, "z")) +l = list( + data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5), + data.table(id4=5:1, id5=1:5, v2=1:5), + data.table(id6=5:1, id7=1:5, v3=1:5), + data.table(id8=5:1, id9=5:1, v4=1:5) +) +setkeyv(l[[1L]], "id1"); setindexv(l[[1L]], list("id1", "id2", "id3", c("id1","id2","id3"))); setindexv(l[[3L]], list("id6", "id7")); setindexv(l[[4L]], "id9") +ii = lapply(l, indices) +ans = cbindlist(l) +test(13.04, key(ans), "id1") +test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9")) +test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib + +# mergepair + +## test copy-ness argument in mergepair + +### LHS equal to RHS: no copy in all cases +num = 21.000 +l = list( + lhs = data.table(id1=1:2, v1=1:2), + rhs = data.table(id1=1:2, v2=1:2) +) +expected = data.table(id1=1:2, v1=1:2, v2=1:2) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected) ## copy=TRUE: no shared columns + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected) ## copy=FALSE: LHS shared but no RHS + test(num<-num+0.001, notcopied(ans, l, how=how)) + } +} +### RHS includes LHS: no copy in inner, left, right +num = 22.000 +unless = "full" +l = list( + lhs = data.table(id1=1:2, v1=1:2), + rhs = data.table(id1=1:3, v2=1:3) +) +expected = list( + inner = data.table(id1=1:2, v1=1:2, v2=1:2), + left = data.table(id1=1:2, v1=1:2, v2=1:2), + right = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3), + full = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS includes RHS: no copy in left, right, full +num = 23.000 +unless = "inner" +l = list( + lhs = data.table(id1=1:3, v1=1:3), + rhs = data.table(id1=1:2, v2=1:2) +) +expected = list( + inner = data.table(id1=1:2, v1=1:2, v2=1:2), + left = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA)), + right = data.table(id1=1:2, v1=1:2, v2=1:2), + full = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS single nonmatch RHS on both sides: no copy in left, right +num = 24.000 +unless = c("inner","full") +l = list( + lhs = data.table(id1=3:1, v1=1:3), + rhs = data.table(id1=c(4L,2:1), v2=1:3) +) +expected = list( + inner = data.table(id1=2:1, v1=2:3, v2=2:3), + left = data.table(id1=3:1, v1=1:3, v2=c(NA,2:3)), + right = data.table(id1=c(4L,2:1), v1=c(NA,2:3), v2=1:3), + full = data.table(id1=c(3:1,4L), v1=c(1:3,NA), v2=c(NA,2:3,1L)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS zero match RHS: no copy in left, right +num = 25.000 +unless = c("inner","full") +l = list( + lhs = data.table(id1=2:1, v1=1:2), + rhs = data.table(id1=3:4, v2=1:2) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)), + right = data.table(id1=3:4, v1=c(NA_integer_,NA), v2=1:2), + full = data.table(id1=c(2:1,3:4), v1=c(1:2,NA,NA), v2=c(NA,NA,1:2)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS and RHS zero nrow: no copies +num = 26.000 +unless = character() +l = list( + lhs = data.table(id1=integer(), v1=integer()), + rhs = data.table(id1=integer(), v2=integer()) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=integer(), v1=integer(), v2=integer()), + right = data.table(id1=integer(), v1=integer(), v2=integer()), + full = data.table(id1=integer(), v1=integer(), v2=integer()) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### LHS has zero nrow: no copies +num = 27.000 +unless = character() +l = list( + lhs = data.table(id1=integer(), v1=integer()), + rhs = data.table(id1=2:1, v2=1:2) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=integer(), v1=integer(), v2=integer()), + right = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2), + full = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} +### RHS has zero nrow +num = 28.000 +unless = "inner" +l = list( + lhs = data.table(id1=2:1, v1=1:2), + rhs = data.table(id1=integer(), v2=integer()) +) +expected = list( + inner = data.table(id1=integer(), v1=integer(), v2=integer()), + left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)), + right = data.table(id1=integer(), v1=integer(), v2=integer()), + full = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)) +) +for (how in c("inner","left","right","full")) { + num = trunc(num*10)/10 + 0.1 + for (mult in c("all","first","last","error")) { + num = trunc(num*100)/100 + 0.01 + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]]) + test(num<-num+0.001, copied(ans, l)) + test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]]) + test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless)) + } +} + +## fdistinct, another round + +dt = data.table(x = +c(74L, 103L, 158L, 250L, 56L, 248L, 260L, 182L, 174L, 17L, 57L, + 49L, 189L, 106L, 212L, 137L, 198L, 273L, 105L, 214L, 258L, 59L, + 180L, 35L, 74L, 107L, 4L, 106L, 240L, 94L, 133L, 165L, 136L, + 52L, 228L, 184L, 219L, 30L, 200L, 114L, 226L, 178L, 216L, 153L, + 146L, 218L, 7L, 132L, 202L, 191L, 132L, 237L, 121L, 68L, 20L, + 28L, 87L, 143L, 183L, 112L, 252L, 81L, 127L, 92L, 179L, 71L, + 132L, 211L, 24L, 241L, 94L, 231L, 96L, 92L, 131L, 246L, 238L, + 108L, 214L, 265L, 120L, 196L, 110L, 90L, 209L, 56L, 196L, 34L, + 68L, 40L, 66L, 17L, 177L, 241L, 215L, 220L, 126L, 113L, 223L, + 167L, 181L, 98L, 75L, 273L, 175L, 59L, 36L, 132L, 255L, 165L, + 269L, 202L, 99L, 119L, 41L, 4L, 197L, 29L, 123L, 177L, 273L, + 137L, 134L, 48L, 208L, 125L, 141L, 58L, 63L, 164L, 159L, 22L, + 10L, 177L, 256L, 165L, 155L, 145L, 271L, 140L, 188L, 166L, 66L, + 71L, 201L, 125L, 49L, 206L, 29L, 238L, 170L, 154L, 91L, 125L, + 138L, 50L, 146L, 21L, 77L, 59L, 79L, 247L, 123L, 215L, 243L, + 114L, 18L, 93L, 200L, 93L, 174L, 232L, 236L, 108L, 105L, 247L, + 178L, 204L, 167L, 249L, 81L, 53L, 244L, 139L, 242L, 53L, 209L, + 200L, 260L, 151L, 196L, 107L, 28L, 256L, 78L, 163L, 31L, 232L, + 88L, 216L, 74L, 61L, 143L, 74L, 50L, 143L, 155L, 36L, 71L, 198L, + 265L, 28L, 210L, 261L, 226L, 85L, 179L, 263L, 263L, 94L, 73L, + 46L, 89L, 141L, 255L, 141L, 71L, 13L, 115L, 235L, 96L, 37L, 103L, + 174L, 108L, 190L, 190L, 153L, 119L, 125L, 85L, 160L, 251L, 40L, + 115L, 59L, 118L, 37L, 127L, 260L, 210L, 257L, 130L, 166L, 134L, + 30L, 69L, 138L, 103L, 258L, 145L, 88L, 77L, 217L, 194L, 46L, + 18L, 208L, 171L, 47L, 18L, 30L, 105L, 47L, 83L) +) +ans = unique(dt, by="x") +test(301.01, data.table(x=unique(dt$x)), ans) ## OK +test(301.02, fdistinct(dt, on="x"), ans) ## force sort=TRUE for the moment diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw index 044d82cfa0..b47cf23a42 100644 --- a/inst/tests/other.Rraw +++ b/inst/tests/other.Rraw @@ -1,4 +1,4 @@ -pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf", "nanotime", "R.utils", "yaml") +pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf", "nanotime", "R.utils", "yaml", "DBI", "RSQLite") # First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs. # So that these dependencies of other.Rraw are maintained in a single place. # TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by @@ -766,3 +766,297 @@ if (loaded[["nanotime"]]) { # respect dec=',' for nanotime, related to #6446, corresponding to tests 2281.* test(31, fwrite(data.table(as.nanotime(.POSIXct(0))), dec=',', sep=';'), output="1970-01-01T00:00:00,000000000Z") } + +# NB: currently, RSQLite requires DBI, so partially redundant, but future-proof. +if (loaded[["DBI"]] && loaded[["RSQLite"]]) { + # mergelist join tester vs SQLite, based on v1.9.8 non-equi join tester + + # funs ---- + + # produce SQL statement + # ln, rn: lhs names, rhs names, symmult: symmetric mult + mult_all = function(tbl, cols, ...) sprintf( + "(\n SELECT %s FROM %s\n) %s", + paste(setdiff(cols,"row_id"), collapse=", "), tbl, tbl + ) + mult_one = function(tbl, cols, on, mult) sprintf( + "(SELECT %s FROM (\n SELECT *, ROW_NUMBER() OVER (PARTITION BY %s ORDER BY row_id %s) AS rownum FROM %s\n) %s WHERE rownum=1) %s", + paste(setdiff(cols,c("row_id","rownum")), collapse=", "), + paste(on, collapse=", "), + if (mult=="first") "ASC" else "DESC", + tbl, tbl, tbl + ) + sql = function(how, on, mult, ln, rn, symmult=FALSE, notjoin=FALSE) { + stopifnot(length(on)==1L) + # building sql query + if (how=="full") { + return(sprintf( + "%s\nUNION ALL\n%s", + sql("left", on, mult, ln, rn, symmult=mult%in%c("first","last")), + sql("right", on, mult, ln, rn, symmult=mult%in%c("first","last"), notjoin=TRUE) + )) + } + nm = list() + nm[["lhs"]] = ln; nm[["rhs"]] = rn + using = sprintf("USING (%s)", paste(on, collapse=", ")) + lhs = "lhs"; rhs = "rhs" + join = if (how=="inner") { + if (mult=="all") sprintf("%s\nINNER JOIN\n%s\n%s", mult_all(lhs, nm[[lhs]]), mult_all(rhs, nm[[rhs]]), using) + else sprintf("%s\nINNER JOIN\n%s\n%s", mult_one(lhs, nm[[lhs]], on, mult), mult_one(rhs, nm[[rhs]], on, mult), using) + } else if (how=="left") { + if (mult=="all") sprintf("%s\nLEFT JOIN\n%s\n%s", mult_all(lhs, nm[[lhs]]), mult_all(rhs, nm[[rhs]]), using) + else sprintf("%s\nLEFT JOIN\n%s\n%s", (if (symmult) mult_one else mult_all)(lhs, nm[[lhs]], on, mult), mult_one(rhs, nm[[rhs]], on, mult), using) + } else if (how=="right") { ## lhs-rhs swap happens here, mult_one is applied on new rhs + if (mult=="all") sprintf("%s\nLEFT JOIN\n%s\n%s", mult_all(rhs, nm[[rhs]]), mult_all(lhs, nm[[lhs]]), using) + else sprintf("%s\nLEFT JOIN\n%s\n%s", (if (symmult) mult_one else mult_all)(rhs, nm[[rhs]], on, mult), mult_one(lhs, nm[[lhs]], on, mult), using) + } + if (how=="right") {lhs = "rhs"; rhs = "lhs"} ## this name swap is for notjoin and select below + where = if (!notjoin) "" else sprintf("\nWHERE %s IS NULL", paste(rhs, on, sep=".")) + select = sprintf("%s, %s, %s", paste(lhs, on, sep="."), + paste("lhs", setdiff(nm[["lhs"]], c("row_id",on)),sep=".",collapse=", "), + paste("rhs", setdiff(nm[["rhs"]], c("row_id",on)),sep=".",collapse=", ")) + sprintf("SELECT %s FROM\n%s%s", select, join, where) + } + + # .conn SQLite connection, if provided it will use it instead of creating temporary one + # .drop logical TRUE (default) will drop db tables before and after and populate new, when FALSE it expects tables to be populated + join.sql.equal = function(l, on, how="inner", mult="all", allow.cartesian=TRUE, .conn, .drop=TRUE, .debug=interactive(), ans, err=FALSE) { + nm = names(l) + stopifnot(is.null(nm) || identical(nm, c("x","i")) || identical(nm, c("lhs","rhs"))) + names(l) = c("lhs","rhs") + lhs = l[["lhs"]]; rhs = l[["rhs"]] + stopifnot(is.data.table(lhs), is.data.table(rhs), + is.character(how), is.character(mult), length(mult)==1L, + is.character(on), + is.logical(allow.cartesian), is.logical(.drop)) + if (err && mult=="error") { + dt = try(silent=TRUE, mergelist(list(lhs, rhs), on=on, how=how, mult=mult)) + if (!inherits(dt, "try-error")) { + if (.debug) browser() + stop("no error returned from mergelist(mult='error') but err flag set to TRUE in join.sql.equal") + } + err_msg = "mult='error' and multiple matches during merge" + if (!identical(attr(dt, "condition", TRUE)[["message"]], err_msg)) { + if (.debug) browser() + stop("different error returned than expected: ", attr(dt, "condition", TRUE)[["message"]]) + } + return(TRUE) + } + # row_id column required as SQL is not ordered, creating on R side + if (!"row_id" %in% names(lhs)) lhs = copy(lhs)[, "row_id" := seq_len(.N)] + if (!"row_id" %in% names(rhs)) rhs = copy(rhs)[, "row_id" := seq_len(.N)] + # preparing sql environment + conn = if (new.conn <- missing(.conn)) DBI::dbConnect(RSQLite::SQLite()) else .conn + if (.drop) { + try(suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE lhs;")), silent=TRUE) + try(suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE rhs;")), silent=TRUE) + DBI::dbWriteTable(conn, name="lhs", value=lhs) + DBI::dbWriteTable(conn, name="rhs", value=rhs) + } + # building sql query + s = sql(how, on, mult, names(lhs), names(rhs)) + s = paste0(s,";\n") + # run data.table and SQLite + dt = mergelist(list(lhs[,!"row_id"], rhs[,!"row_id"]), on=on, how=how, mult=mult) + sq = try(silent=TRUE, as.data.table(DBI::dbGetQuery(conn, s))) + if (inherits(sq, "try-error")) { + if (.debug) {message("error during sql statement"); browser()} + stop("error during sql statement") + } + if (!is.data.table(dt) || !is.data.table(sq)) { + if (.debug) {message("dt and sq must be data.table already"); browser()} + stop("dt and sq must be data.table already") + } + if (how %in% c("inner","full")) { + dt2 = mergelist(list(rhs[,!"row_id"], lhs[,!"row_id"]), on=on, how=how, mult=mult) + setcolorder(dt2, neworder=names(dt)) + setattr(dt, "index", integer()) + setattr(dt2, "index", integer()) + r = all.equal(dt, dt2, ignore.row.order=TRUE) + ## check it is symetric + if (!isTRUE(r)) { + if (.debug) {message("mergelist is not symmetric for ", how); browser()} + stop("mergelist is not symmetric for ", how) + } + } + setattr(sq, "index", integer()) + setattr(dt, "index", integer()) + # compare results + a = all.equal(dt, sq, ignore.row.order=TRUE) + b = all.equal(dt, sq, ignore.row.order=TRUE, ignore.col.order=TRUE) + if (!missing(ans)) { + r = all.equal(ans, sq, ignore.row.order=TRUE) + if (!isTRUE(r)) { + if (.debug) browser() + stop("sql does not match to reference answer") + } + } + if (.drop) { + suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE lhs;")) + suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE rhs;")) + } + if (new.conn) suppressWarnings(DBI::dbDisconnect(conn)) + if (isTRUE(b) && !isTRUE(a)) { + if (.debug) browser() + stop("only column order mismatch") + } + if (!isTRUE(a)) { + if (.debug) browser() + cat(sep="\n",c( + sprintf("# dtq:\nmergelist(l, on='%s', how='%s', mult='%s')", paste(on, collapse=", "), how, mult), + sprintf("# sql:\n%s", s), + a, "\n")) + } + isTRUE(a) + } + + batch.join.sql.equal = function(cases, on, hows=c("inner","left","right","full"), mults=c("all","first","last"), .debug=FALSE) { + if ("error" %in% mults) stop("mult=error is not supported") + p = proc.time()[[3L]] + conn = DBI::dbConnect(RSQLite::SQLite()) + ans = list() + dup_n = 0L + for (case in cases) { + l = data(case) + stopifnot(c("lhs","rhs") %in% names(l)) + case = as.character(case) + lhs = l$lhs; rhs = l$rhs + ans[[case]] = list() + # reuse tables, to test if affects sqlite efficiency + try(suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE lhs;")), silent = TRUE) + try(suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE rhs;")), silent = TRUE) + # row_id column required as SQL is not ordered, creating on R side + if (!"row_id" %in% names(lhs)) lhs = copy(lhs)[, "row_id" := seq_len(.N)] + if (!"row_id" %in% names(rhs)) rhs = copy(rhs)[, "row_id" := seq_len(.N)] + DBI::dbWriteTable(conn, name="lhs", value=lhs) + DBI::dbWriteTable(conn, name="rhs", value=rhs) + len = prod(length(cases), length(hows), length(mults)) + if (len > (len.warn <- getOption("tests.length.warning", 1e3))) + warning(sprintf("You are about to run %s number of tests. To suppress this warning use 'tests.length.warning' option, set to numeric threshold or Inf.", len.warn)) + for (how in hows) { + ans[[case]][[how]] = list() + for (mult in mults) { + if (!is.null(ans[[case]][[how]][[mult]])) { + dup_n = dup_n+1L + next #warning("Some tests are duplicated, so far ", dup_n) + } + ans[[case]][[how]][[mult]] = join.sql.equal(list(lhs=lhs, rhs=rhs), on=on, how=how, mult=mult, .conn=conn, .drop=FALSE, .debug=.debug) + } + } + suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE lhs;")) + suppressWarnings(DBI::dbSendQuery(conn, "DROP TABLE rhs;")) + } + suppressWarnings(DBI::dbDisconnect(conn)) + cat(sprintf("batch.join.sql.equal: %s%s tests completed in %.1fs\n", + len, if (dup_n) sprintf(" (%s duplicated)", dup_n) else "", proc.time()[[3L]] - p)) + ans + } + data = function(case) { + set.seed(108) + if (case == 1L) { # 2 match + lhs = data.table(id = c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id = c(2L,4L,3L,5L), v2=1:4) + } else if (case == 2L) { # 4 match + lhs = data.table(id = c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id = c(7L,5L,3L,1L), v2=1:4) + } else if (case == 3L) { # 1 match + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(1L,2L,4L,6L), v2=1:4) + } else if (case == 4L) { # 0 match + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(0L,2L,4L,6L), v2=1:4) + } else if (case == 5L) { # 0 match dup + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(0L,2L,2L,6L), v2=1:4) + } else if (case == 6L) { # 1 match dup + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(1L,2L,2L,6L), v2=1:4) + } else if (case == 7L) { # 1 match dup match + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id =c(3L,3L,4L,6L), v2=1:4) + } else if (case == 8L) { # 2 match 2 dup match + lhs = data.table(id =c(1L,5L,3L,7L), v1=1:4) + rhs = data.table(id = c(3L,3L,7L,7L), v2=1:4) + } else if (case == 9L) { # 2 dup 2 dup + lhs = data.table(id = c(1L,5L,1L,5L), v1=1:4) + rhs = data.table(id = c(5L,5L,1L,1L), v2=1:4) + } else if (case == 10L) { # 4 dup 4 dup match + lhs = data.table(id = c(1L,1L,1L,1L), v1=1:4) + rhs = data.table(id = c(1L,1L,1L,1L), v2=1:4) + } else if (case == 11L) { # 4 dup 4 dup nomatch + lhs = data.table(id = c(1L,1L,1L,1L), v1=1:4) + rhs = data.table(id = c(2L,2L,2L,2L), v2=1:4) + } else if (case == 12L) { # no match, no overlap + lhs = data.table(id = c(1:4), v1=1:4) + rhs = data.table(id = c(6:9), v2=1:4) + } else if (case == 13L) { # all i matches + lhs = data.table(id = c(1L,5L,3L,7L,9L), v1=1:5) + rhs = data.table(id = c(7L,5L,3L,1L), v2=1:4) + } else if (case == 14L) { # dup match and 1 non-match + ## inner join short circuit test + ## what if some row is excluded but another is duplicated? nrow(i) match + lhs = data.table(id = c(1L,5L,3L,7L,3L), v1=1:5) + rhs = data.table(id = c(7L,5L,3L,2L), v2=1:4) + } else if (case == 15L) { + # does not raise error on mult="error" because dups '13' does not have matching rows! + lhs = data.table(id = as.integer(c(17,14,11,10,5,1,19,7,16,15)), v1=1:10) + rhs = data.table(id = as.integer(c(6,20,13,1,8,13,3,10,17,9)), v2=1:10) + } else if (case == 16L) { + lhs = data.table(id = sample(10L, 10L, TRUE), v1=1:10) + rhs = data.table(id = sample(10L, 10L, TRUE), v2=1:10) + } else if (case == 17L) { + lhs = data.table(id = sample(1e2L, 1e2L, TRUE), v1=1:1e2) + rhs = data.table(id = sample(1e2L, 1e2L, TRUE), v2=1:1e2) + } else if (case == 18L) { + lhs = data.table(id = sample(1e2L, 1e2L, TRUE), v1=1:1e2) + rhs = data.table(id = sample(10L, 20L, TRUE), v2=1:1e2) + } else if (case==19L) { + lhs = as.data.table(list(id=sample(1e3), v1=1:1e3)) + rhs = as.data.table(list(id=sample(1e3), v2=1:1e3)) + } else if (case==20L) { + lhs = as.data.table(list(id=sample(1e3*2L, 1e3), v1=1:1e3)) + rhs = as.data.table(list(id=sample(1e3*2L, 1e3), v2=1:1e3)) + } else if (case==21L) { + lhs = as.data.table(list(id=sample(1e3, 1e3*2L, TRUE), v1=1:1e3)) + rhs = as.data.table(list(id=sample(1e3, 1e3*2L, TRUE), v2=1:1e3)) + } else if (case==22L) { ## LHS equals RHS + lhs = data.table(id=1:2, v1=1:2) + rhs = data.table(id=1:2, v2=1:2) + } else if (case==23L) { ## cross join + lhs = data.table(id=c(1L,1L), v1=1:2) + rhs = data.table(id=c(1L,1L), v2=1:2) + } else if (case==24L) { ## cartesian match, dups on both sides of match + lhs = data.table(id=c(1L,1:2), v1=1:3) + rhs = data.table(id=c(1L,1L,3L), v2=1:3) + } else if (case==25L) { ## duplicates in RHS + lhs = data.table(id=1:2, v1=1:2) + rhs = data.table(id=c(2L,2:3), v2=1:3) + } else if (case==26L) { ## duplicates in RHS and LHS, some RHS dups does not have matches in LHS (merge.data.table+mult fails) + lhs = data.table(id=c(1:3,3L), v1=1:4) + rhs = data.table(id=c(1L,1L,3:4,4L), v2=1:5) + } else if (case==27L) { ## duplicates in RHS and LHS, some LHS dups does not have matches in RHS + lhs = data.table(id=c(1L,1L,3:4,4L), v1=1:5) + rhs = data.table(id=c(1:3,3L), v2=1:4) + } else if (case==28L) { ## duplicates in RHS and LHS + lhs = data.table(id=c(1:3,3L), v1=1:4) + rhs = data.table(id=c(1L,1L,3:4), v2=1:4) + } else if (case==29L) { ## duplicates in RHS + lhs = data.table(id=1:2, v1=1:2) + rhs = data.table(id=c(2L,2:3), v2=1:3) + } else if (case==30L) { ## duplicates in LHS + lhs = data.table(id=c(1:2,2L), v1=1:3) + rhs = data.table(id=2:3, v2=1:2) + } else if (case==31L) { + lhs = data.table(id=integer(), v1=integer()) + rhs = data.table(id=integer(), v2=integer()) + } else stop("case not found") + list(lhs=lhs, rhs=rhs) + } + + # tests ---- + + y = batch.join.sql.equal(cases=1:31, on="id", hows=c("inner","left","right","full"), mults=c("all","first","last"), .debug=interactive()) + y = rapply(y, isTRUE) + if (!all(y)) + stop(sprintf("join tests failed for %s cases:\n%s", sum(!y), paste(" ", names(y)[!y], collapse="\n"))) +} diff --git a/man/cbindlist.Rd b/man/cbindlist.Rd new file mode 100644 index 0000000000..5a780e99ad --- /dev/null +++ b/man/cbindlist.Rd @@ -0,0 +1,36 @@ +\name{cbindlist} +\alias{cbindlist} +\alias{cbind} +\alias{cbind.data.table} +\title{Column bind multiple data.tables} +\description{ + Column bind multiple \code{data.table}s. +} +\usage{ + cbindlist(l, copy=TRUE) +} +\arguments{ + \item{l}{ \code{list} of \code{data.table}s to merge. } + \item{copy}{ \code{logical}, decides if columns has to be copied into resulting object (default) or just referred. } +} +\details{ + Column bind only stacks input elements. Works like \code{\link{data.table}}, but takes \code{list} type on input. Zero-column tables in \code{l} are omitted. Tables in \code{l} should have matching row count; recycling of length-1 rows is not yet implemented. Indices of the input tables are transferred to the resulting table, as well as the \emph{key} of the first keyed table. +} +\value{ + A new \code{data.table} based on the stacked objects. Eventually when \code{copy} is \code{FALSE}, then resulting object will share columns with \code{l} tables. +} +\note{ + If output object has any duplicate names, then key and indices are removed. +} +\seealso{ + \code{\link{data.table}}, \code{\link{rbindlist}} +} +\examples{ +l = list( + d1 = data.table(x=1:3, v1=1L), + d2 = data.table(y=3:1, v2=2L), + d3 = data.table(z=2:4, v3=3L) +) +cbindlist(l) +} +\keyword{ data } diff --git a/man/mergelist.Rd b/man/mergelist.Rd new file mode 100644 index 0000000000..bfee1aae11 --- /dev/null +++ b/man/mergelist.Rd @@ -0,0 +1,189 @@ +\name{mergelist} +\alias{mergelist} +\title{Merge multiple data.tables} +\description{ + Faster merge of multiple \code{data.table}s. +} +\usage{ + mergelist(l, on, cols, + how = c("left","inner","full","right","semi","anti","cross"), + mult, copy = TRUE, + join.many = getOption("datatable.join.many")) +} +\arguments{ + \item{l}{ \code{list} of \code{data.table}s to merge. } + \item{on}{ \code{character} vector of column names to merge on; when missing, the \emph{key} of \emph{join-to} table is used. } + \item{cols}{ \code{list} of \code{character} column names corresponding to tables in \code{l}, used to subset columns during merges. } + \item{how}{ \code{character} scalar, controls how to merge tables. Allowed values are \code{"left"} (default), \code{"inner"}, \code{"full"}, \code{"right"}, \code{"semi"}, \code{"anti"}, \code{"cross"}. See Details. } + \item{mult}{ \code{character} scalar, controls how to proceed when multiple rows in \emph{join-to} table match to the row in \emph{join-from} table. Allowed values are \code{"error"}, \code{"all"}, \code{"first"}, \code{"last"}. Default depends on \code{how}, described in \emph{details} below. See examples on how to detect duplicated matches. Using \code{"all"} is recommended together with \code{join.many=FALSE}, unless rows explosion or cartesian product are intended. } + \item{copy}{ \code{logical}, defaults to \code{TRUE}, when \code{FALSE}, then resulting object may share columns with tables in \code{l}, depending on matches. } + \item{join.many}{ \code{logical}, defaults to \code{getOption("datatable.join.many")}, which is \code{TRUE} by default; when \code{FALSE} and \code{mult="all"}, then extra check is made to ensure no \emph{many-to-many} matches exist between tables, and if they exist, then exception is raised. Works similarly to \code{allow.cartesian} option in \code{[.data.table} but is more strict. An option \code{"datatable.join.many"} controls that globally for \code{mergelist} and \code{[.data.table}. } +} +\details{ + Function should be considered experimental. Users are encouraged to provide feedback in our issue tracker. + + Merging is performed sequentially, for \code{l} of 3 tables, it will do something like \code{merge(merge(l[[1L]], l[[2L]]), l[[3L]])}. Merging does not support \emph{non-equi joins}, column names to merge on must be common in both tables on each merge. + + Arguments \code{on}, \code{how}, \code{mult}, \code{join.many} could be lists as well, each of length \code{length(l)-1L}, to provide argument to be used for each single tables pair to merge, see examples. + + Terms \emph{join-to} and \emph{join-from} depends on \code{how} argument: + \enumerate{ + \item{ \code{how="left|semi|anti"}: \emph{join-to} is \emph{RHS}, \emph{join-from} is \emph{LHS}. } + \item{ \code{how="inner|full|cross"}: treats \emph{LHS} and \emph{RHS} tables equally, terms applies to both tables. } + \item{ \code{how="right"}: \emph{join-to} is \emph{LHS}, \emph{join-from} is \emph{RHS}. } + } + + Using \code{mult="error"} will raise exception when multiple rows in \emph{join-to} table match to the row in \emph{join-from} table. It should not be used to just detect duplicates, as duplicates might not have matching row, and in such case exception will not be raised. + + Default value for argument \code{mult} depends on \code{how} argument: + \enumerate{ + \item{ \code{how="left|inner|full|right"}: sets \code{mult="error"}. } + \item{ \code{how="semi|anti"}: sets \code{mult="last"}, although works same as \code{mult="first"}. } + \item{ \code{how="cross"}: sets \code{mult="all"}. } + } + + When \code{on} argument is missing, then columns to join on will be decided based on \emph{key} depending on \code{how} argument: + \enumerate{ + \item{ \code{how="left|right|semi|anti"}: key columns of \emph{join-to} table. } + \item{ \code{how="inner|full"}: if only one table has key, then this key is used, if both tables have key, then \code{intersect(key(lhs), key(rhs))}, having its order aligned to shorter key. } + } + + When joining tables that are not directly linked to single table, e.g. snowflake schema, \emph{right} outer join can be used to optimize the sequence of merges, see examples. +} +\value{ + A new \code{data.table} based on the merged objects. +} +\note{ + Using \code{how="inner|full"} together with \code{mult!="all"} is sub-efficient. Unlike during join in \code{[.data.table}, it will apply \code{mult} on both tables. It is to ensure that the join is symmetric so \emph{LHS} and \emph{RHS} tables can be swapped, regardless of \code{mult} argument. It is always possible to apply \code{mult}-like filter manually and join using \code{mult="all"}. + + Using \code{join.many=FALSE} is sub-efficient. Note that it only takes effect when \code{mult="all"}. If input data are verified to not have duplicated matches, then this can safely use the default \code{TRUE}. Otherwise for \code{mult="all"} merges it is recommended to use \code{join.many=FALSE}, unless of course \emph{many-to-many} join, that duplicates rows, is intended. +} +\seealso{ + \code{\link{[.data.table}}, \code{\link{merge.data.table}} +} +\examples{ +l = list( + data.table(id1 = c(1:4,2:5), v1 = 1:8), + data.table(id1 = 2:3, v2 = 1:2), + data.table(id1 = 3:5, v3 = 1:3) +) +mergelist(l, on="id1") + +## using keys +l = list( + data.table(id1 = c(1:4,2:5), v1 = 1:8), + data.table(id1 = 3:5, id2 = 1:3, v2 = 1:3, key="id1"), + data.table(id2 = 1:4, v3 = 4:1, key="id2") +) +mergelist(l) + +## select columns +l = list( + data.table(id1 = c(1:4,2:5), v1 = 1:8, v2 = 8:1), + data.table(id1 = 3:5, v3 = 1:3, v4 = 3:1, v5 = 1L, key="id1") +) +mergelist(l, cols = list(NULL, c("v3","v5"))) + +## different arguments for each merge pair +l = list( + data.table(id1=1:4, id2=4:1), + data.table(id1=c(1:3,1:2), v2=c(1L,1L,1:2,2L)), + data.table(id2=4:5) +) +mergelist(l, + on = list("id1", "id2"), ## first merge on id1, second on id2 + how = list("inner", "anti"), ## first inner join, second anti join + mult = list("last", NULL)) ## use default 'mult' in second join + +## detecting duplicates matches +l = list( + data.table(id1=c(1:4,2:5), v1=1:8), ## dups in LHS are fine + data.table(id1=c(2:3,2L), v2=1:3), ## dups in RHS + data.table(id1=3:5, v3=1:3) +) +#mergelist(l, on="id1") # ERROR: mult='error' and multiple matches during merge +lapply(l[-1L], `[`, j = if (.N>1L) .SD, by = "id1") ## duplicated rows + +## 'star schema' and 'snowflake schema' examples + +### populate fact: US population by state and date + +gt = state.x77[,"Population"] +gt = data.table(state_id=seq_along(state.name), p=gt[state.name]/sum(gt), k=1L) +tt = as.IDate(paste0(as.integer(time(uspop)),"-01-01")) +tt = as.data.table(stats::approx(tt, c(uspop), tt[1L]:tt[length(tt)])) +tt = tt[, .(date=as.IDate(x), date_id=seq_along(x), pop=y, k=1L)] +fact = tt[gt, on="k", allow.cartesian=TRUE, + .(state_id=i.state_id, date_id=x.date_id, population = x.pop * i.p)] +setkeyv(fact, c("state_id","date_id")) + +### populate dimensions: time and geography + +time = data.table(key = "date_id", + date_id = seq_along(tt$date), date = tt$date, + month_id = month(tt$date), month = month.name[month(tt$date)], + year_id = year(tt$date)-1789L, year = as.character(year(tt$date)), + week_id = week(tt$date), week = as.character(week(tt$date)), + weekday_id = wday(tt$date)-1L, weekday = weekdays(tt$date) +)[weekday_id==0L, weekday_id:=7L][] +geog = data.table(key = "state_id", + state_id = seq_along(state.name), state_abb=state.abb, state_name=state.name, + division_id = as.integer(state.division), + division_name = as.character(state.division), + region_id = as.integer(state.region), + region_name = as.character(state.region) +) +rm(gt, tt) + +### denormalize 'star schema' + +l = list(fact, time, geog) +ans = mergelist(l) + +rm(l, ans) + +### turn 'star schema' into 'snowflake schema' + +make.lvl = function(x, cols) { + stopifnot(is.data.table(x)) + lvl = x[, unique(.SD), .SDcols=cols] + setkeyv(lvl, cols[1L]) + setindexv(lvl, as.list(cols)) +} +time = list( + date = make.lvl(time, c("date_id","date","year_id","month_id","week_id", + "weekday_id")), + weekday = make.lvl(time, c("weekday_id","weekday")), + week = make.lvl(time, c("week_id","week")), + month = make.lvl(time, c("month_id","month")), + year = make.lvl(time, c("year_id","year")) +) +geog = list( + state = make.lvl(geog, c("state_id","state_abb","state_name","division_id")), + division = make.lvl(geog, c("division_id","division_name","region_id")), + region = make.lvl(geog, c("region_id","region_name")) +) + +### denormalize 'snowflake schema' + +#### left join all +l = c(list(fact=fact), time, geog) +ans = mergelist(l) + +rm(ans) +#### merge hierarchies alone, reduce sizes in merges of geog dimension +ans = mergelist(list( + fact, + mergelist(time), + mergelist(rev(geog), how="right") +)) + +rm(ans) +#### same but no unnecessary copies +ans = mergelist(list( + fact, + mergelist(time, copy=FALSE), + mergelist(rev(geog), how="right", copy=FALSE) +)) +} +\keyword{ data } diff --git a/src/bmerge.c b/src/bmerge.c index f6f640e711..f0e55dc02d 100644 --- a/src/bmerge.c +++ b/src/bmerge.c @@ -29,7 +29,7 @@ static SEXP nqgrp; static int ncol, *o, *xo, *retFirst, *retLength, *retIndex, *allLen1, *allGrp1, *rollends, ilen, anslen; static int *op, nqmaxgrp; static int ctr, nomatch; // populating matches for non-equi joins -enum {ALL, FIRST, LAST} mult = ALL; +enum {ALL, FIRST, LAST, ERR} mult = ALL; static double roll, rollabs; static Rboolean rollToNearest=FALSE; #define XIND(i) (xo ? xo[(i)]-1 : i) @@ -49,8 +49,10 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r // iArg, xArg, icolsArg and xcolsArg idtVec = SEXPPTR_RO(idt); // set globals so bmerge_r can see them. xdtVec = SEXPPTR_RO(xdt); - if (!isInteger(icolsArg)) internal_error(__func__, "icols is not integer vector"); // # nocov - if (!isInteger(xcolsArg)) internal_error(__func__, "xcols is not integer vector"); // # nocov + if (!isInteger(icolsArg)) + internal_error(__func__, "icols is not integer vector"); // # nocov + if (!isInteger(xcolsArg)) + internal_error(__func__, "xcols is not integer vector"); // # nocov if ((LENGTH(icolsArg)==0 || LENGTH(xcolsArg)==0) && LENGTH(idt)>0) // We let through LENGTH(i) == 0 for tests 2126.* internal_error(__func__, "icols and xcols must be non-empty integer vectors"); if (LENGTH(icolsArg) > LENGTH(xcolsArg)) internal_error(__func__, "length(icols) [%d] > length(xcols) [%d]", LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov @@ -60,10 +62,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r iN = ilen = anslen = LENGTH(idt) ? LENGTH(VECTOR_ELT(idt,0)) : 0; ncol = LENGTH(icolsArg); // there may be more sorted columns in x than involved in the join for(int col=0; colLENGTH(idt) || icols[col]<1) error(_("icols[%d]=%d outside range [1,length(i)=%d]"), col, icols[col], LENGTH(idt)); - if (xcols[col]>LENGTH(xdt) || xcols[col]<1) error(_("xcols[%d]=%d outside range [1,length(x)=%d]"), col, xcols[col], LENGTH(xdt)); + if (icols[col]==NA_INTEGER) + internal_error(__func__, "icols[%d] is NA", col); // # nocov + if (xcols[col]==NA_INTEGER) + internal_error(__func__, "xcols[%d] is NA", col); // # nocov + if (icols[col]>LENGTH(idt) || icols[col]<1) + internal_error(__func__, "icols[%d]=%d outside range [1,length(i)=%d]", col, icols[col], LENGTH(idt)); // # nocov. Should have been caught already. + if (xcols[col]>LENGTH(xdt) || xcols[col]<1) + internal_error(__func__, "xcols[%d]=%d outside range [1,length(x)=%d]", col, xcols[col], LENGTH(xdt)); // # nocov int it = TYPEOF(VECTOR_ELT(idt, icols[col]-1)); int xt = TYPEOF(VECTOR_ELT(xdt, xcols[col]-1)); if (iN && it!=xt) @@ -75,11 +81,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r // rollArg, rollendsArg roll = 0.0; rollToNearest = FALSE; if (isString(rollarg)) { - if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error(_("roll is character but not 'nearest'")); - if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet.")); + if (strcmp(CHAR(STRING_ELT(rollarg, 0)), "nearest") != 0) + internal_error(__func__, "roll is character but not 'nearest'"); // # nocov. Only [.data.table exposes roll= directly, and this is already checked there. + if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP) + error(_("roll='nearest' can't be applied to a character column, yet.")); roll=1.0; rollToNearest=TRUE; // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later } else { - if (!isReal(rollarg)) internal_error(__func__, "roll is not character or double"); // # nocov + if (!isReal(rollarg)) + internal_error(__func__, "roll is not character or double"); // # nocov roll = REAL(rollarg)[0]; // more common case (rolling forwards or backwards) or no roll when 0.0 } rollabs = fabs(roll); @@ -98,10 +107,16 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r } // mult arg - if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL; - else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) mult = FIRST; - else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) mult = LAST; - else internal_error(__func__, "invalid value for 'mult'"); // # nocov + if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) + mult = ALL; + else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) + mult = FIRST; + else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) + mult = LAST; + else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "error")) + mult = ERR; + else + internal_error(__func__, "invalid value for 'mult'"); // # nocov // opArg if (!isInteger(opArg) || length(opArg)!=ncol) @@ -132,7 +147,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r retLength = R_Calloc(anslen, int); retIndex = R_Calloc(anslen, int); // initialise retIndex here directly, as next loop is meant for both equi and non-equi joins - for (int j=0; j1) allLen1[0] = FALSE; + if (len>1) { + if (mult==ALL) + allLen1[0] = FALSE; // bmerge()$allLen1 + } if (nqmaxgrp == 1) { - const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // extra +1 for 1-based indexing at R level - const int rl = (mult==ALL) ? len : 1; + const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // bmerge()$starts thus extra +1 for 1-based indexing at R level + const int rl = (mult==ALL) ? len : 1; // bmerge()$lens for (int j=ilow+1; j1 && mult==ERR already checked, no dup matches, continue as mult=ALL // for this irow, we've matches on more than one group allGrp1[0] = FALSE; retFirst[ctr+ilen] = xlow+2; @@ -428,7 +450,7 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisg } } else { // none of the groups so far have filled in for this index. So use it! - if (mult == ALL) { + if (mult == ALL || mult == ERR) { retFirst[k] = xlow+2; retLength[k] = len; retIndex[k] = k+1; diff --git a/src/data.table.h b/src/data.table.h index e597fb0d45..70f5a9bb12 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -258,6 +258,12 @@ SEXP islockedR(SEXP x); bool need2utf8(SEXP x); SEXP coerceUtf8IfNeeded(SEXP x); SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg); +int n_rows(SEXP x); +int n_columns(SEXP x); +bool isDataTable(SEXP x); +bool isRectangularList(SEXP x); +bool perhapsDataTable(SEXP x); +SEXP perhapsDataTableR(SEXP x); void internal_error(const char *call_name, const char *format, ...); // types.c @@ -278,6 +284,10 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env); //negate.c SEXP notchin(SEXP x, SEXP table); +// mergelist.c +SEXP cbindlist(SEXP x, SEXP copyArg); +SEXP copyCols(SEXP x, SEXP cols); + // functions called from R level .Call/.External and registered in init.c // these now live here to pass -Wstrict-prototypes, #5477 // all arguments must be SEXP since they are called from R level diff --git a/src/init.c b/src/init.c index 0f1a76c3db..02959bdcd6 100644 --- a/src/init.c +++ b/src/init.c @@ -149,6 +149,9 @@ R_CallMethodDef callMethods[] = { {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, +{"Ccbindlist", (DL_FUNC) &cbindlist, -1}, +{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1}, +{"CcopyCols", (DL_FUNC) ©Cols, -1}, {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, {NULL, NULL, 0} }; diff --git a/src/mergelist.c b/src/mergelist.c new file mode 100644 index 0000000000..60c508977e --- /dev/null +++ b/src/mergelist.c @@ -0,0 +1,98 @@ +#include "data.table.h" + +// set(x, NULL, cols, copy(unclass(x)[cols])) ## but keeps the index +SEXP copyCols(SEXP x, SEXP cols) { + // used in R/mergelist.R + if (!isDataTable(x)) + error("'x' must be a data.table"); // # nocov + if (!isInteger(cols)) + error("'cols' must be integer"); // # nocov + int nx = length(x), ncols = LENGTH(cols), *colsp = INTEGER(cols); + if (!nx || !ncols) + return R_NilValue; + for (int i=0; ilimit) error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit); + if (limit<0) + error(_("clamp must be positive")); // # nocov + if (reslen>limit) + error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit); } SEXP ans = PROTECT(allocVector(INTSXP, reslen)); int *ians = INTEGER(ans); diff --git a/tests/mergelist.R b/tests/mergelist.R new file mode 100644 index 0000000000..4884087c3d --- /dev/null +++ b/tests/mergelist.R @@ -0,0 +1,2 @@ +require(data.table) +test.data.table(script="mergelist.Rraw")