From 01233bf3f488c9d36ffb301e1a733376debce947 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Thu, 29 Aug 2024 16:48:43 +0000
Subject: [PATCH 1/9] cbindlist

add cbind by reference, timing

R prototype of mergelist

wording

use lower overhead funs

stick to int32 for now, correct R_alloc

bmerge C refactor for codecov and one loop for speed

address revealed codecov gaps

refactor vecseq for codecov

seqexp helper, some alloccol export on C

bmerge codecov, types handled in R bmerge already

better comment seqexp

bmerge mult=error #655

multiple new C utils

swap if branches

explain new C utils

comments mostly

reduce conflicts to PR #4386

comment C code

address multiple matches during update-on-join #3747

Revert "address multiple matches during update-on-join #3747"

This reverts commit b64c0c3480fe9415bbda6729c361621e60da6e01.

merge.dt has temporarily mult arg, for testing

minor changes to cbindlist c

dev mergelist, for single pair now

add quiet option to cc()

mergelist tests

add check for names to perhaps.dt

rm mult from merge.dt method

rework, clean, polish multer, fix righ and full joins

make full join symmetric

mergepair inner function to loop on

extra check for symmetric

mergelist manual

ensure no df-dt passed where list expected

comments and manual

handle 0 cols tables

more tests

more tests and debugging

move more logic closer to bmerge, simplify mergepair

more tests

revert not used changes

reduce not needed checks, cleanup

copy arg behavior, manual, no tests yet

cbindlist manual, export both

cleanup processing bmerge to dtmatch

test function match order for easier preview

vecseq gets short-circuit

batch test allow browser

big cleanup

remmove unneeded stuff, reduce diff

more cleanup, minor manual fixes

add proper test scripts

Merge branch 'master' into cbind-merge-list

comment out not used code for coverage

more tests, some nocopy opts

rename sql test script, should fix codecov

simplify dtmatch inner branch

more precise copy, now copy only T or F

unused arg not yet in api, wording

comments and refer issues

codecov

hasindex coverage

codecov gap

tests for join using key, cols argument

fix missing import forderv

more tests, improve missing on handling

more tests for order of inner and full join for long keys

new allow.cartesian option, #4383, #914

reduce diff, improve codecov

reduce diff, comments

need more DT, not lists, mergelist 3+ tbls

proper escape heavy check

unit tests

more tests, address overalloc failure

mergelist and cbindlist retain index

manual, examples

fix manual

minor clarify in manual

retain keys, right outer join for snowflake schema joins

duplicates in cbindlist

recycling in cbindlist

escape 0 input in copyCols

empty input handling

closing cbindlist

vectorized _on_ and _join.many_ arg

rename dtmatch to dtmerge

vectorized args: how, mult
push down input validation
add support for cross join, semi join, anti join

full join, reduce overhead for mult=error

mult default value dynamic

fix manual

add "see details" to Rd

mention shared on in arg description

amend feedback from Michael

semi and anti joins will not reorder x columns

Merge branch 'master' into cbind-merge-list

spelling, thx to @jan-glx

check all new funs used and add comments

bugfix, sort=T needed for now

Merge branch 'master' into cbind-merge-list

Update NEWS.md

Merge branch 'master' into cbind-merge-list

Merge branch 'master' into cbind-merge-list

NEWS placement

numbering

ascArg->order

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list

attempt to restore from master

Update to stopf() error style

Need isFrame for now

More quality checks: any(!x)->!all(x); use vapply_1{b,c,i}

really restore from master

try to PROTECT() before duplicate()

update error message in test

appease the rchk gods

extraneous space

missing ';'

use catf

simplify perhapsDataTableR

move sqlite.Rraw.manual into other.Rraw

simplify for loop

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list
---
 R/data.table.R |  2 +-
 src/bmerge.c   | 60 +++++++++++++++++++++++++++++++++-----------------
 src/vecseq.c   | 18 ++++++++++-----
 3 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index 7b48704a1a..fe0f42a565 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -199,7 +199,7 @@ replace_dot_alias = function(e) {
     }
     return(x)
   }
-  if (!mult %chin% c("first","last","all")) stopf("mult argument can only be 'first', 'last' or 'all'")
+  if (!mult %chin% c("first", "last", "all")) stopf("mult argument can only be 'first', 'last' or 'all'")
   missingroll = missing(roll)
   if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
   if (is.character(roll)) {
diff --git a/src/bmerge.c b/src/bmerge.c
index f6f640e711..b3e993e19e 100644
--- a/src/bmerge.c
+++ b/src/bmerge.c
@@ -49,8 +49,10 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   // iArg, xArg, icolsArg and xcolsArg
   idtVec = SEXPPTR_RO(idt);  // set globals so bmerge_r can see them.
   xdtVec = SEXPPTR_RO(xdt);
-  if (!isInteger(icolsArg)) internal_error(__func__, "icols is not integer vector"); // # nocov
-  if (!isInteger(xcolsArg)) internal_error(__func__, "xcols is not integer vector"); // # nocov
+  if (!isInteger(icolsArg))
+    internal_error(__func__, "icols is not integer vector"); // # nocov
+  if (!isInteger(xcolsArg))
+    internal_error(__func__, "xcols is not integer vector"); // # nocov
   if ((LENGTH(icolsArg)==0 || LENGTH(xcolsArg)==0) && LENGTH(idt)>0) // We let through LENGTH(i) == 0 for tests 2126.*
     internal_error(__func__, "icols and xcols must be non-empty integer vectors");
   if (LENGTH(icolsArg) > LENGTH(xcolsArg)) internal_error(__func__, "length(icols) [%d] > length(xcols) [%d]", LENGTH(icolsArg), LENGTH(xcolsArg)); // # nocov
@@ -60,10 +62,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   iN = ilen = anslen = LENGTH(idt) ? LENGTH(VECTOR_ELT(idt,0)) : 0;
   ncol = LENGTH(icolsArg);    // there may be more sorted columns in x than involved in the join
   for(int col=0; col<ncol; col++) {
-    if (icols[col]==NA_INTEGER) internal_error(__func__, "icols[%d] is NA", col); // # nocov
-    if (xcols[col]==NA_INTEGER) internal_error(__func__, "xcols[%d] is NA", col); // # nocov
-    if (icols[col]>LENGTH(idt) || icols[col]<1) error(_("icols[%d]=%d outside range [1,length(i)=%d]"), col, icols[col], LENGTH(idt));
-    if (xcols[col]>LENGTH(xdt) || xcols[col]<1) error(_("xcols[%d]=%d outside range [1,length(x)=%d]"), col, xcols[col], LENGTH(xdt));
+    if (icols[col]==NA_INTEGER)
+      internal_error(__func__, "icols[%d] is NA", col); // # nocov
+    if (xcols[col]==NA_INTEGER)
+      internal_error(__func__, "xcols[%d] is NA", col); // # nocov
+    if (icols[col]>LENGTH(idt) || icols[col]<1)
+      internal_error(__func__, "icols[%d]=%d outside range [1,length(i)=%d]", col, icols[col], LENGTH(idt)); // # nocov. Should have been caught already.
+    if (xcols[col]>LENGTH(xdt) || xcols[col]<1)
+      internal_error(__func__, "xcols[%d]=%d outside range [1,length(x)=%d]", col, xcols[col], LENGTH(xdt)); // # nocov
     int it = TYPEOF(VECTOR_ELT(idt, icols[col]-1));
     int xt = TYPEOF(VECTOR_ELT(xdt, xcols[col]-1));
     if (iN && it!=xt)
@@ -75,11 +81,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   // rollArg, rollendsArg
   roll = 0.0; rollToNearest = FALSE;
   if (isString(rollarg)) {
-    if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error(_("roll is character but not 'nearest'"));
-    if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP) error(_("roll='nearest' can't be applied to a character column, yet."));
+    if (strcmp(CHAR(STRING_ELT(rollarg, 0)), "nearest") != 0)
+      internal_error(__func__, "roll is character but not 'nearest'"); // # nocov. Only [.data.table exposes roll= directly, and this is already checked there.
+    if (ncol>0 && TYPEOF(VECTOR_ELT(idt, icols[ncol-1]-1))==STRSXP)
+      error(_("roll='nearest' can't be applied to a character column, yet."));
     roll=1.0; rollToNearest=TRUE;       // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later
   } else {
-    if (!isReal(rollarg)) internal_error(__func__, "roll is not character or double"); // # nocov
+    if (!isReal(rollarg))
+      internal_error(__func__, "roll is not character or double"); // # nocov
     roll = REAL(rollarg)[0];   // more common case (rolling forwards or backwards) or no roll when 0.0
   }
   rollabs = fabs(roll);
@@ -98,10 +107,14 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   }
 
   // mult arg
-  if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all")) mult = ALL;
-  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first")) mult = FIRST;
-  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last")) mult = LAST;
-  else internal_error(__func__, "invalid value for 'mult'"); // # nocov
+  if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "all"))
+    mult = ALL;
+  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "first"))
+    mult = FIRST;
+  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last"))
+    mult = LAST;
+  else
+    internal_error(__func__, "invalid value for 'mult'"); // # nocov
 
   // opArg
   if (!isInteger(opArg) || length(opArg)!=ncol)
@@ -132,7 +145,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
     retLength = R_Calloc(anslen, int);
     retIndex = R_Calloc(anslen, int);
     // initialise retIndex here directly, as next loop is meant for both equi and non-equi joins
-    for (int j=0; j<anslen; j++) retIndex[j] = j+1;
+    for (int j=0; j<anslen; j++)
+      retIndex[j] = j+1;
   } else { // equi joins (or) non-equi join but no multiple matches
     retFirstArg = PROTECT(allocVector(INTSXP, anslen));
     retFirst = INTEGER(retFirstArg);
@@ -145,9 +159,11 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   for (int j=0; j<anslen; j++) {
     // defaults need to populated here as bmerge_r may well not touch many locations, say if the last row of i is before the first row of x.
     retFirst[j] = nomatch;   // default to no match for NA goto below
-    // retLength[j] = 0;   // TO DO: do this to save the branch below and later branches at R level to set .N to 0
-    retLength[j] = nomatch==0 ? 0 : 1;
   }
+  // retLength[j] = 0;   // TO DO: do this to save the branch below and later branches at R level to set .N to 0
+  int retLengthVal = (int)(nomatch != 0);
+  for (int j=0; j<anslen; j++)
+    retLength[j] = retLengthVal;
 
   // allLen1Arg
   allLen1Arg = PROTECT(allocVector(LGLSXP, 1));
@@ -174,7 +190,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
   // xo arg
   xo = NULL;
   if (length(xoArg)) {
-    if (!isInteger(xoArg)) internal_error(__func__, "xoArg is not an integer vector"); // # nocov
+    if (!isInteger(xoArg))
+      internal_error(__func__, "xoArg is not an integer vector"); // # nocov
     xo = INTEGER(xoArg);
   }
 
@@ -391,10 +408,13 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisg
       // final two 1's are lowmax and uppmax
     } else {
       int len = xupp-xlow-1+rollLow+rollUpp; // rollLow and rollUpp cannot both be true
-      if (mult==ALL && len>1) allLen1[0] = FALSE;
+      if (len>1) {
+        if (mult==ALL)
+          allLen1[0] = FALSE;                           // bmerge()$allLen1
+      }
       if (nqmaxgrp == 1) {
-        const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // extra +1 for 1-based indexing at R level
-        const int rl = (mult==ALL) ? len : 1;
+        const int rf = (mult!=LAST) ? xlow+2-rollLow : xupp+rollUpp; // bmerge()$starts thus extra +1 for 1-based indexing at R level
+        const int rl = (mult==ALL) ? len : 1;                        // bmerge()$lens
         for (int j=ilow+1; j<iupp; j++) {   // usually iterates once only for j=ir
           const int k = o ? o[j]-1 : j;
           retFirst[k] = rf;
diff --git a/src/vecseq.c b/src/vecseq.c
index 9466c615b9..707439ed4f 100644
--- a/src/vecseq.c
+++ b/src/vecseq.c
@@ -10,9 +10,12 @@ SEXP vecseq(SEXP x, SEXP len, SEXP clamp)
   // Specially for use by [.data.table after binary search. Now so specialized that for general use
   // bit::vecseq is recommended (Jens has coded it in C now).
 
-  if (!isInteger(x)) error(_("x must be an integer vector"));
-  if (!isInteger(len)) error(_("len must be an integer vector"));
-  if (LENGTH(x) != LENGTH(len)) error(_("x and len must be the same length"));
+  if (!isInteger(x))
+    error(_("x must be an integer vector")); // # nocov
+  if (!isInteger(len))
+    error(_("len must be an integer vector")); // # nocov
+  if (LENGTH(x) != LENGTH(len))
+    error(_("x and len must be the same length")); // # nocov
   const int *ix = INTEGER(x);
   const int *ilen = INTEGER(len), nlen=LENGTH(len);
   int reslen = 0;
@@ -22,10 +25,13 @@ SEXP vecseq(SEXP x, SEXP len, SEXP clamp)
     reslen += ilen[i];
   }
   if (!isNull(clamp)) {
-    if (!isNumeric(clamp) || LENGTH(clamp)!=1) error(_("clamp must be a double vector length 1"));
+    if (!isNumeric(clamp) || LENGTH(clamp)!=1)
+      error(_("clamp must be a double vector length 1")); // # nocov
     double limit = REAL(clamp)[0];
-    if (limit<0) error(_("clamp must be positive"));
-    if (reslen>limit) error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit);
+    if (limit<0)
+      error(_("clamp must be positive")); // # nocov
+    if (reslen>limit)
+      error(_("Join results in %d rows; more than %d = nrow(x)+nrow(i). Check for duplicate key values in i each of which join to the same group in x over and over again. If that's ok, try by=.EACHI to run j for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and data.table issue tracker for advice."), reslen, (int)limit);
   }
   SEXP ans = PROTECT(allocVector(INTSXP, reslen));
   int *ians = INTEGER(ans);

From 0ab0f5ef3ddb3933960311387756ba93895c9873 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Thu, 29 Aug 2024 16:53:29 +0000
Subject: [PATCH 2/9] cbindlist

add cbind by reference, timing

R prototype of mergelist

wording

use lower overhead funs

stick to int32 for now, correct R_alloc

bmerge C refactor for codecov and one loop for speed

address revealed codecov gaps

refactor vecseq for codecov

seqexp helper, some alloccol export on C

bmerge codecov, types handled in R bmerge already

better comment seqexp

bmerge mult=error #655

multiple new C utils

swap if branches

explain new C utils

comments mostly

reduce conflicts to PR #4386

comment C code

address multiple matches during update-on-join #3747

Revert "address multiple matches during update-on-join #3747"

This reverts commit b64c0c3480fe9415bbda6729c361621e60da6e01.

merge.dt has temporarily mult arg, for testing

minor changes to cbindlist c

dev mergelist, for single pair now

add quiet option to cc()

mergelist tests

add check for names to perhaps.dt

rm mult from merge.dt method

rework, clean, polish multer, fix righ and full joins

make full join symmetric

mergepair inner function to loop on

extra check for symmetric

mergelist manual

ensure no df-dt passed where list expected

comments and manual

handle 0 cols tables

more tests

more tests and debugging

move more logic closer to bmerge, simplify mergepair

more tests

revert not used changes

reduce not needed checks, cleanup

copy arg behavior, manual, no tests yet

cbindlist manual, export both

cleanup processing bmerge to dtmatch

test function match order for easier preview

vecseq gets short-circuit

batch test allow browser

big cleanup

remmove unneeded stuff, reduce diff

more cleanup, minor manual fixes

add proper test scripts

Merge branch 'master' into cbind-merge-list

comment out not used code for coverage

more tests, some nocopy opts

rename sql test script, should fix codecov

simplify dtmatch inner branch

more precise copy, now copy only T or F

unused arg not yet in api, wording

comments and refer issues

codecov

hasindex coverage

codecov gap

tests for join using key, cols argument

fix missing import forderv

more tests, improve missing on handling

more tests for order of inner and full join for long keys

new allow.cartesian option, #4383, #914

reduce diff, improve codecov

reduce diff, comments

need more DT, not lists, mergelist 3+ tbls

proper escape heavy check

unit tests

more tests, address overalloc failure

mergelist and cbindlist retain index

manual, examples

fix manual

minor clarify in manual

retain keys, right outer join for snowflake schema joins

duplicates in cbindlist

recycling in cbindlist

escape 0 input in copyCols

empty input handling

closing cbindlist

vectorized _on_ and _join.many_ arg

rename dtmatch to dtmerge

vectorized args: how, mult
push down input validation
add support for cross join, semi join, anti join

full join, reduce overhead for mult=error

mult default value dynamic

fix manual

add "see details" to Rd

mention shared on in arg description

amend feedback from Michael

semi and anti joins will not reorder x columns

Merge branch 'master' into cbind-merge-list

spelling, thx to @jan-glx

check all new funs used and add comments

bugfix, sort=T needed for now

Merge branch 'master' into cbind-merge-list

Update NEWS.md

Merge branch 'master' into cbind-merge-list

Merge branch 'master' into cbind-merge-list

NEWS placement

numbering

ascArg->order

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list

attempt to restore from master

Update to stopf() error style

Need isFrame for now

More quality checks: any(!x)->!all(x); use vapply_1{b,c,i}

really restore from master

try to PROTECT() before duplicate()

update error message in test

appease the rchk gods

extraneous space

missing ';'

use catf

simplify perhapsDataTableR

move sqlite.Rraw.manual into other.Rraw

simplify for loop

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list
---
 R/data.table.R | 2 +-
 src/bmerge.c   | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index fe0f42a565..7b48704a1a 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -199,7 +199,7 @@ replace_dot_alias = function(e) {
     }
     return(x)
   }
-  if (!mult %chin% c("first", "last", "all")) stopf("mult argument can only be 'first', 'last' or 'all'")
+  if (!mult %chin% c("first","last","all")) stopf("mult argument can only be 'first', 'last' or 'all'")
   missingroll = missing(roll)
   if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
   if (is.character(roll)) {
diff --git a/src/bmerge.c b/src/bmerge.c
index b3e993e19e..f0e55dc02d 100644
--- a/src/bmerge.c
+++ b/src/bmerge.c
@@ -29,7 +29,7 @@ static SEXP nqgrp;
 static int ncol, *o, *xo, *retFirst, *retLength, *retIndex, *allLen1, *allGrp1, *rollends, ilen, anslen;
 static int *op, nqmaxgrp;
 static int ctr, nomatch; // populating matches for non-equi joins
-enum {ALL, FIRST, LAST} mult = ALL;
+enum {ALL, FIRST, LAST, ERR} mult = ALL;
 static double roll, rollabs;
 static Rboolean rollToNearest=FALSE;
 #define XIND(i) (xo ? xo[(i)]-1 : i)
@@ -113,6 +113,8 @@ SEXP bmerge(SEXP idt, SEXP xdt, SEXP icolsArg, SEXP xcolsArg, SEXP xoArg, SEXP r
     mult = FIRST;
   else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "last"))
     mult = LAST;
+  else if (!strcmp(CHAR(STRING_ELT(multArg, 0)), "error"))
+    mult = ERR;
   else
     internal_error(__func__, "invalid value for 'mult'"); // # nocov
 
@@ -426,7 +428,7 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisg
         for (int j=ilow+1; j<iupp; j++) {
           const int k = o ? o[j]-1 : j;
           if (retFirst[k] != nomatch) {
-            if (mult == ALL) {
+            if (mult == ALL || mult == ERR) { // len>1 && mult==ERR already checked, no dup matches, continue as mult=ALL
               // for this irow, we've matches on more than one group
               allGrp1[0] = FALSE;
               retFirst[ctr+ilen] = xlow+2;
@@ -448,7 +450,7 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int thisg
             }
           } else {
             // none of the groups so far have filled in for this index. So use it!
-            if (mult == ALL) {
+            if (mult == ALL || mult == ERR) {
               retFirst[k] = xlow+2;
               retLength[k] = len;
               retIndex[k] = k+1;

From 7342791b08ae948c4b6d8f42be76a426ce4b214a Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Tue, 1 Oct 2024 09:39:50 -0700
Subject: [PATCH 3/9] restore ws change

---
 R/data.table.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/data.table.R b/R/data.table.R
index 7b48704a1a..fe0f42a565 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -199,7 +199,7 @@ replace_dot_alias = function(e) {
     }
     return(x)
   }
-  if (!mult %chin% c("first","last","all")) stopf("mult argument can only be 'first', 'last' or 'all'")
+  if (!mult %chin% c("first", "last", "all")) stopf("mult argument can only be 'first', 'last' or 'all'")
   missingroll = missing(roll)
   if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
   if (is.character(roll)) {

From 3ef33d7a9c1b6358e2b4d0d0fd686308a494840f Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Thu, 29 Aug 2024 16:55:42 +0000
Subject: [PATCH 4/9] cbindlist

add cbind by reference, timing

R prototype of mergelist

wording

use lower overhead funs

stick to int32 for now, correct R_alloc

bmerge C refactor for codecov and one loop for speed

address revealed codecov gaps

refactor vecseq for codecov

seqexp helper, some alloccol export on C

bmerge codecov, types handled in R bmerge already

better comment seqexp

bmerge mult=error #655

multiple new C utils

swap if branches

explain new C utils

comments mostly

reduce conflicts to PR #4386

comment C code

address multiple matches during update-on-join #3747

Revert "address multiple matches during update-on-join #3747"

This reverts commit b64c0c3480fe9415bbda6729c361621e60da6e01.

merge.dt has temporarily mult arg, for testing

minor changes to cbindlist c

dev mergelist, for single pair now

add quiet option to cc()

mergelist tests

add check for names to perhaps.dt

rm mult from merge.dt method

rework, clean, polish multer, fix righ and full joins

make full join symmetric

mergepair inner function to loop on

extra check for symmetric

mergelist manual

ensure no df-dt passed where list expected

comments and manual

handle 0 cols tables

more tests

more tests and debugging

move more logic closer to bmerge, simplify mergepair

more tests

revert not used changes

reduce not needed checks, cleanup

copy arg behavior, manual, no tests yet

cbindlist manual, export both

cleanup processing bmerge to dtmatch

test function match order for easier preview

vecseq gets short-circuit

batch test allow browser

big cleanup

remmove unneeded stuff, reduce diff

more cleanup, minor manual fixes

add proper test scripts

Merge branch 'master' into cbind-merge-list

comment out not used code for coverage

more tests, some nocopy opts

rename sql test script, should fix codecov

simplify dtmatch inner branch

more precise copy, now copy only T or F

unused arg not yet in api, wording

comments and refer issues

codecov

hasindex coverage

codecov gap

tests for join using key, cols argument

fix missing import forderv

more tests, improve missing on handling

more tests for order of inner and full join for long keys

new allow.cartesian option, #4383, #914

reduce diff, improve codecov

reduce diff, comments

need more DT, not lists, mergelist 3+ tbls

proper escape heavy check

unit tests

more tests, address overalloc failure

mergelist and cbindlist retain index

manual, examples

fix manual

minor clarify in manual

retain keys, right outer join for snowflake schema joins

duplicates in cbindlist

recycling in cbindlist

escape 0 input in copyCols

empty input handling

closing cbindlist

vectorized _on_ and _join.many_ arg

rename dtmatch to dtmerge

vectorized args: how, mult
push down input validation
add support for cross join, semi join, anti join

full join, reduce overhead for mult=error

mult default value dynamic

fix manual

add "see details" to Rd

mention shared on in arg description

amend feedback from Michael

semi and anti joins will not reorder x columns

Merge branch 'master' into cbind-merge-list

spelling, thx to @jan-glx

check all new funs used and add comments

bugfix, sort=T needed for now

Merge branch 'master' into cbind-merge-list

Update NEWS.md

Merge branch 'master' into cbind-merge-list

Merge branch 'master' into cbind-merge-list

NEWS placement

numbering

ascArg->order

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list

attempt to restore from master

Update to stopf() error style

Need isFrame for now

More quality checks: any(!x)->!all(x); use vapply_1{b,c,i}

really restore from master

try to PROTECT() before duplicate()

update error message in test

appease the rchk gods

extraneous space

missing ';'

use catf

simplify perhapsDataTableR

move sqlite.Rraw.manual into other.Rraw

simplify for loop

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list
---
 src/data.table.h |  6 ++++
 src/utils.c      | 89 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/src/data.table.h b/src/data.table.h
index e597fb0d45..8fbd66d7cc 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -258,6 +258,12 @@ SEXP islockedR(SEXP x);
 bool need2utf8(SEXP x);
 SEXP coerceUtf8IfNeeded(SEXP x);
 SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg);
+int n_rows(SEXP x);
+int n_columns(SEXP x);
+bool isDataTable(SEXP x);
+bool isRectangularList(SEXP x);
+bool perhapsDataTable(SEXP x);
+SEXP perhapsDataTableR(SEXP x);
 void internal_error(const char *call_name, const char *format, ...);
 
 // types.c
diff --git a/src/utils.c b/src/utils.c
index b88a07985f..9244028b8a 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -439,6 +439,95 @@ SEXP startsWithAny(const SEXP x, const SEXP y, SEXP start) {
   return ScalarLogical(false);
 }
 
+// if (length(x)) length(x[[1L]]) else 0L
+// used in src/mergelist.c and below in commented out set_row_names
+int n_rows(SEXP x) {
+  if (!LENGTH(x))
+    return 0; // # nocov. Not yet reached from anywhere, cbindlist uses it but escapes for !n_columns(x)
+  return length(VECTOR_ELT(x, 0));
+}
+
+// length(x)
+// used in src/mergelist.c
+// to be an abstraction layer on C level
+int n_columns(SEXP x) {
+  return LENGTH(x);
+}
+
+/*
+ Below commented out functions will be uncommented when addressing #4439
+ // c("data.table","data.frame")
+ static SEXP char2_dtdf() {
+ SEXP char2_dtdf = PROTECT(allocVector(STRSXP, 2));
+ SET_STRING_ELT(char2_dtdf, 0, char_datatable);
+ SET_STRING_ELT(char2_dtdf, 1, char_dataframe);
+ UNPROTECT(1);
+ return char2_dtdf;
+ }
+ 
+ // .set_row_names(x)
+ static SEXP set_row_names(int n) {
+ SEXP ans = R_NilValue;
+ if (n) {
+ ans = PROTECT(allocVector(INTSXP, 2));
+ INTEGER(ans)[0] = NA_INTEGER;
+ INTEGER(ans)[1] = -n;
+ } else {
+ ans = PROTECT(allocVector(INTSXP, 0));
+ }
+ UNPROTECT(1);
+ return ans;
+ }
+ 
+ // setDT(x) ## not in-place!
+ SEXP setDT(SEXP x) {
+ if (!isNewList(x))
+ error("internal error: C setDT should be called only on a list"); // # nocov
+ setAttrib(x, R_ClassSymbol, char2_dtdf());
+ setAttrib(x, sym_rownames, set_row_names(n_rows(x)));
+ return alloccolwrapper(x, GetOption(sym_alloccol, R_NilValue), GetOption(sym_verbose, R_NilValue));
+ }*/
+
+// inherits(x, "data.table")
+bool isDataTable(SEXP x) {
+  return INHERITS(x, char_datatable);
+}
+
+// rectangular list; NB does not allow length-1 recycling
+// length(x) <= 1L || length(unique(lengths(x))) == 1L
+static inline bool isRectangular(SEXP x) {
+  int n = LENGTH(x);
+  if (n < 2)
+    return true;
+  R_xlen_t nr = xlength(VECTOR_ELT(x, 0));
+  for (int i=1; i<n; ++i) {
+    if (xlength(VECTOR_ELT(x, i)) != nr)
+      return false;
+  }
+  return true;
+}
+
+// setDT()-friendly rectangular list, i.e.
+//   a named list() with all entries of equal length()
+bool isRectangularList(SEXP x) {
+  if (!isNewList(x))
+    return false;
+  if (!LENGTH(x))
+    return true;
+  if (isNull(getAttrib(x, R_NamesSymbol)))
+    return false;
+  return isRectangular(x);
+}
+
+// TODO: use isDataFrame (when included in any R release).
+// isDataTable(x) || isFrame(x) || isRectangularList(x)
+bool perhapsDataTable(SEXP x) {
+  return isDataTable(x) || isFrame(x) || isRectangularList(x);
+}
+SEXP perhapsDataTableR(SEXP x) {
+  return ScalarLogical(perhapsDataTable(x));
+}
+
 void internal_error(const char *call_name, const char *format, ...) {
   char buff[1024];
   va_list args;

From 4664e847bde3f1fa33283b16fb26566192534d34 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Thu, 29 Aug 2024 17:17:11 +0000
Subject: [PATCH 5/9] cbindlist

add cbind by reference, timing

R prototype of mergelist

wording

use lower overhead funs

stick to int32 for now, correct R_alloc

bmerge C refactor for codecov and one loop for speed

address revealed codecov gaps

refactor vecseq for codecov

seqexp helper, some alloccol export on C

bmerge codecov, types handled in R bmerge already

better comment seqexp

bmerge mult=error #655

multiple new C utils

swap if branches

explain new C utils

comments mostly

reduce conflicts to PR #4386

comment C code

address multiple matches during update-on-join #3747

Revert "address multiple matches during update-on-join #3747"

This reverts commit b64c0c3480fe9415bbda6729c361621e60da6e01.

merge.dt has temporarily mult arg, for testing

minor changes to cbindlist c

dev mergelist, for single pair now

add quiet option to cc()

mergelist tests

add check for names to perhaps.dt

rm mult from merge.dt method

rework, clean, polish multer, fix righ and full joins

make full join symmetric

mergepair inner function to loop on

extra check for symmetric

mergelist manual

ensure no df-dt passed where list expected

comments and manual

handle 0 cols tables

more tests

more tests and debugging

move more logic closer to bmerge, simplify mergepair

more tests

revert not used changes

reduce not needed checks, cleanup

copy arg behavior, manual, no tests yet

cbindlist manual, export both

cleanup processing bmerge to dtmatch

test function match order for easier preview

vecseq gets short-circuit

batch test allow browser

big cleanup

remmove unneeded stuff, reduce diff

more cleanup, minor manual fixes

add proper test scripts

Merge branch 'master' into cbind-merge-list

comment out not used code for coverage

more tests, some nocopy opts

rename sql test script, should fix codecov

simplify dtmatch inner branch

more precise copy, now copy only T or F

unused arg not yet in api, wording

comments and refer issues

codecov

hasindex coverage

codecov gap

tests for join using key, cols argument

fix missing import forderv

more tests, improve missing on handling

more tests for order of inner and full join for long keys

new allow.cartesian option, #4383, #914

reduce diff, improve codecov

reduce diff, comments

need more DT, not lists, mergelist 3+ tbls

proper escape heavy check

unit tests

more tests, address overalloc failure

mergelist and cbindlist retain index

manual, examples

fix manual

minor clarify in manual

retain keys, right outer join for snowflake schema joins

duplicates in cbindlist

recycling in cbindlist

escape 0 input in copyCols

empty input handling

closing cbindlist

vectorized _on_ and _join.many_ arg

rename dtmatch to dtmerge

vectorized args: how, mult
push down input validation
add support for cross join, semi join, anti join

full join, reduce overhead for mult=error

mult default value dynamic

fix manual

add "see details" to Rd

mention shared on in arg description

amend feedback from Michael

semi and anti joins will not reorder x columns

Merge branch 'master' into cbind-merge-list

spelling, thx to @jan-glx

check all new funs used and add comments

bugfix, sort=T needed for now

Merge branch 'master' into cbind-merge-list

Update NEWS.md

Merge branch 'master' into cbind-merge-list

Merge branch 'master' into cbind-merge-list

NEWS placement

numbering

ascArg->order

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list

attempt to restore from master

Update to stopf() error style

Need isFrame for now

More quality checks: any(!x)->!all(x); use vapply_1{b,c,i}

really restore from master

try to PROTECT() before duplicate()

update error message in test

appease the rchk gods

extraneous space

missing ';'

use catf

simplify perhapsDataTableR

move sqlite.Rraw.manual into other.Rraw

simplify for loop

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list
---
 NAMESPACE                 |  1 +
 R/mergelist.R             |  9 +++++
 inst/tests/mergelist.Rraw | 72 ++++++++++++++++++++++++++++++++++
 man/cbindlist.Rd          | 36 +++++++++++++++++
 src/data.table.h          |  3 ++
 src/init.c                |  1 +
 src/mergelist.c           | 81 +++++++++++++++++++++++++++++++++++++++
 tests/mergelist.R         |  2 +
 8 files changed, 205 insertions(+)
 create mode 100644 R/mergelist.R
 create mode 100644 inst/tests/mergelist.Rraw
 create mode 100644 man/cbindlist.Rd
 create mode 100644 src/mergelist.c
 create mode 100644 tests/mergelist.R

diff --git a/NAMESPACE b/NAMESPACE
index 0e0c733ce2..dba9582625 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -59,6 +59,7 @@ export(nafill)
 export(setnafill)
 export(.Last.updated)
 export(fcoalesce)
+export(cbindlist)
 export(substitute2)
 #export(DT)  # mtcars |> DT(i,j,by)  #4872 #5472
 
diff --git a/R/mergelist.R b/R/mergelist.R
new file mode 100644
index 0000000000..9606ce0abb
--- /dev/null
+++ b/R/mergelist.R
@@ -0,0 +1,9 @@
+cbindlist = function(l, copy=TRUE) {
+  ans = .Call(Ccbindlist, l, copy)
+  if (anyDuplicated(names(ans))) { ## invalidate key and index
+    setattr(ans, "sorted", NULL)
+    setattr(ans, "index", integer())
+  }
+  setDT(ans)
+  ans
+}
diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw
new file mode 100644
index 0000000000..9e6835cb71
--- /dev/null
+++ b/inst/tests/mergelist.Rraw
@@ -0,0 +1,72 @@
+require(methods)
+
+if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
+  if ((tt<-compiler::enableJIT(-1))>0)
+    cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="")
+} else {
+  require(data.table)
+  test = data.table:::test
+}
+
+addresses = function(x) vapply(x, address, "")
+
+# cbindlist
+
+l = list(
+  d1 = data.table(x=1:3, v1=1L),
+  d2 = data.table(y=3:1, v2=2L),
+  d3 = data.table(z=2:4, v3=3L)
+)
+ans = cbindlist(l)
+expected = data.table(l$d1, l$d2, l$d3)
+test(11.01, ans, expected)
+test(11.02, intersect(addresses(ans), addresses(expected)), character())
+ans = cbindlist(l, copy=FALSE)
+expected = setDT(c(l$d1, l$d2, l$d3))
+test(11.03, ans, expected)
+test(11.04, length(intersect(addresses(ans), addresses(expected))), ncol(expected))
+test(11.05, cbindlist(list(data.table(a=1L), data.table(), data.table(d=2L), data.table(f=3L))), data.table(a=1L,d=2L,f=3L))
+rm(expected)
+## codecov
+test(12.01, cbindlist(data.frame(a=1L), data.frame(b=1L)), error="must be a list")
+test(12.02, cbindlist(TRUE, FALSE), error="must be a list")
+test(12.03, cbindlist(list(), NA), error="must be TRUE or FALSE")
+test(12.04, cbindlist(list(data.table(a=1L), 1L)), error="is not of data.table type")
+test(12.05, options = c(datatable.verbose=TRUE), cbindlist(list(data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2), output="cbindlist.*took")
+test(12.06, cbindlist(list(data.table(), data.table(a=1:2), data.table(b=1:2))), data.table(a=1:2, b=1:2))
+test(12.07, cbindlist(list(data.table(), data.table(a=1:2), list(b=1:2))), data.table(a=1:2, b=1:2))
+test(12.08, cbindlist(list(data.table(a=integer()), list(b=integer()))), data.table(a=integer(), b=integer()))
+## duplicated names
+test(12.09, cbindlist(list(data.table(a=1L, b=2L), data.table(b=3L, d=4L))), data.table(a=1L, b=2L, b=3L, d=4L))
+ans = cbindlist(list(setindexv(data.table(a=2:1, b=1:2),"a"), data.table(a=1:2, b=2:1, key="a"), data.table(a=2:1, b=1:2)))
+test(12.10, ans, data.table(a=2:1, b=1:2, a=1:2, b=2:1, a=2:1, b=1:2))
+test(12.11, indices(ans), NULL)
+## recycling, first ensure cbind recycling that we want to match to
+test(12.12, cbind(data.table(x=integer()), data.table(a=1:2)), data.table(x=c(NA_integer_,NA), a=1:2))
+test(12.13, cbind(data.table(x=1L), data.table(a=1:2)), data.table(x=c(1L,1L), a=1:2))
+test(12.14, cbindlist(list(data.table(a=integer()), data.table(b=1:2))), error="recycling.*not yet implemented")
+test(12.15, cbindlist(list(data.table(a=1L), data.table(b=1:2))), error="recycling.*not yet implemented")
+test(12.16, cbindlist(list(data.table(a=integer()), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow")
+test(12.17, cbindlist(list(data.table(a=1L), data.table(b=1:2)), copy=FALSE), error="has to have equal nrow")
+
+## retain indices
+d = data.table(x=1:2, y=2:1, z=2:1, v1=1:2) ## ensure setDT will retain key and indices when it is called on the list, bc Ccbindlist returns list
+setkeyv(d, "x"); setindexv(d, list("y", "z"))
+a = attributes(d)
+attributes(d) = a[!names(a) %in% c("class",".internal.selfref","row.names")]
+test(13.01, class(d), "list")
+setDT(d)
+test(13.02, key(d), "x")
+# test(13.03, hasindex(d, "y") && hasindex(d, "z"))
+l = list(
+  data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5),
+  data.table(id4=5:1, id5=1:5, v2=1:5),
+  data.table(id6=5:1, id7=1:5, v3=1:5),
+  data.table(id8=5:1, id9=5:1, v4=1:5)
+)
+setkeyv(l[[1L]], "id1"); setindexv(l[[1L]], list("id1", "id2", "id3", c("id1","id2","id3"))); setindexv(l[[3L]], list("id6", "id7")); setindexv(l[[4L]], "id9")
+ii = lapply(l, indices)
+ans = cbindlist(l)
+test(13.04, key(ans), "id1")
+test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9"))
+test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib
diff --git a/man/cbindlist.Rd b/man/cbindlist.Rd
new file mode 100644
index 0000000000..5a780e99ad
--- /dev/null
+++ b/man/cbindlist.Rd
@@ -0,0 +1,36 @@
+\name{cbindlist}
+\alias{cbindlist}
+\alias{cbind}
+\alias{cbind.data.table}
+\title{Column bind multiple data.tables}
+\description{
+  Column bind multiple \code{data.table}s.
+}
+\usage{
+  cbindlist(l, copy=TRUE)
+}
+\arguments{
+  \item{l}{ \code{list} of \code{data.table}s to merge. }
+  \item{copy}{ \code{logical}, decides if columns has to be copied into resulting object (default) or just referred. }
+}
+\details{
+  Column bind only stacks input elements. Works like \code{\link{data.table}}, but takes \code{list} type on input. Zero-column tables in \code{l} are omitted. Tables in \code{l} should have matching row count; recycling of length-1 rows is not yet implemented. Indices of the input tables are transferred to the resulting table, as well as the \emph{key} of the first keyed table.
+}
+\value{
+  A new \code{data.table} based on the stacked objects. Eventually when \code{copy} is \code{FALSE}, then resulting object will share columns with \code{l} tables.
+}
+\note{
+  If output object has any duplicate names, then key and indices are removed.
+}
+\seealso{
+  \code{\link{data.table}}, \code{\link{rbindlist}}
+}
+\examples{
+l = list(
+  d1 = data.table(x=1:3, v1=1L),
+  d2 = data.table(y=3:1, v2=2L),
+  d3 = data.table(z=2:4, v3=3L)
+)
+cbindlist(l)
+}
+\keyword{ data }
diff --git a/src/data.table.h b/src/data.table.h
index 8fbd66d7cc..f4d22b95ab 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -284,6 +284,9 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env);
 //negate.c
 SEXP notchin(SEXP x, SEXP table);
 
+// mergelist.c
+SEXP cbindlist(SEXP x, SEXP copyArg);
+
 // functions called from R level .Call/.External and registered in init.c
 // these now live here to pass -Wstrict-prototypes, #5477
 // all arguments must be SEXP since they are called from R level
diff --git a/src/init.c b/src/init.c
index 0f1a76c3db..7189bb9da6 100644
--- a/src/init.c
+++ b/src/init.c
@@ -149,6 +149,7 @@ R_CallMethodDef callMethods[] = {
 {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1},
 {"CconvertDate", (DL_FUNC)&convertDate, -1},
 {"Cnotchin", (DL_FUNC)&notchin, -1},
+{"Ccbindlist", (DL_FUNC) &cbindlist, -1},
 {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1},
 {NULL, NULL, 0}
 };
diff --git a/src/mergelist.c b/src/mergelist.c
new file mode 100644
index 0000000000..77c4287736
--- /dev/null
+++ b/src/mergelist.c
@@ -0,0 +1,81 @@
+#include "data.table.h"
+
+void mergeIndexAttrib(SEXP to, SEXP from) {
+  if (!isInteger(to) || LENGTH(to)!=0)
+    internal_error(__func__, "'to' must be integer() already"); // # nocov
+  if (isNull(from))
+    return;
+  SEXP t = ATTRIB(to), f = ATTRIB(from);
+  if (isNull(f))
+    return;
+  if (isNull(t))
+    SET_ATTRIB(to, shallow_duplicate(f));
+  else {
+    for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t));
+    SETCDR(t, shallow_duplicate(f));
+  }
+  return;
+}
+
+SEXP cbindlist(SEXP x, SEXP copyArg) {
+  if (!isNewList(x) || isFrame(x))
+    error(_("'%s' must be a list"), "x");
+  if (!IS_TRUE_OR_FALSE(copyArg))
+    error(_("'%s' must be TRUE or FALSE"), "copy");
+  bool copy = (bool)LOGICAL(copyArg)[0];
+  const bool verbose = GetVerbose();
+  double tic = 0;
+  if (verbose)
+    tic = omp_get_wtime();
+  int nx = length(x), nans = 0, nr = -1, *nnx = (int*)R_alloc(nx, sizeof(int));
+  bool recycle = false;
+  for (int i=0; i<nx; ++i) {
+    SEXP thisx = VECTOR_ELT(x, i);
+    if (!perhapsDataTable(thisx))
+      error(_("Element %d of 'l' list is not a data.table."), i+1);
+    nnx[i] = n_columns(thisx);
+    if (!nnx[i])
+      continue;
+    int thisnr = n_rows(thisx);
+    if (nr < 0) // first (non-zero length table) iteration
+      nr = thisnr;
+    else if (nr != thisnr) {
+      if (!copy)
+        error(_("For copy=FALSE all non-empty tables in 'l' have to have the same number of rows, but l[[%d]] has %d rows which differs from the previous non-zero number of rows (%d)."), i+1, thisnr, nr);
+      recycle = true;
+    }
+    nans += nnx[i];
+  }
+  if (recycle)
+    error(_("Recycling rows is not yet implemented.")); // dont we have a routines for that already somewhere?
+  SEXP ans = PROTECT(allocVector(VECSXP, nans));
+  SEXP index = PROTECT(allocVector(INTSXP, 0));
+  SEXP key = R_NilValue;
+  setAttrib(ans, sym_index, index);
+  SEXP names = PROTECT(allocVector(STRSXP, nans));
+  for (int i=0, ians=0; i<nx; ++i) {
+    int protecti =0;
+    SEXP thisx = VECTOR_ELT(x, i);
+    SEXP thisnames = PROTECT(getAttrib(thisx, R_NamesSymbol)); protecti++;
+    for (int j=0; j<nnx[i]; ++j, ++ians) {
+      SEXP thisxcol;
+      if (copy) {
+        thisxcol = PROTECT(duplicate(VECTOR_ELT(thisx, j))); protecti++;
+      } else {
+        thisxcol = VECTOR_ELT(thisx, j);
+      }
+      SET_VECTOR_ELT(ans, ians, thisxcol);
+      SET_STRING_ELT(names, ians, STRING_ELT(thisnames, j));
+    }
+    mergeIndexAttrib(index, getAttrib(thisx, sym_index));
+    if (isNull(key)) // first key is retained
+      key = getAttrib(thisx, sym_sorted);
+    UNPROTECT(protecti);
+  }
+  setAttrib(ans, R_NamesSymbol, names);
+  setAttrib(ans, sym_sorted, key);
+  if (verbose)
+    Rprintf(_("cbindlist: took %.3fs\n"), omp_get_wtime()-tic);
+  UNPROTECT(3);
+  return ans;
+}
diff --git a/tests/mergelist.R b/tests/mergelist.R
new file mode 100644
index 0000000000..4884087c3d
--- /dev/null
+++ b/tests/mergelist.R
@@ -0,0 +1,2 @@
+require(data.table)
+test.data.table(script="mergelist.Rraw")

From 63769782566510f255559e7e209049c36431d1a2 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Thu, 29 Aug 2024 17:22:18 +0000
Subject: [PATCH 6/9] cbindlist

add cbind by reference, timing

R prototype of mergelist

wording

use lower overhead funs

stick to int32 for now, correct R_alloc

bmerge C refactor for codecov and one loop for speed

address revealed codecov gaps

refactor vecseq for codecov

seqexp helper, some alloccol export on C

bmerge codecov, types handled in R bmerge already

better comment seqexp

bmerge mult=error #655

multiple new C utils

swap if branches

explain new C utils

comments mostly

reduce conflicts to PR #4386

comment C code

address multiple matches during update-on-join #3747

Revert "address multiple matches during update-on-join #3747"

This reverts commit b64c0c3480fe9415bbda6729c361621e60da6e01.

merge.dt has temporarily mult arg, for testing

minor changes to cbindlist c

dev mergelist, for single pair now

add quiet option to cc()

mergelist tests

add check for names to perhaps.dt

rm mult from merge.dt method

rework, clean, polish multer, fix righ and full joins

make full join symmetric

mergepair inner function to loop on

extra check for symmetric

mergelist manual

ensure no df-dt passed where list expected

comments and manual

handle 0 cols tables

more tests

more tests and debugging

move more logic closer to bmerge, simplify mergepair

more tests

revert not used changes

reduce not needed checks, cleanup

copy arg behavior, manual, no tests yet

cbindlist manual, export both

cleanup processing bmerge to dtmatch

test function match order for easier preview

vecseq gets short-circuit

batch test allow browser

big cleanup

remmove unneeded stuff, reduce diff

more cleanup, minor manual fixes

add proper test scripts

Merge branch 'master' into cbind-merge-list

comment out not used code for coverage

more tests, some nocopy opts

rename sql test script, should fix codecov

simplify dtmatch inner branch

more precise copy, now copy only T or F

unused arg not yet in api, wording

comments and refer issues

codecov

hasindex coverage

codecov gap

tests for join using key, cols argument

fix missing import forderv

more tests, improve missing on handling

more tests for order of inner and full join for long keys

new allow.cartesian option, #4383, #914

reduce diff, improve codecov

reduce diff, comments

need more DT, not lists, mergelist 3+ tbls

proper escape heavy check

unit tests

more tests, address overalloc failure

mergelist and cbindlist retain index

manual, examples

fix manual

minor clarify in manual

retain keys, right outer join for snowflake schema joins

duplicates in cbindlist

recycling in cbindlist

escape 0 input in copyCols

empty input handling

closing cbindlist

vectorized _on_ and _join.many_ arg

rename dtmatch to dtmerge

vectorized args: how, mult
push down input validation
add support for cross join, semi join, anti join

full join, reduce overhead for mult=error

mult default value dynamic

fix manual

add "see details" to Rd

mention shared on in arg description

amend feedback from Michael

semi and anti joins will not reorder x columns

Merge branch 'master' into cbind-merge-list

spelling, thx to @jan-glx

check all new funs used and add comments

bugfix, sort=T needed for now

Merge branch 'master' into cbind-merge-list

Update NEWS.md

Merge branch 'master' into cbind-merge-list

Merge branch 'master' into cbind-merge-list

NEWS placement

numbering

ascArg->order

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list

attempt to restore from master

Update to stopf() error style

Need isFrame for now

More quality checks: any(!x)->!all(x); use vapply_1{b,c,i}

really restore from master

try to PROTECT() before duplicate()

update error message in test

appease the rchk gods

extraneous space

missing ';'

use catf

simplify perhapsDataTableR

move sqlite.Rraw.manual into other.Rraw

simplify for loop

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list
---
 R/data.table.R            |  23 ++++---
 R/mergelist.R             | 123 ++++++++++++++++++++++++++++++++++++++
 inst/tests/mergelist.Rraw |  74 ++++++++++++++++++++++-
 src/init.c                |   1 +
 4 files changed, 213 insertions(+), 8 deletions(-)

diff --git a/R/data.table.R b/R/data.table.R
index fe0f42a565..0ad21cf9fa 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -519,13 +519,22 @@ replace_dot_alias = function(e) {
         if (!byjoin || nqbyjoin) {
           # Really, `anyDuplicated` in base is AWESOME!
           # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
-          if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
-          irows = if (allLen1) f__ else vecseq(f__,len__,
-            if (allow.cartesian ||
-                notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
-                !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
-              NULL # #742. If 'i' has no duplicates, ignore
-            } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
+          if (verbose) {last.started.at=proc.time();cat("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
+          irows = if (allLen1) f__ else {
+            join.many = getOption("datatable.join.many") # #914, default TRUE for backward compatibility
+            anyDups = if (!join.many && length(f__)==1L && len__==nrow(x)) {
+              NULL # special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
+            } else if (!notjoin && ( # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
+              !allow.cartesian ||
+              !join.many))
+              as.logical(anyDuplicated(f__, incomparables = c(0L, NA_integer_)))
+            limit = if (!is.null(anyDups) && anyDups) { # #742. If 'i' has no duplicates, ignore
+              if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
+              else if (!allow.cartesian && !notjoin) as.double(nrow(x)+nrow(i))
+              else internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov
+            }
+            vecseq(f__, len__, limit)
+          } # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
           if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
           # Fix for #1092 and #1074
           # TODO: implement better version of "any"/"all"/"which" to avoid
diff --git a/R/mergelist.R b/R/mergelist.R
index 9606ce0abb..435ee0a601 100644
--- a/R/mergelist.R
+++ b/R/mergelist.R
@@ -7,3 +7,126 @@ cbindlist = function(l, copy=TRUE) {
   setDT(ans)
   ans
 }
+
+# when 'on' is missing then use keys, used only for inner and full join
+onkeys = function(x, y) {
+  if (is.null(x) && !is.null(y)) y
+  else if (!is.null(x) && is.null(y)) x
+  else if (!is.null(x) && !is.null(y)) {
+    if (length(x)>=length(y)) intersect(y, x) ## align order to shorter|rhs key
+    else intersect(x, y)
+  } else NULL # nocov ## internal error is being called later in mergepair
+}
+someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) {
+  keep = colnamesInt(x, keep)
+  drop = colnamesInt(x, drop)
+  cols = colnamesInt(x, cols)
+  ans = union(keep, setdiff(cols, drop))
+  if (!retain.order) return(ans)
+  intersect(colnamesInt(x, NULL), ans)
+}
+hasindex = function(x, by, retGrp=FALSE) {
+  index = attr(x, "index", TRUE)
+  if (is.null(index)) return(FALSE)
+  idx_name = paste0("__",by,collapse="")
+  idx = attr(index, idx_name, TRUE)
+  if (is.null(idx)) return(FALSE)
+  if (!retGrp) return(TRUE)
+  return(!is.null(attr(idx, "starts", TRUE)))
+}
+
+# fdistinct applies mult='first|last'
+# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE]
+# it may not copy when copy=FALSE and x is unique by 'on'
+fdistinct = function(x, on=key(x), mult=c("first","last"), cols=seq_along(x), copy=TRUE) {
+  if (!perhaps.data.table(x))
+    stopf("'x' must be data.table")
+  if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x)))
+    stopf("'on' must be character column names of 'x' argument")
+  mult = match.arg(mult)
+  if (is.null(cols))
+    cols = seq_along(x)
+  else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols))
+    stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument")
+  if (!isTRUEorFALSE(copy))
+    stopf("'%s' must be TRUE or FALSE", "copy")
+  ## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last"
+  ## this short circuit will work after #4386 because it requires retGrp=T
+  #### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE)
+  sort = TRUE ## above line does not work for the moment, test 302.02
+  o = forderv(x, by=on, sort=sort, retGrp=TRUE)
+  if (attr(o, "maxgrpn", TRUE) <= 1L) {
+    ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE)
+    if (copy) ans = copy(ans)
+    return(ans)
+  }
+  f = attr(o, "starts", exact=TRUE)
+  if (mult=="last") {
+    if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov
+    f = c(f[-1L]-1L, nrow(x)) ## last of each group
+  }
+  if (length(o)) f = o[f]
+  if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order
+  .Call(CsubsetDT, x, f, someCols(x, cols, keep=on))
+}
+
+# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow)
+# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT
+dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
+  nomatch = switch(how, "inner"=, "semi"=, "anti"=, "cross"= 0L, "left"=, "right"=, "full"= NA_integer_)
+  nomatch0 = identical(nomatch, 0L)
+  if (is.null(mult))
+    mult = switch(how, "semi"=, "anti"= "last", "cross"= "all", "inner"=, "left"=, "right"=, "full"= "error")
+  if (void && mult!="error")
+    internal_error("void must be used with mult='error'") # nocov
+  if (how=="cross") { ## short-circuit bmerge results only for cross join
+    if (length(on) || mult!="all" || !join.many)
+      stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE")
+    if (void)
+      internal_error("cross join must be used with void=FALSE") # nocov
+    ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer())
+  } else {
+    if (!length(on))
+      stopf("'on' must be non-zero length character vector")
+    if (mult=="all" && (how=="semi" || how=="anti"))
+      stopf("semi and anti joins must be used with mult!='all'")
+    icols = colnamesInt(i, on, check_dups=TRUE)
+    xcols = colnamesInt(x, on, check_dups=TRUE)
+    ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose)
+    if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line
+      return(invisible(NULL))
+    } else if (how=="semi" || how=="anti") { ## semi and anti short-circuit
+      irows = which(if (how=="semi") ans$lens!=0L else ans$lens==0L) ## we will subset i rather than x, thus assign to irows, not to xrows
+      if (length(irows)==length(ans$lens)) irows = NULL
+      return(list(ans=ans, irows=irows))
+    } else if (mult=="all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check
+      !(length(ans$starts)==1L && ans$lens==nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
+      anyDuplicated(ans$starts, incomparables=c(0L,NA_integer_))
+    )
+      stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
+  }
+
+  ## xrows, join-to
+  xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
+  if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)]
+  len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here
+
+  ## irows, join-from
+  irows = if (!(ans$allLen1 && (!nomatch0 || len.x==length(ans$starts)))) seqexp(ans$lens)
+  len.i = if (is.null(irows)) nrow(i) else length(irows)
+
+  if (length(ans$xo) && length(xrows))
+    xrows = ans$xo[xrows]
+  len.x = length(xrows)
+
+  if (len.i!=len.x)
+    internal_error("dtmerge out len.i != len.x") # nocov
+
+  return(list(ans=ans, irows=irows, xrows=xrows))
+}
+
+# Previously, we had a custom C implementation here, which is ~2x faster,
+#   but this is fast enough we don't bother maintaining a new routine.
+#   Hopefully in the future rep() can recognize the ALTREP and use that, too.
+seqexp = function(x) rep(seq_along(x), x)
+perhaps.data.table = function(x) .Call(CperhapsDataTableR, x)
diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw
index 9e6835cb71..a35c4f4103 100644
--- a/inst/tests/mergelist.Rraw
+++ b/inst/tests/mergelist.Rraw
@@ -6,10 +6,49 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
 } else {
   require(data.table)
   test = data.table:::test
+  perhaps.data.table = data.table:::perhaps.data.table
+  hasindex = data.table:::hasindex
+  fdistinct = data.table:::fdistinct
+  forderv = data.table:::forderv
 }
 
 addresses = function(x) vapply(x, address, "")
 
+# internal helpers
+
+test(1.01, perhaps.data.table(list()))
+test(1.02, perhaps.data.table(list(a=1:2)))
+test(1.03, perhaps.data.table(list(a=1:2, b=1:2)))
+test(1.04, perhaps.data.table(list(1:2, 1:2)), FALSE)
+
+test(2.01, fdistinct(list(x=c(1L,1:2), b=1:2), on="x", mult="last"), error="must be data.table")
+test(2.02, fdistinct(data.table(x=c(1L,1:2)), on="z", mult="last"), error="must be character column names of")
+test(2.03, fdistinct(data.table(x=c(1L,1:2)), on="x", mult="last", cols=character()), error="must be non-zero length, non-NA, integer or character columns of")
+test(2.04, fdistinct(data.table(x=c(1L,1:2, y=1:3)), on="x", mult="last", copy=NA), error="must be TRUE or FALSE")
+d = data.table(x=1:2, y=1:2)
+test(2.05, ans<-fdistinct(d, on="x", mult="last"), d)
+test(2.06, intersect(addresses(ans), addresses(d)), character())
+test(2.07, ans<-fdistinct(d, on="x", mult="last", copy=FALSE), d)
+test(2.08, addresses(ans), addresses(d))
+d = data.table(x=c(2:1,2L), y=1:3)
+test(2.09, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))
+test(2.10, fdistinct(d, on="x", mult="last"), data.table(x=1:2, y=2:3))
+setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x", retGrp=TRUE)) ## retGrp=T index #4386
+test(2.11, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))
+
+test(3.01, hasindex(d, "x"))
+test(3.02, hasindex(d, "x", retGrp=TRUE))
+setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x"))              ## retGrp=F index #4386
+test(3.03, hasindex(d, "x"))
+test(3.04, !hasindex(d, "x", retGrp=TRUE))
+setattr(d, "index", NULL)
+test(3.05, !hasindex(d, "x"))
+test(3.06, !hasindex(d, "x", retGrp=TRUE))
+setattr(d, "index", integer())
+test(3.07, !hasindex(d, "x"))
+test(3.08, !hasindex(d, "x", retGrp=TRUE))
+rm(d)
+
 # cbindlist
 
 l = list(
@@ -57,7 +96,7 @@ attributes(d) = a[!names(a) %in% c("class",".internal.selfref","row.names")]
 test(13.01, class(d), "list")
 setDT(d)
 test(13.02, key(d), "x")
-# test(13.03, hasindex(d, "y") && hasindex(d, "z"))
+test(13.03, hasindex(d, "y") && hasindex(d, "z"))
 l = list(
   data.table(id1=1:5, id2=5:1, id3=1:5, v1=1:5),
   data.table(id4=5:1, id5=1:5, v2=1:5),
@@ -70,3 +109,36 @@ ans = cbindlist(l)
 test(13.04, key(ans), "id1")
 test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9"))
 test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib
+
+## fdistinct, another round
+
+dt = data.table(x =
+c(74L, 103L, 158L, 250L, 56L, 248L, 260L, 182L, 174L, 17L, 57L,
+  49L, 189L, 106L, 212L, 137L, 198L, 273L, 105L, 214L, 258L, 59L,
+  180L, 35L, 74L, 107L, 4L, 106L, 240L, 94L, 133L, 165L, 136L,
+  52L, 228L, 184L, 219L, 30L, 200L, 114L, 226L, 178L, 216L, 153L,
+  146L, 218L, 7L, 132L, 202L, 191L, 132L, 237L, 121L, 68L, 20L,
+  28L, 87L, 143L, 183L, 112L, 252L, 81L, 127L, 92L, 179L, 71L,
+  132L, 211L, 24L, 241L, 94L, 231L, 96L, 92L, 131L, 246L, 238L,
+  108L, 214L, 265L, 120L, 196L, 110L, 90L, 209L, 56L, 196L, 34L,
+  68L, 40L, 66L, 17L, 177L, 241L, 215L, 220L, 126L, 113L, 223L,
+  167L, 181L, 98L, 75L, 273L, 175L, 59L, 36L, 132L, 255L, 165L,
+  269L, 202L, 99L, 119L, 41L, 4L, 197L, 29L, 123L, 177L, 273L,
+  137L, 134L, 48L, 208L, 125L, 141L, 58L, 63L, 164L, 159L, 22L,
+  10L, 177L, 256L, 165L, 155L, 145L, 271L, 140L, 188L, 166L, 66L,
+  71L, 201L, 125L, 49L, 206L, 29L, 238L, 170L, 154L, 91L, 125L,
+  138L, 50L, 146L, 21L, 77L, 59L, 79L, 247L, 123L, 215L, 243L,
+  114L, 18L, 93L, 200L, 93L, 174L, 232L, 236L, 108L, 105L, 247L,
+  178L, 204L, 167L, 249L, 81L, 53L, 244L, 139L, 242L, 53L, 209L,
+  200L, 260L, 151L, 196L, 107L, 28L, 256L, 78L, 163L, 31L, 232L,
+  88L, 216L, 74L, 61L, 143L, 74L, 50L, 143L, 155L, 36L, 71L, 198L,
+  265L, 28L, 210L, 261L, 226L, 85L, 179L, 263L, 263L, 94L, 73L,
+  46L, 89L, 141L, 255L, 141L, 71L, 13L, 115L, 235L, 96L, 37L, 103L,
+  174L, 108L, 190L, 190L, 153L, 119L, 125L, 85L, 160L, 251L, 40L,
+  115L, 59L, 118L, 37L, 127L, 260L, 210L, 257L, 130L, 166L, 134L,
+  30L, 69L, 138L, 103L, 258L, 145L, 88L, 77L, 217L, 194L, 46L,
+  18L, 208L, 171L, 47L, 18L, 30L, 105L, 47L, 83L)
+)
+ans = unique(dt, by="x")
+test(301.01, data.table(x=unique(dt$x)), ans) ## OK
+test(301.02, fdistinct(dt, on="x"), ans)      ## force sort=TRUE for the moment
diff --git a/src/init.c b/src/init.c
index 7189bb9da6..1ee4243145 100644
--- a/src/init.c
+++ b/src/init.c
@@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = {
 {"CconvertDate", (DL_FUNC)&convertDate, -1},
 {"Cnotchin", (DL_FUNC)&notchin, -1},
 {"Ccbindlist", (DL_FUNC) &cbindlist, -1},
+{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1},
 {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1},
 {NULL, NULL, 0}
 };

From 24e3a4043a587e9ff0add4d398952325e0d2bd8d Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Thu, 29 Aug 2024 17:24:08 +0000
Subject: [PATCH 7/9] cbindlist

add cbind by reference, timing

R prototype of mergelist

wording

use lower overhead funs

stick to int32 for now, correct R_alloc

bmerge C refactor for codecov and one loop for speed

address revealed codecov gaps

refactor vecseq for codecov

seqexp helper, some alloccol export on C

bmerge codecov, types handled in R bmerge already

better comment seqexp

bmerge mult=error #655

multiple new C utils

swap if branches

explain new C utils

comments mostly

reduce conflicts to PR #4386

comment C code

address multiple matches during update-on-join #3747

Revert "address multiple matches during update-on-join #3747"

This reverts commit b64c0c3480fe9415bbda6729c361621e60da6e01.

merge.dt has temporarily mult arg, for testing

minor changes to cbindlist c

dev mergelist, for single pair now

add quiet option to cc()

mergelist tests

add check for names to perhaps.dt

rm mult from merge.dt method

rework, clean, polish multer, fix righ and full joins

make full join symmetric

mergepair inner function to loop on

extra check for symmetric

mergelist manual

ensure no df-dt passed where list expected

comments and manual

handle 0 cols tables

more tests

more tests and debugging

move more logic closer to bmerge, simplify mergepair

more tests

revert not used changes

reduce not needed checks, cleanup

copy arg behavior, manual, no tests yet

cbindlist manual, export both

cleanup processing bmerge to dtmatch

test function match order for easier preview

vecseq gets short-circuit

batch test allow browser

big cleanup

remmove unneeded stuff, reduce diff

more cleanup, minor manual fixes

add proper test scripts

Merge branch 'master' into cbind-merge-list

comment out not used code for coverage

more tests, some nocopy opts

rename sql test script, should fix codecov

simplify dtmatch inner branch

more precise copy, now copy only T or F

unused arg not yet in api, wording

comments and refer issues

codecov

hasindex coverage

codecov gap

tests for join using key, cols argument

fix missing import forderv

more tests, improve missing on handling

more tests for order of inner and full join for long keys

new allow.cartesian option, #4383, #914

reduce diff, improve codecov

reduce diff, comments

need more DT, not lists, mergelist 3+ tbls

proper escape heavy check

unit tests

more tests, address overalloc failure

mergelist and cbindlist retain index

manual, examples

fix manual

minor clarify in manual

retain keys, right outer join for snowflake schema joins

duplicates in cbindlist

recycling in cbindlist

escape 0 input in copyCols

empty input handling

closing cbindlist

vectorized _on_ and _join.many_ arg

rename dtmatch to dtmerge

vectorized args: how, mult
push down input validation
add support for cross join, semi join, anti join

full join, reduce overhead for mult=error

mult default value dynamic

fix manual

add "see details" to Rd

mention shared on in arg description

amend feedback from Michael

semi and anti joins will not reorder x columns

Merge branch 'master' into cbind-merge-list

spelling, thx to @jan-glx

check all new funs used and add comments

bugfix, sort=T needed for now

Merge branch 'master' into cbind-merge-list

Update NEWS.md

Merge branch 'master' into cbind-merge-list

Merge branch 'master' into cbind-merge-list

NEWS placement

numbering

ascArg->order

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list

attempt to restore from master

Update to stopf() error style

Need isFrame for now

More quality checks: any(!x)->!all(x); use vapply_1{b,c,i}

really restore from master

try to PROTECT() before duplicate()

update error message in test

appease the rchk gods

extraneous space

missing ';'

use catf

simplify perhapsDataTableR

move sqlite.Rraw.manual into other.Rraw

simplify for loop

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list
---
 R/mergelist.R             | 110 +++++++++++++++++++++
 R/onLoad.R                |   1 +
 inst/tests/mergelist.Rraw | 194 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 305 insertions(+)

diff --git a/R/mergelist.R b/R/mergelist.R
index 435ee0a601..52ce68493a 100644
--- a/R/mergelist.R
+++ b/R/mergelist.R
@@ -125,6 +125,116 @@ dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
   return(list(ans=ans, irows=irows, xrows=xrows))
 }
 
+# atomic join between two tables
+mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=names(rhs), copy=TRUE, join.many=TRUE, verbose=FALSE) {
+  semianti = how=="semi" || how=="anti"
+  innerfull = how=="inner" || how=="full"
+  {
+    if (how!="cross") {
+      if (is.null(on)) {
+        if (how=="left" || semianti) on = key(rhs)
+        else if (how=="right") on = key(lhs)
+        else if (innerfull) on = onkeys(key(lhs), key(rhs))
+        if (is.null(on))
+          stopf("'on' is missing and necessary key is not present")
+      }
+      if (any(bad.on <- !on %chin% names(lhs)))
+        stopf("'on' argument specify columns to join [%s] that are not present in LHS table [%s]", brackify(on[bad.on]), brackify(names(lhs)))
+      if (any(bad.on <- !on %chin% names(rhs)))
+        stopf("'on' argument specify columns to join [%s] that are not present in RHS table [%s]", brackify(on[bad.on]), brackify(names(rhs)))
+    } else if (is.null(on)) {
+      on = character() ## cross join only
+    }
+  } ## on
+  {
+    if (how!="right") {
+      jnfm = lhs; fm.cols = lhs.cols; jnto = rhs; to.cols = rhs.cols
+    } else {
+      jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols
+    }
+  } ## join-to and join-from tables and columns (right outer join swap)
+
+  ## ensure symmetric join for inner|full join, apply mult on both tables, bmerge do only 'x' table
+  cp.i = FALSE ## copy marker of out.i
+  if ((innerfull) && !is.null(mult) && (mult=="first" || mult=="last")) {
+    jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE) ## might not copy when already unique by 'on'
+    cp.i = nrow(jnfm)!=nrow(lhs) ## nrow(lhs) bc how='inner|full' so jnfm=lhs
+  } else if (how=="inner" && (is.null(mult) || mult=="error")) { ## we do this branch only to raise error from bmerge, we cannot use forder to just find duplicates because those duplicates might not have matching rows in another table, full join checks mult='error' during two non-void bmerges
+    dtmerge(x=jnfm, i=jnto, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many, void=TRUE)
+  }
+
+  ## binary merge
+  ans = dtmerge(x=jnto, i=jnfm, on=on, how=how, mult=mult, verbose=verbose, join.many=join.many)
+
+  ## make i side
+  out.i = if (is.null(ans$irows))
+    .shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on, retain.order=semianti), retain.key=TRUE)
+  else
+    .Call(CsubsetDT, jnfm, ans$irows, someCols(jnfm, fm.cols, keep=on, retain.order=semianti))
+  cp.i = cp.i || !is.null(ans$irows)
+
+  ## make x side
+  if (semianti) {
+    out.x = list(); cp.x = TRUE
+  } else {
+    out.x = if (is.null(ans$xrows)) ## as of now xrows cannot be NULL #4409 thus nocov below
+      internal_error("dtmerge()$xrows returned NULL, #4409 been resolved but related code has not been updated?") #.shallow(jnto, cols=someCols(jnto, to.cols, drop=on), retain.key=TRUE) # nocov ## as of now nocov does not make difference r-lib/covr#279
+    else
+      .Call(CsubsetDT, jnto, ans$xrows, someCols(jnto, to.cols, drop=on))
+    cp.x = !is.null(ans$xrows)
+    ## ensure no duplicated column names in merge results
+    if (any(dup.i<-names(out.i) %chin% names(out.x)))
+      stopf("merge result has duplicated column names, use 'cols' argument or rename columns in 'l' tables, duplicated column(s): %s", brackify(names(out.i)[dup.i]))
+  }
+
+  ## stack i and x
+  if (how!="full") {
+    if (!cp.i && copy) out.i = copy(out.i)
+    #if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here
+    out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
+    if (how=="right") setcolorder(out, neworder=c(on, names(out.x))) ## arrange columns: i.on, x.cols, i.cols
+  } else { # how=="full"
+    ## we made left join side above, proceed to right join side, so swap tbls
+    jnfm = rhs; fm.cols = rhs.cols; jnto = lhs; to.cols = lhs.cols
+
+    cp.r = FALSE
+    if (!is.null(mult) && (mult=="first" || mult=="last")) {
+      jnfm = fdistinct(jnfm, on=on, mult=mult, cols=fm.cols, copy=FALSE)
+      cp.r = nrow(jnfm)!=nrow(rhs) ## nrow(rhs) bc jnfm=rhs
+    } ## mult=="error" check was made on one side already, below we do on the second side, test 101.43
+
+    ## binary merge anti join
+    bns = dtmerge(x=jnto, i=jnfm, on=on, how="anti", mult=if (!is.null(mult) && mult!="all") mult, verbose=verbose, join.many=join.many)
+
+    ## make anti join side
+    out.r = if (is.null(bns$irows))
+      .shallow(jnfm, cols=someCols(jnfm, fm.cols, keep=on), retain.key=TRUE) ## retain.key is used only in the edge case when !nrow(out.i)
+    else
+      .Call(CsubsetDT, jnfm, bns$irows, someCols(jnfm, fm.cols, keep=on))
+    cp.r = cp.r || !is.null(bns$irows)
+
+    ## short circuit to avoid rbindlist to empty sets and retains keys
+    if (!nrow(out.r)) { ## possibly also !nrow(out.i)
+      if (!cp.i && copy) out.i = copy(out.i)
+      #if (!cp.x && copy) out.x = copy(out.x) ## as of now cp.x always TRUE, search for #4409 here
+      out = .Call(Ccbindlist, list(out.i, out.x), FALSE)
+    } else if (!nrow(out.i)) { ## but not !nrow(out.r)
+      if (!cp.r && copy) out.r = copy(out.r)
+      if (length(add<-setdiff(names(out.i), names(out.r)))) { ## add missing columns of proper types NA
+        neworder = copy(names(out.i)) #set(out.r, NULL, add, lapply(unclass(out.i)[add], `[`, 1L)) ## 291.04 overalloc exceed fail during set()
+        out.i = lapply(unclass(out.i)[add], `[`, seq_len(nrow(out.r))) ## could eventually remove this when cbindlist recycle 0 rows up, note that we need out.r not to be copied
+        out.r = .Call(Ccbindlist, list(out.r, out.i), FALSE)
+        setcolorder(out.r, neworder=neworder)
+      }
+      out = out.r
+    } else { ## all might have not been copied yet, rbindlist will copy
+      out.l = .Call(Ccbindlist, list(out.i, out.x), FALSE)
+      out = rbindlist(list(out.l, out.r), use.names=TRUE, fill=TRUE)
+    }
+  }
+  setDT(out)
+}
+
 # Previously, we had a custom C implementation here, which is ~2x faster,
 #   but this is fast enough we don't bother maintaining a new routine.
 #   Hopefully in the future rep() can recognize the ALTREP and use that, too.
diff --git a/R/onLoad.R b/R/onLoad.R
index ef96849e85..ff8b18c02d 100644
--- a/R/onLoad.R
+++ b/R/onLoad.R
@@ -82,6 +82,7 @@
        "datatable.print.trunc.cols"="FALSE",   # for print.data.table
        "datatable.show.indices"="FALSE",       # for print.data.table
        "datatable.allow.cartesian"="FALSE",    # datatable.<argument name>
+       "datatable.join.many"="TRUE",           # mergelist, [.data.table #4383 #914
        "datatable.dfdispatchwarn"="TRUE",                   # not a function argument
        "datatable.warnredundantby"="TRUE",                  # not a function argument
        "datatable.alloccol"="1024L",           # argument 'n' of alloc.col. Over-allocate 1024 spare column slots
diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw
index a35c4f4103..422d8d7097 100644
--- a/inst/tests/mergelist.Rraw
+++ b/inst/tests/mergelist.Rraw
@@ -6,6 +6,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
 } else {
   require(data.table)
   test = data.table:::test
+  mergepair = data.table:::mergepair
   perhaps.data.table = data.table:::perhaps.data.table
   hasindex = data.table:::hasindex
   fdistinct = data.table:::fdistinct
@@ -13,6 +14,16 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
 }
 
 addresses = function(x) vapply(x, address, "")
+copied = function(ans, l) {
+  all(!addresses(ans) %chin% unlist(recursive=FALSE, lapply(l, addresses)))
+}
+notcopied = function(ans, l, how="left", unless=character()) {
+  if (how %chin% unless) return(copied(ans, l)) ## used during looping tests for easier escape
+  if (how=="full") return( ## either side, left|right, notcopied is fine
+    all(addresses(l[[1L]]) %chin% addresses(ans)) || all(addresses(l[[length(l)]]) %chin% addresses(ans))
+  )
+  all(addresses(if (how=="right") l[[length(l)]] else l[[1L]]) %chin% addresses(ans))
+}
 
 # internal helpers
 
@@ -110,6 +121,189 @@ test(13.04, key(ans), "id1")
 test(13.05, indices(ans), c("id1","id2","id3","id1__id2__id3","id6","id7","id9"))
 test(13.06, ii, lapply(l, indices)) ## this tests that original indices have not been touched, shallow_duplicate in mergeIndexAttrib
 
+# mergepair
+
+## test copy-ness argument in mergepair
+
+### LHS equal to RHS: no copy in all cases
+num = 21.000
+l = list(
+  lhs = data.table(id1=1:2, v1=1:2),
+  rhs = data.table(id1=1:2, v2=1:2)
+)
+expected = data.table(id1=1:2, v1=1:2, v2=1:2)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected) ## copy=TRUE: no shared columns
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected) ## copy=FALSE: LHS shared but no RHS
+    test(num<-num+0.001, notcopied(ans, l, how=how))
+  }
+}
+### RHS includes LHS: no copy in inner, left, right
+num = 22.000
+unless = "full"
+l = list(
+  lhs = data.table(id1=1:2, v1=1:2),
+  rhs = data.table(id1=1:3, v2=1:3)
+)
+expected = list(
+  inner = data.table(id1=1:2, v1=1:2, v2=1:2),
+  left = data.table(id1=1:2, v1=1:2, v2=1:2),
+  right = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3),
+  full = data.table(id1=1:3, v1=c(1:2,NA), v2=1:3)
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS includes RHS: no copy in left, right, full
+num = 23.000
+unless = "inner"
+l = list(
+  lhs = data.table(id1=1:3, v1=1:3),
+  rhs = data.table(id1=1:2, v2=1:2)
+)
+expected = list(
+  inner = data.table(id1=1:2, v1=1:2, v2=1:2),
+  left = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA)),
+  right = data.table(id1=1:2, v1=1:2, v2=1:2),
+  full = data.table(id1=1:3, v1=1:3, v2=c(1:2,NA))
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS single nonmatch RHS on both sides: no copy in left, right
+num = 24.000
+unless = c("inner","full")
+l = list(
+  lhs = data.table(id1=3:1, v1=1:3),
+  rhs = data.table(id1=c(4L,2:1), v2=1:3)
+)
+expected = list(
+  inner = data.table(id1=2:1, v1=2:3, v2=2:3),
+  left = data.table(id1=3:1, v1=1:3, v2=c(NA,2:3)),
+  right = data.table(id1=c(4L,2:1), v1=c(NA,2:3), v2=1:3),
+  full = data.table(id1=c(3:1,4L), v1=c(1:3,NA), v2=c(NA,2:3,1L))
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS zero match RHS: no copy in left, right
+num = 25.000
+unless = c("inner","full")
+l = list(
+  lhs = data.table(id1=2:1, v1=1:2),
+  rhs = data.table(id1=3:4, v2=1:2)
+)
+expected = list(
+  inner = data.table(id1=integer(), v1=integer(), v2=integer()),
+  left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)),
+  right = data.table(id1=3:4, v1=c(NA_integer_,NA), v2=1:2),
+  full = data.table(id1=c(2:1,3:4), v1=c(1:2,NA,NA), v2=c(NA,NA,1:2))
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS and RHS zero nrow: no copies
+num = 26.000
+unless = character()
+l = list(
+  lhs = data.table(id1=integer(), v1=integer()),
+  rhs = data.table(id1=integer(), v2=integer())
+)
+expected = list(
+  inner = data.table(id1=integer(), v1=integer(), v2=integer()),
+  left = data.table(id1=integer(), v1=integer(), v2=integer()),
+  right = data.table(id1=integer(), v1=integer(), v2=integer()),
+  full = data.table(id1=integer(), v1=integer(), v2=integer())
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### LHS has zero nrow: no copies
+num = 27.000
+unless = character()
+l = list(
+  lhs = data.table(id1=integer(), v1=integer()),
+  rhs = data.table(id1=2:1, v2=1:2)
+)
+expected = list(
+  inner = data.table(id1=integer(), v1=integer(), v2=integer()),
+  left = data.table(id1=integer(), v1=integer(), v2=integer()),
+  right = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2),
+  full = data.table(id1=2:1, v1=c(NA_integer_,NA), v2=1:2)
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+### RHS has zero nrow
+num = 28.000
+unless = "inner"
+l = list(
+  lhs = data.table(id1=2:1, v1=1:2),
+  rhs = data.table(id1=integer(), v2=integer())
+)
+expected = list(
+  inner = data.table(id1=integer(), v1=integer(), v2=integer()),
+  left = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA)),
+  right = data.table(id1=integer(), v1=integer(), v2=integer()),
+  full = data.table(id1=2:1, v1=1:2, v2=c(NA_integer_,NA))
+)
+for (how in c("inner","left","right","full")) {
+  num = trunc(num*10)/10 + 0.1
+  for (mult in c("all","first","last","error")) {
+    num = trunc(num*100)/100 + 0.01
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=TRUE), expected[[how]])
+    test(num<-num+0.001, copied(ans, l))
+    test(num<-num+0.001, ans <- mergepair(l$lhs, l$rhs, on="id1", how=how, mult=mult, copy=FALSE), expected[[how]])
+    test(num<-num+0.001, notcopied(ans, l, how=how, unless=unless))
+  }
+}
+
 ## fdistinct, another round
 
 dt = data.table(x =

From 3c56146b7dd470ec473c339c630f863958c21b84 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Thu, 29 Aug 2024 17:24:35 +0000
Subject: [PATCH 8/9] cbindlist

add cbind by reference, timing

R prototype of mergelist

wording

use lower overhead funs

stick to int32 for now, correct R_alloc

bmerge C refactor for codecov and one loop for speed

address revealed codecov gaps

refactor vecseq for codecov

seqexp helper, some alloccol export on C

bmerge codecov, types handled in R bmerge already

better comment seqexp

bmerge mult=error #655

multiple new C utils

swap if branches

explain new C utils

comments mostly

reduce conflicts to PR #4386

comment C code

address multiple matches during update-on-join #3747

Revert "address multiple matches during update-on-join #3747"

This reverts commit b64c0c3480fe9415bbda6729c361621e60da6e01.

merge.dt has temporarily mult arg, for testing

minor changes to cbindlist c

dev mergelist, for single pair now

add quiet option to cc()

mergelist tests

add check for names to perhaps.dt

rm mult from merge.dt method

rework, clean, polish multer, fix righ and full joins

make full join symmetric

mergepair inner function to loop on

extra check for symmetric

mergelist manual

ensure no df-dt passed where list expected

comments and manual

handle 0 cols tables

more tests

more tests and debugging

move more logic closer to bmerge, simplify mergepair

more tests

revert not used changes

reduce not needed checks, cleanup

copy arg behavior, manual, no tests yet

cbindlist manual, export both

cleanup processing bmerge to dtmatch

test function match order for easier preview

vecseq gets short-circuit

batch test allow browser

big cleanup

remmove unneeded stuff, reduce diff

more cleanup, minor manual fixes

add proper test scripts

Merge branch 'master' into cbind-merge-list

comment out not used code for coverage

more tests, some nocopy opts

rename sql test script, should fix codecov

simplify dtmatch inner branch

more precise copy, now copy only T or F

unused arg not yet in api, wording

comments and refer issues

codecov

hasindex coverage

codecov gap

tests for join using key, cols argument

fix missing import forderv

more tests, improve missing on handling

more tests for order of inner and full join for long keys

new allow.cartesian option, #4383, #914

reduce diff, improve codecov

reduce diff, comments

need more DT, not lists, mergelist 3+ tbls

proper escape heavy check

unit tests

more tests, address overalloc failure

mergelist and cbindlist retain index

manual, examples

fix manual

minor clarify in manual

retain keys, right outer join for snowflake schema joins

duplicates in cbindlist

recycling in cbindlist

escape 0 input in copyCols

empty input handling

closing cbindlist

vectorized _on_ and _join.many_ arg

rename dtmatch to dtmerge

vectorized args: how, mult
push down input validation
add support for cross join, semi join, anti join

full join, reduce overhead for mult=error

mult default value dynamic

fix manual

add "see details" to Rd

mention shared on in arg description

amend feedback from Michael

semi and anti joins will not reorder x columns

Merge branch 'master' into cbind-merge-list

spelling, thx to @jan-glx

check all new funs used and add comments

bugfix, sort=T needed for now

Merge branch 'master' into cbind-merge-list

Update NEWS.md

Merge branch 'master' into cbind-merge-list

Merge branch 'master' into cbind-merge-list

NEWS placement

numbering

ascArg->order

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list

attempt to restore from master

Update to stopf() error style

Need isFrame for now

More quality checks: any(!x)->!all(x); use vapply_1{b,c,i}

really restore from master

try to PROTECT() before duplicate()

update error message in test

appease the rchk gods

extraneous space

missing ';'

use catf

simplify perhapsDataTableR

move sqlite.Rraw.manual into other.Rraw

simplify for loop

Merge remote-tracking branch 'origin/cbind-merge-list' into cbind-merge-list

Co-authored-by: Jan Gorecki <jangorecki@users.noreply.github.com>
---
 src/data.table.h |  1 +
 src/init.c       |  1 +
 src/mergelist.c  | 17 +++++++++++++++++
 3 files changed, 19 insertions(+)

diff --git a/src/data.table.h b/src/data.table.h
index f4d22b95ab..70f5a9bb12 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -286,6 +286,7 @@ SEXP notchin(SEXP x, SEXP table);
 
 // mergelist.c
 SEXP cbindlist(SEXP x, SEXP copyArg);
+SEXP copyCols(SEXP x, SEXP cols);
 
 // functions called from R level .Call/.External and registered in init.c
 // these now live here to pass -Wstrict-prototypes, #5477
diff --git a/src/init.c b/src/init.c
index 1ee4243145..02959bdcd6 100644
--- a/src/init.c
+++ b/src/init.c
@@ -151,6 +151,7 @@ R_CallMethodDef callMethods[] = {
 {"Cnotchin", (DL_FUNC)&notchin, -1},
 {"Ccbindlist", (DL_FUNC) &cbindlist, -1},
 {"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1},
+{"CcopyCols", (DL_FUNC) &copyCols, -1},
 {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1},
 {NULL, NULL, 0}
 };
diff --git a/src/mergelist.c b/src/mergelist.c
index 77c4287736..60c508977e 100644
--- a/src/mergelist.c
+++ b/src/mergelist.c
@@ -1,5 +1,22 @@
 #include "data.table.h"
 
+// set(x, NULL, cols, copy(unclass(x)[cols])) ## but keeps the index
+SEXP copyCols(SEXP x, SEXP cols) {
+  // used in R/mergelist.R
+  if (!isDataTable(x))
+    error("'x' must be a data.table"); // # nocov
+  if (!isInteger(cols))
+    error("'cols' must be integer"); // # nocov
+  int nx = length(x), ncols = LENGTH(cols), *colsp = INTEGER(cols);
+  if (!nx || !ncols)
+    return R_NilValue;
+  for (int i=0; i<ncols; ++i) {
+    int thiscol = colsp[i]-1;
+    SET_VECTOR_ELT(x, thiscol, duplicate(VECTOR_ELT(x, thiscol)));
+  }
+  return R_NilValue;
+}
+
 void mergeIndexAttrib(SEXP to, SEXP from) {
   if (!isInteger(to) || LENGTH(to)!=0)
     internal_error(__func__, "'to' must be integer() already"); // # nocov

From 82070159ec19c0eb34b06941deb847dc72b90c58 Mon Sep 17 00:00:00 2001
From: Michael Chirico <chiricom@google.com>
Date: Tue, 1 Jul 2025 22:27:42 +0000
Subject: [PATCH 9/9] nocov errors-->internal_error

---
 src/mergelist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mergelist.c b/src/mergelist.c
index 1954f0973a..eb676d8092 100644
--- a/src/mergelist.c
+++ b/src/mergelist.c
@@ -4,9 +4,9 @@
 SEXP copyCols(SEXP x, SEXP cols) {
   // used in R/mergelist.R
   if (!isDataTable(x))
-    error("'x' must be a data.table"); // # nocov
+    internal_error(__func__, "'x' must be a data.table"); // # nocov
   if (!isInteger(cols))
-    error("'cols' must be integer"); // # nocov
+    internal_error(__func__, "'cols' must be integer"); // # nocov
   int nx = length(x), ncols = LENGTH(cols), *colsp = INTEGER(cols);
   if (!nx || !ncols)
     return R_NilValue;