Merge branch 'master' into issue6964

venom1204 · web-flow · commit 10435827a071 · 2025-07-02T00:55:37.000+05:30
diff --git a/R/data.table.R b/R/data.table.R
@@ -542,12 +542,25 @@ replace_dot_alias = function(e) {
           # Really, `anyDuplicated` in base is AWESOME!
           # allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
           if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
-          irows = if (allLen1) f__ else vecseq(f__,len__,
-            if (allow.cartesian ||
-                notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
-                !anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
-              NULL # #742. If 'i' has no duplicates, ignore
-            } else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
+          if (allLen1) {
+            irows = f__
+          } else {
+            join.many = isTRUE(getOption("datatable.join.many", TRUE)) # #914, default TRUE for backward compatibility
+            anyDups = !notjoin &&
+              (
+                # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
+                (join.many && !allow.cartesian) ||
+                # special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
+                (!join.many && (length(f__) != 1L || len__ != nrow(x)))
+              ) &&
+              anyDuplicated(f__, incomparables = c(0L, NA_integer_)) > 0L
+            limit = if (anyDups) { # #742. If 'i' has no duplicates, ignore
+              if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
+              if (allow.cartesian) internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov
+              as.double(nrow(x)+nrow(i)) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
+            }
+            irows = vecseq(f__, len__, limit)
+          }
           if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
           # Fix for #1092 and #1074
           # TODO: implement better version of "any"/"all"/"which" to avoid
diff --git a/R/mergelist.R b/R/mergelist.R
@@ -10,3 +10,141 @@ cbindlist_impl_ = function(l, copy) {
 
 cbindlist = function(l) cbindlist_impl_(l, copy=TRUE)
 setcbindlist = function(l) cbindlist_impl_(l, copy=FALSE)
+
+# when 'on' is missing then use keys, used only for inner and full join
+onkeys = function(x, y) {
+  if (is.null(x) && !is.null(y)) return(y)
+  if (!is.null(x) && is.null(y)) return(x)
+  if (!is.null(x) && !is.null(y)) {
+    if (length(x) >= length(y))
+      return(intersect(y, x)) ## align order to shorter|rhs key
+    else
+      return(intersect(x, y))
+  }
+  NULL # nocov. Internal error is being called later in mergepair
+}
+
+# column index selection helper
+someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) {
+  keep = colnamesInt(x, keep)
+  drop = colnamesInt(x, drop)
+  cols = colnamesInt(x, cols)
+  ans = union(keep, setdiff(cols, drop))
+  if (!retain.order) return(ans)
+  sort(ans)
+}
+
+hasindex = function(x, by, retGrp=FALSE) {
+  index = attr(x, "index", TRUE)
+  if (is.null(index)) return(FALSE)
+  idx_name = paste0("__", by, collapse="")
+  idx = attr(index, idx_name, TRUE)
+  if (is.null(idx)) return(FALSE)
+  if (!retGrp) return(TRUE)
+  !is.null(attr(idx, "starts", TRUE))
+}
+
+# fdistinct applies mult='first|last'
+# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE]
+# it may not copy when copy=FALSE and x is unique by 'on'
+fdistinct = function(x, on=key(x), mult=c("first", "last"), cols=seq_along(x), copy=TRUE) {
+  if (!perhaps.data.table(x))
+    stopf("'x' must be data.table")
+  if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x)))
+    stopf("'on' must be character column names of 'x' argument")
+  mult = match.arg(mult)
+  if (is.null(cols))
+    cols = seq_along(x)
+  else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols))
+    stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument")
+  if (!isTRUEorFALSE(copy))
+    stopf("'%s' must be TRUE or FALSE", "copy")
+  ## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last"
+  ## this short circuit will work after #4386 because it requires retGrp=T
+  #### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE)
+  sort = TRUE ## above line does not work for the moment, test 302.02
+  o = forderv(x, by=on, sort=sort, retGrp=TRUE)
+  if (attr(o, "maxgrpn", TRUE) <= 1L) {
+    ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE)
+    if (copy) ans = copy(ans)
+    return(ans)
+  }
+  f = attr(o, "starts", exact=TRUE)
+  if (mult == "last") {
+    if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov
+    f = c(f[-1L] - 1L, nrow(x)) ## last of each group
+  }
+  if (length(o)) f = o[f]
+  if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order
+  .Call(CsubsetDT, x, f, someCols(x, cols, keep=on))
+}
+
+# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow)
+# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT
+dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
+  nomatch = switch(how,
+                   inner=, semi=, anti=, cross= 0L,
+                   left=, right=, full=NA_integer_)
+  nomatch0 = identical(nomatch, 0L)
+  if (is.null(mult))
+    mult = switch(how,
+                  semi=, anti="last",
+                  cross="all",
+                  inner=, left=, right=, full="error")
+  if (void && mult != "error")
+    internal_error("'void' must be used with mult='error'") # nocov
+  if (how == "cross") { ## short-circuit bmerge results only for cross join
+    if (length(on) || mult != "all" || !join.many)
+      stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE")
+    if (void)
+      internal_error("cross join must be used with void=FALSE") # nocov
+    ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer())
+  } else {
+    if (!length(on))
+      stopf("'on' must be non-zero length character vector")
+    if (mult == "all" && (how == "semi" || how == "anti"))
+      stopf("semi and anti joins must be used with mult!='all'")
+    icols = colnamesInt(i, on, check_dups=TRUE)
+    xcols = colnamesInt(x, on, check_dups=TRUE)
+    ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose)
+    if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line
+      return(invisible(NULL))
+    } else if (how == "semi" || how == "anti") { ## semi and anti short-circuit
+      ## we will subset i rather than x, thus assign to irows, not to xrows
+      if (how == "semi")
+        irows = which(ans$lens != 0L)
+      else
+        irows = which(ans$lens == 0L)
+      if (length(irows) == length(ans$lens)) irows = NULL
+      return(list(ans=ans, irows=irows))
+    } else if (mult == "all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check
+      !(length(ans$starts) == 1L && ans$lens == nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
+      anyDuplicated(ans$starts, incomparables=c(0L, NA_integer_))
+    )
+      stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
+  }
+
+  ## xrows, join-to
+  xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
+  if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)]
+  len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here
+
+  ## irows, join-from
+  irows = if (!(ans$allLen1 && (!nomatch0 || len.x == length(ans$starts)))) seqexp(ans$lens)
+  len.i = if (is.null(irows)) nrow(i) else length(irows)
+
+  if (length(ans$xo) && length(xrows))
+    xrows = ans$xo[xrows]
+  len.x = length(xrows)
+
+  if (len.i != len.x)
+    internal_error("dtmerge out len.i != len.x") # nocov
+
+  list(ans=ans, irows=irows, xrows=xrows)
+}
+
+# Previously, we had a custom C implementation here, which is ~2x faster,
+#   but this is fast enough we don't bother maintaining a new routine.
+#   Hopefully in the future rep() can recognize the ALTREP and use that, too.
+seqexp = function(x) rep(seq_along(x), x)
+perhaps.data.table = function(x) .Call(CperhapsDataTableR, x)
diff --git a/inst/tests/mergelist.Rraw b/inst/tests/mergelist.Rraw
@@ -6,8 +6,52 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
 } else {
   require(data.table)
   test = data.table:::test
+  perhaps.data.table = data.table:::perhaps.data.table
+  hasindex = data.table:::hasindex
+  fdistinct = data.table:::fdistinct
+  forderv = data.table:::forderv
 }
 
+# internal helpers
+
+test(1.01, perhaps.data.table(list()))
+test(1.02, perhaps.data.table(list(a=1:2)))
+test(1.03, perhaps.data.table(list(a=1:2, b=1:2)))
+test(1.04, perhaps.data.table(list(1:2, 1:2)), FALSE)
+
+test(2.01, fdistinct(list(x=c(1L, 1:2), b=1:2), on="x", mult="last"), error="must be data.table")
+test(2.02, fdistinct(data.table(x=c(1L, 1:2)), on="z", mult="last"), error="must be character column names of")
+test(2.03, fdistinct(data.table(x=c(1L, 1:2)), on="x", mult="last", cols=character()), error="must be non-zero length, non-NA, integer or character columns of")
+test(2.04, fdistinct(data.table(x=c(1L, 1:2, y=1:3)), on="x", mult="last", copy=NA), error="must be TRUE or FALSE")
+local({
+  addresses = function(x) vapply(x, address, "")
+
+  d = data.table(x=1:2, y=1:2)
+  test(2.05, ans <- fdistinct(d, on="x", mult="last"), d)
+  test(2.06, intersect(addresses(ans), addresses(d)), character())
+  test(2.07, ans <- fdistinct(d, on="x", mult="last", copy=FALSE), d)
+  test(2.08, addresses(ans), addresses(d))
+})
+local({
+  d = data.table(x=c(2:1, 2L), y=1:3)
+  test(2.09, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))
+  test(2.10, fdistinct(d, on="x", mult="last"), data.table(x=1:2, y=2:3))
+  setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x", retGrp=TRUE)) ## retGrp=T index #4386
+  test(2.11, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))
+
+  test(3.01, hasindex(d, "x"))
+  test(3.02, hasindex(d, "x", retGrp=TRUE))
+  setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x"))              ## retGrp=F index #4386
+  test(3.03, hasindex(d, "x"))
+  test(3.04, !hasindex(d, "x", retGrp=TRUE))
+  setattr(d, "index", NULL)
+  test(3.05, !hasindex(d, "x"))
+  test(3.06, !hasindex(d, "x", retGrp=TRUE))
+  setattr(d, "index", integer())
+  test(3.07, !hasindex(d, "x"))
+  test(3.08, !hasindex(d, "x", retGrp=TRUE))
+})
+
 # cbindlist, setcbindlist
 
 local({
@@ -69,3 +113,38 @@ local({
 test(13.4, cbindlist(list(data.table(a=1:2), data.table(b=3:4, key="b"))), data.table(a=1:2, b=3:4, key="b"))
 # TODO(#7116): this could be supported
 # test(13.5, cbindlist(list(data.table(a=1:2, key="a"), data.table(b=3:4, key="b"))), data.table(a=1:2, b=3:4, key=c("a", "b")))
+
+## fdistinct, another round
+
+local({
+  dt = data.table(x = c(
+    74L, 103L, 158L, 250L, 56L, 248L, 260L, 182L, 174L, 17L, 57L,
+    49L, 189L, 106L, 212L, 137L, 198L, 273L, 105L, 214L, 258L, 59L,
+    180L, 35L, 74L, 107L, 4L, 106L, 240L, 94L, 133L, 165L, 136L,
+    52L, 228L, 184L, 219L, 30L, 200L, 114L, 226L, 178L, 216L, 153L,
+    146L, 218L, 7L, 132L, 202L, 191L, 132L, 237L, 121L, 68L, 20L,
+    28L, 87L, 143L, 183L, 112L, 252L, 81L, 127L, 92L, 179L, 71L,
+    132L, 211L, 24L, 241L, 94L, 231L, 96L, 92L, 131L, 246L, 238L,
+    108L, 214L, 265L, 120L, 196L, 110L, 90L, 209L, 56L, 196L, 34L,
+    68L, 40L, 66L, 17L, 177L, 241L, 215L, 220L, 126L, 113L, 223L,
+    167L, 181L, 98L, 75L, 273L, 175L, 59L, 36L, 132L, 255L, 165L,
+    269L, 202L, 99L, 119L, 41L, 4L, 197L, 29L, 123L, 177L, 273L,
+    137L, 134L, 48L, 208L, 125L, 141L, 58L, 63L, 164L, 159L, 22L,
+    10L, 177L, 256L, 165L, 155L, 145L, 271L, 140L, 188L, 166L, 66L,
+    71L, 201L, 125L, 49L, 206L, 29L, 238L, 170L, 154L, 91L, 125L,
+    138L, 50L, 146L, 21L, 77L, 59L, 79L, 247L, 123L, 215L, 243L,
+    114L, 18L, 93L, 200L, 93L, 174L, 232L, 236L, 108L, 105L, 247L,
+    178L, 204L, 167L, 249L, 81L, 53L, 244L, 139L, 242L, 53L, 209L,
+    200L, 260L, 151L, 196L, 107L, 28L, 256L, 78L, 163L, 31L, 232L,
+    88L, 216L, 74L, 61L, 143L, 74L, 50L, 143L, 155L, 36L, 71L, 198L,
+    265L, 28L, 210L, 261L, 226L, 85L, 179L, 263L, 263L, 94L, 73L,
+    46L, 89L, 141L, 255L, 141L, 71L, 13L, 115L, 235L, 96L, 37L, 103L,
+    174L, 108L, 190L, 190L, 153L, 119L, 125L, 85L, 160L, 251L, 40L,
+    115L, 59L, 118L, 37L, 127L, 260L, 210L, 257L, 130L, 166L, 134L,
+    30L, 69L, 138L, 103L, 258L, 145L, 88L, 77L, 217L, 194L, 46L,
+    18L, 208L, 171L, 47L, 18L, 30L, 105L, 47L, 83L
+  ))
+  ans = unique(dt, by="x")
+  test(301.01, data.table(x=unique(dt$x)), ans) ## OK
+  test(301.02, fdistinct(dt, on="x"), ans)      ## force sort=TRUE for the moment
+})
diff --git a/man/data.table-options.Rd b/man/data.table-options.Rd
@@ -72,6 +72,7 @@
   \describe{
     \item{\code{datatable.allow.cartesian}}{A logical, default \code{FALSE}. Controls the default value of the 
     \code{allow.cartesian} parameter; see \code{\link{data.table}}. If the value of this parameter is FALSE, an error is raised as a safeguard against an explosive Cartesian join.}
+    \item{\code{datatable.join.many}}{A logical. Stub description to be embellished later in PR #4370. }
   }
 }
 
diff --git a/src/init.c b/src/init.c
@@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = {
 {"CconvertDate", (DL_FUNC)&convertDate, -1},
 {"Cnotchin", (DL_FUNC)&notchin, -1},
 {"Ccbindlist", (DL_FUNC) &cbindlist, -1},
+{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1},
 {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1},
 {NULL, NULL, 0}
 };

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@`
`72`	`72`	`\describe{`
`73`	`73`	`\item{\code{datatable.allow.cartesian}}{A logical, default \code{FALSE}. Controls the default value of the`
`74`	`74`	`\code{allow.cartesian} parameter; see \code{\link{data.table}}. If the value of this parameter is FALSE, an error is raised as a safeguard against an explosive Cartesian join.}`
	`75`	`+ \item{\code{datatable.join.many}}{A logical. Stub description to be embellished later in PR #4370. }`
`75`	`76`	`}`
`76`	`77`	`}`
`77`	`78`