Skip to content

Commit 1043582

Browse files
authored
Merge branch 'master' into issue6964
2 parents 807b060 + cf7fb11 commit 1043582

File tree

5 files changed

+238
-6
lines changed

5 files changed

+238
-6
lines changed

R/data.table.R

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -542,12 +542,25 @@ replace_dot_alias = function(e) {
542542
# Really, `anyDuplicated` in base is AWESOME!
543543
# allow.cartesian shouldn't error if a) not-join, b) 'i' has no duplicates
544544
if (verbose) {last.started.at=proc.time();catf("Constructing irows for '!byjoin || nqbyjoin' ... ");flush.console()}
545-
irows = if (allLen1) f__ else vecseq(f__,len__,
546-
if (allow.cartesian ||
547-
notjoin || # #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
548-
!anyDuplicated(f__, incomparables = c(0L, NA_integer_))) {
549-
NULL # #742. If 'i' has no duplicates, ignore
550-
} else as.double(nrow(x)+nrow(i))) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
545+
if (allLen1) {
546+
irows = f__
547+
} else {
548+
join.many = isTRUE(getOption("datatable.join.many", TRUE)) # #914, default TRUE for backward compatibility
549+
anyDups = !notjoin &&
550+
(
551+
# #698. When notjoin=TRUE, ignore allow.cartesian. Rows in answer will never be > nrow(x).
552+
(join.many && !allow.cartesian) ||
553+
# special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
554+
(!join.many && (length(f__) != 1L || len__ != nrow(x)))
555+
) &&
556+
anyDuplicated(f__, incomparables = c(0L, NA_integer_)) > 0L
557+
limit = if (anyDups) { # #742. If 'i' has no duplicates, ignore
558+
if (!join.many) stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
559+
if (allow.cartesian) internal_error("checking allow.cartesian and join.many, unexpected else branch reached") # nocov
560+
as.double(nrow(x)+nrow(i)) # rows in i might not match to x so old max(nrow(x),nrow(i)) wasn't enough. But this limit now only applies when there are duplicates present so the reason now for nrow(x)+nrow(i) is just to nail it down and be bigger than max(nrow(x),nrow(i)).
561+
}
562+
irows = vecseq(f__, len__, limit)
563+
}
551564
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
552565
# Fix for #1092 and #1074
553566
# TODO: implement better version of "any"/"all"/"which" to avoid

R/mergelist.R

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,141 @@ cbindlist_impl_ = function(l, copy) {
1010

1111
cbindlist = function(l) cbindlist_impl_(l, copy=TRUE)
1212
setcbindlist = function(l) cbindlist_impl_(l, copy=FALSE)
13+
14+
# when 'on' is missing then use keys, used only for inner and full join
15+
onkeys = function(x, y) {
16+
if (is.null(x) && !is.null(y)) return(y)
17+
if (!is.null(x) && is.null(y)) return(x)
18+
if (!is.null(x) && !is.null(y)) {
19+
if (length(x) >= length(y))
20+
return(intersect(y, x)) ## align order to shorter|rhs key
21+
else
22+
return(intersect(x, y))
23+
}
24+
NULL # nocov. Internal error is being called later in mergepair
25+
}
26+
27+
# column index selection helper
28+
someCols = function(x, cols, drop=character(), keep=character(), retain.order=FALSE) {
29+
keep = colnamesInt(x, keep)
30+
drop = colnamesInt(x, drop)
31+
cols = colnamesInt(x, cols)
32+
ans = union(keep, setdiff(cols, drop))
33+
if (!retain.order) return(ans)
34+
sort(ans)
35+
}
36+
37+
hasindex = function(x, by, retGrp=FALSE) {
38+
index = attr(x, "index", TRUE)
39+
if (is.null(index)) return(FALSE)
40+
idx_name = paste0("__", by, collapse="")
41+
idx = attr(index, idx_name, TRUE)
42+
if (is.null(idx)) return(FALSE)
43+
if (!retGrp) return(TRUE)
44+
!is.null(attr(idx, "starts", TRUE))
45+
}
46+
47+
# fdistinct applies mult='first|last'
48+
# for mult='first' it is unique(x, by=on)[, c(on, cols), with=FALSE]
49+
# it may not copy when copy=FALSE and x is unique by 'on'
50+
fdistinct = function(x, on=key(x), mult=c("first", "last"), cols=seq_along(x), copy=TRUE) {
51+
if (!perhaps.data.table(x))
52+
stopf("'x' must be data.table")
53+
if (!is.character(on) || !length(on) || anyNA(on) || !all(on %chin% names(x)))
54+
stopf("'on' must be character column names of 'x' argument")
55+
mult = match.arg(mult)
56+
if (is.null(cols))
57+
cols = seq_along(x)
58+
else if (!(is.character(cols) || is.integer(cols)) || !length(cols) || anyNA(cols))
59+
stopf("'cols' must be non-zero length, non-NA, integer or character columns of 'x' argument")
60+
if (!isTRUEorFALSE(copy))
61+
stopf("'%s' must be TRUE or FALSE", "copy")
62+
## do not compute sort=F for mult="first" if index (sort=T) already available, sort=T is needed only for mult="last"
63+
## this short circuit will work after #4386 because it requires retGrp=T
64+
#### sort = mult!="first" || hasindex(x, by=on, retGrp=TRUE)
65+
sort = TRUE ## above line does not work for the moment, test 302.02
66+
o = forderv(x, by=on, sort=sort, retGrp=TRUE)
67+
if (attr(o, "maxgrpn", TRUE) <= 1L) {
68+
ans = .shallow(x, someCols(x, cols, keep=on), retain.key=TRUE)
69+
if (copy) ans = copy(ans)
70+
return(ans)
71+
}
72+
f = attr(o, "starts", exact=TRUE)
73+
if (mult == "last") {
74+
if (!sort) internal_error("sort must be TRUE when computing mult='last'") # nocov
75+
f = c(f[-1L] - 1L, nrow(x)) ## last of each group
76+
}
77+
if (length(o)) f = o[f]
78+
if (sort && length(o <- forderv(f))) f = f[o] ## this rolls back to original order
79+
.Call(CsubsetDT, x, f, someCols(x, cols, keep=on))
80+
}
81+
82+
# extra layer over bmerge to provide ready to use row indices (or NULL for 1:nrow)
83+
# NULL to avoid extra copies in downstream code, it turned out that avoiding copies precisely is costly and enormously complicates code, need #4409 and/or handle 1:nrow in subsetDT
84+
dtmerge = function(x, i, on, how, mult, join.many, void=FALSE, verbose) {
85+
nomatch = switch(how,
86+
inner=, semi=, anti=, cross= 0L,
87+
left=, right=, full=NA_integer_)
88+
nomatch0 = identical(nomatch, 0L)
89+
if (is.null(mult))
90+
mult = switch(how,
91+
semi=, anti="last",
92+
cross="all",
93+
inner=, left=, right=, full="error")
94+
if (void && mult != "error")
95+
internal_error("'void' must be used with mult='error'") # nocov
96+
if (how == "cross") { ## short-circuit bmerge results only for cross join
97+
if (length(on) || mult != "all" || !join.many)
98+
stopf("cross join must be used with zero-length on, mult='all', join.many=TRUE")
99+
if (void)
100+
internal_error("cross join must be used with void=FALSE") # nocov
101+
ans = list(allLen1=FALSE, starts=rep.int(1L, nrow(i)), lens=rep.int(nrow(x), nrow(i)), xo=integer())
102+
} else {
103+
if (!length(on))
104+
stopf("'on' must be non-zero length character vector")
105+
if (mult == "all" && (how == "semi" || how == "anti"))
106+
stopf("semi and anti joins must be used with mult!='all'")
107+
icols = colnamesInt(i, on, check_dups=TRUE)
108+
xcols = colnamesInt(x, on, check_dups=TRUE)
109+
ans = bmerge(i, x, icols, xcols, roll=0, rollends=c(FALSE, TRUE), nomatch=nomatch, mult=mult, ops=rep.int(1L, length(on)), verbose=verbose)
110+
if (void) { ## void=T is only for the case when we want raise error for mult='error', and that would happen in above line
111+
return(invisible(NULL))
112+
} else if (how == "semi" || how == "anti") { ## semi and anti short-circuit
113+
## we will subset i rather than x, thus assign to irows, not to xrows
114+
if (how == "semi")
115+
irows = which(ans$lens != 0L)
116+
else
117+
irows = which(ans$lens == 0L)
118+
if (length(irows) == length(ans$lens)) irows = NULL
119+
return(list(ans=ans, irows=irows))
120+
} else if (mult == "all" && !ans$allLen1 && !join.many && ## join.many, like allow.cartesian, check
121+
!(length(ans$starts) == 1L && ans$lens == nrow(x)) && ## special case of scalar i match to const duplicated x, not handled by anyDuplicate: data.table(x=c(1L,1L))[data.table(x=1L), on="x"]
122+
anyDuplicated(ans$starts, incomparables=c(0L, NA_integer_))
123+
)
124+
stopf("Joining resulted in many-to-many join. Perform quality check on your data, use mult!='all', or set 'datatable.join.many' option to TRUE to allow rows explosion.")
125+
}
126+
127+
## xrows, join-to
128+
xrows = if (ans$allLen1) ans$starts else vecseq(ans$starts, ans$lens, NULL)
129+
if (nomatch0 && ans$allLen1) xrows = xrows[as.logical(ans$lens)]
130+
len.x = length(xrows) ## as of now cannot optimize to NULL, search for #4409 here
131+
132+
## irows, join-from
133+
irows = if (!(ans$allLen1 && (!nomatch0 || len.x == length(ans$starts)))) seqexp(ans$lens)
134+
len.i = if (is.null(irows)) nrow(i) else length(irows)
135+
136+
if (length(ans$xo) && length(xrows))
137+
xrows = ans$xo[xrows]
138+
len.x = length(xrows)
139+
140+
if (len.i != len.x)
141+
internal_error("dtmerge out len.i != len.x") # nocov
142+
143+
list(ans=ans, irows=irows, xrows=xrows)
144+
}
145+
146+
# Previously, we had a custom C implementation here, which is ~2x faster,
147+
# but this is fast enough we don't bother maintaining a new routine.
148+
# Hopefully in the future rep() can recognize the ALTREP and use that, too.
149+
seqexp = function(x) rep(seq_along(x), x)
150+
perhaps.data.table = function(x) .Call(CperhapsDataTableR, x)

inst/tests/mergelist.Rraw

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,52 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) {
66
} else {
77
require(data.table)
88
test = data.table:::test
9+
perhaps.data.table = data.table:::perhaps.data.table
10+
hasindex = data.table:::hasindex
11+
fdistinct = data.table:::fdistinct
12+
forderv = data.table:::forderv
913
}
1014

15+
# internal helpers
16+
17+
test(1.01, perhaps.data.table(list()))
18+
test(1.02, perhaps.data.table(list(a=1:2)))
19+
test(1.03, perhaps.data.table(list(a=1:2, b=1:2)))
20+
test(1.04, perhaps.data.table(list(1:2, 1:2)), FALSE)
21+
22+
test(2.01, fdistinct(list(x=c(1L, 1:2), b=1:2), on="x", mult="last"), error="must be data.table")
23+
test(2.02, fdistinct(data.table(x=c(1L, 1:2)), on="z", mult="last"), error="must be character column names of")
24+
test(2.03, fdistinct(data.table(x=c(1L, 1:2)), on="x", mult="last", cols=character()), error="must be non-zero length, non-NA, integer or character columns of")
25+
test(2.04, fdistinct(data.table(x=c(1L, 1:2, y=1:3)), on="x", mult="last", copy=NA), error="must be TRUE or FALSE")
26+
local({
27+
addresses = function(x) vapply(x, address, "")
28+
29+
d = data.table(x=1:2, y=1:2)
30+
test(2.05, ans <- fdistinct(d, on="x", mult="last"), d)
31+
test(2.06, intersect(addresses(ans), addresses(d)), character())
32+
test(2.07, ans <- fdistinct(d, on="x", mult="last", copy=FALSE), d)
33+
test(2.08, addresses(ans), addresses(d))
34+
})
35+
local({
36+
d = data.table(x=c(2:1, 2L), y=1:3)
37+
test(2.09, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))
38+
test(2.10, fdistinct(d, on="x", mult="last"), data.table(x=1:2, y=2:3))
39+
setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x", retGrp=TRUE)) ## retGrp=T index #4386
40+
test(2.11, fdistinct(d, on="x", mult="first"), data.table(x=2:1, y=1:2))
41+
42+
test(3.01, hasindex(d, "x"))
43+
test(3.02, hasindex(d, "x", retGrp=TRUE))
44+
setattr(attr(setattr(d, "index", integer()), "index", TRUE), "__x", forderv(d, "x")) ## retGrp=F index #4386
45+
test(3.03, hasindex(d, "x"))
46+
test(3.04, !hasindex(d, "x", retGrp=TRUE))
47+
setattr(d, "index", NULL)
48+
test(3.05, !hasindex(d, "x"))
49+
test(3.06, !hasindex(d, "x", retGrp=TRUE))
50+
setattr(d, "index", integer())
51+
test(3.07, !hasindex(d, "x"))
52+
test(3.08, !hasindex(d, "x", retGrp=TRUE))
53+
})
54+
1155
# cbindlist, setcbindlist
1256

1357
local({
@@ -69,3 +113,38 @@ local({
69113
test(13.4, cbindlist(list(data.table(a=1:2), data.table(b=3:4, key="b"))), data.table(a=1:2, b=3:4, key="b"))
70114
# TODO(#7116): this could be supported
71115
# test(13.5, cbindlist(list(data.table(a=1:2, key="a"), data.table(b=3:4, key="b"))), data.table(a=1:2, b=3:4, key=c("a", "b")))
116+
117+
## fdistinct, another round
118+
119+
local({
120+
dt = data.table(x = c(
121+
74L, 103L, 158L, 250L, 56L, 248L, 260L, 182L, 174L, 17L, 57L,
122+
49L, 189L, 106L, 212L, 137L, 198L, 273L, 105L, 214L, 258L, 59L,
123+
180L, 35L, 74L, 107L, 4L, 106L, 240L, 94L, 133L, 165L, 136L,
124+
52L, 228L, 184L, 219L, 30L, 200L, 114L, 226L, 178L, 216L, 153L,
125+
146L, 218L, 7L, 132L, 202L, 191L, 132L, 237L, 121L, 68L, 20L,
126+
28L, 87L, 143L, 183L, 112L, 252L, 81L, 127L, 92L, 179L, 71L,
127+
132L, 211L, 24L, 241L, 94L, 231L, 96L, 92L, 131L, 246L, 238L,
128+
108L, 214L, 265L, 120L, 196L, 110L, 90L, 209L, 56L, 196L, 34L,
129+
68L, 40L, 66L, 17L, 177L, 241L, 215L, 220L, 126L, 113L, 223L,
130+
167L, 181L, 98L, 75L, 273L, 175L, 59L, 36L, 132L, 255L, 165L,
131+
269L, 202L, 99L, 119L, 41L, 4L, 197L, 29L, 123L, 177L, 273L,
132+
137L, 134L, 48L, 208L, 125L, 141L, 58L, 63L, 164L, 159L, 22L,
133+
10L, 177L, 256L, 165L, 155L, 145L, 271L, 140L, 188L, 166L, 66L,
134+
71L, 201L, 125L, 49L, 206L, 29L, 238L, 170L, 154L, 91L, 125L,
135+
138L, 50L, 146L, 21L, 77L, 59L, 79L, 247L, 123L, 215L, 243L,
136+
114L, 18L, 93L, 200L, 93L, 174L, 232L, 236L, 108L, 105L, 247L,
137+
178L, 204L, 167L, 249L, 81L, 53L, 244L, 139L, 242L, 53L, 209L,
138+
200L, 260L, 151L, 196L, 107L, 28L, 256L, 78L, 163L, 31L, 232L,
139+
88L, 216L, 74L, 61L, 143L, 74L, 50L, 143L, 155L, 36L, 71L, 198L,
140+
265L, 28L, 210L, 261L, 226L, 85L, 179L, 263L, 263L, 94L, 73L,
141+
46L, 89L, 141L, 255L, 141L, 71L, 13L, 115L, 235L, 96L, 37L, 103L,
142+
174L, 108L, 190L, 190L, 153L, 119L, 125L, 85L, 160L, 251L, 40L,
143+
115L, 59L, 118L, 37L, 127L, 260L, 210L, 257L, 130L, 166L, 134L,
144+
30L, 69L, 138L, 103L, 258L, 145L, 88L, 77L, 217L, 194L, 46L,
145+
18L, 208L, 171L, 47L, 18L, 30L, 105L, 47L, 83L
146+
))
147+
ans = unique(dt, by="x")
148+
test(301.01, data.table(x=unique(dt$x)), ans) ## OK
149+
test(301.02, fdistinct(dt, on="x"), ans) ## force sort=TRUE for the moment
150+
})

man/data.table-options.Rd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
\describe{
7373
\item{\code{datatable.allow.cartesian}}{A logical, default \code{FALSE}. Controls the default value of the
7474
\code{allow.cartesian} parameter; see \code{\link{data.table}}. If the value of this parameter is FALSE, an error is raised as a safeguard against an explosive Cartesian join.}
75+
\item{\code{datatable.join.many}}{A logical. Stub description to be embellished later in PR #4370. }
7576
}
7677
}
7778

src/init.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = {
150150
{"CconvertDate", (DL_FUNC)&convertDate, -1},
151151
{"Cnotchin", (DL_FUNC)&notchin, -1},
152152
{"Ccbindlist", (DL_FUNC) &cbindlist, -1},
153+
{"CperhapsDataTableR", (DL_FUNC) &perhapsDataTableR, -1},
153154
{"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1},
154155
{NULL, NULL, 0}
155156
};

0 commit comments

Comments
 (0)