Skip to content

Commit 56d696d

Browse files
jhdf
1 parent 5d0b166 commit 56d696d

File tree

2 files changed

+78
-43
lines changed

2 files changed

+78
-43
lines changed

R/bmerge.R

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
1+
2+
13
mergeType = function(x) {
24
ans = typeof(x)
35
if (ans=="integer") { if (is.factor(x)) ans = "factor" }
46
else if (ans=="double") { if (inherits(x, "integer64")) ans = "integer64" }
7+
# do not call fitsInInt*(x) yet because i) if both types are double we don't need to coerce even if one or both sides
8+
# are int-as-double, and ii) to save calling it until we really need it
59
ans
610
}
711

812
cast_with_attrs = function(x, cast_fun) {
913
ans = cast_fun(x)
14+
# do not copy attributes when coercing factor (to character)
1015
if (!is.factor(x) && !is.null(attributes(x))) attributes(ans) = attributes(x)
1116
ans
1217
}
@@ -24,6 +29,18 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
2429
{
2530
callersi = i
2631
i = shallow(i)
32+
# Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here
33+
# by bmerge changing the type of the user's input object by reference. We now shallow copy i again. If we then
34+
# coerce a column in i only, we are just changing the temporary coercion used for the merge operation. If we
35+
# set callersi too then we are keeping that coerced i column in the merge result returned to user.
36+
# The type of the i column is always returned (i.e. just i set not callersi too), other than:
37+
# i) to convert int-as-double to int, useful for ad hoc joins when the L postfix is often forgotten.
38+
# ii) to coerce i.factor to character when joining to x.character
39+
# So those are the only two uses of callersi below.
40+
# Careful to only use plonk syntax (full column) on i and x from now on, otherwise user's i and x would
41+
# change. This is why shallow() is very importantly internal only, currently.
42+
43+
# Using .SD in j to join could fail due to being locked and set() being used here, #1926
2744
.Call(C_unlock, i)
2845
x = shallow(x)
2946
.Call(C_unlock, x)
@@ -35,6 +52,10 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
3552
supported = c(ORDERING_TYPES, "factor", "integer64")
3653

3754
if (nrow(i)) for (a in seq_along(icols)) {
55+
# - check that join columns have compatible types
56+
# - do type coercions if necessary on just the shallow local copies for the purpose of join
57+
# - handle factor columns appropriately
58+
# Note that if i is keyed, if this coerces i's key gets dropped by set()
3859
icol = icols[a]
3960
xcol = xcols[a]
4061
x_merge_type = mergeType(x[[xcol]])
@@ -43,48 +64,44 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
4364
iname = paste0("i.", names(i)[icol])
4465
if (!x_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, x_merge_type)
4566
if (!i_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, i_merge_type)
46-
4767
if (x_merge_type=="factor" || i_merge_type=="factor") {
4868
if (roll!=0.0 && a==length(icols))
4969
stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname)
5070
if (x_merge_type=="factor" && i_merge_type=="factor") {
5171
if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname)
52-
set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]])
72+
set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) # nomatch=0L otherwise a level that is missing would match to NA values
5373
next
5474
} else {
5575
if (x_merge_type=="character") {
5676
coerce_col(i, icol, "factor", "character", iname, xname, verbose=verbose)
57-
set(callersi, j=icol, value=i[[icol]])
77+
set(callersi, j=icol, value=i[[icol]]) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
5878
next
5979
} else if (i_merge_type=="character") {
6080
if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname)
6181
newvalue = chmatch(i[[icol]], levels(x[[xcol]]), nomatch=0L)
62-
if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_
82+
if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809
6383
set(i, j=icol, value=newvalue)
6484
next
6585
}
6686
}
67-
# Incompatible factor join: Factor vs (Not Factor and Not Character)
68-
# The 'message' attribute must match the *old* error for direct calls to bmerge (e.g., DT[otherDT])
6987
condition_message <- sprintf(
7088
"Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", # Exact match for tests like 2044.24
7189
xname, x_merge_type,
7290
iname, i_merge_type
7391
)
74-
7592
condition <- structure(
7693
list(
7794
message = condition_message,
78-
c_bmerge_x_arg_bare_col_name = names(x)[xcol],
79-
c_bmerge_x_arg_type = x_merge_type,
80-
c_bmerge_i_arg_bare_col_name = names(i)[icol],
81-
c_bmerge_i_arg_type = i_merge_type
95+
bmerge_x_arg_col_name = names(x)[xcol],
96+
bmerge_x_arg_type = x_merge_type,
97+
bmerge_i_arg_col_name = names(i)[icol],
98+
bmerge_i_arg_type = i_merge_type
8299
),
83100
class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition")
84101
)
85102
stop(condition)
86103
}
87-
104+
# we check factors first to cater for the case when trying to do rolling joins on factors
88105
if (x_merge_type == i_merge_type) {
89106
if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname)
90107
next
@@ -99,23 +116,24 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
99116
coerce_col(x, xcol, x_merge_type, i_merge_type, xname, iname, from_detail=gettext(" (all-NA)"), verbose=verbose)
100117
next
101118
}
102-
# This 'stopf' might have been the one originally hit by the failing tests.
103-
# Our custom error above now preempts this for incompatible factor joins.
104119
stopf("Incompatible join types: %s (%s) and %s (%s)", xname, x_merge_type, iname, i_merge_type)
105120
}
106121
if (x_merge_type=="integer64" || i_merge_type=="integer64") {
107122
nm = c(iname, xname)
108-
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) }
123+
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce
109124
if (wclass=="integer" || (wclass=="double" && fitsInInt64(w[[wc]]))) {
110125
from_detail = if (wclass == "double") gettext(" (which has integer64 representation, e.g. no fractions)") else ""
111126
coerce_col(w, wc, wclass, "integer64", nm[1L], nm[2L], from_detail, verbose=verbose)
112127
} else stopf("Incompatible join types: %s is type integer64 but %s is type double and cannot be coerced to integer64 (e.g. has fractions)", nm[2L], nm[1L])
113128
} else {
114-
ic_idx = which(icol == icols)
129+
# just integer and double left
130+
ic_idx = which(icol == icols) # check if on is joined on multiple conditions, #6602
115131
if (i_merge_type=="double") {
116132
coerce_x = FALSE
117133
if (fitsInInt32(i[[icol]])) {
118134
coerce_x = TRUE
135+
# common case of ad hoc user-typed integers missing L postfix joining to correct integer keys
136+
# we've always coerced to int and returned int, for convenience.
119137
if (length(ic_idx)>1L) {
120138
xc_idx = xcols[ic_idx]
121139
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) {
@@ -128,7 +146,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
128146
if (coerce_x) {
129147
from_detail = gettext(" (which contains no fractions)")
130148
coerce_col(i, icol, "double", "integer", iname, xname, from_detail, verbose=verbose)
131-
set(callersi, j=icol, value=i[[icol]])
149+
set(callersi, j=icol, value=i[[icol]]) # change the shallow copy of i up in [.data.table to reflect in the result, too.
132150
if (length(ic_idx)>1L) {
133151
xc_idx = xcols[ic_idx]
134152
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) {
@@ -152,8 +170,11 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
152170
}
153171
}
154172

155-
non_equi = which.first(ops != 1L)
173+
## after all modifications of x, check if x has a proper key on all xcols.
174+
## If not, calculate the order. Also for non-equi joins, the order must be calculated.
175+
non_equi = which.first(ops != 1L) # 1 is "==" operator
156176
if (is.na(non_equi)) {
177+
# equi join. use existing key (#1825) or existing secondary index (#1439)
157178
if (identical(xcols, head(chmatch(key(x), names(x)), length(xcols)))) {
158179
xo = integer(0L)
159180
if (verbose) catf("on= matches existing key, using key\n")
@@ -167,21 +188,27 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
167188
if (verbose) {last.started.at=proc.time(); flush.console()}
168189
xo = forderv(x, by = xcols)
169190
if (verbose) {catf("Calculated ad hoc index in %s\n", timetaken(last.started.at)); flush.console()}
191+
# TODO: use setindex() instead, so it's cached for future reuse
170192
}
171193
}
194+
## these variables are only needed for non-equi joins. Set them to default.
172195
nqgrp = integer(0L)
173196
nqmaxgrp = 1L
174197
} else {
198+
# non-equi operators present.. investigate groups..
175199
nqgrp = integer(0L)
176200
nqmaxgrp = 1L
177201
if (verbose) catf("Non-equi join operators detected ... \n")
178202
if (roll != FALSE) stopf("roll is not implemented for non-equi joins yet.")
179203
if (verbose) {last.started.at=proc.time();catf(" forder took ... ");flush.console()}
204+
# TODO: could check/reuse secondary indices, but we need 'starts' attribute as well!
180205
xo = forderv(x, xcols, retGrp=TRUE)
181-
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
206+
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # notranslate
182207
xg = attr(xo, 'starts', exact=TRUE)
183208
resetcols = head(xcols, non_equi-1L)
184209
if (length(resetcols)) {
210+
# TODO: can we get around having to reorder twice here?
211+
# or at least reuse previous order?
185212
if (verbose) {last.started.at=proc.time();catf(" Generating group lengths ... ");flush.console()}
186213
resetlen = attr(forderv(x, resetcols, retGrp=TRUE), 'starts', exact=TRUE)
187214
resetlen = .Call(Cuniqlengths, resetlen, nrow(x))
@@ -190,8 +217,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
190217
if (verbose) {last.started.at=proc.time();catf(" Generating non-equi group ids ... ");flush.console()}
191218
nqgrp = .Call(Cnestedid, x, xcols[non_equi:length(xcols)], xo, xg, resetlen, mult)
192219
if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()}
193-
if (length(nqgrp)) nqmaxgrp = max(nqgrp)
194-
if (nqmaxgrp > 1L) {
220+
if (length(nqgrp)) nqmaxgrp = max(nqgrp) # fix for #1986, when 'x' is 0-row table max(.) returns -Inf.
221+
if (nqmaxgrp > 1L) { # got some non-equi join work to do
195222
if ("_nqgrp_" %in% names(x)) stopf("Column name '_nqgrp_' is reserved for non-equi joins.")
196223
if (verbose) {last.started.at=proc.time();catf(" Recomputing forder with non-equi ids ... ");flush.console()}
197224
set(nqx<-shallow(x), j="_nqgrp_", value=nqgrp)
@@ -205,7 +232,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
205232
if (verbose) {last.started.at=proc.time();catf("Starting bmerge ...\n");flush.console()}
206233
ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp)
207234
if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()}
235+
# TO DO: xo could be moved inside Cbmerge
208236

209-
ans$xo = xo
237+
ans$xo = xo # for further use by [.data.table
210238
ans
211-
}
239+
}

R/merge.R

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all,
22
all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) {
3-
4-
# NO user_x_name / user_y_name at the top to maintain original variable environment for most of the function
5-
# They will be fetched *only* within the error handler if needed.
6-
73
if (!sort %in% c(TRUE, FALSE))
84
stopf("Argument 'sort' should be logical TRUE/FALSE")
95
if (!no.dups %in% c(TRUE, FALSE))
@@ -12,7 +8,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
128
if (!is.data.table(y)) {
139
y = as.data.table(y)
1410
if (missing(by) && missing(by.x)) {
15-
by = key(x) # Original logic
11+
by = key(x)
1612
}
1713
}
1814
x0 = length(x) == 0L
@@ -21,17 +17,17 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
2117
if (x0 && y0)
2218
warningf("Neither of the input data.tables to join have columns.")
2319
else if (x0)
24-
warningf("Input data.table '%s' has no columns.", "x") # Original: literal "x"
20+
warningf("Input data.table '%s' has no columns.", "x")
2521
else
26-
warningf("Input data.table '%s' has no columns.", "y") # Original: literal "y"
22+
warningf("Input data.table '%s' has no columns.", "y")
2723
}
2824
check_duplicate_names(x)
2925
check_duplicate_names(y)
3026

31-
nm_x = names(x) # Original logic
32-
nm_y = names(y) # Original logic
27+
nm_x = names(x)
28+
nm_y = names(y)
3329

34-
## set up 'by'/'by.x'/'by.y' - RETAIN ORIGINAL LOGIC EXACTLY
30+
## set up 'by'/'by.x'/'by.y'
3531
if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y))
3632
stopf("`by.x` and `by.y` must be of same length.")
3733
if (!missing(by) && !missing(by.x))
@@ -50,7 +46,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
5046
} else {
5147
if (is.null(by))
5248
by = intersect(key(x), key(y))
53-
if (!length(by))
49+
if (!length(by)) # was is.null() before PR#5183 changed to !length()
5450
by = key(x)
5551
if (!length(by))
5652
by = intersect(nm_x, nm_y)
@@ -66,26 +62,33 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
6662
by.x = by.y = by
6763
}
6864

65+
# warn about unused arguments #2587
6966
if (length(list(...))) {
7067
ell = as.list(substitute(list(...)))[-1L]
7168
for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n)
7269
unnamed_n = length(ell) - sum(nzchar(names(ell)))
7370
if (unnamed_n)
7471
warningf("Passed %d unknown and unnamed arguments.", unnamed_n)
7572
}
76-
73+
# with i. prefix in v1.9.3, this goes away. Left here for now ...
74+
## sidestep the auto-increment column number feature-leading-to-bug by
75+
## ensuring no names end in ".1", see unit test
76+
## "merge and auto-increment columns in y[x]" in test-data.frame.like.R
7777
start = setdiff(nm_x, by.x)
7878
end = setdiff(nm_y, by.y)
7979
dupnames = intersect(start, end)
8080
if (length(dupnames)) {
8181
start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L])
8282
end[chmatch(dupnames, end, 0L)] = paste0(dupnames, suffixes[2L])
8383
}
84+
# If no.dups = TRUE we also need to added the suffix to columns in y
85+
# that share a name with by.x
8486
dupkeyx = intersect(by.x, end)
8587
if (no.dups && length(dupkeyx)) {
8688
end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L])
8789
}
8890

91+
# implement incomparables argument #2587
8992
if (!is.null(incomparables)) {
9093
"%fin%" = function(x_val, table_val) if (is.character(x_val) && is.character(table_val)) x_val %chin% table_val else x_val %in% table_val
9194
xind = rowSums(x[, lapply(.SD, function(x_col_val) !(x_col_val %fin% incomparables)), .SDcols=by.x]) == length(by.x)
@@ -100,16 +103,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
100103
bmerge_incompatible_type_error = function(e) {
101104
# For merge(x=DT1, y=DT2), DT1 (user's 'x') is bmerge's 'i'
102105
# DT2 (user's 'y') is bmerge's 'x'
103-
x_part_col_name <- e$c_bmerge_i_arg_bare_col_name
104-
x_part_type <- e$c_bmerge_i_arg_type
105-
y_part_col_name <- e$c_bmerge_x_arg_bare_col_name
106-
y_part_type <- e$c_bmerge_x_arg_type
106+
x_part_col_name <- e$bmerge_i_arg_col_name
107+
x_part_type <- e$bmerge_i_arg_type
108+
y_part_col_name <- e$bmerge_x_arg_col_name
109+
y_part_type <- e$bmerge_x_arg_type
107110

108111
# Use literal "x." and "y." prefixes referring to the arguments of merge()
109112
msg <- sprintf(
110-
"Incompatible join types: x.%s (%s) and y.%s (%s). Factor columns must join to factor or character columns.",
111-
x_part_col_name, x_part_type, # Corresponds to merge() argument 'x'
112-
y_part_col_name, y_part_type # Corresponds to merge() argument 'y'
113+
"Incompatible join types: x.%s (%s) and i.%s (%s). Factor columns must join to factor or character columns.",
114+
x_part_col_name, x_part_type,
115+
y_part_col_name, y_part_type
113116
)
114117

115118
# Remove call = NULL to get "Error in merge.data.table(...): " prefix.
@@ -121,19 +124,23 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
121124
missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian]
122125
if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE)
123126
}
124-
127+
# X[Y] syntax puts JIS i columns at the end, merge likes them alongside i.
125128
newend = setdiff(nm_y, by.y)
129+
# fix for #1290, make sure by.y order is set properly before naming
126130
setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend))
127131
setnames(dt, c(by.x, start, end))
128132
if (nrow(dt) > 0L) {
129133
setkeyv(dt, if (sort) by.x else NULL)
130134
}
131135

136+
# Throw warning if there are duplicate column names in 'dt' (i.e. if
137+
# `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
132138
resultdupnames = names(dt)[duplicated(names(dt))]
133139
if (length(resultdupnames)) {
134140
warningf("column names %s are duplicated in the result", brackify(resultdupnames))
135141
}
136142

143+
# retain custom classes of first argument that resulted in dispatch to this method, #1378
137144
setattr(dt, "class", class_x)
138145
dt
139146
}

0 commit comments

Comments
 (0)