Skip to content

Commit 2b7edce

Browse files
kjh
1 parent 8f97c83 commit 2b7edce

File tree

1 file changed

+32
-63
lines changed

1 file changed

+32
-63
lines changed

R/bmerge.R

Lines changed: 32 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,11 @@ mergeType = function(x) {
22
ans = typeof(x)
33
if (ans=="integer") { if (is.factor(x)) ans = "factor" }
44
else if (ans=="double") { if (inherits(x, "integer64")) ans = "integer64" }
5-
# do not call fitsInInt*(x) yet because i) if both types are double we don't need to coerce even if one or both sides
6-
# are int-as-double, and ii) to save calling it until we really need it
75
ans
86
}
97

108
cast_with_attrs = function(x, cast_fun) {
119
ans = cast_fun(x)
12-
# do not copy attributes when coercing factor (to character)
1310
if (!is.factor(x) && !is.null(attributes(x))) attributes(ans) = attributes(x)
1411
ans
1512
}
@@ -27,18 +24,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
2724
{
2825
callersi = i
2926
i = shallow(i)
30-
# Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here
31-
# by bmerge changing the type of the user's input object by reference. We now shallow copy i again. If we then
32-
# coerce a column in i only, we are just changing the temporary coercion used for the merge operation. If we
33-
# set callersi too then we are keeping that coerced i column in the merge result returned to user.
34-
# The type of the i column is always returned (i.e. just i set not callersi too), other than:
35-
# i) to convert int-as-double to int, useful for ad hoc joins when the L postfix is often forgotten.
36-
# ii) to coerce i.factor to character when joining to x.character
37-
# So those are the only two uses of callersi below.
38-
# Careful to only use plonk syntax (full column) on i and x from now on, otherwise user's i and x would
39-
# change. This is why shallow() is very importantly internal only, currently.
40-
41-
# Using .SD in j to join could fail due to being locked and set() being used here, #1926
4227
.Call(C_unlock, i)
4328
x = shallow(x)
4429
.Call(C_unlock, x)
@@ -50,16 +35,12 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
5035
supported = c(ORDERING_TYPES, "factor", "integer64")
5136

5237
if (nrow(i)) for (a in seq_along(icols)) {
53-
# - check that join columns have compatible types
54-
# - do type coercions if necessary on just the shallow local copies for the purpose of join
55-
# - handle factor columns appropriately
56-
# Note that if i is keyed, if this coerces i's key gets dropped by set()
5738
icol = icols[a]
5839
xcol = xcols[a]
5940
x_merge_type = mergeType(x[[xcol]])
6041
i_merge_type = mergeType(i[[icol]])
61-
xname = paste0("x.", names(x)[xcol]) # Prefixed name for x's column (bmerge's perspective)
62-
iname = paste0("i.", names(i)[icol]) # Prefixed name for i's column (bmerge's perspective)
42+
xname = paste0("x.", names(x)[xcol])
43+
iname = paste0("i.", names(i)[icol])
6344
if (!x_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, x_merge_type)
6445
if (!i_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, i_merge_type)
6546

@@ -68,49 +49,48 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
6849
stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname)
6950
if (x_merge_type=="factor" && i_merge_type=="factor") {
7051
if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname)
71-
set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) # nomatch=0L otherwise a level that is missing would match to NA values
52+
set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]])
7253
next
7354
} else {
74-
if (x_merge_type=="character") {
55+
if (x_merge_type=="character") {
7556
coerce_col(i, icol, "factor", "character", iname, xname, verbose=verbose)
76-
set(callersi, j=icol, value=i[[icol]]) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
57+
set(callersi, j=icol, value=i[[icol]])
7758
next
78-
} else if (i_merge_type=="character") { # i is character, x is factor
59+
} else if (i_merge_type=="character") {
7960
if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname)
8061
newvalue = chmatch(i[[icol]], levels(x[[xcol]]), nomatch=0L)
81-
if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809
62+
if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_
8263
set(i, j=icol, value=newvalue)
8364
next
8465
}
8566
}
67+
# Incompatible factor join: Factor vs (Not Factor and Not Character)
68+
# The 'message' attribute must match the *old* error for direct calls to bmerge (e.g., DT[otherDT])
8669
condition_message <- sprintf(
87-
"Incompatible join types (from bmerge perspective): %s (%s) and %s (%s). Factor columns must join to factor or character columns.",
88-
xname, x_merge_type, # xname is like "x.colA", x_merge_type is its type
89-
iname, i_merge_type # iname is like "i.colB", i_merge_type is its type
70+
"Incompatible join types: %s (%s) and %s (%s)", # Exact match for tests like 2044.24
71+
xname, x_merge_type,
72+
iname, i_merge_type
9073
)
74+
9175
condition <- structure(
9276
list(
93-
message = condition_message, # Raw message from bmerge if not caught and rephrased
94-
call = NULL, # Will be populated by stop() or handler
95-
96-
# Details for 'x' argument bmerge.R received (e.g., table y from merge(x,y) or table x from x[i])
97-
c_bmerge_x_arg_bare_col_name = names(x)[xcol], # Bare column name from x, e.g., "a"
98-
c_bmerge_x_arg_type = x_merge_type, # Type string for x's column, e.g., "integer"
99-
100-
# Details for 'i' argument bmerge.R received (e.g., table x from merge(x,y) or table i from x[i])
101-
c_bmerge_i_arg_bare_col_name = names(i)[icol], # Bare column name from i, e.g., "a"
102-
c_bmerge_i_arg_type = i_merge_type # Type string for i's column, e.g., "factor"
77+
message = condition_message,
78+
call = NULL,
79+
c_bmerge_x_arg_bare_col_name = names(x)[xcol],
80+
c_bmerge_x_arg_type = x_merge_type,
81+
c_bmerge_i_arg_bare_col_name = names(i)[icol],
82+
c_bmerge_i_arg_type = i_merge_type
10383
),
10484
class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition")
10585
)
10686
stop(condition)
10787
}
108-
# we check factors first to cater for the case when trying to do rolling joins on factors
88+
10989
if (x_merge_type == i_merge_type) {
11090
if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname)
11191
next
11292
}
113-
cfl = c("character", "logical", "factor") # Note: 'factor' in cfl is effectively dead code here due to earlier factor handling
93+
cfl = c("character", "logical", "factor")
11494
if (x_merge_type %chin% cfl || i_merge_type %chin% cfl) {
11595
if (anyNA(i[[icol]]) && allNA(i[[icol]])) {
11696
coerce_col(i, icol, i_merge_type, x_merge_type, iname, xname, from_detail=gettext(" (all-NA)"), verbose=verbose)
@@ -120,25 +100,23 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
120100
coerce_col(x, xcol, x_merge_type, i_merge_type, xname, iname, from_detail=gettext(" (all-NA)"), verbose=verbose)
121101
next
122102
}
123-
# This stopf is for other incompatible types not covered by the factor-specific error above
103+
# This 'stopf' might have been the one originally hit by the failing tests.
104+
# Our custom error above now preempts this for incompatible factor joins.
124105
stopf("Incompatible join types: %s (%s) and %s (%s)", xname, x_merge_type, iname, i_merge_type)
125106
}
126107
if (x_merge_type=="integer64" || i_merge_type=="integer64") {
127108
nm = c(iname, xname)
128-
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce
109+
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) }
129110
if (wclass=="integer" || (wclass=="double" && fitsInInt64(w[[wc]]))) {
130111
from_detail = if (wclass == "double") gettext(" (which has integer64 representation, e.g. no fractions)") else ""
131112
coerce_col(w, wc, wclass, "integer64", nm[1L], nm[2L], from_detail, verbose=verbose)
132113
} else stopf("Incompatible join types: %s is type integer64 but %s is type double and cannot be coerced to integer64 (e.g. has fractions)", nm[2L], nm[1L])
133114
} else {
134-
# just integer and double left
135-
ic_idx = which(icol == icols) # check if on is joined on multiple conditions, #6602
115+
ic_idx = which(icol == icols)
136116
if (i_merge_type=="double") {
137117
coerce_x = FALSE
138118
if (fitsInInt32(i[[icol]])) {
139119
coerce_x = TRUE
140-
# common case of ad hoc user-typed integers missing L postfix joining to correct integer keys
141-
# we've always coerced to int and returned int, for convenience.
142120
if (length(ic_idx)>1L) {
143121
xc_idx = xcols[ic_idx]
144122
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) {
@@ -151,7 +129,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
151129
if (coerce_x) {
152130
from_detail = gettext(" (which contains no fractions)")
153131
coerce_col(i, icol, "double", "integer", iname, xname, from_detail, verbose=verbose)
154-
set(callersi, j=icol, value=i[[icol]]) # change the shallow copy of i up in [.data.table to reflect in the result, too.
132+
set(callersi, j=icol, value=i[[icol]])
155133
if (length(ic_idx)>1L) {
156134
xc_idx = xcols[ic_idx]
157135
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) {
@@ -174,11 +152,9 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
174152
}
175153
}
176154
}
177-
## after all modifications of x, check if x has a proper key on all xcols.
178-
## If not, calculate the order. Also for non-equi joins, the order must be calculated.
179-
non_equi = which.first(ops != 1L) # 1 is "==" operator
155+
156+
non_equi = which.first(ops != 1L)
180157
if (is.na(non_equi)) {
181-
# equi join. use existing key (#1825) or existing secondary index (#1439)
182158
if (identical(xcols, head(chmatch(key(x), names(x)), length(xcols)))) {
183159
xo = integer(0L)
184160
if (verbose) catf("on= matches existing key, using key\n")
@@ -192,27 +168,21 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
192168
if (verbose) {last.started.at=proc.time(); flush.console()}
193169
xo = forderv(x, by = xcols)
194170
if (verbose) {catf("Calculated ad hoc index in %s\n", timetaken(last.started.at)); flush.console()}
195-
# TODO: use setindex() instead, so it's cached for future reuse
196171
}
197172
}
198-
## these variables are only needed for non-equi joins. Set them to default.
199173
nqgrp = integer(0L)
200174
nqmaxgrp = 1L
201175
} else {
202-
# non-equi operators present.. investigate groups..
203176
nqgrp = integer(0L)
204177
nqmaxgrp = 1L
205178
if (verbose) catf("Non-equi join operators detected ... \n")
206179
if (roll != FALSE) stopf("roll is not implemented for non-equi joins yet.")
207180
if (verbose) {last.started.at=proc.time();catf(" forder took ... ");flush.console()}
208-
# TODO: could check/reuse secondary indices, but we need 'starts' attribute as well!
209181
xo = forderv(x, xcols, retGrp=TRUE)
210-
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # notranslate
182+
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
211183
xg = attr(xo, 'starts', exact=TRUE)
212184
resetcols = head(xcols, non_equi-1L)
213185
if (length(resetcols)) {
214-
# TODO: can we get around having to reorder twice here?
215-
# or at least reuse previous order?
216186
if (verbose) {last.started.at=proc.time();catf(" Generating group lengths ... ");flush.console()}
217187
resetlen = attr(forderv(x, resetcols, retGrp=TRUE), 'starts', exact=TRUE)
218188
resetlen = .Call(Cuniqlengths, resetlen, nrow(x))
@@ -221,8 +191,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
221191
if (verbose) {last.started.at=proc.time();catf(" Generating non-equi group ids ... ");flush.console()}
222192
nqgrp = .Call(Cnestedid, x, xcols[non_equi:length(xcols)], xo, xg, resetlen, mult)
223193
if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()}
224-
if (length(nqgrp)) nqmaxgrp = max(nqgrp) # fix for #1986, when 'x' is 0-row table max(.) returns -Inf.
225-
if (nqmaxgrp > 1L) { # got some non-equi join work to do
194+
if (length(nqgrp)) nqmaxgrp = max(nqgrp)
195+
if (nqmaxgrp > 1L) {
226196
if ("_nqgrp_" %in% names(x)) stopf("Column name '_nqgrp_' is reserved for non-equi joins.")
227197
if (verbose) {last.started.at=proc.time();catf(" Recomputing forder with non-equi ids ... ");flush.console()}
228198
set(nqx<-shallow(x), j="_nqgrp_", value=nqgrp)
@@ -236,8 +206,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
236206
if (verbose) {last.started.at=proc.time();catf("Starting bmerge ...\n");flush.console()}
237207
ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp)
238208
if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()}
239-
# TO DO: xo could be moved inside Cbmerge
240209

241-
ans$xo = xo # for further use by [.data.table
210+
ans$xo = xo
242211
ans
243212
}

0 commit comments

Comments
 (0)