1+
2+
13mergeType = function (x ) {
24 ans = typeof(x )
35 if (ans == " integer" ) { if (is.factor(x )) ans = " factor" }
46 else if (ans == " double" ) { if (inherits(x , " integer64" )) ans = " integer64" }
7+ # do not call fitsInInt*(x) yet because i) if both types are double we don't need to coerce even if one or both sides
8+ # are int-as-double, and ii) to save calling it until we really need it
59 ans
610}
711
812cast_with_attrs = function (x , cast_fun ) {
913 ans = cast_fun(x )
14+ # do not copy attributes when coercing factor (to character)
1015 if (! is.factor(x ) && ! is.null(attributes(x ))) attributes(ans ) = attributes(x )
1116 ans
1217}
@@ -24,6 +29,18 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
2429{
2530 callersi = i
2631 i = shallow(i )
32+ # Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here
33+ # by bmerge changing the type of the user's input object by reference. We now shallow copy i again. If we then
34+ # coerce a column in i only, we are just changing the temporary coercion used for the merge operation. If we
35+ # set callersi too then we are keeping that coerced i column in the merge result returned to user.
36+ # The type of the i column is always returned (i.e. just i set not callersi too), other than:
37+ # i) to convert int-as-double to int, useful for ad hoc joins when the L postfix is often forgotten.
38+ # ii) to coerce i.factor to character when joining to x.character
39+ # So those are the only two uses of callersi below.
40+ # Careful to only use plonk syntax (full column) on i and x from now on, otherwise user's i and x would
41+ # change. This is why shallow() is very importantly internal only, currently.
42+
43+ # Using .SD in j to join could fail due to being locked and set() being used here, #1926
2744 .Call(C_unlock , i )
2845 x = shallow(x )
2946 .Call(C_unlock , x )
@@ -35,6 +52,10 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
3552 supported = c(ORDERING_TYPES , " factor" , " integer64" )
3653
3754 if (nrow(i )) for (a in seq_along(icols )) {
55+ # - check that join columns have compatible types
56+ # - do type coercions if necessary on just the shallow local copies for the purpose of join
57+ # - handle factor columns appropriately
58+ # Note that if i is keyed, if this coerces i's key gets dropped by set()
3859 icol = icols [a ]
3960 xcol = xcols [a ]
4061 x_merge_type = mergeType(x [[xcol ]])
@@ -43,48 +64,44 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
4364 iname = paste0(" i." , names(i )[icol ])
4465 if (! x_merge_type %chin % supported ) stopf(" %s is type %s which is not supported by data.table join" , xname , x_merge_type )
4566 if (! i_merge_type %chin % supported ) stopf(" %s is type %s which is not supported by data.table join" , iname , i_merge_type )
46-
4767 if (x_merge_type == " factor" || i_merge_type == " factor" ) {
4868 if (roll != 0.0 && a == length(icols ))
4969 stopf(" Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined." , xname , iname )
5070 if (x_merge_type == " factor" && i_merge_type == " factor" ) {
5171 if (verbose ) catf(" Matching %s factor levels to %s factor levels.\n " , iname , xname )
52- set(i , j = icol , value = chmatch(levels(i [[icol ]]), levels(x [[xcol ]]), nomatch = 0L )[i [[icol ]]])
72+ set(i , j = icol , value = chmatch(levels(i [[icol ]]), levels(x [[xcol ]]), nomatch = 0L )[i [[icol ]]]) # nomatch=0L otherwise a level that is missing would match to NA values
5373 next
5474 } else {
5575 if (x_merge_type == " character" ) {
5676 coerce_col(i , icol , " factor" , " character" , iname , xname , verbose = verbose )
57- set(callersi , j = icol , value = i [[icol ]])
77+ set(callersi , j = icol , value = i [[icol ]]) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
5878 next
5979 } else if (i_merge_type == " character" ) {
6080 if (verbose ) catf(" Matching character column %s to factor levels in %s.\n " , iname , xname )
6181 newvalue = chmatch(i [[icol ]], levels(x [[xcol ]]), nomatch = 0L )
62- if (anyNA(i [[icol ]])) newvalue [is.na(i [[icol ]])] = NA_integer_
82+ if (anyNA(i [[icol ]])) newvalue [is.na(i [[icol ]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809
6383 set(i , j = icol , value = newvalue )
6484 next
6585 }
6686 }
67- # Incompatible factor join: Factor vs (Not Factor and Not Character)
68- # The 'message' attribute must match the *old* error for direct calls to bmerge (e.g., DT[otherDT])
6987 condition_message <- sprintf(
7088 " Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns." , # Exact match for tests like 2044.24
7189 xname , x_merge_type ,
7290 iname , i_merge_type
7391 )
74-
7592 condition <- structure(
7693 list (
7794 message = condition_message ,
78- c_bmerge_x_arg_bare_col_name = names(x )[xcol ],
79- c_bmerge_x_arg_type = x_merge_type ,
80- c_bmerge_i_arg_bare_col_name = names(i )[icol ],
81- c_bmerge_i_arg_type = i_merge_type
95+ bmerge_x_arg_col_name = names(x )[xcol ],
96+ bmerge_x_arg_type = x_merge_type ,
97+ bmerge_i_arg_col_name = names(i )[icol ],
98+ bmerge_i_arg_type = i_merge_type
8299 ),
83100 class = c(" bmerge_incompatible_type_error" , " data.table_error" , " error" , " condition" )
84101 )
85102 stop(condition )
86103 }
87-
104+ # we check factors first to cater for the case when trying to do rolling joins on factors
88105 if (x_merge_type == i_merge_type ) {
89106 if (verbose ) catf(" %s has same type (%s) as %s. No coercion needed.\n " , iname , x_merge_type , xname )
90107 next
@@ -99,23 +116,24 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
99116 coerce_col(x , xcol , x_merge_type , i_merge_type , xname , iname , from_detail = gettext(" (all-NA)" ), verbose = verbose )
100117 next
101118 }
102- # This 'stopf' might have been the one originally hit by the failing tests.
103- # Our custom error above now preempts this for incompatible factor joins.
104119 stopf(" Incompatible join types: %s (%s) and %s (%s)" , xname , x_merge_type , iname , i_merge_type )
105120 }
106121 if (x_merge_type == " integer64" || i_merge_type == " integer64" ) {
107122 nm = c(iname , xname )
108- if (x_merge_type == " integer64" ) { w = i ; wc = icol ; wclass = i_merge_type ; } else { w = x ; wc = xcol ; wclass = x_merge_type ; nm = rev(nm ) }
123+ if (x_merge_type == " integer64" ) { w = i ; wc = icol ; wclass = i_merge_type ; } else { w = x ; wc = xcol ; wclass = x_merge_type ; nm = rev(nm ) } # w is which to coerce
109124 if (wclass == " integer" || (wclass == " double" && fitsInInt64(w [[wc ]]))) {
110125 from_detail = if (wclass == " double" ) gettext(" (which has integer64 representation, e.g. no fractions)" ) else " "
111126 coerce_col(w , wc , wclass , " integer64" , nm [1L ], nm [2L ], from_detail , verbose = verbose )
112127 } else stopf(" Incompatible join types: %s is type integer64 but %s is type double and cannot be coerced to integer64 (e.g. has fractions)" , nm [2L ], nm [1L ])
113128 } else {
114- ic_idx = which(icol == icols )
129+ # just integer and double left
130+ ic_idx = which(icol == icols ) # check if on is joined on multiple conditions, #6602
115131 if (i_merge_type == " double" ) {
116132 coerce_x = FALSE
117133 if (fitsInInt32(i [[icol ]])) {
118134 coerce_x = TRUE
135+ # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys
136+ # we've always coerced to int and returned int, for convenience.
119137 if (length(ic_idx )> 1L ) {
120138 xc_idx = xcols [ic_idx ]
121139 for (xb in xc_idx [which(vapply_1c(.shallow(x , xc_idx ), mergeType ) == " double" )]) {
@@ -128,7 +146,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
128146 if (coerce_x ) {
129147 from_detail = gettext(" (which contains no fractions)" )
130148 coerce_col(i , icol , " double" , " integer" , iname , xname , from_detail , verbose = verbose )
131- set(callersi , j = icol , value = i [[icol ]])
149+ set(callersi , j = icol , value = i [[icol ]]) # change the shallow copy of i up in [.data.table to reflect in the result, too.
132150 if (length(ic_idx )> 1L ) {
133151 xc_idx = xcols [ic_idx ]
134152 for (xb in xc_idx [which(vapply_1c(.shallow(x , xc_idx ), mergeType ) == " double" )]) {
@@ -152,8 +170,11 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
152170 }
153171 }
154172
155- non_equi = which.first(ops != 1L )
173+ # # after all modifications of x, check if x has a proper key on all xcols.
174+ # # If not, calculate the order. Also for non-equi joins, the order must be calculated.
175+ non_equi = which.first(ops != 1L ) # 1 is "==" operator
156176 if (is.na(non_equi )) {
177+ # equi join. use existing key (#1825) or existing secondary index (#1439)
157178 if (identical(xcols , head(chmatch(key(x ), names(x )), length(xcols )))) {
158179 xo = integer(0L )
159180 if (verbose ) catf(" on= matches existing key, using key\n " )
@@ -167,21 +188,27 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
167188 if (verbose ) {last.started.at = proc.time(); flush.console()}
168189 xo = forderv(x , by = xcols )
169190 if (verbose ) {catf(" Calculated ad hoc index in %s\n " , timetaken(last.started.at )); flush.console()}
191+ # TODO: use setindex() instead, so it's cached for future reuse
170192 }
171193 }
194+ # # these variables are only needed for non-equi joins. Set them to default.
172195 nqgrp = integer(0L )
173196 nqmaxgrp = 1L
174197 } else {
198+ # non-equi operators present.. investigate groups..
175199 nqgrp = integer(0L )
176200 nqmaxgrp = 1L
177201 if (verbose ) catf(" Non-equi join operators detected ... \n " )
178202 if (roll != FALSE ) stopf(" roll is not implemented for non-equi joins yet." )
179203 if (verbose ) {last.started.at = proc.time();catf(" forder took ... " );flush.console()}
204+ # TODO: could check/reuse secondary indices, but we need 'starts' attribute as well!
180205 xo = forderv(x , xcols , retGrp = TRUE )
181- if (verbose ) {cat(timetaken(last.started.at )," \n " ); flush.console()}
206+ if (verbose ) {cat(timetaken(last.started.at )," \n " ); flush.console()} # notranslate
182207 xg = attr(xo , ' starts' , exact = TRUE )
183208 resetcols = head(xcols , non_equi - 1L )
184209 if (length(resetcols )) {
210+ # TODO: can we get around having to reorder twice here?
211+ # or at least reuse previous order?
185212 if (verbose ) {last.started.at = proc.time();catf(" Generating group lengths ... " );flush.console()}
186213 resetlen = attr(forderv(x , resetcols , retGrp = TRUE ), ' starts' , exact = TRUE )
187214 resetlen = .Call(Cuniqlengths , resetlen , nrow(x ))
@@ -190,8 +217,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
190217 if (verbose ) {last.started.at = proc.time();catf(" Generating non-equi group ids ... " );flush.console()}
191218 nqgrp = .Call(Cnestedid , x , xcols [non_equi : length(xcols )], xo , xg , resetlen , mult )
192219 if (verbose ) {catf(" done in %s\n " ,timetaken(last.started.at )); flush.console()}
193- if (length(nqgrp )) nqmaxgrp = max(nqgrp )
194- if (nqmaxgrp > 1L ) {
220+ if (length(nqgrp )) nqmaxgrp = max(nqgrp ) # fix for #1986, when 'x' is 0-row table max(.) returns -Inf.
221+ if (nqmaxgrp > 1L ) { # got some non-equi join work to do
195222 if (" _nqgrp_" %in% names(x )) stopf(" Column name '_nqgrp_' is reserved for non-equi joins." )
196223 if (verbose ) {last.started.at = proc.time();catf(" Recomputing forder with non-equi ids ... " );flush.console()}
197224 set(nqx <- shallow(x ), j = " _nqgrp_" , value = nqgrp )
@@ -205,7 +232,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
205232 if (verbose ) {last.started.at = proc.time();catf(" Starting bmerge ...\n " );flush.console()}
206233 ans = .Call(Cbmerge , i , x , as.integer(icols ), as.integer(xcols ), xo , roll , rollends , nomatch , mult , ops , nqgrp , nqmaxgrp )
207234 if (verbose ) {catf(" bmerge done in %s\n " ,timetaken(last.started.at )); flush.console()}
235+ # TO DO: xo could be moved inside Cbmerge
208236
209- ans $ xo = xo
237+ ans $ xo = xo # for further use by [.data.table
210238 ans
211- }
239+ }
0 commit comments