@@ -2,14 +2,11 @@ mergeType = function(x) {
22 ans = typeof(x )
33 if (ans == " integer" ) { if (is.factor(x )) ans = " factor" }
44 else if (ans == " double" ) { if (inherits(x , " integer64" )) ans = " integer64" }
5- # do not call fitsInInt*(x) yet because i) if both types are double we don't need to coerce even if one or both sides
6- # are int-as-double, and ii) to save calling it until we really need it
75 ans
86}
97
108cast_with_attrs = function (x , cast_fun ) {
119 ans = cast_fun(x )
12- # do not copy attributes when coercing factor (to character)
1310 if (! is.factor(x ) && ! is.null(attributes(x ))) attributes(ans ) = attributes(x )
1411 ans
1512}
@@ -27,18 +24,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
2724{
2825 callersi = i
2926 i = shallow(i )
30- # Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here
31- # by bmerge changing the type of the user's input object by reference. We now shallow copy i again. If we then
32- # coerce a column in i only, we are just changing the temporary coercion used for the merge operation. If we
33- # set callersi too then we are keeping that coerced i column in the merge result returned to user.
34- # The type of the i column is always returned (i.e. just i set not callersi too), other than:
35- # i) to convert int-as-double to int, useful for ad hoc joins when the L postfix is often forgotten.
36- # ii) to coerce i.factor to character when joining to x.character
37- # So those are the only two uses of callersi below.
38- # Careful to only use plonk syntax (full column) on i and x from now on, otherwise user's i and x would
39- # change. This is why shallow() is very importantly internal only, currently.
40-
41- # Using .SD in j to join could fail due to being locked and set() being used here, #1926
4227 .Call(C_unlock , i )
4328 x = shallow(x )
4429 .Call(C_unlock , x )
@@ -50,16 +35,12 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
5035 supported = c(ORDERING_TYPES , " factor" , " integer64" )
5136
5237 if (nrow(i )) for (a in seq_along(icols )) {
53- # - check that join columns have compatible types
54- # - do type coercions if necessary on just the shallow local copies for the purpose of join
55- # - handle factor columns appropriately
56- # Note that if i is keyed, if this coerces i's key gets dropped by set()
5738 icol = icols [a ]
5839 xcol = xcols [a ]
5940 x_merge_type = mergeType(x [[xcol ]])
6041 i_merge_type = mergeType(i [[icol ]])
61- xname = paste0(" x." , names(x )[xcol ]) # Prefixed name for x's column (bmerge's perspective)
62- iname = paste0(" i." , names(i )[icol ]) # Prefixed name for i's column (bmerge's perspective)
42+ xname = paste0(" x." , names(x )[xcol ])
43+ iname = paste0(" i." , names(i )[icol ])
6344 if (! x_merge_type %chin % supported ) stopf(" %s is type %s which is not supported by data.table join" , xname , x_merge_type )
6445 if (! i_merge_type %chin % supported ) stopf(" %s is type %s which is not supported by data.table join" , iname , i_merge_type )
6546
@@ -68,49 +49,48 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
6849 stopf(" Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined." , xname , iname )
6950 if (x_merge_type == " factor" && i_merge_type == " factor" ) {
7051 if (verbose ) catf(" Matching %s factor levels to %s factor levels.\n " , iname , xname )
71- set(i , j = icol , value = chmatch(levels(i [[icol ]]), levels(x [[xcol ]]), nomatch = 0L )[i [[icol ]]]) # nomatch=0L otherwise a level that is missing would match to NA values
52+ set(i , j = icol , value = chmatch(levels(i [[icol ]]), levels(x [[xcol ]]), nomatch = 0L )[i [[icol ]]])
7253 next
7354 } else {
74- if (x_merge_type == " character" ) {
55+ if (x_merge_type == " character" ) {
7556 coerce_col(i , icol , " factor" , " character" , iname , xname , verbose = verbose )
76- set(callersi , j = icol , value = i [[icol ]]) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
57+ set(callersi , j = icol , value = i [[icol ]])
7758 next
78- } else if (i_merge_type == " character" ) { # i is character, x is factor
59+ } else if (i_merge_type == " character" ) {
7960 if (verbose ) catf(" Matching character column %s to factor levels in %s.\n " , iname , xname )
8061 newvalue = chmatch(i [[icol ]], levels(x [[xcol ]]), nomatch = 0L )
81- if (anyNA(i [[icol ]])) newvalue [is.na(i [[icol ]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809
62+ if (anyNA(i [[icol ]])) newvalue [is.na(i [[icol ]])] = NA_integer_
8263 set(i , j = icol , value = newvalue )
8364 next
8465 }
8566 }
67+ # Incompatible factor join: Factor vs (Not Factor and Not Character)
68+ # The 'message' attribute must match the *old* error for direct calls to bmerge (e.g., DT[otherDT])
8669 condition_message <- sprintf(
87- " Incompatible join types (from bmerge perspective) : %s (%s) and %s (%s). Factor columns must join to factor or character columns. " ,
88- xname , x_merge_type , # xname is like "x.colA", x_merge_type is its type
89- iname , i_merge_type # iname is like "i.colB", i_merge_type is its type
70+ " Incompatible join types: %s (%s) and %s (%s)" , # Exact match for tests like 2044.24
71+ xname , x_merge_type ,
72+ iname , i_merge_type
9073 )
74+
9175 condition <- structure(
9276 list (
93- message = condition_message , # Raw message from bmerge if not caught and rephrased
94- call = NULL , # Will be populated by stop() or handler
95-
96- # Details for 'x' argument bmerge.R received (e.g., table y from merge(x,y) or table x from x[i])
97- c_bmerge_x_arg_bare_col_name = names(x )[xcol ], # Bare column name from x, e.g., "a"
98- c_bmerge_x_arg_type = x_merge_type , # Type string for x's column, e.g., "integer"
99-
100- # Details for 'i' argument bmerge.R received (e.g., table x from merge(x,y) or table i from x[i])
101- c_bmerge_i_arg_bare_col_name = names(i )[icol ], # Bare column name from i, e.g., "a"
102- c_bmerge_i_arg_type = i_merge_type # Type string for i's column, e.g., "factor"
77+ message = condition_message ,
78+ call = NULL ,
79+ c_bmerge_x_arg_bare_col_name = names(x )[xcol ],
80+ c_bmerge_x_arg_type = x_merge_type ,
81+ c_bmerge_i_arg_bare_col_name = names(i )[icol ],
82+ c_bmerge_i_arg_type = i_merge_type
10383 ),
10484 class = c(" bmerge_incompatible_type_error" , " data.table_error" , " error" , " condition" )
10585 )
10686 stop(condition )
10787 }
108- # we check factors first to cater for the case when trying to do rolling joins on factors
88+
10989 if (x_merge_type == i_merge_type ) {
11090 if (verbose ) catf(" %s has same type (%s) as %s. No coercion needed.\n " , iname , x_merge_type , xname )
11191 next
11292 }
113- cfl = c(" character" , " logical" , " factor" ) # Note: 'factor' in cfl is effectively dead code here due to earlier factor handling
93+ cfl = c(" character" , " logical" , " factor" )
11494 if (x_merge_type %chin % cfl || i_merge_type %chin % cfl ) {
11595 if (anyNA(i [[icol ]]) && allNA(i [[icol ]])) {
11696 coerce_col(i , icol , i_merge_type , x_merge_type , iname , xname , from_detail = gettext(" (all-NA)" ), verbose = verbose )
@@ -120,25 +100,23 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
120100 coerce_col(x , xcol , x_merge_type , i_merge_type , xname , iname , from_detail = gettext(" (all-NA)" ), verbose = verbose )
121101 next
122102 }
123- # This stopf is for other incompatible types not covered by the factor-specific error above
103+ # This 'stopf' might have been the one originally hit by the failing tests.
104+ # Our custom error above now preempts this for incompatible factor joins.
124105 stopf(" Incompatible join types: %s (%s) and %s (%s)" , xname , x_merge_type , iname , i_merge_type )
125106 }
126107 if (x_merge_type == " integer64" || i_merge_type == " integer64" ) {
127108 nm = c(iname , xname )
128- if (x_merge_type == " integer64" ) { w = i ; wc = icol ; wclass = i_merge_type ; } else { w = x ; wc = xcol ; wclass = x_merge_type ; nm = rev(nm ) } # w is which to coerce
109+ if (x_merge_type == " integer64" ) { w = i ; wc = icol ; wclass = i_merge_type ; } else { w = x ; wc = xcol ; wclass = x_merge_type ; nm = rev(nm ) }
129110 if (wclass == " integer" || (wclass == " double" && fitsInInt64(w [[wc ]]))) {
130111 from_detail = if (wclass == " double" ) gettext(" (which has integer64 representation, e.g. no fractions)" ) else " "
131112 coerce_col(w , wc , wclass , " integer64" , nm [1L ], nm [2L ], from_detail , verbose = verbose )
132113 } else stopf(" Incompatible join types: %s is type integer64 but %s is type double and cannot be coerced to integer64 (e.g. has fractions)" , nm [2L ], nm [1L ])
133114 } else {
134- # just integer and double left
135- ic_idx = which(icol == icols ) # check if on is joined on multiple conditions, #6602
115+ ic_idx = which(icol == icols )
136116 if (i_merge_type == " double" ) {
137117 coerce_x = FALSE
138118 if (fitsInInt32(i [[icol ]])) {
139119 coerce_x = TRUE
140- # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys
141- # we've always coerced to int and returned int, for convenience.
142120 if (length(ic_idx )> 1L ) {
143121 xc_idx = xcols [ic_idx ]
144122 for (xb in xc_idx [which(vapply_1c(.shallow(x , xc_idx ), mergeType ) == " double" )]) {
@@ -151,7 +129,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
151129 if (coerce_x ) {
152130 from_detail = gettext(" (which contains no fractions)" )
153131 coerce_col(i , icol , " double" , " integer" , iname , xname , from_detail , verbose = verbose )
154- set(callersi , j = icol , value = i [[icol ]]) # change the shallow copy of i up in [.data.table to reflect in the result, too.
132+ set(callersi , j = icol , value = i [[icol ]])
155133 if (length(ic_idx )> 1L ) {
156134 xc_idx = xcols [ic_idx ]
157135 for (xb in xc_idx [which(vapply_1c(.shallow(x , xc_idx ), mergeType ) == " double" )]) {
@@ -174,11 +152,9 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
174152 }
175153 }
176154 }
177- # # after all modifications of x, check if x has a proper key on all xcols.
178- # # If not, calculate the order. Also for non-equi joins, the order must be calculated.
179- non_equi = which.first(ops != 1L ) # 1 is "==" operator
155+
156+ non_equi = which.first(ops != 1L )
180157 if (is.na(non_equi )) {
181- # equi join. use existing key (#1825) or existing secondary index (#1439)
182158 if (identical(xcols , head(chmatch(key(x ), names(x )), length(xcols )))) {
183159 xo = integer(0L )
184160 if (verbose ) catf(" on= matches existing key, using key\n " )
@@ -192,27 +168,21 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
192168 if (verbose ) {last.started.at = proc.time(); flush.console()}
193169 xo = forderv(x , by = xcols )
194170 if (verbose ) {catf(" Calculated ad hoc index in %s\n " , timetaken(last.started.at )); flush.console()}
195- # TODO: use setindex() instead, so it's cached for future reuse
196171 }
197172 }
198- # # these variables are only needed for non-equi joins. Set them to default.
199173 nqgrp = integer(0L )
200174 nqmaxgrp = 1L
201175 } else {
202- # non-equi operators present.. investigate groups..
203176 nqgrp = integer(0L )
204177 nqmaxgrp = 1L
205178 if (verbose ) catf(" Non-equi join operators detected ... \n " )
206179 if (roll != FALSE ) stopf(" roll is not implemented for non-equi joins yet." )
207180 if (verbose ) {last.started.at = proc.time();catf(" forder took ... " );flush.console()}
208- # TODO: could check/reuse secondary indices, but we need 'starts' attribute as well!
209181 xo = forderv(x , xcols , retGrp = TRUE )
210- if (verbose ) {cat(timetaken(last.started.at )," \n " ); flush.console()} # notranslate
182+ if (verbose ) {cat(timetaken(last.started.at )," \n " ); flush.console()}
211183 xg = attr(xo , ' starts' , exact = TRUE )
212184 resetcols = head(xcols , non_equi - 1L )
213185 if (length(resetcols )) {
214- # TODO: can we get around having to reorder twice here?
215- # or at least reuse previous order?
216186 if (verbose ) {last.started.at = proc.time();catf(" Generating group lengths ... " );flush.console()}
217187 resetlen = attr(forderv(x , resetcols , retGrp = TRUE ), ' starts' , exact = TRUE )
218188 resetlen = .Call(Cuniqlengths , resetlen , nrow(x ))
@@ -221,8 +191,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
221191 if (verbose ) {last.started.at = proc.time();catf(" Generating non-equi group ids ... " );flush.console()}
222192 nqgrp = .Call(Cnestedid , x , xcols [non_equi : length(xcols )], xo , xg , resetlen , mult )
223193 if (verbose ) {catf(" done in %s\n " ,timetaken(last.started.at )); flush.console()}
224- if (length(nqgrp )) nqmaxgrp = max(nqgrp ) # fix for #1986, when 'x' is 0-row table max(.) returns -Inf.
225- if (nqmaxgrp > 1L ) { # got some non-equi join work to do
194+ if (length(nqgrp )) nqmaxgrp = max(nqgrp )
195+ if (nqmaxgrp > 1L ) {
226196 if (" _nqgrp_" %in% names(x )) stopf(" Column name '_nqgrp_' is reserved for non-equi joins." )
227197 if (verbose ) {last.started.at = proc.time();catf(" Recomputing forder with non-equi ids ... " );flush.console()}
228198 set(nqx <- shallow(x ), j = " _nqgrp_" , value = nqgrp )
@@ -236,8 +206,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
236206 if (verbose ) {last.started.at = proc.time();catf(" Starting bmerge ...\n " );flush.console()}
237207 ans = .Call(Cbmerge , i , x , as.integer(icols ), as.integer(xcols ), xo , roll , rollends , nomatch , mult , ops , nqgrp , nqmaxgrp )
238208 if (verbose ) {catf(" bmerge done in %s\n " ,timetaken(last.started.at )); flush.console()}
239- # TO DO: xo could be moved inside Cbmerge
240209
241- ans $ xo = xo # for further use by [.data.table
210+ ans $ xo = xo
242211 ans
243212}
0 commit comments