From b14534d80a766d638b3ff66f10b3e7b859b03dc7 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 09:17:14 +0530 Subject: [PATCH 01/23] hsdhP --- R/bmerge.R | 17 ++++++++++++++++- R/merge.R | 49 ++++++++++++++++++++++++++++++------------------- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index dffca5e44f..9ff5e9d9e3 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -84,7 +84,22 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos next } } - stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type) + raw_msg <- sprintf("Incompatible join types for bmerge: '%s' (%s) and '%s' (%s). Factor columns must join to factor or character columns.", + xname, x_merge_type, # Details for C-bmerge 'x' argument + iname, i_merge_type) # Details for C-bmerge 'i' argument + + condition <- structure( + list( + message = raw_msg, # Generic message from bmerge's perspective + call = NULL, # Will be populated by stop() or handler + c_bmerge_x_arg_col_specifier = xname, # e.g., "x.a" or "a" + c_bmerge_x_arg_type = x_merge_type, # e.g., "integer" + c_bmerge_i_arg_col_specifier = iname, # e.g., "i.a" or "a" + c_bmerge_i_arg_type = i_merge_type # e.g., "factor" + ), + class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition") + ) + stop(condition) } # we check factors first to cater for the case when trying to do rolling joins on factors if (x_merge_type == i_merge_type) { diff --git a/R/merge.R b/R/merge.R index 4d7245983e..d406e40521 100644 --- a/R/merge.R +++ b/R/merge.R @@ -46,7 +46,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } else { if (is.null(by)) by = intersect(key(x), key(y)) - if (!length(by)) # was is.null() before PR#5183 changed to !length() + if (!length(by)) by = key(x) if (!length(by)) by = intersect(nm_x, nm_y) @@ -62,7 +62,6 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL by.x = by.y = by } - # warn about unused arguments #2587 if (length(list(...))) { ell = as.list(substitute(list(...)))[-1L] for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n) @@ -70,10 +69,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (unnamed_n) warningf("Passed %d unknown and unnamed arguments.", unnamed_n) } - # with i. prefix in v1.9.3, this goes away. Left here for now ... - ## sidestep the auto-increment column number feature-leading-to-bug by - ## ensuring no names end in ".1", see unit test - ## "merge and auto-increment columns in y[x]" in test-data.frame.like.R + start = setdiff(nm_x, by.x) end = setdiff(nm_y, by.y) dupnames = intersect(start, end) @@ -81,47 +77,62 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L]) end[chmatch(dupnames, end, 0L)] = paste0(dupnames, suffixes[2L]) } - # If no.dups = TRUE we also need to added the suffix to columns in y - # that share a name with by.x + dupkeyx = intersect(by.x, end) if (no.dups && length(dupkeyx)) { end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) } - # implement incomparables argument #2587 if (!is.null(incomparables)) { - # %fin% to be replaced when #5232 is implemented/closed "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by) yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by) - # subset both so later steps still work x = x[xind] y = y[yind] } - dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names) - if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed - # Perhaps not very commonly used, so not a huge deal that the join is redone here. + # Enhanced error handling during join + user_x_name = deparse1(substitute(x)) + user_y_name = deparse1(substitute(y)) + dt = tryCatch({ + y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] + }, bmerge_incompatible_type_error = function(e) { + col_from_user_x <- e$bmerge_i_arg_details$col_name + type_from_user_x <- e$bmerge_i_arg_details$type_str + + col_from_user_y <- e$bmerge_x_arg_details$col_name + type_from_user_y <- e$bmerge_x_arg_details$type_str + + by_col_name <- col_from_user_x + + new_msg <- sprintf( + "When merging data.table '%s' (argument 'x') and data.table '%s' (argument 'y') on column '%s':\n Incompatible join types. Factor columns must join to factor or character columns.\n Type in '%s' (column '%s'%s): %s\n Type in '%s' (column '%s'%s): %s", + user_x_name, + user_y_name, + by_col_name, + user_x_name, by_col_name, if (suffixes[1] != "") paste0(" becoming '", by_col_name, suffixes[1], "'") else "", type_from_user_x, + user_y_name, by_col_name, if (suffixes[2] != "") paste0(" becoming '", by_col_name, suffixes[2], "'") else "", type_from_user_y + ) + + stop(errorCondition(message = new_msg, class = c("datatable_merge_error", "error", "condition"), call = NULL)) + }) + + if (all.y && nrow(y)) { missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } - # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. newend = setdiff(nm_y, by.y) - # fix for #1290, make sure by.y order is set properly before naming setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) setnames(dt, c(by.x, start, end)) if (nrow(dt) > 0L) { setkeyv(dt, if (sort) by.x else NULL) } - # Throw warning if there are duplicate column names in 'dt' (i.e. if - # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame) resultdupnames = names(dt)[duplicated(names(dt))] if (length(resultdupnames)) { warningf("column names %s are duplicated in the result", brackify(resultdupnames)) } - # retain custom classes of first argument that resulted in dispatch to this method, #1378 setattr(dt, "class", class_x) dt } From 54a7286473cc35e562c579056cac2c41af568888 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 09:23:37 +0530 Subject: [PATCH 02/23] jb --- R/merge.R | 59 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/R/merge.R b/R/merge.R index d406e40521..edcb9fc3ff 100644 --- a/R/merge.R +++ b/R/merge.R @@ -27,7 +27,6 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL nm_x = names(x) nm_y = names(y) - ## set up 'by'/'by.x'/'by.y' if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y)) stopf("`by.x` and `by.y` must be of same length.") if (!missing(by) && !missing(by.x)) @@ -77,7 +76,6 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L]) end[chmatch(dupnames, end, 0L)] = paste0(dupnames, suffixes[2L]) } - dupkeyx = intersect(by.x, end) if (no.dups && length(dupkeyx)) { end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) @@ -91,36 +89,47 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL y = y[yind] } - # Enhanced error handling during join - user_x_name = deparse1(substitute(x)) - user_y_name = deparse1(substitute(y)) - dt = tryCatch({ - y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] - }, bmerge_incompatible_type_error = function(e) { - col_from_user_x <- e$bmerge_i_arg_details$col_name - type_from_user_x <- e$bmerge_i_arg_details$type_str - - col_from_user_y <- e$bmerge_x_arg_details$col_name - type_from_user_y <- e$bmerge_x_arg_details$type_str - - by_col_name <- col_from_user_x + # ----- BEGIN MODIFICATION ----- + user_x_name <- deparse1(substitute(x)) + user_y_name <- deparse1(substitute(y)) + by_for_join <- by + dt <- tryCatch({ + y[x, nomatch = if (all.x) NA else NULL, on = by_for_join, allow.cartesian = allow.cartesian] + }, + bmerge_incompatible_type_error = function(e) { + safe_attr_get <- function(obj, attr_name, placeholder) { + val <- obj[[attr_name]] + if (is.null(val) || length(val) != 1L || is.na(val) || !nzchar(as.character(val)[1L])) { + warning(paste0("Problem retrieving attribute '", attr_name, "' from bmerge_incompatible_type_error object. Was: ", paste(capture.output(dput(val)), collapse = " "))) + return(placeholder) + } + return(as.character(val)[1L]) + } + col_from_user_x_arg <- safe_attr_get(e, "c_bmerge_i_arg_bare_col_name", "[unknown column from x]") + type_from_user_x_arg <- safe_attr_get(e, "c_bmerge_i_arg_type", "[unknown type from x]") + col_from_user_y_arg <- safe_attr_get(e, "c_bmerge_x_arg_bare_col_name", "[unknown column from y]") + type_from_user_y_arg <- safe_attr_get(e, "c_bmerge_x_arg_type", "[unknown type from y]") + final_user_x_name <- if (is.null(user_x_name) || length(user_x_name) != 1L || !nzchar(user_x_name[1L])) "x" else user_x_name[1L] + final_user_y_name <- if (is.null(user_y_name) || length(user_y_name) != 1L || !nzchar(user_y_name[1L])) "y" else user_y_name[1L] - new_msg <- sprintf( - "When merging data.table '%s' (argument 'x') and data.table '%s' (argument 'y') on column '%s':\n Incompatible join types. Factor columns must join to factor or character columns.\n Type in '%s' (column '%s'%s): %s\n Type in '%s' (column '%s'%s): %s", - user_x_name, - user_y_name, - by_col_name, - user_x_name, by_col_name, if (suffixes[1] != "") paste0(" becoming '", by_col_name, suffixes[1], "'") else "", type_from_user_x, - user_y_name, by_col_name, if (suffixes[2] != "") paste0(" becoming '", by_col_name, suffixes[2], "'") else "", type_from_user_y - ) + msg <- sprintf( + "When merging data.table '%s' (argument 'x') and data.table '%s' (argument 'y'):\n Incompatible join types. Factor columns must join to factor or character columns.\n Column '%s' from '%s' (arg 'x') has type: %s.\n Column '%s' from '%s' (arg 'y') has type: %s.", + final_user_x_name, + final_user_y_name, + col_from_user_x_arg, final_user_x_name, type_from_user_x_arg, + col_from_user_y_arg, final_user_y_name, type_from_user_y_arg + ) - stop(errorCondition(message = new_msg, class = c("datatable_merge_error", "error", "condition"), call = NULL)) - }) + stop(errorCondition(message = msg, class = c("datatable_merge_type_error", "data.table_error", "error", "condition"), call = NULL)) + } + ) + # ----- END MODIFICATION ----- if (all.y && nrow(y)) { missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } + newend = setdiff(nm_y, by.y) setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) setnames(dt, c(by.x, start, end)) From d4bce9f3283faba9cbe35f5e34e3cc6e898427cd Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 09:26:04 +0530 Subject: [PATCH 03/23] jh --- R/bmerge.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 9ff5e9d9e3..31b9421a19 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -92,10 +92,10 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos list( message = raw_msg, # Generic message from bmerge's perspective call = NULL, # Will be populated by stop() or handler - c_bmerge_x_arg_col_specifier = xname, # e.g., "x.a" or "a" - c_bmerge_x_arg_type = x_merge_type, # e.g., "integer" - c_bmerge_i_arg_col_specifier = iname, # e.g., "i.a" or "a" - c_bmerge_i_arg_type = i_merge_type # e.g., "factor" + c_bmerge_x_arg_bare_col_name = x_bare_colname_variable, # MUST be a valid string + c_bmerge_x_arg_type = x_coltype_str_variable, # MUST be a valid string + c_bmerge_i_arg_bare_col_name = i_bare_colname_variable, # MUST be a valid string + c_bmerge_i_arg_type = i_coltype_str_variable # MUST be a valid string ), class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition") ) From 2ede355d40a6d569cc5582949d96fa3db73df941 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 10:34:07 +0530 Subject: [PATCH 04/23] jkf --- R/bmerge.R | 57 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 31b9421a19..9293128fe3 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -1,5 +1,3 @@ - - mergeType = function(x) { ans = typeof(x) if (ans=="integer") { if (is.factor(x)) ans = "factor" } @@ -60,10 +58,11 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos xcol = xcols[a] x_merge_type = mergeType(x[[xcol]]) i_merge_type = mergeType(i[[icol]]) - xname = paste0("x.", names(x)[xcol]) - iname = paste0("i.", names(i)[icol]) + xname = paste0("x.", names(x)[xcol]) # Prefixed name for x's column (bmerge's perspective) + iname = paste0("i.", names(i)[icol]) # Prefixed name for i's column (bmerge's perspective) if (!x_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, x_merge_type) if (!i_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, i_merge_type) + if (x_merge_type=="factor" || i_merge_type=="factor") { if (roll!=0.0 && a==length(icols)) stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname) @@ -71,12 +70,12 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname) set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) # nomatch=0L otherwise a level that is missing would match to NA values next - } else { - if (x_merge_type=="character") { + } else { # One is factor, the other is not factor + if (x_merge_type=="character") { # i is factor, x is character coerce_col(i, icol, "factor", "character", iname, xname, verbose=verbose) set(callersi, j=icol, value=i[[icol]]) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581 next - } else if (i_merge_type=="character") { + } else if (i_merge_type=="character") { # i is character, x is factor if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname) newvalue = chmatch(i[[icol]], levels(x[[xcol]]), nomatch=0L) if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809 @@ -84,29 +83,46 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos next } } - raw_msg <- sprintf("Incompatible join types for bmerge: '%s' (%s) and '%s' (%s). Factor columns must join to factor or character columns.", - xname, x_merge_type, # Details for C-bmerge 'x' argument - iname, i_merge_type) # Details for C-bmerge 'i' argument + # ----- BEGIN MODIFICATION ----- + # If execution reaches here, it means: + # 1. One of (x_merge_type, i_merge_type) is "factor". + # 2. They are not BOTH "factor". + # 3. The non-factor column is NOT "character". + # This is the specific incompatibility: Factor joining with non-factor/non-character. + + # Message for the condition object, from bmerge's perspective using its 'x' and 'i' arguments + # and their prefixed column names. + condition_message <- sprintf( + "Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", + xname, x_merge_type, # xname is like "x.colA", x_merge_type is its type + iname, i_merge_type # iname is like "i.colB", i_merge_type is its type + ) condition <- structure( list( - message = raw_msg, # Generic message from bmerge's perspective - call = NULL, # Will be populated by stop() or handler - c_bmerge_x_arg_bare_col_name = x_bare_colname_variable, # MUST be a valid string - c_bmerge_x_arg_type = x_coltype_str_variable, # MUST be a valid string - c_bmerge_i_arg_bare_col_name = i_bare_colname_variable, # MUST be a valid string - c_bmerge_i_arg_type = i_coltype_str_variable # MUST be a valid string + message = condition_message, # Raw message if not caught and rephrased + call = NULL, # Will be populated by stop() or handler + + # Details for 'x' argument received by bmerge.R (e.g., DT2 in merge(DT1,DT2) or DT1 in DT1[DT2]) + c_bmerge_x_arg_bare_col_name = names(x)[xcol], # Bare column name, e.g., "a" + c_bmerge_x_arg_type = x_merge_type, # Type string, e.g., "integer" + + # Details for 'i' argument received by bmerge.R (e.g., DT1 in merge(DT1,DT2) or DT2 in DT1[DT2]) + c_bmerge_i_arg_bare_col_name = names(i)[icol], # Bare column name, e.g., "a" + c_bmerge_i_arg_type = i_merge_type # Type string, e.g., "factor" ), class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition") ) stop(condition) - } + # ----- END MODIFICATION ----- + } # End of initial "if (x_merge_type=='factor' || i_merge_type=='factor')" block + # we check factors first to cater for the case when trying to do rolling joins on factors if (x_merge_type == i_merge_type) { if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname) next } - cfl = c("character", "logical", "factor") + cfl = c("character", "logical", "factor") # Note: 'factor' in cfl is effectively dead code here due to earlier factor handling if (x_merge_type %chin% cfl || i_merge_type %chin% cfl) { if (anyNA(i[[icol]]) && allNA(i[[icol]])) { coerce_col(i, icol, i_merge_type, x_merge_type, iname, xname, from_detail=gettext(" (all-NA)"), verbose=verbose) @@ -116,6 +132,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos coerce_col(x, xcol, x_merge_type, i_merge_type, xname, iname, from_detail=gettext(" (all-NA)"), verbose=verbose) next } + # This stopf is for other incompatible types not covered by the factor-specific error above stopf("Incompatible join types: %s (%s) and %s (%s)", xname, x_merge_type, iname, i_merge_type) } if (x_merge_type=="integer64" || i_merge_type=="integer64") { @@ -168,7 +185,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } } } - } + } # End of for loop iterating through join columns ## after all modifications of x, check if x has a proper key on all xcols. ## If not, calculate the order. Also for non-equi joins, the order must be calculated. @@ -236,4 +253,4 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos ans$xo = xo # for further use by [.data.table ans -} +} \ No newline at end of file From dabaddce12ca2a926e9da9934dd09847dd778ee7 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 11:01:22 +0530 Subject: [PATCH 05/23] kjh --- R/bmerge.R | 36 ++++++------------ R/merge.R | 105 +++++++++++++++++++++++++++++++---------------------- 2 files changed, 72 insertions(+), 69 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 9293128fe3..72e560aa4c 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -70,8 +70,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname) set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) # nomatch=0L otherwise a level that is missing would match to NA values next - } else { # One is factor, the other is not factor - if (x_merge_type=="character") { # i is factor, x is character + } else { + if (x_merge_type=="character") { coerce_col(i, icol, "factor", "character", iname, xname, verbose=verbose) set(callersi, j=icol, value=i[[icol]]) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581 next @@ -83,40 +83,27 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos next } } - # ----- BEGIN MODIFICATION ----- - # If execution reaches here, it means: - # 1. One of (x_merge_type, i_merge_type) is "factor". - # 2. They are not BOTH "factor". - # 3. The non-factor column is NOT "character". - # This is the specific incompatibility: Factor joining with non-factor/non-character. - - # Message for the condition object, from bmerge's perspective using its 'x' and 'i' arguments - # and their prefixed column names. condition_message <- sprintf( - "Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", + "Incompatible join types (from bmerge perspective): %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, # xname is like "x.colA", x_merge_type is its type iname, i_merge_type # iname is like "i.colB", i_merge_type is its type ) - condition <- structure( list( - message = condition_message, # Raw message if not caught and rephrased + message = condition_message, # Raw message from bmerge if not caught and rephrased call = NULL, # Will be populated by stop() or handler - # Details for 'x' argument received by bmerge.R (e.g., DT2 in merge(DT1,DT2) or DT1 in DT1[DT2]) - c_bmerge_x_arg_bare_col_name = names(x)[xcol], # Bare column name, e.g., "a" - c_bmerge_x_arg_type = x_merge_type, # Type string, e.g., "integer" + # Details for 'x' argument bmerge.R received (e.g., table y from merge(x,y) or table x from x[i]) + c_bmerge_x_arg_bare_col_name = names(x)[xcol], # Bare column name from x, e.g., "a" + c_bmerge_x_arg_type = x_merge_type, # Type string for x's column, e.g., "integer" - # Details for 'i' argument received by bmerge.R (e.g., DT1 in merge(DT1,DT2) or DT2 in DT1[DT2]) - c_bmerge_i_arg_bare_col_name = names(i)[icol], # Bare column name, e.g., "a" - c_bmerge_i_arg_type = i_merge_type # Type string, e.g., "factor" + # Details for 'i' argument bmerge.R received (e.g., table x from merge(x,y) or table i from x[i]) + c_bmerge_i_arg_bare_col_name = names(i)[icol], # Bare column name from i, e.g., "a" + c_bmerge_i_arg_type = i_merge_type # Type string for i's column, e.g., "factor" ), class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition") ) stop(condition) - # ----- END MODIFICATION ----- - } # End of initial "if (x_merge_type=='factor' || i_merge_type=='factor')" block - # we check factors first to cater for the case when trying to do rolling joins on factors if (x_merge_type == i_merge_type) { if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname) @@ -185,8 +172,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } } } - } # End of for loop iterating through join columns - + } ## after all modifications of x, check if x has a proper key on all xcols. ## If not, calculate the order. Also for non-equi joins, the order must be calculated. non_equi = which.first(ops != 1L) # 1 is "==" operator diff --git a/R/merge.R b/R/merge.R index edcb9fc3ff..996ac50209 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,5 +1,10 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) { + + + user_x_name <- deparse1(substitute(x)) + user_y_name <- deparse1(substitute(y)) + if (!sort %in% c(TRUE, FALSE)) stopf("Argument 'sort' should be logical TRUE/FALSE") if (!no.dups %in% c(TRUE, FALSE)) @@ -17,9 +22,9 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (x0 && y0) warningf("Neither of the input data.tables to join have columns.") else if (x0) - warningf("Input data.table '%s' has no columns.", "x") + warningf("Input data.table '%s' (argument 'x') has no columns.", user_x_name) else - warningf("Input data.table '%s' has no columns.", "y") + warningf("Input data.table '%s' (argument 'y') has no columns.", user_y_name) } check_duplicate_names(x) check_duplicate_names(y) @@ -27,40 +32,44 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL nm_x = names(x) nm_y = names(y) + ## set up 'by'/'by.x'/'by.y' + # The 'by' variable created here is crucial as it becomes the 'on' argument for y[x, on=by, ...] if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y)) stopf("`by.x` and `by.y` must be of same length.") if (!missing(by) && !missing(by.x)) warningf("Supplied both `by` and `by.x`/`by.y`. `by` argument will be ignored.") - if (!is.null(by.x)) { + if (!is.null(by.x)) { # by.x and by.y are provided if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y)) stopf("A non-empty vector of column names is required for `by.x` and `by.y`.") + # ----- MODIFIED: Use specific table names in stopf ----- if (!all(idx <- by.x %chin% nm_x)) { - stopf("The following columns listed in `%s` are missing from %s: %s", "by.x", "x", brackify(by.x[!idx])) + stopf("The following columns listed in `%s` are missing from data.table '%s' (argument 'x'): %s", "by.x", user_x_name, brackify(by.x[!idx])) } if (!all(idx <- by.y %chin% nm_y)) { - stopf("The following columns listed in `%s` are missing from %s: %s", "by.y", "y", brackify(by.y[!idx])) + stopf("The following columns listed in `%s` are missing from data.table '%s' (argument 'y'): %s", "by.y", user_y_name, brackify(by.y[!idx])) } - by = by.x - names(by) = by.y - } else { + # ----- END MODIFICATION ----- + by = by.x # 'by' takes values from by.x (cols of table 'x' arg) + names(by) = by.y # names of 'by' are cols from by.y (cols of table 'y' arg) + # So `by` for `on=by` will be c(col_from_y = "col_from_x", ...) + } else { # by is provided or inferred if (is.null(by)) by = intersect(key(x), key(y)) - if (!length(by)) + if (!length(by)) # was is.null() before PR#5183 changed to !length() by = key(x) if (!length(by)) by = intersect(nm_x, nm_y) if (length(by) == 0L || !is.character(by)) stopf("A non-empty vector of column names for `by` is required.") if (!all(idx <- by %in% nm_x)) { - stopf("The following columns listed in `%s` are missing from %s: %s", "by", "x", brackify(by[!idx])) + stopf("The following columns listed in `%s` are missing from data.table '%s' (argument 'x'): %s", "by", user_x_name, brackify(by[!idx])) } if (!all(idx <- by %in% nm_y)) { - stopf("The following columns listed in `%s` are missing from %s: %s", "by", "y", brackify(by[!idx])) + stopf("The following columns listed in `%s` are missing from data.table '%s' (argument 'y'): %s", "by", user_y_name, brackify(by[!idx])) } - by = unname(by) - by.x = by.y = by + by = unname(by) # 'by' becomes a character vector of common column names for `on=by` + by.x = by.y = by # by.x and by.y are set for later use in renaming/ordering } - if (length(list(...))) { ell = as.list(substitute(list(...)))[-1L] for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n) @@ -68,9 +77,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (unnamed_n) warningf("Passed %d unknown and unnamed arguments.", unnamed_n) } - - start = setdiff(nm_x, by.x) - end = setdiff(nm_y, by.y) + start = setdiff(nm_x, by.x) # by.x here refers to key cols from original x's perspective or common names + end = setdiff(nm_y, by.y) # by.y here refers to key cols from original y's perspective or common names dupnames = intersect(start, end) if (length(dupnames)) { start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L]) @@ -80,68 +88,77 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (no.dups && length(dupkeyx)) { end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) } - if (!is.null(incomparables)) { - "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table - xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by) - yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by) + "%fin%" = function(x_val, table_val) if (is.character(x_val) && is.character(table_val)) x_val %chin% table_val else x_val %in% table_val + xind = rowSums(x[, lapply(.SD, function(col_val_x) !(col_val_x %fin% incomparables)), .SDcols=by.x]) == length(by.x) # Use by.x for columns from table x + yind = rowSums(y[, lapply(.SD, function(col_val_y) !(col_val_y %fin% incomparables)), .SDcols=by.y]) == length(by.y) # Use by.y for columns from table y x = x[xind] y = y[yind] } - - # ----- BEGIN MODIFICATION ----- - user_x_name <- deparse1(substitute(x)) - user_y_name <- deparse1(substitute(y)) - by_for_join <- by dt <- tryCatch({ - y[x, nomatch = if (all.x) NA else NULL, on = by_for_join, allow.cartesian = allow.cartesian] + y[x, nomatch = if (all.x) NA else NULL, on = by, allow.cartesian = allow.cartesian] }, bmerge_incompatible_type_error = function(e) { - safe_attr_get <- function(obj, attr_name, placeholder) { - val <- obj[[attr_name]] + safe_attr_get <- function(obj, attr_name, placeholder) { + val <- obj[[attr_name]] # Use [[ to get NULL if attribute doesn't exist if (is.null(val) || length(val) != 1L || is.na(val) || !nzchar(as.character(val)[1L])) { - warning(paste0("Problem retrieving attribute '", attr_name, "' from bmerge_incompatible_type_error object. Was: ", paste(capture.output(dput(val)), collapse = " "))) + # Provide a warning to console to aid debugging if attributes are not as expected + warning(paste0( + "Developer warning: Problem retrieving attribute '", attr_name, + "' from bmerge_incompatible_type_error object. Was: ", + paste(capture.output(dput(val)), collapse=" ") + )) return(placeholder) } return(as.character(val)[1L]) } - col_from_user_x_arg <- safe_attr_get(e, "c_bmerge_i_arg_bare_col_name", "[unknown column from x]") - type_from_user_x_arg <- safe_attr_get(e, "c_bmerge_i_arg_type", "[unknown type from x]") - col_from_user_y_arg <- safe_attr_get(e, "c_bmerge_x_arg_bare_col_name", "[unknown column from y]") - type_from_user_y_arg <- safe_attr_get(e, "c_bmerge_x_arg_type", "[unknown type from y]") + + # In merge(USER_X, USER_Y), the internal join is y[x, ...]: + # - USER_X (1st arg to merge) corresponds to 'i' argument in bmerge.R (source of error `e$c_bmerge_i_arg_*`). + # - USER_Y (2nd arg to merge) corresponds to 'x' argument in bmerge.R (source of error `e$c_bmerge_x_arg_*`). + + col_from_user_x_table <- safe_attr_get(e, "c_bmerge_i_arg_bare_col_name", "[unknown column from 'x' arg]") + type_from_user_x_table <- safe_attr_get(e, "c_bmerge_i_arg_type", "[unknown type for 'x' arg]") + + col_from_user_y_table <- safe_attr_get(e, "c_bmerge_x_arg_bare_col_name", "[unknown column from 'y' arg]") + type_from_user_y_table <- safe_attr_get(e, "c_bmerge_x_arg_type", "[unknown type for 'y' arg]") + + # Ensure user_x_name and user_y_name are valid strings for sprintf final_user_x_name <- if (is.null(user_x_name) || length(user_x_name) != 1L || !nzchar(user_x_name[1L])) "x" else user_x_name[1L] final_user_y_name <- if (is.null(user_y_name) || length(user_y_name) != 1L || !nzchar(user_y_name[1L])) "y" else user_y_name[1L] - msg <- sprintf( "When merging data.table '%s' (argument 'x') and data.table '%s' (argument 'y'):\n Incompatible join types. Factor columns must join to factor or character columns.\n Column '%s' from '%s' (arg 'x') has type: %s.\n Column '%s' from '%s' (arg 'y') has type: %s.", final_user_x_name, final_user_y_name, - col_from_user_x_arg, final_user_x_name, type_from_user_x_arg, - col_from_user_y_arg, final_user_y_name, type_from_user_y_arg + col_from_user_x_table, final_user_x_name, type_from_user_x_table, + col_from_user_y_table, final_user_y_name, type_from_user_y_table ) - stop(errorCondition(message = msg, class = c("datatable_merge_type_error", "data.table_error", "error", "condition"), call = NULL)) } ) - # ----- END MODIFICATION ----- - if (all.y && nrow(y)) { + + if (all.y && nrow(y)) { + # are most likely to be caught in the primary join attempt. missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } - - newend = setdiff(nm_y, by.y) + newend = setdiff(nm_y, by.y) setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) + setnames(dt, c(by.x, start, end)) if (nrow(dt) > 0L) { - setkeyv(dt, if (sort) by.x else NULL) + setkeyv(dt, if (sort) by.x else NULL) # Uses final key column names } + # Throw warning if there are duplicate column names in 'dt' (i.e. if + # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame) resultdupnames = names(dt)[duplicated(names(dt))] if (length(resultdupnames)) { warningf("column names %s are duplicated in the result", brackify(resultdupnames)) } + # retain custom classes of first argument that resulted in dispatch to this method, #1378 setattr(dt, "class", class_x) dt -} +} \ No newline at end of file From bfaad2c0f5ba2c99e456c93259985d88f3c082bd Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 11:17:42 +0530 Subject: [PATCH 06/23] iyg --- R/bmerge.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/bmerge.R b/R/bmerge.R index 72e560aa4c..5528130c13 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -104,6 +104,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition") ) stop(condition) + } # we check factors first to cater for the case when trying to do rolling joins on factors if (x_merge_type == i_merge_type) { if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname) From 8f97c836f11a42b90983c3aaeacef7c5d58de170 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 12:01:18 +0530 Subject: [PATCH 07/23] jsgdf --- R/merge.R | 130 +++++++++++++++++++++++++----------------------------- 1 file changed, 60 insertions(+), 70 deletions(-) diff --git a/R/merge.R b/R/merge.R index 996ac50209..35872a6389 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,9 +1,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) { - - user_x_name <- deparse1(substitute(x)) - user_y_name <- deparse1(substitute(y)) + # NO user_x_name / user_y_name here to maintain original variable environment for tests if (!sort %in% c(TRUE, FALSE)) stopf("Argument 'sort' should be logical TRUE/FALSE") @@ -13,7 +11,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (!is.data.table(y)) { y = as.data.table(y) if (missing(by) && missing(by.x)) { - by = key(x) + by = key(x) # Original logic } } x0 = length(x) == 0L @@ -22,37 +20,33 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (x0 && y0) warningf("Neither of the input data.tables to join have columns.") else if (x0) - warningf("Input data.table '%s' (argument 'x') has no columns.", user_x_name) + warningf("Input data.table '%s' has no columns.", "x") # Original: literal "x" else - warningf("Input data.table '%s' (argument 'y') has no columns.", user_y_name) + warningf("Input data.table '%s' has no columns.", "y") # Original: literal "y" } check_duplicate_names(x) check_duplicate_names(y) - nm_x = names(x) - nm_y = names(y) + nm_x = names(x) # Original logic + nm_y = names(y) # Original logic - ## set up 'by'/'by.x'/'by.y' - # The 'by' variable created here is crucial as it becomes the 'on' argument for y[x, on=by, ...] + ## set up 'by'/'by.x'/'by.y' - RETAIN ORIGINAL LOGIC EXACTLY if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y)) stopf("`by.x` and `by.y` must be of same length.") if (!missing(by) && !missing(by.x)) warningf("Supplied both `by` and `by.x`/`by.y`. `by` argument will be ignored.") - if (!is.null(by.x)) { # by.x and by.y are provided + if (!is.null(by.x)) { if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y)) stopf("A non-empty vector of column names is required for `by.x` and `by.y`.") - # ----- MODIFIED: Use specific table names in stopf ----- if (!all(idx <- by.x %chin% nm_x)) { - stopf("The following columns listed in `%s` are missing from data.table '%s' (argument 'x'): %s", "by.x", user_x_name, brackify(by.x[!idx])) + stopf("The following columns listed in `%s` are missing from %s: %s", "by.x", "x", brackify(by.x[!idx])) # Original: literal "x" } if (!all(idx <- by.y %chin% nm_y)) { - stopf("The following columns listed in `%s` are missing from data.table '%s' (argument 'y'): %s", "by.y", user_y_name, brackify(by.y[!idx])) + stopf("The following columns listed in `%s` are missing from %s: %s", "by.y", "y", brackify(by.y[!idx])) # Original: literal "y" } - # ----- END MODIFICATION ----- - by = by.x # 'by' takes values from by.x (cols of table 'x' arg) - names(by) = by.y # names of 'by' are cols from by.y (cols of table 'y' arg) - # So `by` for `on=by` will be c(col_from_y = "col_from_x", ...) - } else { # by is provided or inferred + by = by.x + names(by) = by.y + } else { if (is.null(by)) by = intersect(key(x), key(y)) if (!length(by)) # was is.null() before PR#5183 changed to !length() @@ -62,14 +56,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (length(by) == 0L || !is.character(by)) stopf("A non-empty vector of column names for `by` is required.") if (!all(idx <- by %in% nm_x)) { - stopf("The following columns listed in `%s` are missing from data.table '%s' (argument 'x'): %s", "by", user_x_name, brackify(by[!idx])) + stopf("The following columns listed in `%s` are missing from %s: %s", "by", "x", brackify(by[!idx])) # Original: literal "x" } if (!all(idx <- by %in% nm_y)) { - stopf("The following columns listed in `%s` are missing from data.table '%s' (argument 'y'): %s", "by", user_y_name, brackify(by[!idx])) + stopf("The following columns listed in `%s` are missing from %s: %s", "by", "y", brackify(by[!idx])) # Original: literal "y" } - by = unname(by) # 'by' becomes a character vector of common column names for `on=by` - by.x = by.y = by # by.x and by.y are set for later use in renaming/ordering + by = unname(by) + by.x = by.y = by } + + # warn about unused arguments #2587 if (length(list(...))) { ell = as.list(substitute(list(...)))[-1L] for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n) @@ -77,8 +73,9 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (unnamed_n) warningf("Passed %d unknown and unnamed arguments.", unnamed_n) } - start = setdiff(nm_x, by.x) # by.x here refers to key cols from original x's perspective or common names - end = setdiff(nm_y, by.y) # by.y here refers to key cols from original y's perspective or common names + + start = setdiff(nm_x, by.x) # Original logic + end = setdiff(nm_y, by.y) # Original logic dupnames = intersect(start, end) if (length(dupnames)) { start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L]) @@ -88,77 +85,70 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (no.dups && length(dupkeyx)) { end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) } + + # implement incomparables argument #2587 if (!is.null(incomparables)) { + # %fin% to be replaced when #5232 is implemented/closed "%fin%" = function(x_val, table_val) if (is.character(x_val) && is.character(table_val)) x_val %chin% table_val else x_val %in% table_val - xind = rowSums(x[, lapply(.SD, function(col_val_x) !(col_val_x %fin% incomparables)), .SDcols=by.x]) == length(by.x) # Use by.x for columns from table x - yind = rowSums(y[, lapply(.SD, function(col_val_y) !(col_val_y %fin% incomparables)), .SDcols=by.y]) == length(by.y) # Use by.y for columns from table y - x = x[xind] - y = y[yind] + xind = rowSums(x[, lapply(.SD, function(x_col_val) !(x_col_val %fin% incomparables)), .SDcols=by.x]) == length(by.x) # Original by.x usage + yind = rowSums(y[, lapply(.SD, function(y_col_val) !(y_col_val %fin% incomparables)), .SDcols=by.y]) == length(by.y) # Original by.y usage + # subset both so later steps still work + x = x[xind] # Original modification of x + y = y[yind] # Original modification of y } + + # ----- MINIMAL MODIFICATION: tryCatch around the main join call ----- dt <- tryCatch({ - y[x, nomatch = if (all.x) NA else NULL, on = by, allow.cartesian = allow.cartesian] + # Original join call, using 'by' as constructed by original logic above + y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] }, bmerge_incompatible_type_error = function(e) { - safe_attr_get <- function(obj, attr_name, placeholder) { - val <- obj[[attr_name]] # Use [[ to get NULL if attribute doesn't exist - if (is.null(val) || length(val) != 1L || is.na(val) || !nzchar(as.character(val)[1L])) { - # Provide a warning to console to aid debugging if attributes are not as expected - warning(paste0( - "Developer warning: Problem retrieving attribute '", attr_name, - "' from bmerge_incompatible_type_error object. Was: ", - paste(capture.output(dput(val)), collapse=" ") - )) - return(placeholder) - } - return(as.character(val)[1L]) - } - - # In merge(USER_X, USER_Y), the internal join is y[x, ...]: - # - USER_X (1st arg to merge) corresponds to 'i' argument in bmerge.R (source of error `e$c_bmerge_i_arg_*`). - # - USER_Y (2nd arg to merge) corresponds to 'x' argument in bmerge.R (source of error `e$c_bmerge_x_arg_*`). + # Get table names *locally* only when this specific error occurs. + # These x_arg_name and y_arg_name refer to the 'x' and 'y' arguments of merge.data.table + x_arg_name <- deparse1(substitute(x, env = parent.frame(2))) # Go up to merge.data.table's frame + y_arg_name <- deparse1(substitute(y, env = parent.frame(2))) # Go up to merge.data.table's frame - col_from_user_x_table <- safe_attr_get(e, "c_bmerge_i_arg_bare_col_name", "[unknown column from 'x' arg]") - type_from_user_x_table <- safe_attr_get(e, "c_bmerge_i_arg_type", "[unknown type for 'x' arg]") + # Ensure arg names are single, non-empty strings + final_x_arg_name <- if (is.null(x_arg_name) || length(x_arg_name) != 1L || !nzchar(x_arg_name[1L])) "arg 'x'" else x_arg_name[1L] + final_y_arg_name <- if (is.null(y_arg_name) || length(y_arg_name) != 1L || !nzchar(y_arg_name[1L])) "arg 'y'" else y_arg_name[1L] - col_from_user_y_table <- safe_attr_get(e, "c_bmerge_x_arg_bare_col_name", "[unknown column from 'y' arg]") - type_from_user_y_table <- safe_attr_get(e, "c_bmerge_x_arg_type", "[unknown type for 'y' arg]") + # Extract column/type details from error object 'e' + # Assumes attributes on 'e' are valid single character strings from R/bmerge.R + # Mapping: merge's 'x' is bmerge's 'i'; merge's 'y' is bmerge's 'x' + col_from_x_arg <- e$c_bmerge_i_arg_bare_col_name + type_from_x_arg <- e$c_bmerge_i_arg_type + col_from_y_arg <- e$c_bmerge_x_arg_bare_col_name + type_from_y_arg <- e$c_bmerge_x_arg_type - # Ensure user_x_name and user_y_name are valid strings for sprintf - final_user_x_name <- if (is.null(user_x_name) || length(user_x_name) != 1L || !nzchar(user_x_name[1L])) "x" else user_x_name[1L] - final_user_y_name <- if (is.null(user_y_name) || length(user_y_name) != 1L || !nzchar(user_y_name[1L])) "y" else user_y_name[1L] + # Construct the concise error message msg <- sprintf( - "When merging data.table '%s' (argument 'x') and data.table '%s' (argument 'y'):\n Incompatible join types. Factor columns must join to factor or character columns.\n Column '%s' from '%s' (arg 'x') has type: %s.\n Column '%s' from '%s' (arg 'y') has type: %s.", - final_user_x_name, - final_user_y_name, - col_from_user_x_table, final_user_x_name, type_from_user_x_table, - col_from_user_y_table, final_user_y_name, type_from_user_y_table + "Incompatible join types for merge: table '%s' column '%s' (%s) and table '%s' column '%s' (%s). Factor columns must join to factor or character columns.", + final_x_arg_name, col_from_x_arg, type_from_x_arg, + final_y_arg_name, col_from_y_arg, type_from_y_arg ) + stop(errorCondition(message = msg, class = c("datatable_merge_type_error", "data.table_error", "error", "condition"), call = NULL)) } ) + # ----- END MINIMAL MODIFICATION ----- - - if (all.y && nrow(y)) { - # are most likely to be caught in the primary join attempt. + if (all.y && nrow(y)) { # Original logic missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } - newend = setdiff(nm_y, by.y) - setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) - - setnames(dt, c(by.x, start, end)) + + newend = setdiff(nm_y, by.y) # Original logic + setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) # Original logic + setnames(dt, c(by.x, start, end)) # Original logic if (nrow(dt) > 0L) { - setkeyv(dt, if (sort) by.x else NULL) # Uses final key column names + setkeyv(dt, if (sort) by.x else NULL) # Original logic } - # Throw warning if there are duplicate column names in 'dt' (i.e. if - # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame) resultdupnames = names(dt)[duplicated(names(dt))] if (length(resultdupnames)) { warningf("column names %s are duplicated in the result", brackify(resultdupnames)) } - # retain custom classes of first argument that resulted in dispatch to this method, #1378 setattr(dt, "class", class_x) dt } \ No newline at end of file From 2b7edce4b23e4b9dafe13eede520d3e118489012 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 12:09:11 +0530 Subject: [PATCH 08/23] kjh --- R/bmerge.R | 95 ++++++++++++++++++------------------------------------ 1 file changed, 32 insertions(+), 63 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 5528130c13..1d6ff4f71a 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -2,14 +2,11 @@ mergeType = function(x) { ans = typeof(x) if (ans=="integer") { if (is.factor(x)) ans = "factor" } else if (ans=="double") { if (inherits(x, "integer64")) ans = "integer64" } - # do not call fitsInInt*(x) yet because i) if both types are double we don't need to coerce even if one or both sides - # are int-as-double, and ii) to save calling it until we really need it ans } cast_with_attrs = function(x, cast_fun) { ans = cast_fun(x) - # do not copy attributes when coercing factor (to character) if (!is.factor(x) && !is.null(attributes(x))) attributes(ans) = attributes(x) ans } @@ -27,18 +24,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos { callersi = i i = shallow(i) - # Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here - # by bmerge changing the type of the user's input object by reference. We now shallow copy i again. If we then - # coerce a column in i only, we are just changing the temporary coercion used for the merge operation. If we - # set callersi too then we are keeping that coerced i column in the merge result returned to user. - # The type of the i column is always returned (i.e. just i set not callersi too), other than: - # i) to convert int-as-double to int, useful for ad hoc joins when the L postfix is often forgotten. - # ii) to coerce i.factor to character when joining to x.character - # So those are the only two uses of callersi below. - # Careful to only use plonk syntax (full column) on i and x from now on, otherwise user's i and x would - # change. This is why shallow() is very importantly internal only, currently. - - # Using .SD in j to join could fail due to being locked and set() being used here, #1926 .Call(C_unlock, i) x = shallow(x) .Call(C_unlock, x) @@ -50,16 +35,12 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos supported = c(ORDERING_TYPES, "factor", "integer64") if (nrow(i)) for (a in seq_along(icols)) { - # - check that join columns have compatible types - # - do type coercions if necessary on just the shallow local copies for the purpose of join - # - handle factor columns appropriately - # Note that if i is keyed, if this coerces i's key gets dropped by set() icol = icols[a] xcol = xcols[a] x_merge_type = mergeType(x[[xcol]]) i_merge_type = mergeType(i[[icol]]) - xname = paste0("x.", names(x)[xcol]) # Prefixed name for x's column (bmerge's perspective) - iname = paste0("i.", names(i)[icol]) # Prefixed name for i's column (bmerge's perspective) + xname = paste0("x.", names(x)[xcol]) + iname = paste0("i.", names(i)[icol]) if (!x_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, x_merge_type) if (!i_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, i_merge_type) @@ -68,49 +49,48 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname) if (x_merge_type=="factor" && i_merge_type=="factor") { if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname) - set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) # nomatch=0L otherwise a level that is missing would match to NA values + set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) next } else { - if (x_merge_type=="character") { + if (x_merge_type=="character") { coerce_col(i, icol, "factor", "character", iname, xname, verbose=verbose) - set(callersi, j=icol, value=i[[icol]]) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581 + set(callersi, j=icol, value=i[[icol]]) next - } else if (i_merge_type=="character") { # i is character, x is factor + } else if (i_merge_type=="character") { if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname) newvalue = chmatch(i[[icol]], levels(x[[xcol]]), nomatch=0L) - if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809 + if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_ set(i, j=icol, value=newvalue) next } } + # Incompatible factor join: Factor vs (Not Factor and Not Character) + # The 'message' attribute must match the *old* error for direct calls to bmerge (e.g., DT[otherDT]) condition_message <- sprintf( - "Incompatible join types (from bmerge perspective): %s (%s) and %s (%s). Factor columns must join to factor or character columns.", - xname, x_merge_type, # xname is like "x.colA", x_merge_type is its type - iname, i_merge_type # iname is like "i.colB", i_merge_type is its type + "Incompatible join types: %s (%s) and %s (%s)", # Exact match for tests like 2044.24 + xname, x_merge_type, + iname, i_merge_type ) + condition <- structure( list( - message = condition_message, # Raw message from bmerge if not caught and rephrased - call = NULL, # Will be populated by stop() or handler - - # Details for 'x' argument bmerge.R received (e.g., table y from merge(x,y) or table x from x[i]) - c_bmerge_x_arg_bare_col_name = names(x)[xcol], # Bare column name from x, e.g., "a" - c_bmerge_x_arg_type = x_merge_type, # Type string for x's column, e.g., "integer" - - # Details for 'i' argument bmerge.R received (e.g., table x from merge(x,y) or table i from x[i]) - c_bmerge_i_arg_bare_col_name = names(i)[icol], # Bare column name from i, e.g., "a" - c_bmerge_i_arg_type = i_merge_type # Type string for i's column, e.g., "factor" + message = condition_message, + call = NULL, + c_bmerge_x_arg_bare_col_name = names(x)[xcol], + c_bmerge_x_arg_type = x_merge_type, + c_bmerge_i_arg_bare_col_name = names(i)[icol], + c_bmerge_i_arg_type = i_merge_type ), class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition") ) stop(condition) } - # we check factors first to cater for the case when trying to do rolling joins on factors + if (x_merge_type == i_merge_type) { if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname) next } - cfl = c("character", "logical", "factor") # Note: 'factor' in cfl is effectively dead code here due to earlier factor handling + cfl = c("character", "logical", "factor") if (x_merge_type %chin% cfl || i_merge_type %chin% cfl) { if (anyNA(i[[icol]]) && allNA(i[[icol]])) { coerce_col(i, icol, i_merge_type, x_merge_type, iname, xname, from_detail=gettext(" (all-NA)"), verbose=verbose) @@ -120,25 +100,23 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos coerce_col(x, xcol, x_merge_type, i_merge_type, xname, iname, from_detail=gettext(" (all-NA)"), verbose=verbose) next } - # This stopf is for other incompatible types not covered by the factor-specific error above + # This 'stopf' might have been the one originally hit by the failing tests. + # Our custom error above now preempts this for incompatible factor joins. stopf("Incompatible join types: %s (%s) and %s (%s)", xname, x_merge_type, iname, i_merge_type) } if (x_merge_type=="integer64" || i_merge_type=="integer64") { nm = c(iname, xname) - if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce + if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } if (wclass=="integer" || (wclass=="double" && fitsInInt64(w[[wc]]))) { from_detail = if (wclass == "double") gettext(" (which has integer64 representation, e.g. no fractions)") else "" coerce_col(w, wc, wclass, "integer64", nm[1L], nm[2L], from_detail, verbose=verbose) } else stopf("Incompatible join types: %s is type integer64 but %s is type double and cannot be coerced to integer64 (e.g. has fractions)", nm[2L], nm[1L]) } else { - # just integer and double left - ic_idx = which(icol == icols) # check if on is joined on multiple conditions, #6602 + ic_idx = which(icol == icols) if (i_merge_type=="double") { coerce_x = FALSE if (fitsInInt32(i[[icol]])) { coerce_x = TRUE - # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys - # we've always coerced to int and returned int, for convenience. if (length(ic_idx)>1L) { xc_idx = xcols[ic_idx] for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) { @@ -151,7 +129,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (coerce_x) { from_detail = gettext(" (which contains no fractions)") coerce_col(i, icol, "double", "integer", iname, xname, from_detail, verbose=verbose) - set(callersi, j=icol, value=i[[icol]]) # change the shallow copy of i up in [.data.table to reflect in the result, too. + set(callersi, j=icol, value=i[[icol]]) if (length(ic_idx)>1L) { xc_idx = xcols[ic_idx] for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) { @@ -174,11 +152,9 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } } } - ## after all modifications of x, check if x has a proper key on all xcols. - ## If not, calculate the order. Also for non-equi joins, the order must be calculated. - non_equi = which.first(ops != 1L) # 1 is "==" operator + + non_equi = which.first(ops != 1L) if (is.na(non_equi)) { - # equi join. use existing key (#1825) or existing secondary index (#1439) if (identical(xcols, head(chmatch(key(x), names(x)), length(xcols)))) { xo = integer(0L) if (verbose) catf("on= matches existing key, using key\n") @@ -192,27 +168,21 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) {last.started.at=proc.time(); flush.console()} xo = forderv(x, by = xcols) if (verbose) {catf("Calculated ad hoc index in %s\n", timetaken(last.started.at)); flush.console()} - # TODO: use setindex() instead, so it's cached for future reuse } } - ## these variables are only needed for non-equi joins. Set them to default. nqgrp = integer(0L) nqmaxgrp = 1L } else { - # non-equi operators present.. investigate groups.. nqgrp = integer(0L) nqmaxgrp = 1L if (verbose) catf("Non-equi join operators detected ... \n") if (roll != FALSE) stopf("roll is not implemented for non-equi joins yet.") if (verbose) {last.started.at=proc.time();catf(" forder took ... ");flush.console()} - # TODO: could check/reuse secondary indices, but we need 'starts' attribute as well! xo = forderv(x, xcols, retGrp=TRUE) - if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # notranslate + if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} xg = attr(xo, 'starts', exact=TRUE) resetcols = head(xcols, non_equi-1L) if (length(resetcols)) { - # TODO: can we get around having to reorder twice here? - # or at least reuse previous order? if (verbose) {last.started.at=proc.time();catf(" Generating group lengths ... ");flush.console()} resetlen = attr(forderv(x, resetcols, retGrp=TRUE), 'starts', exact=TRUE) resetlen = .Call(Cuniqlengths, resetlen, nrow(x)) @@ -221,8 +191,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) {last.started.at=proc.time();catf(" Generating non-equi group ids ... ");flush.console()} nqgrp = .Call(Cnestedid, x, xcols[non_equi:length(xcols)], xo, xg, resetlen, mult) if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} - if (length(nqgrp)) nqmaxgrp = max(nqgrp) # fix for #1986, when 'x' is 0-row table max(.) returns -Inf. - if (nqmaxgrp > 1L) { # got some non-equi join work to do + if (length(nqgrp)) nqmaxgrp = max(nqgrp) + if (nqmaxgrp > 1L) { if ("_nqgrp_" %in% names(x)) stopf("Column name '_nqgrp_' is reserved for non-equi joins.") if (verbose) {last.started.at=proc.time();catf(" Recomputing forder with non-equi ids ... ");flush.console()} set(nqx<-shallow(x), j="_nqgrp_", value=nqgrp) @@ -236,8 +206,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) {last.started.at=proc.time();catf("Starting bmerge ...\n");flush.console()} ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()} - # TO DO: xo could be moved inside Cbmerge - ans$xo = xo # for further use by [.data.table + ans$xo = xo ans } \ No newline at end of file From af1f0a7b9a052641694ac31aa24c84a0fa175f4e Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 15:54:25 +0530 Subject: [PATCH 09/23] hb --- R/bmerge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/bmerge.R b/R/bmerge.R index 1d6ff4f71a..6e44a40b59 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -67,7 +67,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos # Incompatible factor join: Factor vs (Not Factor and Not Character) # The 'message' attribute must match the *old* error for direct calls to bmerge (e.g., DT[otherDT]) condition_message <- sprintf( - "Incompatible join types: %s (%s) and %s (%s)", # Exact match for tests like 2044.24 + "Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", # Exact match for tests like 2044.24 xname, x_merge_type, iname, i_merge_type ) From 5d0b166ed867c3867003f7f04b302ce3622c9db7 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 17:38:23 +0530 Subject: [PATCH 10/23] hjg --- R/bmerge.R | 1 - R/merge.R | 75 ++++++++++++++++++++++-------------------------------- 2 files changed, 30 insertions(+), 46 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 6e44a40b59..c3d5402db0 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -75,7 +75,6 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos condition <- structure( list( message = condition_message, - call = NULL, c_bmerge_x_arg_bare_col_name = names(x)[xcol], c_bmerge_x_arg_type = x_merge_type, c_bmerge_i_arg_bare_col_name = names(i)[icol], diff --git a/R/merge.R b/R/merge.R index 35872a6389..0afa5f17ac 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,7 +1,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) { - # NO user_x_name / user_y_name here to maintain original variable environment for tests + # NO user_x_name / user_y_name at the top to maintain original variable environment for most of the function + # They will be fetched *only* within the error handler if needed. if (!sort %in% c(TRUE, FALSE)) stopf("Argument 'sort' should be logical TRUE/FALSE") @@ -39,33 +40,32 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (length(by.x) == 0L || !is.character(by.x) || !is.character(by.y)) stopf("A non-empty vector of column names is required for `by.x` and `by.y`.") if (!all(idx <- by.x %chin% nm_x)) { - stopf("The following columns listed in `%s` are missing from %s: %s", "by.x", "x", brackify(by.x[!idx])) # Original: literal "x" + stopf("The following columns listed in `%s` are missing from %s: %s", "by.x", "x", brackify(by.x[!idx])) } if (!all(idx <- by.y %chin% nm_y)) { - stopf("The following columns listed in `%s` are missing from %s: %s", "by.y", "y", brackify(by.y[!idx])) # Original: literal "y" + stopf("The following columns listed in `%s` are missing from %s: %s", "by.y", "y", brackify(by.y[!idx])) } by = by.x names(by) = by.y } else { if (is.null(by)) by = intersect(key(x), key(y)) - if (!length(by)) # was is.null() before PR#5183 changed to !length() + if (!length(by)) by = key(x) if (!length(by)) by = intersect(nm_x, nm_y) if (length(by) == 0L || !is.character(by)) stopf("A non-empty vector of column names for `by` is required.") if (!all(idx <- by %in% nm_x)) { - stopf("The following columns listed in `%s` are missing from %s: %s", "by", "x", brackify(by[!idx])) # Original: literal "x" + stopf("The following columns listed in `%s` are missing from %s: %s", "by", "x", brackify(by[!idx])) } if (!all(idx <- by %in% nm_y)) { - stopf("The following columns listed in `%s` are missing from %s: %s", "by", "y", brackify(by[!idx])) # Original: literal "y" + stopf("The following columns listed in `%s` are missing from %s: %s", "by", "y", brackify(by[!idx])) } by = unname(by) by.x = by.y = by } - # warn about unused arguments #2587 if (length(list(...))) { ell = as.list(substitute(list(...)))[-1L] for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n) @@ -74,8 +74,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL warningf("Passed %d unknown and unnamed arguments.", unnamed_n) } - start = setdiff(nm_x, by.x) # Original logic - end = setdiff(nm_y, by.y) # Original logic + start = setdiff(nm_x, by.x) + end = setdiff(nm_y, by.y) dupnames = intersect(start, end) if (length(dupnames)) { start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L]) @@ -86,62 +86,47 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) } - # implement incomparables argument #2587 if (!is.null(incomparables)) { - # %fin% to be replaced when #5232 is implemented/closed "%fin%" = function(x_val, table_val) if (is.character(x_val) && is.character(table_val)) x_val %chin% table_val else x_val %in% table_val - xind = rowSums(x[, lapply(.SD, function(x_col_val) !(x_col_val %fin% incomparables)), .SDcols=by.x]) == length(by.x) # Original by.x usage - yind = rowSums(y[, lapply(.SD, function(y_col_val) !(y_col_val %fin% incomparables)), .SDcols=by.y]) == length(by.y) # Original by.y usage - # subset both so later steps still work - x = x[xind] # Original modification of x - y = y[yind] # Original modification of y + xind = rowSums(x[, lapply(.SD, function(x_col_val) !(x_col_val %fin% incomparables)), .SDcols=by.x]) == length(by.x) + yind = rowSums(y[, lapply(.SD, function(y_col_val) !(y_col_val %fin% incomparables)), .SDcols=by.y]) == length(by.y) + x = x[xind] + y = y[yind] } - # ----- MINIMAL MODIFICATION: tryCatch around the main join call ----- dt <- tryCatch({ - # Original join call, using 'by' as constructed by original logic above y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] }, bmerge_incompatible_type_error = function(e) { - # Get table names *locally* only when this specific error occurs. - # These x_arg_name and y_arg_name refer to the 'x' and 'y' arguments of merge.data.table - x_arg_name <- deparse1(substitute(x, env = parent.frame(2))) # Go up to merge.data.table's frame - y_arg_name <- deparse1(substitute(y, env = parent.frame(2))) # Go up to merge.data.table's frame - - # Ensure arg names are single, non-empty strings - final_x_arg_name <- if (is.null(x_arg_name) || length(x_arg_name) != 1L || !nzchar(x_arg_name[1L])) "arg 'x'" else x_arg_name[1L] - final_y_arg_name <- if (is.null(y_arg_name) || length(y_arg_name) != 1L || !nzchar(y_arg_name[1L])) "arg 'y'" else y_arg_name[1L] - - # Extract column/type details from error object 'e' - # Assumes attributes on 'e' are valid single character strings from R/bmerge.R - # Mapping: merge's 'x' is bmerge's 'i'; merge's 'y' is bmerge's 'x' - col_from_x_arg <- e$c_bmerge_i_arg_bare_col_name - type_from_x_arg <- e$c_bmerge_i_arg_type - col_from_y_arg <- e$c_bmerge_x_arg_bare_col_name - type_from_y_arg <- e$c_bmerge_x_arg_type + # For merge(x=DT1, y=DT2), DT1 (user's 'x') is bmerge's 'i' + # DT2 (user's 'y') is bmerge's 'x' + x_part_col_name <- e$c_bmerge_i_arg_bare_col_name + x_part_type <- e$c_bmerge_i_arg_type + y_part_col_name <- e$c_bmerge_x_arg_bare_col_name + y_part_type <- e$c_bmerge_x_arg_type - # Construct the concise error message + # Use literal "x." and "y." prefixes referring to the arguments of merge() msg <- sprintf( - "Incompatible join types for merge: table '%s' column '%s' (%s) and table '%s' column '%s' (%s). Factor columns must join to factor or character columns.", - final_x_arg_name, col_from_x_arg, type_from_x_arg, - final_y_arg_name, col_from_y_arg, type_from_y_arg + "Incompatible join types: x.%s (%s) and y.%s (%s). Factor columns must join to factor or character columns.", + x_part_col_name, x_part_type, # Corresponds to merge() argument 'x' + y_part_col_name, y_part_type # Corresponds to merge() argument 'y' ) - stop(errorCondition(message = msg, class = c("datatable_merge_type_error", "data.table_error", "error", "condition"), call = NULL)) + # Remove call = NULL to get "Error in merge.data.table(...): " prefix. + stop(errorCondition(message = msg, class = c("datatable_merge_type_error", "data.table_error", "error", "condition"))) } ) - # ----- END MINIMAL MODIFICATION ----- - if (all.y && nrow(y)) { # Original logic + if (all.y && nrow(y)) { missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } - newend = setdiff(nm_y, by.y) # Original logic - setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) # Original logic - setnames(dt, c(by.x, start, end)) # Original logic + newend = setdiff(nm_y, by.y) + setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) + setnames(dt, c(by.x, start, end)) if (nrow(dt) > 0L) { - setkeyv(dt, if (sort) by.x else NULL) # Original logic + setkeyv(dt, if (sort) by.x else NULL) } resultdupnames = names(dt)[duplicated(names(dt))] From 56d696d78121ca706e40d1d0d06f5293d5a753b3 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 3 Jun 2025 20:09:39 +0530 Subject: [PATCH 11/23] jhdf --- R/bmerge.R | 74 +++++++++++++++++++++++++++++++++++++----------------- R/merge.R | 47 +++++++++++++++++++--------------- 2 files changed, 78 insertions(+), 43 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index c3d5402db0..255d7dbdfe 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -1,12 +1,17 @@ + + mergeType = function(x) { ans = typeof(x) if (ans=="integer") { if (is.factor(x)) ans = "factor" } else if (ans=="double") { if (inherits(x, "integer64")) ans = "integer64" } + # do not call fitsInInt*(x) yet because i) if both types are double we don't need to coerce even if one or both sides + # are int-as-double, and ii) to save calling it until we really need it ans } cast_with_attrs = function(x, cast_fun) { ans = cast_fun(x) + # do not copy attributes when coercing factor (to character) if (!is.factor(x) && !is.null(attributes(x))) attributes(ans) = attributes(x) ans } @@ -24,6 +29,18 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos { callersi = i i = shallow(i) + # Just before the call to bmerge() in [.data.table there is a shallow() copy of i to prevent coercions here + # by bmerge changing the type of the user's input object by reference. We now shallow copy i again. If we then + # coerce a column in i only, we are just changing the temporary coercion used for the merge operation. If we + # set callersi too then we are keeping that coerced i column in the merge result returned to user. + # The type of the i column is always returned (i.e. just i set not callersi too), other than: + # i) to convert int-as-double to int, useful for ad hoc joins when the L postfix is often forgotten. + # ii) to coerce i.factor to character when joining to x.character + # So those are the only two uses of callersi below. + # Careful to only use plonk syntax (full column) on i and x from now on, otherwise user's i and x would + # change. This is why shallow() is very importantly internal only, currently. + + # Using .SD in j to join could fail due to being locked and set() being used here, #1926 .Call(C_unlock, i) x = shallow(x) .Call(C_unlock, x) @@ -35,6 +52,10 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos supported = c(ORDERING_TYPES, "factor", "integer64") if (nrow(i)) for (a in seq_along(icols)) { + # - check that join columns have compatible types + # - do type coercions if necessary on just the shallow local copies for the purpose of join + # - handle factor columns appropriately + # Note that if i is keyed, if this coerces i's key gets dropped by set() icol = icols[a] xcol = xcols[a] x_merge_type = mergeType(x[[xcol]]) @@ -43,48 +64,44 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos iname = paste0("i.", names(i)[icol]) if (!x_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, x_merge_type) if (!i_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, i_merge_type) - if (x_merge_type=="factor" || i_merge_type=="factor") { if (roll!=0.0 && a==length(icols)) stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname) if (x_merge_type=="factor" && i_merge_type=="factor") { if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname) - set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) + set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) # nomatch=0L otherwise a level that is missing would match to NA values next } else { if (x_merge_type=="character") { coerce_col(i, icol, "factor", "character", iname, xname, verbose=verbose) - set(callersi, j=icol, value=i[[icol]]) + set(callersi, j=icol, value=i[[icol]]) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581 next } else if (i_merge_type=="character") { if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname) newvalue = chmatch(i[[icol]], levels(x[[xcol]]), nomatch=0L) - if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_ + if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809 set(i, j=icol, value=newvalue) next } } - # Incompatible factor join: Factor vs (Not Factor and Not Character) - # The 'message' attribute must match the *old* error for direct calls to bmerge (e.g., DT[otherDT]) condition_message <- sprintf( "Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", # Exact match for tests like 2044.24 xname, x_merge_type, iname, i_merge_type ) - condition <- structure( list( message = condition_message, - c_bmerge_x_arg_bare_col_name = names(x)[xcol], - c_bmerge_x_arg_type = x_merge_type, - c_bmerge_i_arg_bare_col_name = names(i)[icol], - c_bmerge_i_arg_type = i_merge_type + bmerge_x_arg_col_name = names(x)[xcol], + bmerge_x_arg_type = x_merge_type, + bmerge_i_arg_col_name = names(i)[icol], + bmerge_i_arg_type = i_merge_type ), class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition") ) stop(condition) } - + # we check factors first to cater for the case when trying to do rolling joins on factors if (x_merge_type == i_merge_type) { if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname) next @@ -99,23 +116,24 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos coerce_col(x, xcol, x_merge_type, i_merge_type, xname, iname, from_detail=gettext(" (all-NA)"), verbose=verbose) next } - # This 'stopf' might have been the one originally hit by the failing tests. - # Our custom error above now preempts this for incompatible factor joins. stopf("Incompatible join types: %s (%s) and %s (%s)", xname, x_merge_type, iname, i_merge_type) } if (x_merge_type=="integer64" || i_merge_type=="integer64") { nm = c(iname, xname) - if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } + if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce if (wclass=="integer" || (wclass=="double" && fitsInInt64(w[[wc]]))) { from_detail = if (wclass == "double") gettext(" (which has integer64 representation, e.g. no fractions)") else "" coerce_col(w, wc, wclass, "integer64", nm[1L], nm[2L], from_detail, verbose=verbose) } else stopf("Incompatible join types: %s is type integer64 but %s is type double and cannot be coerced to integer64 (e.g. has fractions)", nm[2L], nm[1L]) } else { - ic_idx = which(icol == icols) + # just integer and double left + ic_idx = which(icol == icols) # check if on is joined on multiple conditions, #6602 if (i_merge_type=="double") { coerce_x = FALSE if (fitsInInt32(i[[icol]])) { coerce_x = TRUE + # common case of ad hoc user-typed integers missing L postfix joining to correct integer keys + # we've always coerced to int and returned int, for convenience. if (length(ic_idx)>1L) { xc_idx = xcols[ic_idx] for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) { @@ -128,7 +146,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (coerce_x) { from_detail = gettext(" (which contains no fractions)") coerce_col(i, icol, "double", "integer", iname, xname, from_detail, verbose=verbose) - set(callersi, j=icol, value=i[[icol]]) + set(callersi, j=icol, value=i[[icol]]) # change the shallow copy of i up in [.data.table to reflect in the result, too. if (length(ic_idx)>1L) { xc_idx = xcols[ic_idx] for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) { @@ -152,8 +170,11 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } } - non_equi = which.first(ops != 1L) + ## after all modifications of x, check if x has a proper key on all xcols. + ## If not, calculate the order. Also for non-equi joins, the order must be calculated. + non_equi = which.first(ops != 1L) # 1 is "==" operator if (is.na(non_equi)) { + # equi join. use existing key (#1825) or existing secondary index (#1439) if (identical(xcols, head(chmatch(key(x), names(x)), length(xcols)))) { xo = integer(0L) if (verbose) catf("on= matches existing key, using key\n") @@ -167,21 +188,27 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) {last.started.at=proc.time(); flush.console()} xo = forderv(x, by = xcols) if (verbose) {catf("Calculated ad hoc index in %s\n", timetaken(last.started.at)); flush.console()} + # TODO: use setindex() instead, so it's cached for future reuse } } + ## these variables are only needed for non-equi joins. Set them to default. nqgrp = integer(0L) nqmaxgrp = 1L } else { + # non-equi operators present.. investigate groups.. nqgrp = integer(0L) nqmaxgrp = 1L if (verbose) catf("Non-equi join operators detected ... \n") if (roll != FALSE) stopf("roll is not implemented for non-equi joins yet.") if (verbose) {last.started.at=proc.time();catf(" forder took ... ");flush.console()} + # TODO: could check/reuse secondary indices, but we need 'starts' attribute as well! xo = forderv(x, xcols, retGrp=TRUE) - if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} + if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # notranslate xg = attr(xo, 'starts', exact=TRUE) resetcols = head(xcols, non_equi-1L) if (length(resetcols)) { + # TODO: can we get around having to reorder twice here? + # or at least reuse previous order? if (verbose) {last.started.at=proc.time();catf(" Generating group lengths ... ");flush.console()} resetlen = attr(forderv(x, resetcols, retGrp=TRUE), 'starts', exact=TRUE) resetlen = .Call(Cuniqlengths, resetlen, nrow(x)) @@ -190,8 +217,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) {last.started.at=proc.time();catf(" Generating non-equi group ids ... ");flush.console()} nqgrp = .Call(Cnestedid, x, xcols[non_equi:length(xcols)], xo, xg, resetlen, mult) if (verbose) {catf("done in %s\n",timetaken(last.started.at)); flush.console()} - if (length(nqgrp)) nqmaxgrp = max(nqgrp) - if (nqmaxgrp > 1L) { + if (length(nqgrp)) nqmaxgrp = max(nqgrp) # fix for #1986, when 'x' is 0-row table max(.) returns -Inf. + if (nqmaxgrp > 1L) { # got some non-equi join work to do if ("_nqgrp_" %in% names(x)) stopf("Column name '_nqgrp_' is reserved for non-equi joins.") if (verbose) {last.started.at=proc.time();catf(" Recomputing forder with non-equi ids ... ");flush.console()} set(nqx<-shallow(x), j="_nqgrp_", value=nqgrp) @@ -205,7 +232,8 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) {last.started.at=proc.time();catf("Starting bmerge ...\n");flush.console()} ans = .Call(Cbmerge, i, x, as.integer(icols), as.integer(xcols), xo, roll, rollends, nomatch, mult, ops, nqgrp, nqmaxgrp) if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()} + # TO DO: xo could be moved inside Cbmerge - ans$xo = xo + ans$xo = xo # for further use by [.data.table ans -} \ No newline at end of file +} diff --git a/R/merge.R b/R/merge.R index 0afa5f17ac..6cbb67dd62 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,9 +1,5 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) { - - # NO user_x_name / user_y_name at the top to maintain original variable environment for most of the function - # They will be fetched *only* within the error handler if needed. - if (!sort %in% c(TRUE, FALSE)) stopf("Argument 'sort' should be logical TRUE/FALSE") if (!no.dups %in% c(TRUE, FALSE)) @@ -12,7 +8,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (!is.data.table(y)) { y = as.data.table(y) if (missing(by) && missing(by.x)) { - by = key(x) # Original logic + by = key(x) } } x0 = length(x) == 0L @@ -21,17 +17,17 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (x0 && y0) warningf("Neither of the input data.tables to join have columns.") else if (x0) - warningf("Input data.table '%s' has no columns.", "x") # Original: literal "x" + warningf("Input data.table '%s' has no columns.", "x") else - warningf("Input data.table '%s' has no columns.", "y") # Original: literal "y" + warningf("Input data.table '%s' has no columns.", "y") } check_duplicate_names(x) check_duplicate_names(y) - nm_x = names(x) # Original logic - nm_y = names(y) # Original logic + nm_x = names(x) + nm_y = names(y) - ## set up 'by'/'by.x'/'by.y' - RETAIN ORIGINAL LOGIC EXACTLY + ## set up 'by'/'by.x'/'by.y' if ((!is.null(by.x) || !is.null(by.y)) && length(by.x) != length(by.y)) stopf("`by.x` and `by.y` must be of same length.") if (!missing(by) && !missing(by.x)) @@ -50,7 +46,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } else { if (is.null(by)) by = intersect(key(x), key(y)) - if (!length(by)) + if (!length(by)) # was is.null() before PR#5183 changed to !length() by = key(x) if (!length(by)) by = intersect(nm_x, nm_y) @@ -66,6 +62,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL by.x = by.y = by } + # warn about unused arguments #2587 if (length(list(...))) { ell = as.list(substitute(list(...)))[-1L] for (n in setdiff(names(ell), "")) warningf("Unknown argument '%s' has been passed.", n) @@ -73,7 +70,10 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (unnamed_n) warningf("Passed %d unknown and unnamed arguments.", unnamed_n) } - + # with i. prefix in v1.9.3, this goes away. Left here for now ... + ## sidestep the auto-increment column number feature-leading-to-bug by + ## ensuring no names end in ".1", see unit test + ## "merge and auto-increment columns in y[x]" in test-data.frame.like.R start = setdiff(nm_x, by.x) end = setdiff(nm_y, by.y) dupnames = intersect(start, end) @@ -81,11 +81,14 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL start[chmatch(dupnames, start, 0L)] = paste0(dupnames, suffixes[1L]) end[chmatch(dupnames, end, 0L)] = paste0(dupnames, suffixes[2L]) } + # If no.dups = TRUE we also need to added the suffix to columns in y + # that share a name with by.x dupkeyx = intersect(by.x, end) if (no.dups && length(dupkeyx)) { end[chmatch(dupkeyx, end, 0L)] = paste0(dupkeyx, suffixes[2L]) } + # implement incomparables argument #2587 if (!is.null(incomparables)) { "%fin%" = function(x_val, table_val) if (is.character(x_val) && is.character(table_val)) x_val %chin% table_val else x_val %in% table_val xind = rowSums(x[, lapply(.SD, function(x_col_val) !(x_col_val %fin% incomparables)), .SDcols=by.x]) == length(by.x) @@ -100,16 +103,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL bmerge_incompatible_type_error = function(e) { # For merge(x=DT1, y=DT2), DT1 (user's 'x') is bmerge's 'i' # DT2 (user's 'y') is bmerge's 'x' - x_part_col_name <- e$c_bmerge_i_arg_bare_col_name - x_part_type <- e$c_bmerge_i_arg_type - y_part_col_name <- e$c_bmerge_x_arg_bare_col_name - y_part_type <- e$c_bmerge_x_arg_type + x_part_col_name <- e$bmerge_i_arg_col_name + x_part_type <- e$bmerge_i_arg_type + y_part_col_name <- e$bmerge_x_arg_col_name + y_part_type <- e$bmerge_x_arg_type # Use literal "x." and "y." prefixes referring to the arguments of merge() msg <- sprintf( - "Incompatible join types: x.%s (%s) and y.%s (%s). Factor columns must join to factor or character columns.", - x_part_col_name, x_part_type, # Corresponds to merge() argument 'x' - y_part_col_name, y_part_type # Corresponds to merge() argument 'y' + "Incompatible join types: x.%s (%s) and i.%s (%s). Factor columns must join to factor or character columns.", + x_part_col_name, x_part_type, + y_part_col_name, y_part_type ) # Remove call = NULL to get "Error in merge.data.table(...): " prefix. @@ -121,19 +124,23 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } - + # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i. newend = setdiff(nm_y, by.y) + # fix for #1290, make sure by.y order is set properly before naming setcolorder(dt, c(by.y, setdiff(names(dt), c(by.y, newend)), newend)) setnames(dt, c(by.x, start, end)) if (nrow(dt) > 0L) { setkeyv(dt, if (sort) by.x else NULL) } + # Throw warning if there are duplicate column names in 'dt' (i.e. if + # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame) resultdupnames = names(dt)[duplicated(names(dt))] if (length(resultdupnames)) { warningf("column names %s are duplicated in the result", brackify(resultdupnames)) } + # retain custom classes of first argument that resulted in dispatch to this method, #1378 setattr(dt, "class", class_x) dt } \ No newline at end of file From bb0d6e6c72ca421e82454e39528f4aad778e9f38 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Thu, 5 Jun 2025 13:03:50 +0530 Subject: [PATCH 12/23] modification in bmerge and merge --- R/bmerge.R | 13 ++++++++++++- R/merge.R | 18 +++++++++++++++--- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index dffca5e44f..8b6074f456 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -84,7 +84,18 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos next } } - stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type) + condition_message = sprintf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type) + condition <- structure( + list( + message = condition_message, + bmerge_x_arg_col_name = names(x)[xcol], + bmerge_x_arg_type = x_merge_type, + bmerge_i_arg_col_name = names(i)[icol], + bmerge_i_arg_type = i_merge_type + ), + class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition") + ) + stop(condition) } # we check factors first to cater for the case when trying to do rolling joins on factors if (x_merge_type == i_merge_type) { diff --git a/R/merge.R b/R/merge.R index 4d7245983e..f6394255b5 100644 --- a/R/merge.R +++ b/R/merge.R @@ -92,13 +92,25 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL if (!is.null(incomparables)) { # %fin% to be replaced when #5232 is implemented/closed "%fin%" = function(x, table) if (is.character(x) && is.character(table)) x %chin% table else x %in% table - xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by) - yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by) + xind = rowSums(x[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.x]) == length(by.x) + yind = rowSums(y[, lapply(.SD, function(x) !(x %fin% incomparables)), .SDcols=by.y]) == length(by.y) # subset both so later steps still work x = x[xind] y = y[yind] } - dt = y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] # includes JIS columns (with a i. prefix if conflict with x names) + dt = tryCatch({ + y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] + }, + bmerge_incompatible_type_error = function(e) { + x_part_col_name = e$bmerge_i_arg_col_name + x_part_type = e$bmerge_i_arg_type + y_part_col_name = e$bmerge_x_arg_col_name + y_part_type = e$bmerge_x_arg_type + + msg = sprintf("Incompatible join types: x.%s (%s) and i.%s (%s). Factor columns must join to factor or character columns.", x_part_col_name, x_part_type, y_part_col_name, y_part_type) + stop(errorCondition(message = msg, class = c("datatable_merge_type_error", "data.table_error", "error", "condition"))) + } + ) if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed # Perhaps not very commonly used, so not a huge deal that the join is redone here. From ccd481f57ef21dd458be134f5f2fa7ac8921e377 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Fri, 6 Jun 2025 20:05:10 +0530 Subject: [PATCH 13/23] add test and appropriate changes --- R/bmerge.R | 11 ++++------- R/merge.R | 24 +++++++----------------- inst/tests/tests.Rraw | 5 +++++ 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 255d7dbdfe..13c34cee78 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -84,20 +84,17 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos next } } - condition_message <- sprintf( - "Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", # Exact match for tests like 2044.24 - xname, x_merge_type, - iname, i_merge_type - ) - condition <- structure( + condition_message = gettextf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type) + condition = structure( list( message = condition_message, + call = sys.call(sys.nframe()-1L), bmerge_x_arg_col_name = names(x)[xcol], bmerge_x_arg_type = x_merge_type, bmerge_i_arg_col_name = names(i)[icol], bmerge_i_arg_type = i_merge_type ), - class = c("bmerge_incompatible_type_error", "data.table_error", "error", "condition") + class = c("dt_bmerge_incompatible_type_error", "error", "condition") ) stop(condition) } diff --git a/R/merge.R b/R/merge.R index 6cbb67dd62..63ce50aa7d 100644 --- a/R/merge.R +++ b/R/merge.R @@ -97,26 +97,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL y = y[yind] } - dt <- tryCatch({ + dt = tryCatch({ y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] }, - bmerge_incompatible_type_error = function(e) { - # For merge(x=DT1, y=DT2), DT1 (user's 'x') is bmerge's 'i' - # DT2 (user's 'y') is bmerge's 'x' - x_part_col_name <- e$bmerge_i_arg_col_name - x_part_type <- e$bmerge_i_arg_type - y_part_col_name <- e$bmerge_x_arg_col_name - y_part_type <- e$bmerge_x_arg_type + dt_bmerge_incompatible_type_error = function(e) { + x_part_col_name = e$bmerge_i_arg_col_name + x_part_type = e$bmerge_i_arg_type + y_part_col_name = e$bmerge_x_arg_col_name + y_part_type = e$bmerge_x_arg_type - # Use literal "x." and "y." prefixes referring to the arguments of merge() - msg <- sprintf( - "Incompatible join types: x.%s (%s) and i.%s (%s). Factor columns must join to factor or character columns.", - x_part_col_name, x_part_type, - y_part_col_name, y_part_type - ) - - # Remove call = NULL to get "Error in merge.data.table(...): " prefix. - stop(errorCondition(message = msg, class = c("datatable_merge_type_error", "data.table_error", "error", "condition"))) + stopf("Incompatible join types: x.%s (%s) and i.%s (%s). Factor columns must join to factor or character columns.", x_part_col_name, x_part_type, y_part_col_name, y_part_type, class = "dt_merge_incompatible_type_error") } ) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c9cfe6244b..722209d32c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21157,3 +21157,8 @@ test(2317.6, DT1[DF1, on='a', .(d = x.a + i.d)]$d, 5) test(2317.7, DT1[DF2, on='a', e := i.e]$e, 5) test(2317.8, DT1[DF2, on='a', e2 := x.a + i.e]$e2, 6) test(2317.9, DT1[DF2, on='a', .(e = x.a + i.e)]$e, 6) + +# Test for incompatible factor joins rephrased by merge.data.table(#7048) +DT1=data.table(a=factor('a')) +DT2=data.table(a=1L) +test(2318, merge(DT1, DT2, by = "a"), error = "Incompatible join types: x.a (factor) and i.a (integer). Factor columns must join to factor or character columns.") From 495a06d02b37d7cced1920011e01b3084d54e3d0 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:21:40 -0700 Subject: [PATCH 14/23] terminal newline --- R/merge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/merge.R b/R/merge.R index 63ce50aa7d..c85cde7145 100644 --- a/R/merge.R +++ b/R/merge.R @@ -133,4 +133,4 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL # retain custom classes of first argument that resulted in dispatch to this method, #1378 setattr(dt, "class", class_x) dt -} \ No newline at end of file +} From 62b7c8a75ec4299b36f55732af26beb192516de0 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:25:28 -0700 Subject: [PATCH 15/23] avoid structure() --- R/bmerge.R | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/R/bmerge.R b/R/bmerge.R index 13c34cee78..0331624200 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -85,17 +85,15 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos } } condition_message = gettextf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type) - condition = structure( - list( - message = condition_message, - call = sys.call(sys.nframe()-1L), - bmerge_x_arg_col_name = names(x)[xcol], - bmerge_x_arg_type = x_merge_type, - bmerge_i_arg_col_name = names(i)[icol], - bmerge_i_arg_type = i_merge_type - ), - class = c("dt_bmerge_incompatible_type_error", "error", "condition") + condition = list( + message = condition_message, + call = sys.call(sys.nframe() - 1L), + bmerge_x_arg_col_name = names(x)[xcol], + bmerge_x_arg_type = x_merge_type, + bmerge_i_arg_col_name = names(i)[icol], + bmerge_i_arg_type = i_merge_type ) + class(condition) = c("dt_bmerge_incompatible_type_error", "error", "condition") stop(condition) } # we check factors first to cater for the case when trying to do rolling joins on factors From 1c7d2501715e2356b41da895b1287e0b065cde20 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:28:14 -0700 Subject: [PATCH 16/23] disable irrelevant linter --- .ci/.lintr.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.ci/.lintr.R b/.ci/.lintr.R index f081763b41..c4f6d0c5e1 100644 --- a/.ci/.lintr.R +++ b/.ci/.lintr.R @@ -40,6 +40,9 @@ linters = c(dt_linters, all_linters( # TODO(lintr#2442): Use this once x[ , j, by] is supported. commas_linter = NULL, commented_code_linter = NULL, + # mostly, we just use stopf() & friends, but ignore this for the + # rare cases we need plain stop (e.g. #7048) + condition_call_linter = NULL, # TODO(linter->3.2.0): Activate this. consecutive_assertion_linter = NULL, cyclocomp_linter = NULL, From 3e3004663398a6a1a9c8750aa411cfecc0ee3cbe Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:29:56 -0700 Subject: [PATCH 17/23] correct prefix :) --- R/merge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/merge.R b/R/merge.R index c85cde7145..92162cb6cf 100644 --- a/R/merge.R +++ b/R/merge.R @@ -106,7 +106,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL y_part_col_name = e$bmerge_x_arg_col_name y_part_type = e$bmerge_x_arg_type - stopf("Incompatible join types: x.%s (%s) and i.%s (%s). Factor columns must join to factor or character columns.", x_part_col_name, x_part_type, y_part_col_name, y_part_type, class = "dt_merge_incompatible_type_error") + stopf("Incompatible join types: x.%s (%s) and y.%s (%s). Factor columns must join to factor or character columns.", x_part_col_name, x_part_type, y_part_col_name, y_part_type, class = "dt_merge_incompatible_type_error") } ) From 672bdbb4f81aa673b3551262ee325ee68ecbe47c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:30:13 -0700 Subject: [PATCH 18/23] correct prefix in tests --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 722209d32c..c8bb42ae0e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21161,4 +21161,4 @@ test(2317.9, DT1[DF2, on='a', .(e = x.a + i.e)]$e, 6) # Test for incompatible factor joins rephrased by merge.data.table(#7048) DT1=data.table(a=factor('a')) DT2=data.table(a=1L) -test(2318, merge(DT1, DT2, by = "a"), error = "Incompatible join types: x.a (factor) and i.a (integer). Factor columns must join to factor or character columns.") +test(2318, merge(DT1, DT2, by = "a"), error = "Incompatible join types: x.a (factor) and y.a (integer). Factor columns must join to factor or character columns.") From fe23e002256362bb8fff9151d3c9187f6f7bcdf9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:33:55 -0700 Subject: [PATCH 19/23] style: avoid {} if not needed --- R/merge.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/merge.R b/R/merge.R index 92162cb6cf..5e3d533a80 100644 --- a/R/merge.R +++ b/R/merge.R @@ -97,9 +97,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL y = y[yind] } - dt = tryCatch({ - y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian] - }, + dt = tryCatch( + y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian], dt_bmerge_incompatible_type_error = function(e) { x_part_col_name = e$bmerge_i_arg_col_name x_part_type = e$bmerge_i_arg_type From fc67d724981fa2a6cd9547a13b286c0962dc2d03 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:34:19 -0700 Subject: [PATCH 20/23] ws --- R/merge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/merge.R b/R/merge.R index 5e3d533a80..2af03a3b49 100644 --- a/R/merge.R +++ b/R/merge.R @@ -98,7 +98,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } dt = tryCatch( - y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian], + y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian], dt_bmerge_incompatible_type_error = function(e) { x_part_col_name = e$bmerge_i_arg_col_name x_part_type = e$bmerge_i_arg_type From 18a0662d477c7b59b0058a44005047312a74ad28 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:37:41 -0700 Subject: [PATCH 21/23] Apply prefix outside translation to get identical messages in gettext --- R/merge.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/merge.R b/R/merge.R index 2af03a3b49..a4d41b5031 100644 --- a/R/merge.R +++ b/R/merge.R @@ -100,12 +100,12 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL dt = tryCatch( y[x, nomatch=if (all.x) NA else NULL, on=by, allow.cartesian=allow.cartesian], dt_bmerge_incompatible_type_error = function(e) { - x_part_col_name = e$bmerge_i_arg_col_name + x_part_col_name = paste0("x.", e$bmerge_i_arg_col_name) x_part_type = e$bmerge_i_arg_type - y_part_col_name = e$bmerge_x_arg_col_name + y_part_col_name = paste0("y.", e$bmerge_x_arg_col_name) y_part_type = e$bmerge_x_arg_type - stopf("Incompatible join types: x.%s (%s) and y.%s (%s). Factor columns must join to factor or character columns.", x_part_col_name, x_part_type, y_part_col_name, y_part_type, class = "dt_merge_incompatible_type_error") + stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", x_part_col_name, x_part_type, y_part_col_name, y_part_type, class = "dt_merge_incompatible_type_error") } ) From ae572ce596c38e2c909281da42ceab47d1547b90 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:42:19 -0700 Subject: [PATCH 22/23] restore comment --- R/merge.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/merge.R b/R/merge.R index a4d41b5031..37c1bd6641 100644 --- a/R/merge.R +++ b/R/merge.R @@ -109,7 +109,8 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL } ) - if (all.y && nrow(y)) { + if (all.y && nrow(y)) { # If y does not have any rows, no need to proceed + # Perhaps not very commonly used, so not a huge deal that the join is redone here. missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian] if (length(missingyidx)) dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE, ignore.attr=TRUE) } From 4cbe8b9d11f7445c4be48a21319850a0e08f4d46 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 9 Jun 2025 13:42:59 -0700 Subject: [PATCH 23/23] drop custom class in merge() error for now --- R/merge.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/merge.R b/R/merge.R index 37c1bd6641..cc15b830bf 100644 --- a/R/merge.R +++ b/R/merge.R @@ -105,7 +105,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL y_part_col_name = paste0("y.", e$bmerge_x_arg_col_name) y_part_type = e$bmerge_x_arg_type - stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", x_part_col_name, x_part_type, y_part_col_name, y_part_type, class = "dt_merge_incompatible_type_error") + stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", x_part_col_name, x_part_type, y_part_col_name, y_part_type) } )