Skip to content

Commit afb99aa

Browse files
MarkusBonschmattdowle
authored andcommitted
Fixing bug in 'on' with complex variable names (#3094)
1 parent a6ac844 commit afb99aa

File tree

3 files changed

+140
-44
lines changed

3 files changed

+140
-44
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313

1414
1. Providing an `i` subset expression when attempting to delete a column correctly failed with helpful error, but when the column was missing too created a new column full of `NULL` values, [#3089](https://github.com/Rdatatable/data.table/issues/3089). Thanks to Michael Chirico for reporting.
1515

16+
2. Column names that look like expressions (e.g. `"a<=colB"`) caused an error when used in `on=` even when wrapped with backticks, [#3092](https://github.com/Rdatatable/data.table/issues/3092). Additionally, `on=` now supports white spaces around operators; e.g. `on = "colA == colB"`. Thanks to @mt1022 for reporting and to @MarkusBonsch for fixing.
17+
1618
#### NOTES
1719

1820
1. When data.table first loads it now checks the DLL's MD5. This is to detect installation issues on Windows when you upgrade and i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. Thanks to Gabor Csardi for the suggestion to use `tools::checkMD5sums()`.

R/data.table.R

Lines changed: 101 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -488,40 +488,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
488488
}
489489
if (!missing(on)) {
490490
# on = .() is now possible, #1257
491-
parse_on <- function(onsub) {
492-
ops = c("==", "<=", "<", ">=", ">", "!=")
493-
pat = paste0("(", ops, ")", collapse="|")
494-
if (is.call(onsub) && onsub[[1L]] == "eval") {
495-
onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L))
496-
if (is.call(onsub) && onsub[[1L]] == "eval") onsub = onsub[[2L]]
497-
}
498-
if (is.call(onsub) && as.character(onsub[[1L]]) %in% c("list", ".")) {
499-
spat = paste0("[ ]+(", pat, ")[ ]+")
500-
onsub = lapply(as.list(onsub)[-1L], function(x) gsub(spat, "\\1", deparse(x, width.cutoff=500L)))
501-
onsub = as.call(c(quote(c), onsub))
502-
}
503-
on = eval(onsub, parent.frame(2L), parent.frame(2L))
504-
if (!is.character(on))
505-
stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
506-
this_op = regmatches(on, gregexpr(pat, on))
507-
idx = (vapply(this_op, length, 0L) == 0L)
508-
this_op[idx] = "=="
509-
this_op = unlist(this_op, use.names=FALSE)
510-
idx_op = match(this_op, ops, nomatch=0L)
511-
if (any(idx_op %in% c(0L, 6L)))
512-
stop("Invalid operators ", paste(this_op[idx_op==0L], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".")
513-
if (is.null(names(on))) {
514-
on[idx] = if (isnull_inames) paste(on[idx], paste0("V", seq_len(sum(idx))), sep="==") else paste(on[idx], on[idx], sep="==")
515-
} else {
516-
on[idx] = paste(names(on)[idx], on[idx], sep="==")
517-
}
518-
split = tstrsplit(on, paste0("[ ]*", pat, "[ ]*"))
519-
on = setattr(split[[2L]], 'names', split[[1L]])
520-
if (length(empty_idx <- which(names(on) == "")))
521-
names(on)[empty_idx] = on[empty_idx]
522-
list(on = on, ops = idx_op)
523-
}
524-
on_ops = parse_on(substitute(on))
491+
on_ops = .parse_on(substitute(on), isnull_inames)
525492
on = on_ops[[1L]]
526493
ops = on_ops[[2L]]
527494
# TODO: collect all '==' ops first to speeden up Cnestedid
@@ -3056,3 +3023,103 @@ isReallyReal <- function(x) {
30563023
)
30573024
)
30583025
}
3026+
3027+
3028+
.parse_on <- function(onsub, isnull_inames) {
3029+
## helper that takes the 'on' string(s) and extracts comparison operators and column names from it.
3030+
#' @param onsub the substituted on
3031+
#' @param isnull_inames bool; TRUE if i has no names.
3032+
#' @return List with two entries:
3033+
#' 'on' : character vector providing the column names for the join.
3034+
#' Names correspond to columns in x, entries correspond to columns in i
3035+
#' 'ops': integer vector. Gives the indices of the operators that connect the columns in x and i.
3036+
ops = c("==", "<=", "<", ">=", ">", "!=")
3037+
pat = paste0("(", ops, ")", collapse="|")
3038+
if (is.call(onsub) && onsub[[1L]] == "eval") {
3039+
onsub = eval(onsub[[2L]], parent.frame(2L), parent.frame(2L))
3040+
if (is.call(onsub) && onsub[[1L]] == "eval") { onsub = onsub[[2L]] }
3041+
}
3042+
if (is.call(onsub) && as.character(onsub[[1L]]) %in% c("list", ".")) {
3043+
spat = paste0("[ ]+(", pat, ")[ ]+")
3044+
onsub = lapply(as.list(onsub)[-1L], function(x) gsub(spat, "\\1", deparse(x, width.cutoff=500L)))
3045+
onsub = as.call(c(quote(c), onsub))
3046+
}
3047+
on = eval(onsub, parent.frame(2L), parent.frame(2L))
3048+
if (!is.character(on))
3049+
stop("'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
3050+
## extract the operators and potential variable names from 'on'.
3051+
## split at backticks to take care about variable names like `col1<=`.
3052+
pieces <- strsplit(on, "(?=[`])", perl = TRUE)
3053+
xCols <- character(length(on))
3054+
## if 'on' is named, the names are the xCols for sure
3055+
if(!is.null(names(on))){
3056+
xCols <- names(on)
3057+
}
3058+
iCols <- character(length(on))
3059+
operators <- character(length(on))
3060+
## loop over the elements and extract operators and column names.
3061+
for(i in seq_along(pieces)){
3062+
thisCols <- character(0)
3063+
thisOperators <- character(0)
3064+
j <- 1
3065+
while(j <= length(pieces[[i]])){
3066+
if(pieces[[i]][j] == "`"){
3067+
## start of a variable name with backtick.
3068+
thisCols <- c(thisCols, pieces[[i]][j+1])
3069+
j <- j+3 # +1 is the column name, +2 is delimiting "`", +3 is next relevant entry.`
3070+
} else {
3071+
## no backtick
3072+
## search for operators
3073+
thisOperators <- c(thisOperators,
3074+
unlist(regmatches(pieces[[i]][j], gregexpr(pat, pieces[[i]][j])),
3075+
use.names = FALSE))
3076+
## search for column names
3077+
thisCols <- c(thisCols, trimws(strsplit(pieces[[i]][j], pat)[[1]]))
3078+
## there can be empty string column names because of trimws, remove them
3079+
thisCols <- thisCols[thisCols != ""]
3080+
j <- j+1
3081+
}
3082+
}
3083+
if (length(thisOperators) == 0) {
3084+
## if no operator is given, it must be ==
3085+
operators[i] <- "=="
3086+
} else if (length(thisOperators) == 1) {
3087+
operators[i] <- thisOperators
3088+
} else {
3089+
## multiple operators found in one 'on' part. Something is wrong.
3090+
stop("Found more than one operator in one 'on' statement: ", on[i], ". Please specify a single operator.")
3091+
}
3092+
if (length(thisCols) == 2){
3093+
## two column names found, first is xCol, second is iCol for sure
3094+
xCols[i] <- thisCols[1]
3095+
iCols[i] <- thisCols[2]
3096+
} else if (length(thisCols) == 1){
3097+
## a single column name found. Can mean different things
3098+
if(xCols[i] != ""){
3099+
## xCol is given by names(on). thisCols must be iCol
3100+
iCols[i] <- thisCols[1]
3101+
} else if (isnull_inames){
3102+
## i has no names. It will be given the names V1, V2, ... automatically.
3103+
## The single column name is the x column. It will match to the ith column in i.
3104+
xCols[i] <- thisCols[1]
3105+
iCols[i] <- paste0("V", i)
3106+
} else {
3107+
## i has names and one single column name is given by on.
3108+
## This means that xCol and iCol have the same name.
3109+
xCols[i] <- thisCols[1]
3110+
iCols[i] <- thisCols[1]
3111+
}
3112+
} else if (length(thisCols) == 0){
3113+
stop("'on' contains no column name: ", on[i], ". Each 'on' clause must contain one or two column names.")
3114+
} else {
3115+
stop("'on' contains more than 2 column names: ", on[i], ". Each 'on' clause must contain one or two column names.")
3116+
}
3117+
}
3118+
idx_op = match(operators, ops, nomatch=0L)
3119+
if (any(idx_op %in% c(0L, 6L)))
3120+
stop("Invalid operators ", paste(operators[idx_op %in% c(0L, 6L)], collapse=","), ". Only allowed operators are ", paste(ops[1:5], collapse=""), ".")
3121+
## the final on will contain the xCol as name, the iCol as value
3122+
on <- iCols
3123+
names(on) <- xCols
3124+
return(list(on = on, ops = idx_op))
3125+
}

inst/tests/tests.Rraw

Lines changed: 37 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12278,28 +12278,55 @@ DT = data.table(A=1:5)
1227812278
test(1947.1, DT[A<0, c('A','B'):=.(NULL, A)], error="When deleting columns, i should not be provided")
1227912279
test(1947.2, DT, data.table(A=1:5))
1228012280

12281+
## tests for backticks and spaces in column names of on=, #2931
12282+
DT <- data.table(id = 1:3, `counts(a>=0)` = 1:3, sameName = 1:3)
12283+
i <- data.table(idi = 1:3, ` weirdName>=` = 1:3, sameName = 1:3)
12284+
## test white spaces around operator
12285+
test(1948.1, DT[i, on = "id >= idi"], DT[i, on = "id>=idi"])
12286+
test(1948.2, DT[i, on = "id>= idi"], DT[i, on = "id>=idi"])
12287+
test(1948.3, DT[i, on = "id >=idi"], DT[i, on = "id>=idi"])
12288+
## test column names containing operators
12289+
test(1948.4, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")),
12290+
DT[i, on = "id>=idi"])
12291+
test(1948.5, setnames(DT[i, on = "id>=` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")),
12292+
DT[i, on = "id>=idi"])
12293+
test(1948.6, setnames(DT[i, on = "id >= ` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")),
12294+
DT[i, on = "id>=idi"])
12295+
test(1948.7, setnames(DT[i, on = "`counts(a>=0)`==` weirdName>=`"], c("id","counts(a>=0)", "sameName", " weirdName>=", "i.sameName")),
12296+
DT[i, on = "id==idi"])
12297+
## mixed example
12298+
test(1948.8, DT[i, on = c( id = "idi", "sameName", "`counts(a>=0)`==` weirdName>=`")], DT[i, on = "id==idi", c("id", "counts(a>=0)", "sameName")])
12299+
## testing 'eval' in on clause
12300+
test(1948.9, DT[i, on = eval(eval("id<=idi"))], DT[i, on = "id<=idi"])
12301+
## testing for errors
12302+
test(1948.11, DT[i, on = ""], error = "'on' contains no column name: . Each 'on' clause must contain one or two column names.")
12303+
test(1948.12, DT[i, on = "id>=idi>=1"], error = "Found more than one operator in one 'on' statement: id>=idi>=1. Please specify a single operator.")
12304+
test(1948.13, DT[i, on = "`id``idi`<=id"], error = "'on' contains more than 2 column names: `id``idi`<=id. Each 'on' clause must contain one or two column names.")
12305+
test(1948.14, DT[i, on = "id != idi"], error = "Invalid operators !=. Only allowed operators are ==<=<>=>.")
12306+
test(1948.15, DT[i, on = 1L], error = "'on' argument should be a named atomic vector of column names indicating which columns in 'i' should be joined with which columns in 'x'.")
12307+
1228112308
# helpful error when on= is provided but not i, rather than silently ignoring on=
12282-
test(1948.1, DT[,,on=A], error="When i and j are both missing, no other argument should be used.")
12283-
test(1948.2, DT[,1,on=A], error="i must be provided when on= is provided")
12284-
test(1948.3, DT[1,,with=FALSE], error="j must be provided when with=FALSE")
12309+
test(1949.1, DT[,,on=A], error="When i and j are both missing, no other argument should be used.")
12310+
test(1949.2, DT[,1,on=A], error="i must be provided when on= is provided")
12311+
test(1949.3, DT[1,,with=FALSE], error="j must be provided when with=FALSE")
1228512312

1228612313
if (test_bit64) {
1228712314
# explicit coverage of 2-column real case in uniqlist. Keeps coming up in codecov checks in PRs that don't touch uniqlist.c
1228812315
DT = data.table(id=c("A","A","B","B","B"), v=as.integer64(c(1,2,3,3,4)))
12289-
test(1949, uniqlist(DT), INT(1,2,3,5))
12316+
test(1950, uniqlist(DT), INT(1,2,3,5))
1229012317
}
1229112318

1229212319
# allow nomatch=NULL to work same as nomatch=0L, #857
1229312320
d1 = data.table(a=1:3, b=2:4)
1229412321
d2 = data.table(a=2:4, b=3:5)
12295-
test(1950.1, d1[d2, on="a", nomatch=NULL], d1[d2, on="a", nomatch=0L])
12296-
test(1950.2, d1[d2, on="b", nomatch=NULL], d1[d2, on="b", nomatch=0L])
12297-
test(1950.3, d1[d2, on=c("a","b"), nomatch=NULL], d1[d2, on=c("a","b"), nomatch=0L])
12298-
test(1950.4, d1[d2, nomatch=3], error="nomatch= must be either NA or NULL .or 0 for backwards compatibility")
12322+
test(1951.1, d1[d2, on="a", nomatch=NULL], d1[d2, on="a", nomatch=0L])
12323+
test(1951.2, d1[d2, on="b", nomatch=NULL], d1[d2, on="b", nomatch=0L])
12324+
test(1951.3, d1[d2, on=c("a","b"), nomatch=NULL], d1[d2, on=c("a","b"), nomatch=0L])
12325+
test(1951.4, d1[d2, nomatch=3], error="nomatch= must be either NA or NULL .or 0 for backwards compatibility")
1229912326

1230012327
# coverage of which= checks
12301-
test(1951.1, d1[a==2, which=3], error="which= must be a logical vector length 1. Either FALSE, TRUE or NA.")
12302-
test(1951.2, d1[a==2, 2, which=TRUE], error="which==TRUE.*but j is also supplied")
12328+
test(1952.1, d1[a==2, which=3], error="which= must be a logical vector length 1. Either FALSE, TRUE or NA.")
12329+
test(1952.2, d1[a==2, 2, which=TRUE], error="which==TRUE.*but j is also supplied")
1230312330

1230412331

1230512332
###################################

0 commit comments

Comments
 (0)