diff --git a/NEWS.md b/NEWS.md index 81dd36e1de..d418908b32 100644 --- a/NEWS.md +++ b/NEWS.md @@ -78,6 +78,8 @@ 17. A data.table with a column of class `vctrs_list_of` (from package {vctrs}) prints as expected, [#5948](https://github.com/Rdatatable/data.table/issues/5948). Before, they could be printed messily, e.g. printing every entry in a nested data.frame. Thanks @jesse-smith for the report, @DavisVaughan and @r2evans for contributing, and @MichaelChirico for the PR. +18. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix. + ### NOTES 1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/. diff --git a/R/data.table.R b/R/data.table.R index ef8969a7f4..9fd092beda 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1348,21 +1348,8 @@ replace_dot_alias = function(e) { ans[icolsAns] = .Call(CsubsetDT, i, ii, icols) ans[xcolsAns] = .Call(CsubsetDT, x, irows, xcols) setattr(ans, "names", ansvars) - if (haskey(x)) { - keylen = which.first(!key(x) %chin% ansvars)-1L - if (is.na(keylen)) keylen = length(key(x)) - len = length(rightcols) - # fix for #1268, #1704, #1766 and #1823 - chk = if (len && !missing(on)) !identical(head(key(x), len), names(on)) else FALSE - if ( (keylen>len || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) { - keylen = if (!chk) len else 0L # fix for #1268 - } - ## check key on i as well! - ichk = is.data.table(i) && haskey(i) && - identical(head(key(i), length(leftcols)), names_i[leftcols]) # i has the correct key, #3061 - if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE - setattr(ans,"sorted",head(key(x),keylen)) - } + # NB: could be NULL + setattr(ans, "sorted", .join_result_key(x, i, ans, if (!missing(on)) names(on), ansvars, leftcols, rightcols, names_i, irows, roll)) setattr(ans, "class", class(x)) # retain class that inherits from data.table, #64 setattr(ans, "row.names", .set_row_names(length(ans[[1L]]))) setalloccol(ans) @@ -2034,6 +2021,48 @@ replace_dot_alias = function(e) { setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line } +# can the specified merge of x and i be marked as sorted? return the columns for which this is true, otherwise NULL +.join_result_key <- function(x, i, ans, on_lhs, ansvars, leftcols, rightcols, names_i, irows, roll) { + x_key <- key(x) + if (is.null(x_key)) + return(NULL) + + key_length = which.first(!x_key %chin% ansvars) - 1L + if (is.na(key_length)) + key_length = length(x_key) + + rhs_length = length(rightcols) + # fix for #1268, #1704, #1766 and #1823 + chk = rhs_length && !is.null(on_lhs) && !identical(head(x_key, rhs_length), on_lhs) + if ( (key_length > rhs_length || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) { + key_length = if (chk) 0L else rhs_length # fix for #1268 + } + + if (!key_length) + return(NULL) + + # i has the correct key, #3061 + if (identical(head(key(i), length(leftcols)), names_i[leftcols])) + return(head(x_key, key_length)) + + if (!.Call(CisOrderedSubset, irows, nrow(x))) + return(NULL) + + # see #1010. don't set key when i has no key, but irows is ordered and !roll + if (roll && length(irows) != 1L) + return(NULL) + + new_key <- head(x_key, key_length) + + #5361 merging on keyed factor with character, check if resulting character is really sorted + if (identical(vapply_1c(.shallow(i, leftcols), typeof), vapply_1c(.shallow(x, rightcols), typeof))) + return(new_key) + + if (!is.sorted(ans, by=new_key)) + return(NULL) + new_key +} + # What's the name of the top-level call in 'j'? # NB: earlier, we used 'as.character()' but that fails for closures/builtins (#6026). root_name = function(jsub) if (is.call(jsub)) paste(deparse(jsub[[1L]]), collapse = " ") else "" diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 989fcb0ec7..cf2564e558 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7077,6 +7077,50 @@ test(1483.3, merge(x,y,by="country",all=TRUE), data.table(country=factor(c("US", setkey(y) test(1483.4, y[x], data.table(country="US", key="country")) +# 5361 merge on character and factor should only have key(x) if result is really sorted +lett = c("a", "b", "c") +rlet = c("c", "b", "a") +x = data.table(i=rlet) +y = data.table(i=factor(lett, levels=rlet), key="i") +test(1483.51, x[y, on="i"], x) +test(1483.52, y[x, on="i"], x) +test(1483.53, merge(x, y, by="i"), data.table(i=lett, key="i")) +test(1483.54, merge(y, x, by="i"), data.table(i=lett, key="i")) +x = data.table(i1=1:3, i2=rlet) +y = data.table(i1=1:3, i2=factor(lett, levels=rlet), key=c("i1", "i2")) +test(1483.55, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett)) +test(1483.56, y[x, on=c("i1", "i2")], x) +test(1483.57, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2"))) +test(1483.58, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2"))) + +x = data.table(i=rlet, key="i") +y = data.table(i=factor(lett, levels=rlet)) +test(1483.61, x[y, on="i"], x) +test(1483.62, y[x, on="i"], data.table(i=lett)) +test(1483.63, merge(x, y, by="i"), data.table(i=lett, key="i")) +test(1483.64, merge(y, x, by="i"), data.table(i=lett, key="i")) +x = data.table(i1=1:3, i2=rlet, key=c("i1", "i2")) +y = data.table(i1=1:3, i2=factor(lett, levels=rlet)) +test(1483.65, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett)) +test(1483.66, y[x, on=c("i1", "i2")], data.table(i1=1:3, i2=rlet)) +test(1483.67, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2"))) +test(1483.68, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2"))) + +x = data.table(i=rlet, a=rlet) +y = data.table(i=factor(lett, levels=rlet), b=lett, key="i") +test(1483.71, x[y, on="i"], data.table(i=rlet, a=rlet, b=rlet)) +test(1483.72, y[x, on="i"], data.table(i=rlet, b=rlet, a=rlet)) +test(1483.73, merge(x, y, by="i"), data.table(i=lett, a=lett, b=lett, key="i")) +test(1483.74, merge(y, x, by="i"), data.table(i=lett, b=lett, a=lett, key="i")) + +some_letters <- c("c", "b", "a") +some_more_letters <- rep(c("a", "b", "c"), 2L) +dt1 <- data.table(x = some_letters, y=1:3) +dt2 <- data.table(x = factor(some_more_letters, levels=some_letters), z=1:6, key=c("x", "z")) +dt3 <- merge(dt1, dt2, by="x") +test(1483.81, key(dt3), "x") +test(1483.82, nrow(dt3[x %in% "c", ]), 2L) + # NULL items should be removed when making data.table from list, #842 # Original fix for #842 added a branch in as.data.table.list() using point() # Then PR#3471 moved logic from data.table() into as.data.table.list() and now removes NULL items up front, so longer need for the branch