Skip to content

Commit c806849

Browse files
Fix incorrect keying after merge of keyed, non-alphabetic factor and character columns (#5362)
* add fix * add test * add more tests * add NEWS * typo * sentence structure * state bug more precisely * unwield lengthy if with ws * more tests * extend NEWS for mirror casse * update NEWS * vapply on single columns instead of whole subset * use .shallow * suggested NEWS wording * add OPs original example more exactly to the regression test * add some tests of multiple join columns * avoid using column x in table x * attempt to refactor into huge helper (🤞) * trailing ws * fix (?) tests * need to pass 'ans' too * don't reuse overloaded name 'let' * typo * remove apparently vestigial check --------- Co-authored-by: Michael Chirico <[email protected]> Co-authored-by: Michael Chirico <[email protected]>
1 parent c16f320 commit c806849

File tree

3 files changed

+90
-15
lines changed

3 files changed

+90
-15
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@
7878
7979
17. A data.table with a column of class `vctrs_list_of` (from package {vctrs}) prints as expected, [#5948](https://github.com/Rdatatable/data.table/issues/5948). Before, they could be printed messily, e.g. printing every entry in a nested data.frame. Thanks @jesse-smith for the report, @DavisVaughan and @r2evans for contributing, and @MichaelChirico for the PR.
8080
81+
18. Fixed incorrect sorting of merges where the first column of a key is a factor with non-`sort()`-ed levels (e.g. `factor(1:2, 2:1)` and it is joined to a character column, [#5361](https://github.com/Rdatatable/data.table/issues/5361). Thanks to @gbrunick for the report and Benjamin Schwendinger for the fix.
82+
8183
### NOTES
8284
8385
1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.

R/data.table.R

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,21 +1348,8 @@ replace_dot_alias = function(e) {
13481348
ans[icolsAns] = .Call(CsubsetDT, i, ii, icols)
13491349
ans[xcolsAns] = .Call(CsubsetDT, x, irows, xcols)
13501350
setattr(ans, "names", ansvars)
1351-
if (haskey(x)) {
1352-
keylen = which.first(!key(x) %chin% ansvars)-1L
1353-
if (is.na(keylen)) keylen = length(key(x))
1354-
len = length(rightcols)
1355-
# fix for #1268, #1704, #1766 and #1823
1356-
chk = if (len && !missing(on)) !identical(head(key(x), len), names(on)) else FALSE
1357-
if ( (keylen>len || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
1358-
keylen = if (!chk) len else 0L # fix for #1268
1359-
}
1360-
## check key on i as well!
1361-
ichk = is.data.table(i) && haskey(i) &&
1362-
identical(head(key(i), length(leftcols)), names_i[leftcols]) # i has the correct key, #3061
1363-
if (keylen && (ichk || is.logical(i) || (.Call(CisOrderedSubset, irows, nrow(x)) && ((roll == FALSE) || length(irows) == 1L)))) # see #1010. don't set key when i has no key, but irows is ordered and roll != FALSE
1364-
setattr(ans,"sorted",head(key(x),keylen))
1365-
}
1351+
# NB: could be NULL
1352+
setattr(ans, "sorted", .join_result_key(x, i, ans, if (!missing(on)) names(on), ansvars, leftcols, rightcols, names_i, irows, roll))
13661353
setattr(ans, "class", class(x)) # retain class that inherits from data.table, #64
13671354
setattr(ans, "row.names", .set_row_names(length(ans[[1L]])))
13681355
setalloccol(ans)
@@ -2034,6 +2021,48 @@ replace_dot_alias = function(e) {
20342021
setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line
20352022
}
20362023

2024+
# can the specified merge of x and i be marked as sorted? return the columns for which this is true, otherwise NULL
2025+
.join_result_key <- function(x, i, ans, on_lhs, ansvars, leftcols, rightcols, names_i, irows, roll) {
2026+
x_key <- key(x)
2027+
if (is.null(x_key))
2028+
return(NULL)
2029+
2030+
key_length = which.first(!x_key %chin% ansvars) - 1L
2031+
if (is.na(key_length))
2032+
key_length = length(x_key)
2033+
2034+
rhs_length = length(rightcols)
2035+
# fix for #1268, #1704, #1766 and #1823
2036+
chk = rhs_length && !is.null(on_lhs) && !identical(head(x_key, rhs_length), on_lhs)
2037+
if ( (key_length > rhs_length || chk) && !.Call(CisOrderedSubset, irows, nrow(x))) {
2038+
key_length = if (chk) 0L else rhs_length # fix for #1268
2039+
}
2040+
2041+
if (!key_length)
2042+
return(NULL)
2043+
2044+
# i has the correct key, #3061
2045+
if (identical(head(key(i), length(leftcols)), names_i[leftcols]))
2046+
return(head(x_key, key_length))
2047+
2048+
if (!.Call(CisOrderedSubset, irows, nrow(x)))
2049+
return(NULL)
2050+
2051+
# see #1010. don't set key when i has no key, but irows is ordered and !roll
2052+
if (roll && length(irows) != 1L)
2053+
return(NULL)
2054+
2055+
new_key <- head(x_key, key_length)
2056+
2057+
#5361 merging on keyed factor with character, check if resulting character is really sorted
2058+
if (identical(vapply_1c(.shallow(i, leftcols), typeof), vapply_1c(.shallow(x, rightcols), typeof)))
2059+
return(new_key)
2060+
2061+
if (!is.sorted(ans, by=new_key))
2062+
return(NULL)
2063+
new_key
2064+
}
2065+
20372066
# What's the name of the top-level call in 'j'?
20382067
# NB: earlier, we used 'as.character()' but that fails for closures/builtins (#6026).
20392068
root_name = function(jsub) if (is.call(jsub)) paste(deparse(jsub[[1L]]), collapse = " ") else ""

inst/tests/tests.Rraw

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7077,6 +7077,50 @@ test(1483.3, merge(x,y,by="country",all=TRUE), data.table(country=factor(c("US",
70777077
setkey(y)
70787078
test(1483.4, y[x], data.table(country="US", key="country"))
70797079

7080+
# 5361 merge on character and factor should only have key(x) if result is really sorted
7081+
lett = c("a", "b", "c")
7082+
rlet = c("c", "b", "a")
7083+
x = data.table(i=rlet)
7084+
y = data.table(i=factor(lett, levels=rlet), key="i")
7085+
test(1483.51, x[y, on="i"], x)
7086+
test(1483.52, y[x, on="i"], x)
7087+
test(1483.53, merge(x, y, by="i"), data.table(i=lett, key="i"))
7088+
test(1483.54, merge(y, x, by="i"), data.table(i=lett, key="i"))
7089+
x = data.table(i1=1:3, i2=rlet)
7090+
y = data.table(i1=1:3, i2=factor(lett, levels=rlet), key=c("i1", "i2"))
7091+
test(1483.55, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
7092+
test(1483.56, y[x, on=c("i1", "i2")], x)
7093+
test(1483.57, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
7094+
test(1483.58, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
7095+
7096+
x = data.table(i=rlet, key="i")
7097+
y = data.table(i=factor(lett, levels=rlet))
7098+
test(1483.61, x[y, on="i"], x)
7099+
test(1483.62, y[x, on="i"], data.table(i=lett))
7100+
test(1483.63, merge(x, y, by="i"), data.table(i=lett, key="i"))
7101+
test(1483.64, merge(y, x, by="i"), data.table(i=lett, key="i"))
7102+
x = data.table(i1=1:3, i2=rlet, key=c("i1", "i2"))
7103+
y = data.table(i1=1:3, i2=factor(lett, levels=rlet))
7104+
test(1483.65, x[y, on=c("i1", "i2")], data.table(i1=1:3, i2=lett))
7105+
test(1483.66, y[x, on=c("i1", "i2")], data.table(i1=1:3, i2=rlet))
7106+
test(1483.67, merge(x, y, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
7107+
test(1483.68, merge(y, x, by=c("i1", "i2")), data.table(i1=2L, i2="b", key=c("i1", "i2")))
7108+
7109+
x = data.table(i=rlet, a=rlet)
7110+
y = data.table(i=factor(lett, levels=rlet), b=lett, key="i")
7111+
test(1483.71, x[y, on="i"], data.table(i=rlet, a=rlet, b=rlet))
7112+
test(1483.72, y[x, on="i"], data.table(i=rlet, b=rlet, a=rlet))
7113+
test(1483.73, merge(x, y, by="i"), data.table(i=lett, a=lett, b=lett, key="i"))
7114+
test(1483.74, merge(y, x, by="i"), data.table(i=lett, b=lett, a=lett, key="i"))
7115+
7116+
some_letters <- c("c", "b", "a")
7117+
some_more_letters <- rep(c("a", "b", "c"), 2L)
7118+
dt1 <- data.table(x = some_letters, y=1:3)
7119+
dt2 <- data.table(x = factor(some_more_letters, levels=some_letters), z=1:6, key=c("x", "z"))
7120+
dt3 <- merge(dt1, dt2, by="x")
7121+
test(1483.81, key(dt3), "x")
7122+
test(1483.82, nrow(dt3[x %in% "c", ]), 2L)
7123+
70807124
# NULL items should be removed when making data.table from list, #842
70817125
# Original fix for #842 added a branch in as.data.table.list() using point()
70827126
# Then PR#3471 moved logic from data.table() into as.data.table.list() and now removes NULL items up front, so longer need for the branch

0 commit comments

Comments
 (0)