Skip to content

Commit eee5d43

Browse files
committed
fixed #6556
1 parent e4b0bbb commit eee5d43

File tree

3 files changed

+53
-34
lines changed

3 files changed

+53
-34
lines changed

NEWS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,4 +1013,10 @@ rowwiseDT(
10131013

10141014
20. Some clarity is added to `?GForce` for the case when subtle changes to `j` produce different results because of differences in locale. Because `data.table` _always_ uses the "C" locale, small changes to queries which activate/deactivate GForce might cause confusingly different results when sorting is involved, [#5331](https://github.com/Rdatatable/data.table/issues/5331). The inspirational example compared `DT[, .(max(a), max(b)), by=grp]` and `DT[, .(max(a), max(tolower(b))), by=grp]` -- in the latter case, GForce is deactivated owing to the _ad-hoc_ column, so the result for `max(a)` might differ for the two queries. An example is added to `?GForce`. As always, there are several options to guarantee consistency, for example (1) use namespace qualification to deactivate GForce: `DT[, .(base::max(a), base::max(b)), by=grp]`; (2) turn off all optimizations with `options(datatable.optimize = 0)`; or (3) set your R session to always sort in C locale with `Sys.setlocale("LC_COLLATE", "C")` (or temporarily with e.g. `withr::with_locale()`). Thanks @markseeto for the example and @michaelchirico for the improved documentation.
10151015

1016+
merge() now provides improved error handling for invalid column names in the by argument. When performing a join, the error messages explicitly identify the missing columns in both x and y, ensuring clarity for users. Fixes #6556. Thanks @venom1204 for the PR.
1017+
1018+
1019+
10161020
# data.table v1.14.10 (Dec 2023) back to v1.10.0 (Dec 2016) has been moved to [NEWS.1.md](https://github.com/Rdatatable/data.table/blob/master/NEWS.1.md)
1021+
1022+

R/merge.R

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,16 +29,16 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
2929

3030
## set up 'by'/'by.x'/'by.y'
3131
if ( (!is.null(by.x) || !is.null(by.y)) && length(by.x)!=length(by.y) )
32-
stopf("`by.x` and `by.y` must be of same length.")
32+
stopf("by.x and by.y must be of same length.")
3333
if (!missing(by) && !missing(by.x))
34-
warningf("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.")
34+
warningf("Supplied both by and by.x/by.y. by argument will be ignored.")
3535
if (!is.null(by.x)) {
3636
if (length(by.x)==0L || !is.character(by.x) || !is.character(by.y))
37-
stopf("A non-empty vector of column names is required for `by.x` and `by.y`.")
37+
stopf("A non-empty vector of column names is required for by.x and by.y.")
3838
if (!all(by.x %chin% nm_x))
39-
stopf("Elements listed in `by.x` must be valid column names in x.")
39+
stopf("Elements listed in by.x must be valid column names in x.")
4040
if (!all(by.y %chin% nm_y))
41-
stopf("Elements listed in `by.y` must be valid column names in y.")
41+
stopf("Elements listed in by.y must be valid column names in y.")
4242
by = by.x
4343
names(by) = by.y
4444
} else {
@@ -49,9 +49,20 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
4949
if (!length(by))
5050
by = intersect(nm_x, nm_y)
5151
if (length(by) == 0L || !is.character(by))
52-
stopf("A non-empty vector of column names for `by` is required.")
53-
if (!all(by %chin% intersect(nm_x, nm_y)))
54-
stopf("Elements listed in `by` must be valid column names in x and y")
52+
stopf("A non-empty vector of column names for by is required.")
53+
54+
## Updated Error Handling Section
55+
missing_in_x = setdiff(by, nm_x)
56+
missing_in_y = setdiff(by, nm_y)
57+
if (length(missing_in_x) > 0 || length(missing_in_y) > 0) {
58+
error_msg = "Columns listed in by must be valid column names in both data.tables.\n"
59+
if (length(missing_in_x) > 0)
60+
error_msg = paste0(error_msg, sprintf("✖ Missing in x: %s\n", paste(missing_in_x, collapse = ", ")))
61+
if (length(missing_in_y) > 0)
62+
error_msg = paste0(error_msg, sprintf("✖ Missing in y: %s", paste(missing_in_y, collapse = ", ")))
63+
stopf(error_msg)
64+
}
65+
5566
by = unname(by)
5667
by.x = by.y = by
5768
}
@@ -109,7 +120,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
109120
}
110121

111122
# Throw warning if there are duplicate column names in 'dt' (i.e. if
112-
# `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
123+
# suffixes=c("",""), to match behaviour in base:::merge.data.frame)
113124
resultdupnames = names(dt)[duplicated(names(dt))]
114125
if (length(resultdupnames)) {
115126
warningf("column names %s are duplicated in the result", brackify(resultdupnames))
@@ -118,4 +129,4 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
118129
# retain custom classes of first argument that resulted in dispatch to this method, #1378
119130
setattr(dt, "class", class_x)
120131
dt
121-
}
132+
}

inst/tests/tests.Rraw

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8566,19 +8566,18 @@ DT1 = data.table(id1 = c("c", "a", "b", "b", "b", "c"),
85668566
DT2 = data.table(id1=c("c", "w", "b"), val=50:52)
85678567
test(1600.2, names(DT1[DT2, .(id1=id1, val=val, bla=sum(z1, na.rm=TRUE)), on="id1"]), c("id1", "val", "bla"))
85688568

8569-
# warn when merge empty data.table #597
85708569
DT0 = data.table(NULL)
8571-
DT1 = data.table(a=1)
8572-
test(1601.1, merge(DT1, DT1, by="a"), data.table(a=1, key="a"))
8573-
test(1601.2, merge(DT1, DT0, by="a"),
8574-
warning="Input data.table 'y' has no columns.",
8575-
error="Elements listed in `by`")
8576-
test(1601.3, merge(DT0, DT1, by="a"),
8577-
warning="Input data.table 'x' has no columns.",
8578-
error="Elements listed in `by`")
8579-
test(1601.4, merge(DT0, DT0, by="a"),
8580-
warning="Neither of the input data.tables to join have columns.",
8581-
error="Elements listed in `by`")
8570+
DT1 = data.table(a = 1)
8571+
test(1601.1, merge(DT1, DT1, by = "a"), data.table(a = 1, key = "a"))
8572+
test(1601.2, merge(DT1, DT0, by = "a"),
8573+
warning = "Input data.table 'y' has no columns.",
8574+
error = "Columns listed in by must be valid column names in both data.tables.\n✖ Missing in y: a")
8575+
test(1601.3, merge(DT0, DT1, by = "a"),
8576+
warning = "Input data.table 'x' has no columns.",
8577+
error = "Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: a")
8578+
test(1601.4, merge(DT0, DT0, by = "a"),
8579+
warning = "Neither of the input data.tables to join have columns.",
8580+
error = "Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: a\n✖ Missing in y: a")
85828581

85838582
# fix for #1549
85848583
d1 <- data.table(v1=1:2,x=x)
@@ -13530,14 +13529,13 @@ test(1962.017, merge(DT1, DT2, by = 'V', by.x = 'a', by.y = 'a'),
1353013529
data.table(a = 2:3, V.x = c("a", "a"), V.y = c("b", "b"), key = 'a'),
1353113530
warning = 'Supplied both.*argument will be ignored')
1353213531
test(1962.018, merge(DT1, DT2, by.x = 'z', by.y = 'a'),
13533-
error = 'Elements listed in `by.x`')
13532+
error = "Elements listed in by.x must be valid column names in x.")
1353413533
test(1962.019, merge(DT1, DT2, by.x = 'a', by.y = 'z'),
13535-
error = 'Elements listed in `by.y`')
13534+
error = "Elements listed in by.y must be valid column names in y.")
1353613535
test(1962.0201, merge(DT1, DT2, by=character(0L)), ans) # was error before PR#5183
1353713536
test(1962.0202, merge(DT1, DT2, by=NULL), ans) # test explicit NULL too as missing() could be used inside merge()
13538-
test(1962.021, merge(DT1, DT2, by = 'z'),
13539-
error = 'must be valid column names in x and y')
13540-
13537+
test(1962.021, merge(DT1, DT2, by = "z"),
13538+
error = "Columns listed in by must be valid column names in both data.tables.\n✖ Missing in x: z\n✖ Missing in y: z")
1354113539
## frank.R
1354213540
x = c(1, 1, 2, 5, 4, 3, 4, NA, 6)
1354313541
test(1962.022, frankv(x, na.last = logical(0L)),
@@ -16909,15 +16907,15 @@ if (.Platform$OS.type=="windows") local({
1690916907
})
1691016908
# test back to English (the argument order is back to 1,c,2,1)
1691116909
test(2144, rbind(DT,list(c=4L,a=7L)), error="Column 1 ['c'] of item 2 is missing in item 1")
16912-
1691316910
# Attempting to join on character(0) shouldn't crash R
1691416911
A = data.table(A='a')
1691516912
B = data.table(B='b')
1691616913
test(2145.1, A[B, on=character(0)], error = "'on' argument should be a named atomic vector")
16917-
test(2145.2, merge(A, B, by=character(0) ), error = "non-empty vector of column names for `by` is required.")
16918-
test(2145.3, merge(A, B, by.x=character(0), by.y=character(0)), error = "non-empty vector of column names is required")
16914+
test(2145.2, merge(A, B, by = character(0)), error = "A non-empty vector of column names for by is required.")
16915+
test(2145.3, merge(A, B, by.x = character(0), by.y = character(0)), error = "A non-empty vector of column names is required.")
1691916916
# Also shouldn't crash when using internal functions
16920-
test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE), error = 'icols and xcols must be non-empty')
16917+
test(2145.4, bmerge(A, B, integer(), integer(), 0, c(FALSE, TRUE), NA, 'all', integer(), FALSE),
16918+
error = "icols and xcols must be non-empty")
1692116919

1692216920
# nrow(i)==0 by-join, #4364 (broke in dev 1.12.9)
1692316921
d0 = data.table(id=integer(), n=integer())
@@ -17996,9 +17994,12 @@ test(2230.3, setDF(merge(DT, y, by="k2", incomparables=c(4,5))), merge(x, y,
1799617994
test(2230.4, setDF(merge(DT, y, by="k2", incomparables=c(1, NA, 4, 5))), merge(x, y, by="k2", incomparables=c(1,NA,4,5)))
1799717995
test(2230.5, setDF(merge(DT, y, by="k2", incomparables=c(NA, 3, 4, 5))), merge(x, y, by="k2", incomparables=c(NA,3,4,5)))
1799817996
test(2230.6, merge(DT, y, by="k2", unk=1), merge(DT, y, by="k2"), warning="Unknown argument 'unk' has been passed.")
17999-
test(2230.7, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, 1L),
18000-
merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x/by.y`. `by` argument will be ignored.", "Passed 1 unknown and unnamed arguments."))
18001-
17997+
test(2230.7, merge(DT, y, by = "k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE,
17998+
getOption("datatable.allow.cartesian"), NULL, 1L),
17999+
merge(DT, y, by = "k2"),
18000+
warning = c("Supplied both by and by.x/by.y. by argument will be ignored.",
18001+
"Passed 1 unknown and unnamed arguments."))
18002+
1800218003
# weighted.mean GForce optimized, #3977
1800318004
old = options(datatable.optimize=1L)
1800418005
DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L)
@@ -20697,3 +20698,4 @@ if (test_bit64) {
2069720698
test(2300.3, DT1[DT2, on='id'], error="Incompatible join types")
2069820699
test(2300.4, DT2[DT1, on='id'], error="Incompatible join types")
2069920700
}
20701+

0 commit comments

Comments
 (0)