Skip to content

Commit 23dac21

Browse files
fix type coercion in bmerge (#6603)
* fix type coercion in bmerge * fix bracket * add test cases * fix lint * fix old test case * rename x/i class * add minimal test * indent loop * add fix in one direction * remove indent to cater for diff * Revert "remove indent to cater for diff" This reverts commit 562a9fd. * remove indent * add 2nd case * remove trailing ws * update all cases * fix typo * fix test cases * update testcases * update copying attributes from int to dbl * start modularize * fix cases * ensure same types for test * add test for codecov * simplify * fix test on windows * simplify * add coerce function * modularize more * Use gettext() on character strings directly * rename getClass helper: mergeType * rename: {i,x}c --> {i,x}col I found myself wondering `ic`... "`i` character? `i` class?". Simpler to encode more info in the name * comment ref. issue * exchange subset with .shallow * undo test * Revert "undo test" This reverts commit c9d3d74. * update tests * add comment * add non right join testcase * move helper outside bmerge * update comment * add NEWS * update numbering * tweak NEWS --------- Co-authored-by: Michael Chirico <[email protected]>
1 parent 546259d commit 23dac21

File tree

3 files changed

+127
-58
lines changed

3 files changed

+127
-58
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ rowwiseDT(
107107
108108
11. `tables()` now returns the correct size for data.tables over 2GiB, [#6607](https://github.com/Rdatatable/data.table/issues/6607). Thanks to @vlulla for the report and the PR.
109109
110+
12. Joins on multiple columns, such as `x[y, on=c("x1==y1", "x2==y1")]`, could fail during implicit type coercions if `x1` and `x2` had different but still compatible types, [#6602](https://github.com/Rdatatable/data.table/issues/6602). This was particularly unexpected when columns `x1`, `x2`, and `y1` were all of the same class, e.g. `Date`, but differed in their underlying storage types. Thanks to Benjamin Schwendinger for the report and the fix.
111+
110112
## NOTES
111113
112114
1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix.

R/bmerge.R

Lines changed: 90 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,25 @@
11

2+
3+
mergeType = function(x) {
4+
ans = typeof(x)
5+
if (ans=="integer") { if (is.factor(x)) ans = "factor" }
6+
else if (ans=="double") { if (inherits(x, "integer64")) ans = "integer64" }
7+
# do not call isReallyReal(x) yet because i) if both types are double we don't need to coerce even if one or both sides
8+
# are int-as-double, and ii) to save calling it until we really need it
9+
ans
10+
}
11+
12+
cast_with_atts = function(x, as.f) {
13+
ans = as.f(x)
14+
if (!is.null(attributes(x))) attributes(ans) = attributes(x)
15+
ans
16+
}
17+
18+
coerce_col = function(dt, col, from_type, to_type, from_name, to_name, verbose_msg=NULL) {
19+
if (!is.null(verbose_msg)) catf(verbose_msg, from_type, from_name, to_type, to_name, domain=NULL)
20+
set(dt, j=col, value=cast_with_atts(dt[[col]], match.fun(paste0("as.", to_type))))
21+
}
22+
223
bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
324
{
425
callersi = i
@@ -25,95 +46,110 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
2546

2647
supported = c(ORDERING_TYPES, "factor", "integer64")
2748

28-
getClass = function(x) {
29-
ans = typeof(x)
30-
if (ans=="integer") { if (is.factor(x)) ans = "factor" }
31-
else if (ans=="double") { if (inherits(x, "integer64")) ans = "integer64" }
32-
# do not call isReallyReal(x) yet because i) if both types are double we don't need to coerce even if one or both sides
33-
# are int-as-double, and ii) to save calling it until we really need it
34-
ans
35-
}
36-
3749
if (nrow(i)) for (a in seq_along(icols)) {
3850
# - check that join columns have compatible types
3951
# - do type coercions if necessary on just the shallow local copies for the purpose of join
4052
# - handle factor columns appropriately
4153
# Note that if i is keyed, if this coerces i's key gets dropped by set()
42-
ic = icols[a]
43-
xc = xcols[a]
44-
xclass = getClass(x[[xc]])
45-
iclass = getClass(i[[ic]])
46-
xname = paste0("x.", names(x)[xc])
47-
iname = paste0("i.", names(i)[ic])
48-
if (!xclass %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, xclass)
49-
if (!iclass %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, iclass)
50-
if (xclass=="factor" || iclass=="factor") {
54+
icol = icols[a]
55+
xcol = xcols[a]
56+
x_merge_type = mergeType(x[[xcol]])
57+
i_merge_type = mergeType(i[[icol]])
58+
xname = paste0("x.", names(x)[xcol])
59+
iname = paste0("i.", names(i)[icol])
60+
if (!x_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", xname, x_merge_type)
61+
if (!i_merge_type %chin% supported) stopf("%s is type %s which is not supported by data.table join", iname, i_merge_type)
62+
if (x_merge_type=="factor" || i_merge_type=="factor") {
5163
if (roll!=0.0 && a==length(icols))
5264
stopf("Attempting roll join on factor column when joining %s to %s. Only integer, double or character columns may be roll joined.", xname, iname)
53-
if (xclass=="factor" && iclass=="factor") {
65+
if (x_merge_type=="factor" && i_merge_type=="factor") {
5466
if (verbose) catf("Matching %s factor levels to %s factor levels.\n", iname, xname)
55-
set(i, j=ic, value=chmatch(levels(i[[ic]]), levels(x[[xc]]), nomatch=0L)[i[[ic]]]) # nomatch=0L otherwise a level that is missing would match to NA values
67+
set(i, j=icol, value=chmatch(levels(i[[icol]]), levels(x[[xcol]]), nomatch=0L)[i[[icol]]]) # nomatch=0L otherwise a level that is missing would match to NA values
5668
next
5769
} else {
58-
if (xclass=="character") {
70+
if (x_merge_type=="character") {
5971
if (verbose) catf("Coercing factor column %s to type character to match type of %s.\n", iname, xname)
60-
set(i, j=ic, value=val<-as.character(i[[ic]]))
61-
set(callersi, j=ic, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
72+
set(i, j=icol, value=val<-as.character(i[[icol]]))
73+
set(callersi, j=icol, value=val) # factor in i joining to character in x will return character and not keep x's factor; e.g. for antaresRead #3581
6274
next
63-
} else if (iclass=="character") {
75+
} else if (i_merge_type=="character") {
6476
if (verbose) catf("Matching character column %s to factor levels in %s.\n", iname, xname)
65-
newvalue = chmatch(i[[ic]], levels(x[[xc]]), nomatch=0L)
66-
if (anyNA(i[[ic]])) newvalue[is.na(i[[ic]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809
67-
set(i, j=ic, value=newvalue)
77+
newvalue = chmatch(i[[icol]], levels(x[[xcol]]), nomatch=0L)
78+
if (anyNA(i[[icol]])) newvalue[is.na(i[[icol]])] = NA_integer_ # NA_character_ should match to NA in factor, #3809
79+
set(i, j=icol, value=newvalue)
6880
next
6981
}
7082
}
71-
stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, xclass, iname, iclass)
83+
stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type)
7284
}
73-
if (xclass == iclass) {
74-
if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, xclass, xname)
85+
# we check factors first to cater for the case when trying to do rolling joins on factors
86+
if (x_merge_type == i_merge_type) {
87+
if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname)
7588
next
7689
}
77-
if (xclass=="character" || iclass=="character" ||
78-
xclass=="logical" || iclass=="logical" ||
79-
xclass=="factor" || iclass=="factor") {
80-
if (anyNA(i[[ic]]) && allNA(i[[ic]])) {
81-
if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", iname, iclass, xclass, xname)
82-
set(i, j=ic, value=match.fun(paste0("as.", xclass))(i[[ic]]))
90+
cfl = c("character", "logical", "factor")
91+
if (x_merge_type %chin% cfl || i_merge_type %chin% cfl) {
92+
msg = if(verbose) gettext("Coercing all-NA %s column %s to type %s to match type of %s.\n") else NULL
93+
if (anyNA(i[[icol]]) && allNA(i[[icol]])) {
94+
coerce_col(i, icol, i_merge_type, x_merge_type, iname, xname, msg)
8395
next
8496
}
85-
else if (anyNA(x[[xc]]) && allNA(x[[xc]])) {
86-
if (verbose) catf("Coercing all-NA %s (%s) to type %s to match type of %s.\n", xname, xclass, iclass, iname)
87-
set(x, j=xc, value=match.fun(paste0("as.", iclass))(x[[xc]]))
97+
if (anyNA(x[[xcol]]) && allNA(x[[xcol]])) {
98+
coerce_col(x, xcol, x_merge_type, i_merge_type, xname, iname, msg)
8899
next
89100
}
90-
stopf("Incompatible join types: %s (%s) and %s (%s)", xname, xclass, iname, iclass)
101+
stopf("Incompatible join types: %s (%s) and %s (%s)", xname, x_merge_type, iname, i_merge_type)
91102
}
92-
if (xclass=="integer64" || iclass=="integer64") {
103+
if (x_merge_type=="integer64" || i_merge_type=="integer64") {
93104
nm = c(iname, xname)
94-
if (xclass=="integer64") { w=i; wc=ic; wclass=iclass; } else { w=x; wc=xc; wclass=xclass; nm=rev(nm) } # w is which to coerce
105+
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce
95106
if (wclass=="integer" || (wclass=="double" && !isReallyReal(w[[wc]]))) {
96107
if (verbose) catf("Coercing %s column %s%s to type integer64 to match type of %s.\n", wclass, nm[1L], if (wclass=="double") " (which contains no fractions)" else "", nm[2L])
97108
set(w, j=wc, value=bit64::as.integer64(w[[wc]]))
98109
} else stopf("Incompatible join types: %s is type integer64 but %s is type double and contains fractions", nm[2L], nm[1L])
99110
} else {
100111
# just integer and double left
101-
if (iclass=="double") {
102-
if (!isReallyReal(i[[ic]])) {
112+
ic_idx = which(icol == icols) # check if on is joined on multiple conditions, #6602
113+
if (i_merge_type=="double") {
114+
coerce_x = FALSE
115+
if (!isReallyReal(i[[icol]])) {
116+
coerce_x = TRUE
103117
# common case of ad hoc user-typed integers missing L postfix joining to correct integer keys
104118
# we've always coerced to int and returned int, for convenience.
105-
if (verbose) catf("Coercing double column %s (which contains no fractions) to type integer to match type of %s.\n", iname, xname)
106-
val = as.integer(i[[ic]])
107-
if (!is.null(attributes(i[[ic]]))) attributes(val) = attributes(i[[ic]]) # to retain Date for example; 3679
108-
set(i, j=ic, value=val)
109-
set(callersi, j=ic, value=val) # change the shallow copy of i up in [.data.table to reflect in the result, too.
110-
} else {
111-
if (verbose) catf("Coercing integer column %s to type double to match type of %s which contains fractions.\n", xname, iname)
112-
set(x, j=xc, value=as.double(x[[xc]]))
119+
if (length(ic_idx)>1L) {
120+
xc_idx = xcols[ic_idx]
121+
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) {
122+
if (isReallyReal(x[[xb]])) {
123+
coerce_x = FALSE
124+
break
125+
}
126+
}
127+
}
128+
if (coerce_x) {
129+
msg = if (verbose) gettext("Coercing %s column %s (which contains no fractions) to type %s to match type of %s.\n") else NULL
130+
coerce_col(i, icol, "double", "integer", iname, xname, msg)
131+
set(callersi, j=icol, value=i[[icol]]) # change the shallow copy of i up in [.data.table to reflect in the result, too.
132+
if (length(ic_idx)>1L) {
133+
xc_idx = xcols[ic_idx]
134+
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "double")]) {
135+
coerce_col(x, xb, "double", "integer", paste0("x.", names(x)[xb]), xname, msg)
136+
}
137+
}
138+
}
139+
}
140+
if (!coerce_x) {
141+
msg = if (verbose) gettext("Coercing %s column %s to type %s to match type of %s which contains fractions.\n") else NULL
142+
coerce_col(x, xcol, "integer", "double", xname, iname, msg)
113143
}
114144
} else {
115-
if (verbose) catf("Coercing integer column %s to type double for join to match type of %s.\n", iname, xname)
116-
set(i, j=ic, value=as.double(i[[ic]]))
145+
msg = if (verbose) gettext("Coercing %s column %s to type %s for join to match type of %s.\n") else NULL
146+
coerce_col(i, icol, "integer", "double", iname, xname, msg)
147+
if (length(ic_idx)>1L) {
148+
xc_idx = xcols[ic_idx]
149+
for (xb in xc_idx[which(vapply_1c(.shallow(x, xc_idx), mergeType) == "integer")]) {
150+
coerce_col(x, xb, "integer", "double", paste0("x.", names(x)[xb]), xname, msg)
151+
}
152+
}
117153
}
118154
}
119155
}

inst/tests/tests.Rraw

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15156,15 +15156,15 @@ if (test_bit64) {
1515615156
dt1 = data.table(a=1, b=NA_character_)
1515715157
dt2 = data.table(a=2L, b=NA)
1515815158
test(2044.80, dt1[dt2, on="a==b", verbose=TRUE], data.table(a=NA, b=NA_character_, i.a=2L),
15159-
output=msg<-"Coercing all-NA i.b (logical) to type double to match type of x.a")
15159+
output=msg<-"Coercing all-NA logical column i.b to type double to match type of x.a")
1516015160
test(2044.81, dt1[dt2, on="a==b", nomatch=0L, verbose=TRUE], data.table(a=logical(), b=character(), i.a=integer()),
1516115161
output=msg)
1516215162
test(2044.82, dt1[dt2, on="b==b", verbose=TRUE], data.table(a=1, b=NA, i.a=2L),
15163-
output=msg<-"Coercing all-NA i.b (logical) to type character to match type of x.b")
15163+
output=msg<-"Coercing all-NA logical column i.b to type character to match type of x.b")
1516415164
test(2044.83, dt1[dt2, on="b==b", nomatch=0L, verbose=TRUE], data.table(a=1, b=NA, i.a=2L),
1516515165
output=msg)
1516615166
test(2044.84, dt1[dt2, on="b==a", verbose=TRUE], data.table(a=NA_real_, b=2L, i.b=NA),
15167-
output=msg<-"Coercing all-NA x.b (character) to type integer to match type of i.a")
15167+
output=msg<-"Coercing all-NA character column x.b to type integer to match type of i.a")
1516815168
test(2044.85, dt1[dt2, on="b==a", nomatch=0L, verbose=TRUE], data.table(a=double(), b=integer(), i.b=logical()),
1516915169
output=msg)
1517015170

@@ -15727,7 +15727,8 @@ DT = data.table(z = 1i)
1572715727
test(2069.33, DT[DT, on = 'z'], error = "Type 'complex' is not supported for joining/merging")
1572815728

1572915729
# forder verbose message when !isReallyReal Date, #1738
15730-
DT = data.table(d=sample(seq(as.Date("2015-01-01"), as.Date("2015-01-05"), by="days"), 20, replace=TRUE))
15730+
date_dbl = as.Date(as.double(seq(as.Date("2015-01-01"), as.Date("2015-01-05"), by="days")), origin="1970-01-01")
15731+
DT = data.table(d=sample(date_dbl, 20, replace=TRUE))
1573115732
test(2070.01, typeof(DT$d), "double")
1573215733
test(2070.02, DT[, .N, keyby=d, verbose=TRUE], output="Column 1.*date.*8 byte double.*no fractions are present.*4 byte integer.*to save space and time")
1573315734

@@ -20596,3 +20597,33 @@ test(2295.3, is.data.table(d2))
2059620597

2059720598
# #6588: .checkTypos used to give arbitrary strings to stopf as the first argument
2059820599
test(2296, d2[x %no such operator% 1], error = '%no such operator%')
20600+
20601+
# fix coercing integer/double for joins on multiple columns, #6602
20602+
x = data.table(a=1L)
20603+
y = data.table(c=1L, d=1)
20604+
test(2297.01, y[x, on=.(c == a, d == a), verbose=TRUE], data.table(c=1L, d=1L), output="Coercing .*a to type double.*Coercing .*c to type double")
20605+
test(2297.02, y[x, on=.(d == a, c == a), verbose=TRUE], data.table(c=1L, d=1L), output="Coercing .*a to type double.*Coercing .*c to type double")
20606+
x = data.table(a=1)
20607+
y = data.table(c=1, d=1L)
20608+
test(2297.03, y[x, on=.(c == a, d == a), verbose=TRUE], data.table(c=1L, d=1L), output="Coercing .*a .*no fractions.* to type integer.*Coercing .*c .*no fractions.* to type integer")
20609+
test(2297.04, y[x, on=.(d == a, c == a), verbose=TRUE], data.table(c=1L, d=1L), output="Coercing .*a .*no fractions.* to type integer.*Coercing .*c .*no fractions.* to type integer")
20610+
# dates
20611+
d_int = .Date(1L)
20612+
d_dbl = .Date(1)
20613+
x = data.table(a=d_int)
20614+
y = data.table(c=d_int, d=d_dbl)
20615+
test(2297.11, y[x, on=.(c == a, d == a)], data.table(c=d_int, d=d_int))
20616+
test(2297.12, y[x, on=.(d == a, c == a)], data.table(c=d_int, d=d_int))
20617+
x = data.table(a=d_dbl)
20618+
y = data.table(c=d_dbl, d=d_int)
20619+
test(2297.13, y[x, on=.(c == a, d == a)], data.table(c=d_int, d=d_int))
20620+
test(2297.14, y[x, on=.(d == a, c == a)], data.table(c=d_int, d=d_int))
20621+
# real double
20622+
x = data.table(a=1)
20623+
y = data.table(c=1.5, d=1L)
20624+
test(2297.21, y[x, on=.(c == a, d == a)], data.table(c=1, d=1))
20625+
test(2297.22, y[x, on=.(d == a, c == a)], data.table(c=1, d=1))
20626+
# non right join
20627+
x = data.table(a=1, b=2L)
20628+
y = data.table(c=1.5, d=1L)
20629+
test(2297.31, y[x, on=.(c == a, d == a), nomatch=NULL], output="Empty data.table (0 rows and 3 cols): c,d,b")

0 commit comments

Comments
 (0)