Skip to content

Commit f05893e

Browse files
rbindlist(l, use.names=TRUE) handle different encodings for column names (#5453)
* fix handling of different encodings for column names * improve comment * write special chars in unicode * simplify tests * Fix NEWS numbering * Update NEWS.md Co-authored-by: Michael Chirico <[email protected]> * add comments * fix lint --------- Co-authored-by: Michael Chirico <[email protected]>
1 parent 014dafb commit f05893e

File tree

3 files changed

+22
-8
lines changed

3 files changed

+22
-8
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ rowwiseDT(
111111
112112
12. Joins on multiple columns, such as `x[y, on=c("x1==y1", "x2==y1")]`, could fail during implicit type coercions if `x1` and `x2` had different but still compatible types, [#6602](https://github.com/Rdatatable/data.table/issues/6602). This was particularly unexpected when columns `x1`, `x2`, and `y1` were all of the same class, e.g. `Date`, but differed in their underlying storage types. Thanks to Benjamin Schwendinger for the report and the fix.
113113
114+
13. `rbindlist(l, use.names=TRUE)` can now handle different encodings for the column names in different entries of `l`, [#5452](https://github.com/Rdatatable/data.table/issues/5452). Thanks to @MEO265 for the report, and Benjamin Schwendinger for the fix.
115+
114116
## NOTES
115117
116118
1. Tests run again when some Suggests packages are missing, [#6411](https://github.com/Rdatatable/data.table/issues/6411). Thanks @aadler for the note and @MichaelChirico for the fix.

inst/tests/tests.Rraw

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20639,3 +20639,15 @@ test(2297.22, y[x, on=.(d == a, c == a)], data.table(c=1, d=1))
2063920639
x = data.table(a=1, b=2L)
2064020640
y = data.table(c=1.5, d=1L)
2064120641
test(2297.31, y[x, on=.(c == a, d == a), nomatch=NULL], output="Empty data.table (0 rows and 3 cols): c,d,b")
20642+
20643+
# rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452
20644+
x = data.table(a = 1, b = 2, c = 3)
20645+
y = data.table(x = 4, y = 5, z = 6)
20646+
# a-umlaut, o-umlaut, u-umlaut
20647+
setnames(x , c("\u00e4", "\u00f6", "\u00fc"))
20648+
setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1"))
20649+
test(2298.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5)))
20650+
test(2298.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1)))
20651+
set(y, j="\u00e4", value=NULL)
20652+
test(2298.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5)))
20653+
test(2298.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1)))

src/rbindlist.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
7676
error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames); // # nocov
7777
savetl_init();
7878
int nuniq=0;
79+
// first pass - gather unique column names
7980
for (int i=0; i<LENGTH(l); i++) {
8081
SEXP li = VECTOR_ELT(l, i);
8182
int thisncol=LENGTH(li);
@@ -84,18 +85,15 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
8485
if (!length(cn)) continue;
8586
const SEXP *cnp = STRING_PTR_RO(cn);
8687
for (int j=0; j<thisncol; j++) {
87-
SEXP s = cnp[j];
88+
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
8889
if (TRUELENGTH(s)<0) continue; // seen this name before
8990
if (TRUELENGTH(s)>0) savetl(s);
9091
uniq[nuniq++] = s;
9192
SET_TRUELENGTH(s,-nuniq);
9293
}
9394
}
94-
if (nuniq>0) {
95-
SEXP *tt = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare
96-
if (!tt) free(uniq); // shrink never fails; just keep codacy happy
97-
uniq = tt;
98-
}
95+
if (nuniq>0) uniq = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare
96+
9997
// now count the dups (if any) and how they're distributed across the items
10098
int *counts = (int *)calloc(nuniq, sizeof(int)); // counts of names for each colnames
10199
int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector
@@ -107,6 +105,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
107105
error(_("Failed to allocate nuniq=%d items working memory in rbindlist.c"), nuniq);
108106
// # nocov end
109107
}
108+
// second pass - count duplicates
110109
for (int i=0; i<LENGTH(l); i++) {
111110
SEXP li = VECTOR_ELT(l, i);
112111
int thisncol=length(li);
@@ -116,7 +115,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
116115
const SEXP *cnp = STRING_PTR_RO(cn);
117116
memset(counts, 0, nuniq*sizeof(int));
118117
for (int j=0; j<thisncol; j++) {
119-
SEXP s = cnp[j];
118+
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
120119
counts[ -TRUELENGTH(s)-1 ]++;
121120
}
122121
for (int u=0; u<nuniq; u++) {
@@ -145,6 +144,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
145144
for (int i=0; i<ncol; ++i) {uniqMap[i] = dupLink[i] = -1;}
146145
int nextCol=0, lastDup=ncol-1;
147146

147+
// third pass - create final column mapping colMapRaw
148148
for (int i=0; i<LENGTH(l); ++i) {
149149
SEXP li = VECTOR_ELT(l, i);
150150
int thisncol=length(li);
@@ -156,7 +156,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
156156
const SEXP *cnp = STRING_PTR_RO(cn);
157157
memset(counts, 0, nuniq*sizeof(int));
158158
for (int j=0; j<thisncol; j++) {
159-
SEXP s = cnp[j];
159+
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
160160
int w = -TRUELENGTH(s)-1;
161161
int wi = counts[w]++; // how many dups have we seen before of this name within this item
162162
if (uniqMap[w]==-1) {

0 commit comments

Comments
 (0)