Skip to content

Commit dc1dbe9

Browse files
authored
rbindlist: pre-convert column names to UTF-8 (#7461)
Fixes: #7452
1 parent 1ba8785 commit dc1dbe9

File tree

3 files changed

+25
-7
lines changed

3 files changed

+25
-7
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
354354

355355
27. `dogroups()` no longer reads beyond the resized end of over-allocated data.table list columns, [#7486](https://github.com/Rdatatable/data.table/issues/7486). While this didn't crash in practice, it is now explicitly checked for in recent R versions (r89198+). Thanks @TimTaylor and @aitap for the report and @aitap for the fix.
356356

357+
28. `rbindlist()` now avoids the crash when working with many non-UTF-8 column names, [#7452](https://github.com/Rdatatable/data.table/issues/7452). Thanks @aitap for the report and the fix.
358+
357359
### NOTES
358360

359361
1. The following in-progress deprecations have proceeded:

inst/tests/tests.Rraw

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21900,3 +21900,12 @@ rm(DT, strings)
2190021900
DT = unserialize(serialize(as.data.table(mtcars), NULL))
2190121901
test(2351, DT[,carb:=NULL], as.data.table(mtcars)[,carb:=NULL])
2190221902
rm(DT)
21903+
21904+
# rbindlist did not protect the temporary UTF-8 strings, #7452
21905+
DTn = apply(matrix(as.raw(rep(0xa1:0xff, length.out = 100)), 10), 2, rawToChar)
21906+
Encoding(DTn) = 'latin1' # will need conversion to UTF-8
21907+
DTl = lapply(DTn, function(n) setNames(list(42), n))[c(1, rep(2:length(DTn), length.out = 3e5), 1)]
21908+
DT = suppressMessages(rbindlist(DTl)) # used to crash
21909+
test(2352.1, dim(DT), c(300002L, 1L))
21910+
test(2352.2, DT[[1]], rep(42, nrow(DT)))
21911+
rm(DTn, DTl, DT)

src/rbindlist.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
6868

6969
int *colMap=NULL; // maps each column in final result to the column of each list item
7070
if (usenames==TRUE || usenames==NA_LOGICAL) {
71+
// zeroth pass - convert all names to UTF-8
72+
SEXP cnl = PROTECT(allocVector(VECSXP, XLENGTH(l)));
73+
for (R_xlen_t i = 0; i < XLENGTH(l); ++i) {
74+
const SEXP cn = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
75+
if (xlength(cn)) SET_VECTOR_ELT(cnl, i, coerceUtf8IfNeeded(cn));
76+
}
77+
const SEXP *cnlp = SEXPPTR_RO(cnl);
7178
// here we proceed as if fill=true for brevity (accounting for dups is tricky) and then catch any missings after this branch
7279
// when use.names==NA we also proceed here as if use.names was TRUE to save new code and then check afterwards the map is 1:ncol for every item
7380
// first find number of unique column names present; i.e. length(unique(unlist(lapply(l,names))))
@@ -79,11 +86,11 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
7986
SEXP li = VECTOR_ELT(l, i);
8087
int thisncol=LENGTH(li);
8188
if (isNull(li) || !LENGTH(li)) continue;
82-
const SEXP cn = getAttrib(li, R_NamesSymbol);
89+
const SEXP cn = cnlp[i];
8390
if (!length(cn)) continue;
8491
const SEXP *cnp = STRING_PTR_RO(cn);
8592
for (int j=0; j<thisncol; j++) {
86-
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
93+
SEXP s = cnp[j]; // convert different encodings for use.names #5452
8794
if (hash_lookup(marks, s, 0)<0) continue; // seen this name before
8895
nuniq++;
8996
hash_set(marks, s,-nuniq);
@@ -104,12 +111,12 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
104111
SEXP li = VECTOR_ELT(l, i);
105112
int thisncol=length(li);
106113
if (thisncol==0) continue;
107-
const SEXP cn = getAttrib(li, R_NamesSymbol);
114+
const SEXP cn = cnlp[i];
108115
if (!length(cn)) continue;
109116
const SEXP *cnp = STRING_PTR_RO(cn);
110117
memset(counts, 0, nuniq*sizeof(*counts));
111118
for (int j=0; j<thisncol; j++) {
112-
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
119+
SEXP s = cnp[j]; // convert different encodings for use.names #5452
113120
counts[ -hash_lookup(marks, s, 0)-1 ]++;
114121
}
115122
for (int u=0; u<nuniq; u++) {
@@ -141,14 +148,14 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
141148
SEXP li = VECTOR_ELT(l, i);
142149
int thisncol=length(li);
143150
if (thisncol==0) continue;
144-
const SEXP cn = getAttrib(li, R_NamesSymbol);
151+
const SEXP cn = cnlp[i];
145152
if (!length(cn)) {
146153
for (int j=0; j<thisncol; j++) colMapRaw[i*ncol + j] = j;
147154
} else {
148155
const SEXP *cnp = STRING_PTR_RO(cn);
149156
memset(counts, 0, nuniq*sizeof(*counts));
150157
for (int j=0; j<thisncol; j++) {
151-
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
158+
SEXP s = cnp[j]; // convert different encodings for use.names #5452
152159
int w = -hash_lookup(marks, s, 0)-1;
153160
int wi = counts[w]++; // how many dups have we seen before of this name within this item
154161
if (uniqMap[w]==-1) {
@@ -167,8 +174,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
167174
}
168175
}
169176
free(counts); free(uniqMap); free(dupLink); // all local scope so no need to set to NULL
177+
UNPROTECT(2); // cnl, marks
170178

171-
UNPROTECT(1); // marks
172179
// colMapRaw is still allocated. It was allocated with malloc because we needed to catch if the alloc failed.
173180
// move it to R's heap so it gets automatically free'd on exit, and on any error between now and the end of rbindlist.
174181
colMap = (int *)R_alloc(LENGTH(l)*ncol, sizeof(*colMap));

0 commit comments

Comments
 (0)