Skip to content

Commit 5944324

Browse files
committed
rbindlist(): pre-convert names to UTF-8
Fixes: #7452
1 parent 7083a23 commit 5944324

File tree

1 file changed

+14
-6
lines changed

1 file changed

+14
-6
lines changed

src/rbindlist.c

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
6868

6969
int *colMap=NULL; // maps each column in final result to the column of each list item
7070
if (usenames==TRUE || usenames==NA_LOGICAL) {
71+
// zeroth pass - convert all names to UTF-8
72+
SEXP cnl = PROTECT(allocVector(VECSXP, XLENGTH(l)));
73+
for (R_xlen_t i = 0; i < XLENGTH(l); ++i) {
74+
const SEXP cn = getAttrib(VECTOR_ELT(l, i), R_NamesSymbol);
75+
if (xlength(cn)) SET_VECTOR_ELT(cnl, i, coerceUtf8IfNeeded(cn));
76+
}
77+
const SEXP *cnlp = SEXPPTR_RO(cnl);
7178
// here we proceed as if fill=true for brevity (accounting for dups is tricky) and then catch any missings after this branch
7279
// when use.names==NA we also proceed here as if use.names was TRUE to save new code and then check afterwards the map is 1:ncol for every item
7380
// first find number of unique column names present; i.e. length(unique(unlist(lapply(l,names))))
@@ -81,11 +88,11 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
8188
SEXP li = VECTOR_ELT(l, i);
8289
int thisncol=LENGTH(li);
8390
if (isNull(li) || !LENGTH(li)) continue;
84-
const SEXP cn = getAttrib(li, R_NamesSymbol);
91+
const SEXP cn = cnlp[i];
8592
if (!length(cn)) continue;
8693
const SEXP *cnp = STRING_PTR_RO(cn);
8794
for (int j=0; j<thisncol; j++) {
88-
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
95+
SEXP s = cnp[j]; // convert different encodings for use.names #5452
8996
if (TRUELENGTH(s)<0) continue; // seen this name before
9097
if (TRUELENGTH(s)>0) savetl(s);
9198
uniq[nuniq++] = s;
@@ -110,12 +117,12 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
110117
SEXP li = VECTOR_ELT(l, i);
111118
int thisncol=length(li);
112119
if (thisncol==0) continue;
113-
const SEXP cn = getAttrib(li, R_NamesSymbol);
120+
const SEXP cn = cnlp[i];
114121
if (!length(cn)) continue;
115122
const SEXP *cnp = STRING_PTR_RO(cn);
116123
memset(counts, 0, nuniq*sizeof(*counts));
117124
for (int j=0; j<thisncol; j++) {
118-
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
125+
SEXP s = cnp[j]; // convert different encodings for use.names #5452
119126
counts[ -TRUELENGTH(s)-1 ]++;
120127
}
121128
for (int u=0; u<nuniq; u++) {
@@ -149,14 +156,14 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
149156
SEXP li = VECTOR_ELT(l, i);
150157
int thisncol=length(li);
151158
if (thisncol==0) continue;
152-
const SEXP cn = getAttrib(li, R_NamesSymbol);
159+
const SEXP cn = cnlp[i];
153160
if (!length(cn)) {
154161
for (int j=0; j<thisncol; j++) colMapRaw[i*ncol + j] = j;
155162
} else {
156163
const SEXP *cnp = STRING_PTR_RO(cn);
157164
memset(counts, 0, nuniq*sizeof(*counts));
158165
for (int j=0; j<thisncol; j++) {
159-
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
166+
SEXP s = cnp[j]; // convert different encodings for use.names #5452
160167
int w = -TRUELENGTH(s)-1;
161168
int wi = counts[w]++; // how many dups have we seen before of this name within this item
162169
if (uniqMap[w]==-1) {
@@ -177,6 +184,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
177184
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0); // zero out our usage of tl
178185
free(uniq); free(counts); free(uniqMap); free(dupLink); // all local scope so no need to set to NULL
179186
savetl_end(); // restore R's usage
187+
UNPROTECT(1);
180188

181189
// colMapRaw is still allocated. It was allocated with malloc because we needed to catch if the alloc failed.
182190
// move it to R's heap so it gets automatically free'd on exit, and on any error between now and the end of rbindlist.

0 commit comments

Comments
 (0)