Skip to content

Commit b700bf6

Browse files
committed
rbindlist(): replace 1/2 TRUELENGTH with hashing
Also avoid crashing when creating a 0-size hash.
1 parent 3846cc9 commit b700bf6

File tree

2 files changed

+10
-13
lines changed

2 files changed

+10
-13
lines changed

src/hash.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ hashtab * hash_create_(size_t n, double load_factor) {
2727
// precondition: sizeof hashtab + hash_pair[n_full] < SIZE_MAX
2828
// n_full * sizeof hash_pair < SIZE_MAX - sizeof hashtab
2929
// sizeof hash_pair < (SIZE_MAX - sizeof hashtab) / n_full
30-
if (sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error(
30+
// (note that sometimes n is 0)
31+
if (n_full && sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error(
3132
"hash_create", "n=%zu with load_factor=%g would overflow total allocation size",
3233
n, load_factor
3334
);

src/rbindlist.c

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,10 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
7474
SEXP *uniq = (SEXP *)malloc(upperBoundUniqueNames * sizeof(SEXP)); // upperBoundUniqueNames was initialized with 1 to ensure this is defined (otherwise 0 when no item has names)
7575
if (!uniq)
7676
error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames); // # nocov
77-
savetl_init();
77+
R_xlen_t lh = 0;
78+
for (R_xlen_t i=0; i<xlength(l); i++)
79+
lh += xlength(getAttrib(VECTOR_ELT(l, i), R_NamesSymbol));
80+
hashtab * marks = hash_create(lh);
7881
int nuniq=0;
7982
// first pass - gather unique column names
8083
for (int i=0; i<LENGTH(l); i++) {
@@ -86,10 +89,9 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
8689
const SEXP *cnp = STRING_PTR_RO(cn);
8790
for (int j=0; j<thisncol; j++) {
8891
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
89-
if (TRUELENGTH(s)<0) continue; // seen this name before
90-
if (TRUELENGTH(s)>0) savetl(s);
92+
if (hash_lookup(marks, s, 0)<0) continue; // seen this name before
9193
uniq[nuniq++] = s;
92-
SET_TRUELENGTH(s,-nuniq);
94+
hash_set(marks, s,-nuniq);
9395
}
9496
}
9597
if (nuniq>0) uniq = realloc(uniq, nuniq*sizeof(SEXP)); // shrink to only what we need to release the spare
@@ -99,9 +101,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
99101
int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector
100102
if (!counts || !maxdup) {
101103
// # nocov start
102-
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
103104
free(uniq); free(counts); free(maxdup);
104-
savetl_end();
105105
error(_("Failed to allocate nuniq=%d items working memory in rbindlist.c"), nuniq);
106106
// # nocov end
107107
}
@@ -116,7 +116,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
116116
memset(counts, 0, nuniq*sizeof(int));
117117
for (int j=0; j<thisncol; j++) {
118118
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
119-
counts[ -TRUELENGTH(s)-1 ]++;
119+
counts[ -hash_lookup(marks, s, 0)-1 ]++;
120120
}
121121
for (int u=0; u<nuniq; u++) {
122122
if (counts[u] > maxdup[u]) maxdup[u] = counts[u];
@@ -134,9 +134,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
134134
int *dupLink = (int *)malloc(ncol * sizeof(int)); // if a colname has occurred before (a dup) links from the 1st to the 2nd time in the final result, 2nd to 3rd, etc
135135
if (!colMapRaw || !uniqMap || !dupLink) {
136136
// # nocov start
137-
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
138137
free(uniq); free(counts); free(colMapRaw); free(uniqMap); free(dupLink);
139-
savetl_end();
140138
error(_("Failed to allocate ncol=%d items working memory in rbindlist.c"), ncol);
141139
// # nocov end
142140
}
@@ -157,7 +155,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
157155
memset(counts, 0, nuniq*sizeof(int));
158156
for (int j=0; j<thisncol; j++) {
159157
SEXP s = ENC2UTF8(cnp[j]); // convert different encodings for use.names #5452
160-
int w = -TRUELENGTH(s)-1;
158+
int w = -hash_lookup(marks, s, 0)-1;
161159
int wi = counts[w]++; // how many dups have we seen before of this name within this item
162160
if (uniqMap[w]==-1) {
163161
// first time seen this name across all items
@@ -174,9 +172,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
174172
}
175173
}
176174
}
177-
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0); // zero out our usage of tl
178175
free(uniq); free(counts); free(uniqMap); free(dupLink); // all local scope so no need to set to NULL
179-
savetl_end(); // restore R's usage
180176

181177
// colMapRaw is still allocated. It was allocated with malloc because we needed to catch if the alloc failed.
182178
// move it to R's heap so it gets automatically free'd on exit, and on any error between now and the end of rbindlist.

0 commit comments

Comments
 (0)