Skip to content

Commit 86511ab

Browse files
committed
rbindlist(): replace 2/2 TRUELENGTH with hashing
This may likely require a dynamically growing hash of TRUELENGTHs instead of the current pre-allocation approach with a very conservative over-estimate.
1 parent b700bf6 commit 86511ab

File tree

2 files changed

+27
-23
lines changed

2 files changed

+27
-23
lines changed

src/hash.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,15 @@ void hash_set(hashtab * h, SEXP key, R_xlen_t value) {
5959
return;
6060
} else if (!cell->key) {
6161
if (!h->free) internal_error(
62-
"hash_insert", "no free slots left (size=%zu after the load factor)", h->size
62+
"hash_insert", "no free slots left (full size=%zu)", h->size
6363
);
6464
--h->free;
6565
*cell = (struct hash_pair){.key = key, .value = value};
6666
return;
6767
}
6868
}
6969
internal_error(
70-
"hash_insert", "did not find a free slot for key %p despite size=%zu, free=%zu",
70+
"hash_insert", "did not find a free slot for key %p; size=%zu, free=%zu",
7171
(void*)key, h->size, h->free
7272
);
7373
}

src/rbindlist.c

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,6 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
345345
if (factor && anyNotStringOrFactor) {
346346
// in future warn, or use list column instead ... warning(_("Column %d contains a factor but not all items for the column are character or factor"), idcol+j+1);
347347
// some coercing from (likely) integer/numeric to character will be needed. But this coerce can feasibly fail with out-of-memory, so we have to do it up-front
348-
// before the savetl_init() because we have no hook to clean up tl if coerceVector fails.
349348
if (coercedForFactor==NULL) { coercedForFactor=PROTECT(allocVector(VECSXP, LENGTH(l))); nprotect++; }
350349
for (int i=0; i<LENGTH(l); ++i) {
351350
SEXP li = VECTOR_ELT(l, i);
@@ -360,7 +359,18 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
360359
int ansloc=0;
361360
if (factor) {
362361
char warnStr[1000] = "";
363-
savetl_init(); // no error from now (or warning given options(warn=2)) until savetl_end
362+
// FIXME: this is a very conservative estimate of the number of distinct elements.
363+
// Will probably need a dynamically growing hash instead.
364+
R_xlen_t hl = longestLen > 0 ? longestLen : 0;
365+
for (R_xlen_t i = 0; i < xlength(l); ++i) {
366+
SEXP li = VECTOR_ELT(l, i);
367+
for (R_xlen_t w = 0; w < xlength(li); ++w) {
368+
SEXP thisCol = VECTOR_ELT(li, w);
369+
SEXP thisColStr = isFactor(thisCol) ? getAttrib(thisCol, R_LevelsSymbol) : thisCol;
370+
hl += xlength(thisColStr);
371+
}
372+
}
373+
hashtab * marks = hash_create(hl);
364374
int nLevel=0, allocLevel=0;
365375
SEXP *levelsRaw = NULL; // growing list of SEXP pointers. Raw since managed with raw realloc.
366376
if (orderedFactor) {
@@ -375,14 +385,12 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
375385
nLevel = allocLevel = longestLen;
376386
levelsRaw = (SEXP *)malloc(nLevel * sizeof(SEXP));
377387
if (!levelsRaw) {
378-
savetl_end(); // # nocov
379388
error(_("Failed to allocate working memory for %d ordered factor levels of result column %d"), nLevel, idcol+j+1); // # nocov
380389
}
381390
for (int k=0; k<longestLen; ++k) {
382391
SEXP s = sd[k];
383-
if (TRUELENGTH(s)>0) savetl(s);
384392
levelsRaw[k] = s;
385-
SET_TRUELENGTH(s,-k-1);
393+
hash_set(marks, s, -k-1);
386394
}
387395
for (int i=0; i<LENGTH(l); ++i) {
388396
SEXP li = VECTOR_ELT(l, i);
@@ -395,7 +403,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
395403
const int n = length(levels);
396404
for (int k=0, last=0; k<n; ++k) {
397405
SEXP s = levelsD[k];
398-
const int tl = TRUELENGTH(s);
406+
const int tl = hash_lookup(marks, s, 0);
399407
if (tl>=last) { // if tl>=0 then also tl>=last because last<=0
400408
if (tl>=0) {
401409
snprintf(warnStr, 1000, // not direct warning as we're inside tl region
@@ -434,8 +442,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
434442
for (int k=0; k<n; ++k) {
435443
SEXP s = thisColStrD[k];
436444
if (s==NA_STRING || // remove NA from levels; test 1979 found by package emil when revdep testing 1.12.2 (#3473)
437-
TRUELENGTH(s)<0) continue; // seen this level before; handles removing dups from levels as well as finding unique of character columns
438-
if (TRUELENGTH(s)>0) savetl(s);
445+
hash_lookup(marks, s, 0)<0) continue; // seen this level before; handles removing dups from levels as well as finding unique of character columns
439446
if (allocLevel==nLevel) { // including initial time when allocLevel==nLevel==0
440447
SEXP *tt = NULL;
441448
if (allocLevel<INT_MAX) {
@@ -446,15 +453,14 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
446453
if (tt==NULL) {
447454
// # nocov start
448455
// C spec states that if realloc() fails (above) the original block (levelsRaw) is left untouched: it is not freed or moved. We ...
449-
for (int k=0; k<nLevel; k++) SET_TRUELENGTH(levelsRaw[k], 0); // ... rely on that in this loop which uses levelsRaw.
456+
for (int k=0; k<nLevel; k++) hash_set(marks, levelsRaw[k], 0); // ... rely on that in this loop which uses levelsRaw.
450457
free(levelsRaw);
451-
savetl_end();
452458
error(_("Failed to allocate working memory for %d factor levels of result column %d when reading item %d of item %d"), allocLevel, idcol+j+1, w+1, i+1);
453459
// # nocov end
454460
}
455461
levelsRaw = tt;
456462
}
457-
SET_TRUELENGTH(s,-(++nLevel));
463+
hash_set(marks,s,-(++nLevel));
458464
levelsRaw[nLevel-1] = s;
459465
}
460466
int *targetd = INTEGER(target);
@@ -463,7 +469,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
463469
if (length(thisCol)<=1) {
464470
// recycle length-1, or NA-fill length-0
465471
SEXP lev;
466-
const int val = (length(thisCol)==1 && id[0]!=NA_INTEGER && (lev=thisColStrD[id[0]-1])!=NA_STRING) ? -TRUELENGTH(lev) : NA_INTEGER;
472+
const int val = (length(thisCol)==1 && id[0]!=NA_INTEGER && (lev=thisColStrD[id[0]-1])!=NA_STRING) ? -hash_lookup(marks,lev,0) : NA_INTEGER;
467473
// ^^ #3915 and tests 2015.2-5
468474
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
469475
} else {
@@ -474,22 +480,22 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
474480
// retain the position of NA level (if any) and the integer mappings to it
475481
for (int k=0; k<n; ++k) {
476482
SEXP s = thisColStrD[k];
477-
if (s!=NA_STRING && -TRUELENGTH(s)!=k+1) { hop=true; break; }
483+
if (s!=NA_STRING && -hash_lookup(marks,s,0)!=k+1) { hop=true; break; }
478484
}
479485
} else {
480486
for (int k=0; k<n; ++k) {
481487
SEXP s = thisColStrD[k];
482-
if (s==NA_STRING || -TRUELENGTH(s)!=k+1) { hop=true; break; }
488+
if (s==NA_STRING || -hash_lookup(marks,s,0)!=k+1) { hop=true; break; }
483489
}
484490
}
485491
if (hop) {
486492
if (orderedFactor) {
487493
for (int r=0; r<thisnrow; ++r)
488-
targetd[ansloc+r] = id[r]==NA_INTEGER ? NA_INTEGER : -TRUELENGTH(thisColStrD[id[r]-1]);
494+
targetd[ansloc+r] = id[r]==NA_INTEGER ? NA_INTEGER : -hash_lookup(marks,thisColStrD[id[r]-1],0);
489495
} else {
490496
for (int r=0; r<thisnrow; ++r) {
491497
SEXP lev;
492-
targetd[ansloc+r] = id[r]==NA_INTEGER || (lev=thisColStrD[id[r]-1])==NA_STRING ? NA_INTEGER : -TRUELENGTH(lev);
498+
targetd[ansloc+r] = id[r]==NA_INTEGER || (lev=thisColStrD[id[r]-1])==NA_STRING ? NA_INTEGER : -hash_lookup(marks,lev,0);
493499
}
494500
}
495501
} else {
@@ -499,18 +505,16 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
499505
} else {
500506
const SEXP *sd = STRING_PTR_RO(thisColStr);
501507
if (length(thisCol)<=1) {
502-
const int val = (length(thisCol)==1 && sd[0]!=NA_STRING) ? -TRUELENGTH(sd[0]) : NA_INTEGER;
508+
const int val = (length(thisCol)==1 && sd[0]!=NA_STRING) ? -hash_lookup(marks,sd[0],0) : NA_INTEGER;
503509
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
504510
} else {
505-
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = sd[r]==NA_STRING ? NA_INTEGER : -TRUELENGTH(sd[r]);
511+
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = sd[r]==NA_STRING ? NA_INTEGER : -hash_lookup(marks,sd[r],0);
506512
}
507513
}
508514
}
509515
ansloc += thisnrow;
510516
}
511-
for (int k=0; k<nLevel; ++k) SET_TRUELENGTH(levelsRaw[k], 0);
512-
savetl_end();
513-
if (warnStr[0]) warning("%s", warnStr); // now savetl_end() has happened it's safe to call warning (could error if options(warn=2))
517+
if (warnStr[0]) warning("%s", warnStr); // previously had to wait until savetl_end() for it to be safe to call warning (could error if options(warn=2))
514518
SEXP levelsSxp;
515519
setAttrib(target, R_LevelsSymbol, levelsSxp=allocVector(STRSXP, nLevel));
516520
for (int k=0; k<nLevel; ++k) SET_STRING_ELT(levelsSxp, k, levelsRaw[k]);

0 commit comments

Comments
 (0)