Skip to content

Commit 2c10c37

Browse files
committed
assign: factor out index fixup
Instead of walking the attribute list directly, use R_mapAttrib(). Create a hash table of index names instead of relying on chin() and a temporary string vector. Move all temporary allocations onto the R heap.
1 parent 3c66757 commit 2c10c37

File tree

1 file changed

+106
-89
lines changed

1 file changed

+106
-89
lines changed

src/assign.c

Lines changed: 106 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,99 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
256256
return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0]));
257257
}
258258

259+
struct attrib_name_ctx {
260+
hashtab *indexNames;
261+
SEXP index, assignedNames;
262+
R_xlen_t len;
263+
bool verbose;
264+
};
265+
static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) {
266+
(void)val;
267+
struct attrib_name_ctx *ctx = ctx_;
268+
if (ctx->indexNames)
269+
hash_set(ctx->indexNames, PRINTNAME(key), 1);
270+
else
271+
ctx->len++;
272+
return NULL;
273+
}
274+
275+
static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) {
276+
const struct attrib_name_ctx *ctx = ctx_;
277+
278+
hashtab *indexNames = ctx->indexNames;
279+
SEXP index = ctx->index, assignedNames = ctx->assignedNames;
280+
R_xlen_t indexLength = xlength(value);
281+
bool verbose = ctx->verbose;
282+
283+
const char *tc1, *c1;
284+
tc1 = c1 = CHAR(PRINTNAME(tag)); // the index name; e.g. "__col1__col2"
285+
286+
if (*tc1!='_' || *(tc1+1)!='_') {
287+
// fix for #1396
288+
if (verbose) {
289+
Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1);
290+
}
291+
setAttrib(index, tag, R_NilValue);
292+
return NULL;
293+
}
294+
295+
tc1 += 2; // tc1 always marks the start of a key column
296+
if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov
297+
298+
void *vmax = vmaxget();
299+
// check the position of the first appearance of an assigned column in the index.
300+
// the new index will be truncated to this position.
301+
size_t newKeyLength = strlen(c1);
302+
char *s4 = R_alloc(newKeyLength + 3, 1);
303+
memcpy(s4, c1, newKeyLength);
304+
memcpy(s4 + newKeyLength, "__", 3);
305+
306+
for(int i = 0; i < xlength(assignedNames); i++){
307+
const char *tc2 = CHAR(STRING_ELT(assignedNames, i));
308+
void *vmax2 = vmaxget();
309+
size_t tc2_len = strlen(tc2);
310+
char *s5 = R_alloc(tc2_len + 5, 1); //4 * '_' + \0
311+
memcpy(s5, "__", 2);
312+
memcpy(s5 + 2, tc2, tc2_len);
313+
memcpy(s5 + 2 + tc2_len, "__", 3);
314+
tc2 = strstr(s4, s5);
315+
if(tc2 && (tc2 - s4 < newKeyLength)){ // new column is part of key; match is before last match
316+
newKeyLength = tc2 - s4;
317+
}
318+
vmaxset(vmax2);
319+
}
320+
321+
s4[newKeyLength] = '\0'; // truncate the new key to the new length
322+
if(newKeyLength == 0){ // no valid key column remains. Drop the key
323+
setAttrib(index, tag, R_NilValue);
324+
hash_set(indexNames, PRINTNAME(tag), 0);
325+
if (verbose) {
326+
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
327+
}
328+
} else if(newKeyLength < strlen(c1)) {
329+
SEXP s4Str = PROTECT(mkChar(s4));
330+
if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372)
331+
!hash_lookup(indexNames, s4Str, 0)) { // index with shortened name not present yet
332+
setAttrib(index, installChar(s4Str), value);
333+
hash_set(indexNames, PRINTNAME(tag), 0);
334+
setAttrib(index, tag, R_NilValue);
335+
hash_set(indexNames, s4Str, 1);
336+
if (verbose)
337+
Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4+2);
338+
} else { // indexLength > 0 || shortened name present already
339+
// indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372)
340+
// shortened name already present indicates that index needs to be dropped to avoid duplicate indices.
341+
setAttrib(index, tag, R_NilValue);
342+
hash_set(indexNames, tag, 0);
343+
if (verbose)
344+
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
345+
}
346+
UNPROTECT(1); // s4Str
347+
} //else: index is not affected by assign: nothing to be done
348+
vmaxset(vmax);
349+
return NULL;
350+
}
351+
259352
int *_Last_updated = NULL;
260353

261354
SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
@@ -265,11 +358,11 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
265358
// cols : column names or numbers corresponding to the values to set
266359
// rows : row numbers to assign
267360
R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength;
268-
SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames;
361+
SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames;
269362
bool verbose=GetVerbose();
270363
int ndelete=0; // how many columns are being deleted
271364
const char *c1, *tc1, *tc2;
272-
int *buf, indexNo;
365+
int *buf;
273366
if (isNull(dt)) error(_("assign has been passed a NULL dt"));
274367
if (TYPEOF(dt) != VECSXP) error(_("dt passed to assign isn't type VECSXP"));
275368
if (islocked(dt))
@@ -549,93 +642,17 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
549642
}
550643
index = getAttrib(dt, install("index"));
551644
if (index != R_NilValue) {
552-
s = ATTRIB(index);
553-
indexNo = 0;
554-
// get a vector with all index names
555-
PROTECT(indexNames = allocVector(STRSXP, xlength(s))); protecti++;
556-
while(s != R_NilValue){
557-
SET_STRING_ELT(indexNames, indexNo, PRINTNAME(TAG(s)));
558-
indexNo++;
559-
s = CDR(s);
560-
}
561-
s = ATTRIB(index); // reset to first element
562-
indexNo = 0;
563-
while(s != R_NilValue) {
564-
a = TAG(s);
565-
indexLength = xlength(CAR(s));
566-
tc1 = c1 = CHAR(PRINTNAME(a)); // the index name; e.g. "__col1__col2"
567-
if (*tc1!='_' || *(tc1+1)!='_') {
568-
// fix for #1396
569-
if (verbose) {
570-
Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1);
571-
}
572-
setAttrib(index, a, R_NilValue);
573-
indexNo++;
574-
s = CDR(s);
575-
continue; // with next index
576-
}
577-
tc1 += 2; // tc1 always marks the start of a key column
578-
if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov
579-
// check the position of the first appearance of an assigned column in the index.
580-
// the new index will be truncated to this position.
581-
char *s4 = malloc(strlen(c1) + 3);
582-
if (!s4) {
583-
internal_error(__func__, "Couldn't allocate memory for s4"); // # nocov
584-
}
585-
memcpy(s4, c1, strlen(c1));
586-
memset(s4 + strlen(c1), '\0', 1);
587-
strcat(s4, "__"); // add trailing '__' to newKey so we can search for pattern '__colName__' also at the end of the index.
588-
int newKeyLength = strlen(c1);
589-
for(int i = 0; i < xlength(assignedNames); i++){
590-
tc2 = CHAR(STRING_ELT(assignedNames, i));
591-
char *s5 = malloc(strlen(tc2) + 5); //4 * '_' + \0
592-
if (!s5) {
593-
free(s4); // # nocov
594-
internal_error(__func__, "Couldn't allocate memory for s5"); // # nocov
595-
}
596-
memset(s5, '_', 2);
597-
memset(s5 + 2, '\0', 1);
598-
strcat(s5, tc2);
599-
strcat(s5, "__");
600-
tc2 = strstr(s4, s5);
601-
if(tc2 == NULL){ // column is not part of key
602-
free(s5);
603-
continue;
604-
}
605-
if(tc2 - s4 < newKeyLength){ // new column match is before last match
606-
newKeyLength = tc2 - s4;
607-
}
608-
free(s5);
609-
}
610-
memset(s4 + newKeyLength, '\0', 1); // truncate the new key to the new length
611-
if(newKeyLength == 0){ // no valid key column remains. Drop the key
612-
setAttrib(index, a, R_NilValue);
613-
SET_STRING_ELT(indexNames, indexNo, NA_STRING);
614-
if (verbose) {
615-
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
616-
}
617-
} else if(newKeyLength < strlen(c1)) {
618-
SEXP s4Str = PROTECT(mkString(s4));
619-
if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372)
620-
LOGICAL(chin(s4Str, indexNames))[0] == 0) {// index with shortened name not present yet
621-
SET_TAG(s, install(s4));
622-
SET_STRING_ELT(indexNames, indexNo, mkChar(s4));
623-
if (verbose)
624-
Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4 + 2);
625-
} else { // indexLength > 0 || shortened name present already
626-
// indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372)
627-
// shortened name already present indicates that index needs to be dropped to avoid duplicate indices.
628-
setAttrib(index, a, R_NilValue);
629-
SET_STRING_ELT(indexNames, indexNo, NA_STRING);
630-
if (verbose)
631-
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
632-
}
633-
UNPROTECT(1); // s4Str
634-
} //else: index is not affected by assign: nothing to be done
635-
free(s4);
636-
indexNo ++;
637-
s = CDR(s);
638-
}
645+
struct attrib_name_ctx ctx = { 0, };
646+
R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes?
647+
hashtab *h = hash_create(ctx.len);
648+
PROTECT(h->prot);
649+
ctx.indexNames = h;
650+
R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names
651+
ctx.index = index;
652+
ctx.assignedNames = assignedNames;
653+
ctx.verbose = verbose;
654+
R_mapAttrib(index, fixIndexAttrib, &ctx); // adjust indices as needed
655+
UNPROTECT(1); // h
639656
}
640657
if (ndelete) {
641658
// delete any columns assigned NULL (there was a 'continue' earlier in loop above)

0 commit comments

Comments
 (0)