Skip to content

Commit 4d30c7f

Browse files
aitapHughParsonageben-schwenMichaelChirico
authored
Replace ATTRIB, SET_ATTRIB (#7487)
* frev: drop SET_ATTRIB Instead, backport and use CLEAR_ATTRIB (R >= 4.5). * mergeIndexAttrib: drop SET_ATTRIB Use SHALLOW_DUPLICATE_ATTRIB (R >= 3.3) for the simple case. Also, Backport ANY_ATTRIB (R >= 4.5) instead of testing !isNull(ATTRIB(.)). * cbindlist: use ANY_ATTRIB * nafillR: use ANY_ATTRIB * Backport R_mapAttrib * anySpecialStatic: switch to R_mapAttrib * dogroups: construct rownames anew Instead of trying to walk ATTRIB in search of the compact 'rownames' attribute to modify, install it anew, take note of the returned reference to the value being installed (a different one!) and modify that. * mergeIndexAttrib: switch to R_mapAttrib * assign: factor out index fixup Instead of walking the attribute list directly, use R_mapAttrib(). Create a hash table of index names instead of relying on chin() and a temporary string vector. Move all temporary allocations onto the R heap. * assign: drop indexLength * assign: fix index unmarking * Comments, better field names * Update src/dogroups.c Co-authored-by: Benjamin Schwendinger <[email protected]> * mapAttrib: protect the attribute value Otherwise the callback could remove the attribute and end up with the value unprotected. Protect the attribute tag as well for uniformity. Co-Authored-By: HughParsonage <[email protected]> * dogroups: look up rownames using mapAttrib This solution is closer to the working approach previously taken by the code. * Fix comment, function name * Protect the newly found rownames attribute * add NEWS entry --------- Co-authored-by: HughParsonage <[email protected]> Co-authored-by: Benjamin Schwendinger <[email protected]> Co-authored-by: Michael Chirico <[email protected]> Co-authored-by: Michael Chirico <[email protected]>
1 parent bf91b3a commit 4d30c7f

File tree

7 files changed

+174
-110
lines changed

7 files changed

+174
-110
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626

2727
3. Vignettes are now built using `litedown` instead of `knitr`, [#6394](https://github.com/Rdatatable/data.table/issues/6394). Thanks @jangorecki for the suggestion and @ben-schwen and @aitap for the implementation.
2828

29+
4. Removed use of non-API macros `ATTRIB`, `SET_ATTRIB`, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks @aitap for the continued assiduous work here.
30+
2931
### BUG FIXES
3032

3133
1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix.

src/assign.c

Lines changed: 111 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,103 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
256256
return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0]));
257257
}
258258

259+
struct attrib_name_ctx {
260+
hashtab *indexNames; // stores a 1 for every CHARSXP index name in use, 0 for removed
261+
R_xlen_t indexNamesLen; // how much memory to allocate for the hash?
262+
SEXP index; // attr(DT, "index")
263+
SEXP assignedNames; // STRSXP vector of variable names just assigned
264+
bool verbose;
265+
};
266+
267+
// Mark each CHARSXP attribute name with a 1 inside the hash, or count them to find out the allocation size.
268+
static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) {
269+
(void)val;
270+
struct attrib_name_ctx *ctx = ctx_;
271+
if (ctx->indexNames)
272+
hash_set(ctx->indexNames, PRINTNAME(key), 1);
273+
else
274+
ctx->indexNamesLen++;
275+
return NULL;
276+
}
277+
278+
// For a given index, find out if it sorts a column that has just been assigned. If so, shorten the index (if an equivalent one doesn't already exist) or remove it altogether.
279+
static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) {
280+
const struct attrib_name_ctx *ctx = ctx_;
281+
282+
hashtab *indexNames = ctx->indexNames;
283+
SEXP index = ctx->index, assignedNames = ctx->assignedNames;
284+
R_xlen_t indexLength = xlength(value);
285+
bool verbose = ctx->verbose;
286+
287+
const char *tc1, *c1;
288+
tc1 = c1 = CHAR(PRINTNAME(tag)); // the index name; e.g. "__col1__col2"
289+
290+
if (*tc1!='_' || *(tc1+1)!='_') {
291+
// fix for #1396
292+
if (verbose) {
293+
Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1);
294+
}
295+
setAttrib(index, tag, R_NilValue);
296+
return NULL;
297+
}
298+
299+
tc1 += 2; // tc1 always marks the start of a key column
300+
if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov
301+
302+
void *vmax = vmaxget();
303+
// check the position of the first appearance of an assigned column in the index.
304+
// the new index will be truncated to this position.
305+
size_t newKeyLength = strlen(c1);
306+
char *s4 = R_alloc(newKeyLength + 3, 1);
307+
memcpy(s4, c1, newKeyLength);
308+
memcpy(s4 + newKeyLength, "__", 3);
309+
310+
for(int i = 0; i < xlength(assignedNames); i++){
311+
const char *tc2 = CHAR(STRING_ELT(assignedNames, i));
312+
void *vmax2 = vmaxget();
313+
size_t tc2_len = strlen(tc2);
314+
char *s5 = R_alloc(tc2_len + 5, 1); //4 * '_' + \0
315+
memcpy(s5, "__", 2);
316+
memcpy(s5 + 2, tc2, tc2_len);
317+
memcpy(s5 + 2 + tc2_len, "__", 3);
318+
tc2 = strstr(s4, s5);
319+
if(tc2 && (tc2 - s4 < newKeyLength)){ // new column is part of key; match is before last match
320+
newKeyLength = tc2 - s4;
321+
}
322+
vmaxset(vmax2);
323+
}
324+
325+
s4[newKeyLength] = '\0'; // truncate the new key to the new length
326+
if(newKeyLength == 0){ // no valid key column remains. Drop the key
327+
setAttrib(index, tag, R_NilValue);
328+
hash_set(indexNames, PRINTNAME(tag), 0);
329+
if (verbose) {
330+
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
331+
}
332+
} else if(newKeyLength < strlen(c1)) {
333+
SEXP s4Str = PROTECT(mkChar(s4));
334+
if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372)
335+
!hash_lookup(indexNames, s4Str, 0)) { // index with shortened name not present yet
336+
setAttrib(index, installChar(s4Str), value);
337+
hash_set(indexNames, PRINTNAME(tag), 0);
338+
setAttrib(index, tag, R_NilValue);
339+
hash_set(indexNames, s4Str, 1);
340+
if (verbose)
341+
Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4+2);
342+
} else { // indexLength > 0 || shortened name present already
343+
// indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372)
344+
// shortened name already present indicates that index needs to be dropped to avoid duplicate indices.
345+
setAttrib(index, tag, R_NilValue);
346+
hash_set(indexNames, PRINTNAME(tag), 0);
347+
if (verbose)
348+
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
349+
}
350+
UNPROTECT(1); // s4Str
351+
} //else: index is not affected by assign: nothing to be done
352+
vmaxset(vmax);
353+
return NULL;
354+
}
355+
259356
int *_Last_updated = NULL;
260357

261358
SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
@@ -264,12 +361,12 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
264361
// newcolnames : add these columns (if any)
265362
// cols : column names or numbers corresponding to the values to set
266363
// rows : row numbers to assign
267-
R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength;
268-
SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames;
364+
R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum;
365+
SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames;
269366
bool verbose=GetVerbose();
270367
int ndelete=0; // how many columns are being deleted
271368
const char *c1, *tc1, *tc2;
272-
int *buf, indexNo;
369+
int *buf;
273370
if (isNull(dt)) error(_("assign has been passed a NULL dt"));
274371
if (TYPEOF(dt) != VECSXP) error(_("dt passed to assign isn't type VECSXP"));
275372
if (islocked(dt))
@@ -549,93 +646,17 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
549646
}
550647
index = getAttrib(dt, install("index"));
551648
if (index != R_NilValue) {
552-
s = ATTRIB(index);
553-
indexNo = 0;
554-
// get a vector with all index names
555-
PROTECT(indexNames = allocVector(STRSXP, xlength(s))); protecti++;
556-
while(s != R_NilValue){
557-
SET_STRING_ELT(indexNames, indexNo, PRINTNAME(TAG(s)));
558-
indexNo++;
559-
s = CDR(s);
560-
}
561-
s = ATTRIB(index); // reset to first element
562-
indexNo = 0;
563-
while(s != R_NilValue) {
564-
a = TAG(s);
565-
indexLength = xlength(CAR(s));
566-
tc1 = c1 = CHAR(PRINTNAME(a)); // the index name; e.g. "__col1__col2"
567-
if (*tc1!='_' || *(tc1+1)!='_') {
568-
// fix for #1396
569-
if (verbose) {
570-
Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1);
571-
}
572-
setAttrib(index, a, R_NilValue);
573-
indexNo++;
574-
s = CDR(s);
575-
continue; // with next index
576-
}
577-
tc1 += 2; // tc1 always marks the start of a key column
578-
if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov
579-
// check the position of the first appearance of an assigned column in the index.
580-
// the new index will be truncated to this position.
581-
char *s4 = malloc(strlen(c1) + 3);
582-
if (!s4) {
583-
internal_error(__func__, "Couldn't allocate memory for s4"); // # nocov
584-
}
585-
memcpy(s4, c1, strlen(c1));
586-
memset(s4 + strlen(c1), '\0', 1);
587-
strcat(s4, "__"); // add trailing '__' to newKey so we can search for pattern '__colName__' also at the end of the index.
588-
int newKeyLength = strlen(c1);
589-
for(int i = 0; i < xlength(assignedNames); i++){
590-
tc2 = CHAR(STRING_ELT(assignedNames, i));
591-
char *s5 = malloc(strlen(tc2) + 5); //4 * '_' + \0
592-
if (!s5) {
593-
free(s4); // # nocov
594-
internal_error(__func__, "Couldn't allocate memory for s5"); // # nocov
595-
}
596-
memset(s5, '_', 2);
597-
memset(s5 + 2, '\0', 1);
598-
strcat(s5, tc2);
599-
strcat(s5, "__");
600-
tc2 = strstr(s4, s5);
601-
if(tc2 == NULL){ // column is not part of key
602-
free(s5);
603-
continue;
604-
}
605-
if(tc2 - s4 < newKeyLength){ // new column match is before last match
606-
newKeyLength = tc2 - s4;
607-
}
608-
free(s5);
609-
}
610-
memset(s4 + newKeyLength, '\0', 1); // truncate the new key to the new length
611-
if(newKeyLength == 0){ // no valid key column remains. Drop the key
612-
setAttrib(index, a, R_NilValue);
613-
SET_STRING_ELT(indexNames, indexNo, NA_STRING);
614-
if (verbose) {
615-
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
616-
}
617-
} else if(newKeyLength < strlen(c1)) {
618-
SEXP s4Str = PROTECT(mkString(s4));
619-
if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372)
620-
LOGICAL(chin(s4Str, indexNames))[0] == 0) {// index with shortened name not present yet
621-
SET_TAG(s, install(s4));
622-
SET_STRING_ELT(indexNames, indexNo, mkChar(s4));
623-
if (verbose)
624-
Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4 + 2);
625-
} else { // indexLength > 0 || shortened name present already
626-
// indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372)
627-
// shortened name already present indicates that index needs to be dropped to avoid duplicate indices.
628-
setAttrib(index, a, R_NilValue);
629-
SET_STRING_ELT(indexNames, indexNo, NA_STRING);
630-
if (verbose)
631-
Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2);
632-
}
633-
UNPROTECT(1); // s4Str
634-
} //else: index is not affected by assign: nothing to be done
635-
free(s4);
636-
indexNo ++;
637-
s = CDR(s);
638-
}
649+
struct attrib_name_ctx ctx = { 0, };
650+
R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes?
651+
hashtab *h = hash_create(ctx.indexNamesLen);
652+
PROTECT(h->prot);
653+
ctx.indexNames = h;
654+
R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names
655+
ctx.index = index;
656+
ctx.assignedNames = assignedNames;
657+
ctx.verbose = verbose;
658+
R_mapAttrib(index, fixIndexAttrib, &ctx); // adjust indices as needed
659+
UNPROTECT(1); // h
639660
}
640661
if (ndelete) {
641662
// delete any columns assigned NULL (there was a 'continue' earlier in loop above)

src/data.table.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#endif
1616
#if R_VERSION < R_Version(4, 5, 0)
1717
# define isDataFrame(x) isFrame(x) // #6180
18+
# define CLEAR_ATTRIB(x) SET_ATTRIB(x, R_NilValue)
19+
# define ANY_ATTRIB(x) (!(isNull(ATTRIB(x))))
1820
#endif
1921
#include <Rinternals.h>
2022
#define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT
@@ -103,6 +105,11 @@
103105
}
104106
# define R_resizeVector(x, newlen) R_resizeVector_(x, newlen)
105107
#endif
108+
// TODO(R>=4.6.0): remove the SVN revision check
109+
#if R_VERSION < R_Version(4, 6, 0) || R_SVN_REVISION < 89194
110+
# define BACKPORT_MAP_ATTRIB
111+
# define R_mapAttrib(x, fun, ctx) R_mapAttrib_(x, fun, ctx)
112+
#endif
106113

107114
// init.c
108115
extern SEXP char_integer64;
@@ -343,6 +350,9 @@ SEXP R_allocResizableVector_(SEXPTYPE type, R_xlen_t maxlen);
343350
SEXP R_duplicateAsResizable_(SEXP x);
344351
void R_resizeVector_(SEXP x, R_xlen_t newlen);
345352
#endif
353+
#ifdef BACKPORT_MAP_ATTRIB
354+
SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx);
355+
#endif
346356
SEXP is_direct_child(SEXP pids);
347357

348358
// types.c

src/dogroups.c

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include <fcntl.h>
44
#include <time.h>
55

6+
static SEXP anySpecialAttribute(SEXP key, SEXP val, void *ctx);
7+
68
static bool anySpecialStatic(SEXP x, hashtab * specials) {
79
// Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd
810
// Static because these are like C static arrays which are the same memory for each group; e.g., dogroups
@@ -39,7 +41,7 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) {
3941
// with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed.
4042
// Currently the marker is negative truelength. These specials are protected by us here and before we release them
4143
// we restore the true truelength for when R starts to use vector truelength.
42-
SEXP attribs, list_el;
44+
SEXP list_el;
4345
const int n = length(x);
4446
// use length() not LENGTH() because isNewList() is true for NULL
4547
if (n==0)
@@ -53,20 +55,29 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) {
5355
list_el = VECTOR_ELT(x,i);
5456
if (anySpecialStatic(list_el, specials))
5557
return true;
56-
for(attribs = ATTRIB(list_el); attribs != R_NilValue; attribs = CDR(attribs)) {
57-
if (anySpecialStatic(CAR(attribs), specials))
58-
return true; // #4936
59-
}
58+
if (R_mapAttrib(list_el, anySpecialAttribute, specials))
59+
return true; // #4936
6060
}
6161
}
6262
return false;
6363
}
6464

65+
static SEXP anySpecialAttribute(SEXP key, SEXP val, void *specials) {
66+
(void)key;
67+
return anySpecialStatic(val, specials) ? R_NilValue : NULL;
68+
}
69+
70+
static SEXP findRowNames(SEXP key, SEXP val, void *data) {
71+
(void)data;
72+
if (key == R_RowNamesSymbol) return val;
73+
return NULL;
74+
}
75+
6576
SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg)
6677
{
6778
R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp;
6879
int nprotect=0;
69-
SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source;
80+
SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, s, RHS, target, source;
7081
Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE;
7182
const bool verbose = LOGICAL(verboseArg)[0]==1;
7283
double tstart=0, tblock[10]={0}; int nblock[10]={0}; // For verbose printing, tstart is updated each block
@@ -130,11 +141,11 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
130141
R_LockBinding(install(".I"), env);
131142

132143
SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by"
133-
// fetch rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to
144+
145+
// override rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to
134146
// non data.table aware package that uses rownames
135-
for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s)); // getAttrib0 basically but that's hidden in attrib.c; #loop_counter_not_local_scope_ok
136-
if (s==R_NilValue) error(_("row.names attribute of .SD not found"));
137-
rownames = CAR(s);
147+
SEXP rownames = PROTECT(R_mapAttrib(SD, findRowNames, NULL)); nprotect++;
148+
if (rownames == NULL) error(_("row.names attribute of .SD not found"));
138149
if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]);
139150

140151
// fetch names of .SD and prepare symbols. In case they are copied-on-write by user assigning to those variables

src/mergelist.c

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,21 @@ SEXP copyCols(SEXP x, SEXP cols) {
1717
return R_NilValue;
1818
}
1919

20+
static SEXP setDuplicateOneAttrib(SEXP key, SEXP val, void *x) {
21+
setAttrib(x, PROTECT(key), PROTECT(shallow_duplicate(val)));
22+
UNPROTECT(2);
23+
return NULL; // continue
24+
}
25+
2026
void mergeIndexAttrib(SEXP to, SEXP from) {
2127
if (!isInteger(to) || LENGTH(to)!=0)
2228
internal_error(__func__, "'to' must be integer() already"); // # nocov
2329
if (isNull(from))
2430
return;
25-
SEXP t = ATTRIB(to), f = ATTRIB(from);
26-
if (isNull(t)) // target has no attributes -> overwrite
27-
SET_ATTRIB(to, shallow_duplicate(f));
28-
else {
29-
for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to
30-
SETCDR(t, shallow_duplicate(f));
31-
}
31+
if (!ANY_ATTRIB(to)) // target has no attributes -> overwrite
32+
SHALLOW_DUPLICATE_ATTRIB(to, from);
33+
else
34+
R_mapAttrib(from, setDuplicateOneAttrib, to);
3235
}
3336

3437
SEXP cbindlist(SEXP x, SEXP copyArg) {
@@ -84,7 +87,7 @@ SEXP cbindlist(SEXP x, SEXP copyArg) {
8487
key = getAttrib(thisx, sym_sorted);
8588
UNPROTECT(protecti); // thisnames, thisxcol
8689
}
87-
if (isNull(ATTRIB(index)))
90+
if (!ANY_ATTRIB(index))
8891
setAttrib(ans, sym_index, R_NilValue);
8992
setAttrib(ans, R_NamesSymbol, names);
9093
setAttrib(ans, sym_sorted, key);

src/nafill.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S
218218

219219
if (!binplace) {
220220
for (R_len_t i=0; i<nx; i++) {
221-
if (!isNull(ATTRIB(VECTOR_ELT(x, i))))
221+
if (ANY_ATTRIB(VECTOR_ELT(x, i)))
222222
copyMostAttrib(VECTOR_ELT(x, i), VECTOR_ELT(ans, i));
223223
}
224224
SEXP obj_names = getAttrib(obj, R_NamesSymbol); // copy names

0 commit comments

Comments
 (0)