Skip to content

Commit 4e9c989

Browse files
aitapjangoreckiben-schwen
authored
Migrate most uses of SETLENGTH to the resizable API (#7451)
* Use the experimental resizable vectors API Thanks to Luke Tierney for introducing the API and helping with the migration. * Backport the resizable API Make sure to set the GROWABLE_BIT on the resizable vectors to avoid problems when they are duplicated or garbage-collected. * test 2291.1: misleading TRUELENGTH now impossible Now that data.table objects have the GROWABLE_BIT set, R will reset TRUELENGTH when duplicating them, causing our code to take a different branch. * Drop the finalizer Now that (1) we depend on R >= 3.4 and (2) data.table objects have the GROWABLE_BIT set, there is no need to adjust allocated memory counts by hand. * frollapply(adaptive=TRUE): resizable temporaries Since adaptive application of rolling functions requires us to resize the argument to match the window size, make sure to allocate it as such. * Drop remaining uses of TRUELENGTH from assign.c - Don't SET_TRUELENGTH by hand. All of our resizable vectors now have the GROWABLE_BIT set, so when they are duplicated, TRUELENGTH is reset to 0. - Use a combination of R_isResizable and R_maxLength to replace other uses of TRUELENGTH. * Drop test for TRUELENGTH from init.c * Better backport of R_isResizable() * Placate rchk * copyAsGrowable: don't crash on 0-len argument * Move the resizable allocation functions to utils.c * duplicateAsResizable: refuse ALTREP objects * maxLength: return xlength if non-resizable That's what the function does in R-devel. * adjust to previous code * Safety checks in R_resizeVector() backport * fix comment * Mark internal errors as # nocov --------- Co-authored-by: Jan Gorecki <[email protected]> Co-authored-by: Benjamin Schwendinger <[email protected]>
1 parent 41e1123 commit 4e9c989

File tree

10 files changed

+108
-147
lines changed

10 files changed

+108
-147
lines changed

R/frollapply.R

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -297,38 +297,22 @@ frollapply = function(X, N, FUN, ..., by.column=TRUE, fill=NA, align=c("right","
297297
tight = function(i, dest, src, n) FUN(.Call(CmemcpyDT, dest, src, i, n), ...)
298298
}
299299
} else {
300-
#has.growable = base::getRversion() >= "3.4.0"
301-
## this is now always TRUE
302-
## we keep this branch, it may be useful when getting rid of SET_GROWABLE_BIT and SETLENGTH #6180
303-
has.growable = TRUE
304-
cpy = if (has.growable) function(x) .Call(Csetgrowable, copy(x)) else copy
300+
cpy = function(x) .Call(CcopyAsGrowable, x)
305301
ansMask = function(len, n) {
306302
mask = seq_len(len) >= n
307303
mask[is.na(mask)] = FALSE ## test 6010.206
308304
mask
309305
}
310306
if (by.column) {
311-
allocWindow = function(x, n) x[seq_len(max(n, na.rm=TRUE))]
312-
if (has.growable) {
313-
tight = function(i, dest, src, n) FUN(.Call(CmemcpyVectoradaptive, dest, src, i, n), ...) # CmemcpyVectoradaptive handles k[i]==0
314-
} else {
315-
tight = function(i, dest, src, n) {stopf("internal error: has.growable should be TRUE, implement support for n==0"); FUN(src[(i-n[i]+1L):i], ...)} # nocov
316-
}
307+
allocWindow = function(x, n) cpy(x[seq_len(max(n, na.rm=TRUE))])
308+
tight = function(i, dest, src, n) FUN(.Call(CmemcpyVectoradaptive, dest, src, i, n), ...) # CmemcpyVectoradaptive handles k[i]==0
317309
} else {
318310
if (!list.df) {
319-
allocWindow = function(x, n) x[seq_len(max(n, na.rm=TRUE)), , drop=FALSE]
320-
} else {
321-
allocWindow = function(x, n) lapply(x, `[`, seq_len(max(n)))
322-
}
323-
if (has.growable) {
324-
tight = function(i, dest, src, n) FUN(.Call(CmemcpyDTadaptive, dest, src, i, n), ...) # CmemcpyDTadaptive handles k[i]==0
311+
allocWindow = function(x, n) cpy(x[seq_len(max(n, na.rm=TRUE)), , drop=FALSE])
325312
} else {
326-
if (!list.df) { # nocov
327-
tight = function(i, dest, src, n) {stopf("internal error: has.growable should be TRUE, implement support for n==0"); FUN(src[(i-n[i]+1L):i, , drop=FALSE], ...)} # nocov
328-
} else {
329-
tight = function(i, dest, src, n) {stopf("internal error: has.growable should be TRUE, implement support for n==0"); FUN(lapply(src, `[`, (i-n[i]+1L):i), ...)} # nocov
330-
}
313+
allocWindow = function(x, n) cpy(lapply(x, `[`, seq_len(max(n))))
331314
}
315+
tight = function(i, dest, src, n) FUN(.Call(CmemcpyDTadaptive, dest, src, i, n), ...) # CmemcpyDTadaptive handles k[i]==0
332316
}
333317
}
334318
## prepare templates for errors and warnings

inst/tests/tests.Rraw

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19453,7 +19453,7 @@ test(2290.4, DT[, `:=`(a = 2, c := 3)], error="It looks like you re-used `:=` in
1945319453
df = data.frame(a=1:3)
1945419454
setDT(df)
1945519455
attr(df, "att") = 1
19456-
test(2291.1, set(df, NULL, "new", "new"), error="attributes .* have been reassigned")
19456+
test(2291.1, set(df, NULL, "new", "new"), error="either been loaded from disk.*or constructed manually.*Please run setDT.*setalloccol.*on it first")
1945719457

1945819458
# ns-qualified bysub error, #6493
1945919459
DT = data.table(a = 1)

src/assign.c

Lines changed: 19 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,5 @@
11
#include "data.table.h"
22

3-
static void finalizer(SEXP p)
4-
{
5-
SEXP x;
6-
R_len_t n, l, tl;
7-
if(!R_ExternalPtrAddr(p)) internal_error(__func__, "didn't receive an ExternalPtr"); // # nocov
8-
p = R_ExternalPtrTag(p);
9-
if (!isString(p)) internal_error(__func__, "ExternalPtr doesn't see names in tag"); // # nocov
10-
l = LENGTH(p);
11-
tl = TRUELENGTH(p);
12-
if (l<0 || tl<l) internal_error(__func__, "l=%d, tl=%d", l, tl); // # nocov
13-
n = tl-l;
14-
if (n==0) {
15-
// gc's ReleaseLargeFreeVectors() will have reduced R_LargeVallocSize by the correct amount
16-
// already, so nothing to do (but almost never the case).
17-
return;
18-
}
19-
x = PROTECT(allocVector(INTSXP, 50)); // 50 so it's big enough to be on LargeVector heap. See NodeClassSize in memory.c:allocVector
20-
// INTSXP rather than VECSXP so that GC doesn't inspect contents after LENGTH (thanks to Karl Miller, Jul 2015)
21-
SETLENGTH(x,50+n*2*sizeof(void *)/4); // 1*n for the names, 1*n for the VECSXP itself (both are over allocated).
22-
UNPROTECT(1);
23-
return;
24-
}
25-
263
void setselfref(SEXP x) {
274
if(!INHERITS(x, char_datatable)) return; // #5286
285
SEXP p;
@@ -38,7 +15,6 @@ void setselfref(SEXP x) {
3815
R_NilValue
3916
))
4017
));
41-
R_RegisterCFinalizerEx(p, finalizer, FALSE);
4218
UNPROTECT(2);
4319

4420
/*
@@ -64,39 +40,8 @@ void setselfref(SEXP x) {
6440
*/
6541
}
6642

67-
/* There are two reasons the finalizer doesn't restore the LENGTH to TRUELENGTH. i) The finalizer
68-
happens too late after GC has already released the memory, and ii) copies by base R (e.g.
69-
[<- in write.table just before test 894) allocate at length LENGTH but copy the TRUELENGTH over.
70-
If the finalizer sets LENGTH to TRUELENGTH, that's a fail as it wasn't really allocated at
71-
TRUELENGTH when R did the copy.
72-
Karl Miller suggested an ENVSXP so that restoring LENGTH in finalizer should work. This is the
73-
closest I got to getting it to pass all tests :
74-
75-
SEXP env = PROTECT(allocSExp(ENVSXP));
76-
defineVar(SelfRefSymbol, x, env);
77-
defineVar(R_NamesSymbol, getAttrib(x, R_NamesSymbol), env);
78-
setAttrib(x, SelfRefSymbol, p = R_MakeExternalPtr(
79-
R_NilValue, // for identical() to return TRUE. identical() doesn't look at tag and prot
80-
R_NilValue, //getAttrib(x, R_NamesSymbol), // to detect if names has been replaced and its tl lost, e.g. setattr(DT,"names",...)
81-
PROTECT( // needed when --enable-strict-barrier it seems, iiuc. TO DO: test under that flag and remove if not needed.
82-
env // wrap x in env to avoid an infinite loop in object.size() if prot=x were here
83-
)
84-
));
85-
R_RegisterCFinalizerEx(p, finalizer, FALSE);
86-
UNPROTECT(2);
87-
88-
Then in finalizer:
89-
SETLENGTH(names, tl)
90-
SETLENGTH(dt, tl)
91-
92-
and that finalizer indeed now happens before the GC releases memory (thanks to the env wrapper).
93-
94-
However, we still have problem (ii) above and it didn't pass tests involving base R copies.
95-
96-
We really need R itself to start setting TRUELENGTH to be the allocated length and then
97-
for GC to release TRUELENGTH not LENGTH. Would really tidy this up.
98-
99-
Moved out of ?setkey Details section in 1.12.2 (Mar 2019). Revisit this w.r.t. to recent versions of R.
43+
/*
44+
Moved out of ?setkey Details section in 1.12.2 (Mar 2019). Revisit this w.r.t. to recent versions of R.
10045
The problem (for \code{data.table}) with the copy by \code{key<-} (other than
10146
being slower) is that \R doesn't maintain the over-allocated truelength, but it
10247
looks as though it has. Adding a column by reference using \code{:=} after a
@@ -126,15 +71,9 @@ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) {
12671
tag = R_ExternalPtrTag(v);
12772
if (!(isNull(tag) || isString(tag))) internal_error(__func__, ".internal.selfref tag is neither NULL nor a character vector"); // # nocov
12873
names = getAttrib(x, R_NamesSymbol);
129-
if (names!=tag && isString(names) && !ALTREP(names)) // !ALTREP for #4734
130-
SET_TRUELENGTH(names, LENGTH(names));
131-
// R copied this vector not data.table; it's not actually over-allocated. It looks over-allocated
132-
// because R copies the original vector's tl over despite allocating length.
13374
prot = R_ExternalPtrProtected(v);
13475
if (TYPEOF(prot) != EXTPTRSXP) // Very rare. Was error(_(".internal.selfref prot is not itself an extptr")).
13576
return 0; // # nocov ; see http://stackoverflow.com/questions/15342227/getting-a-random-internal-selfref-error-in-data-table-for-r
136-
if (x!=R_ExternalPtrAddr(prot) && !ALTREP(x))
137-
SET_TRUELENGTH(x, LENGTH(x)); // R copied this vector not data.table, it's not actually over-allocated
13877
return checkNames ? names==tag : x==R_ExternalPtrAddr(prot);
13978
}
14079

@@ -151,7 +90,7 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
15190
// called from alloccol where n is checked carefully, or from shallow() at R level
15291
// where n is set to truelength (i.e. a shallow copy only with no size change)
15392
int protecti=0;
154-
SEXP newdt = PROTECT(allocVector(VECSXP, n)); protecti++; // to do, use growVector here?
93+
SEXP newdt = PROTECT(R_allocResizableVector(VECSXP, n)); protecti++; // to do, use growVector here?
15594
SHALLOW_DUPLICATE_ATTRIB(newdt, dt);
15695

15796
// TO DO: keepattr() would be faster, but can't because shallow isn't merely a shallow copy. It
@@ -169,7 +108,7 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
169108
setAttrib(newdt, sym_sorted, duplicate(sorted));
170109

171110
SEXP names = PROTECT(getAttrib(dt, R_NamesSymbol)); protecti++;
172-
SEXP newnames = PROTECT(allocVector(STRSXP, n)); protecti++;
111+
SEXP newnames = PROTECT(R_allocResizableVector(STRSXP, n)); protecti++;
173112
const int l = isNull(cols) ? LENGTH(dt) : length(cols);
174113
if (isNull(cols)) {
175114
for (int i=0; i<l; ++i) SET_VECTOR_ELT(newdt, i, VECTOR_ELT(dt,i));
@@ -186,13 +125,9 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
186125
for (int i=0; i<l; ++i) SET_STRING_ELT( newnames, i, STRING_ELT(names,INTEGER(cols)[i]-1) );
187126
}
188127
}
128+
R_resizeVector(newdt,l);
129+
R_resizeVector(newnames,l);
189130
setAttrib(newdt, R_NamesSymbol, newnames);
190-
// setAttrib appears to change length and truelength, so need to do that first _then_ SET next,
191-
// otherwise (if the SET were were first) the 100 tl is assigned to length.
192-
SETLENGTH(newnames,l);
193-
SET_TRUELENGTH(newnames,n);
194-
SETLENGTH(newdt,l);
195-
SET_TRUELENGTH(newdt,n);
196131
setselfref(newdt);
197132
UNPROTECT(protecti);
198133
return(newdt);
@@ -260,7 +195,7 @@ SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose)
260195
// if (TRUELENGTH(getAttrib(dt,R_NamesSymbol))!=tl)
261196
// internal_error(__func__, "tl of dt passes checks, but tl of names (%d) != tl of dt (%d)", tl, TRUELENGTH(getAttrib(dt,R_NamesSymbol))); // # nocov
262197

263-
tl = TRUELENGTH(dt);
198+
tl = R_maxLength(dt);
264199
// R <= 2.13.2 and we didn't catch uninitialized tl somehow
265200
if (tl<0) internal_error(__func__, "tl of class is marked but tl<0"); // # nocov
266201
if (tl>0 && tl<l) internal_error(__func__, "tl (%d) < l (%d) but tl of class is marked", tl, l); // # nocov
@@ -310,11 +245,11 @@ SEXP shallowwrapper(SEXP dt, SEXP cols) {
310245
if (!selfrefok(dt, FALSE)) {
311246
int n = isNull(cols) ? length(dt) : length(cols);
312247
return(shallow(dt, cols, n));
313-
} else return(shallow(dt, cols, TRUELENGTH(dt)));
248+
} else return(shallow(dt, cols, R_maxLength(dt)));
314249
}
315250

316251
SEXP truelength(SEXP x) {
317-
return ScalarInteger(isNull(x) ? 0 : TRUELENGTH(x));
252+
return ScalarInteger(isNull(x) || !R_isResizable(x) ? 0 : R_maxLength(x));
318253
}
319254

320255
SEXP selfrefokwrapper(SEXP x, SEXP verbose) {
@@ -509,10 +444,10 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
509444
// modify DT by reference. Other than if new columns are being added and the allocVec() fails with
510445
// out-of-memory. In that case the user will receive hard halt and know to rerun.
511446
if (length(newcolnames)) {
512-
oldtncol = TRUELENGTH(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.
447+
if (!R_isResizable(dt)) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996
448+
oldtncol = R_maxLength(dt); // TO DO: oldtncol can be just called tl now, as we won't realloc here any more.
513449

514450
if (oldtncol<oldncol) {
515-
if (oldtncol==0) error(_("This data.table has either been loaded from disk (e.g. using readRDS()/load()) or constructed manually (e.g. using structure()). Please run setDT() or setalloccol() on it first (to pre-allocate space for new columns) before assigning by reference to it.")); // #2996
516451
internal_error(__func__, "oldtncol(%d) < oldncol(%d)", oldtncol, oldncol); // # nocov
517452
}
518453
if (oldtncol>oldncol+10000L) warning(_("truelength (%d) is greater than 10,000 items over-allocated (length = %d). See ?truelength. If you didn't set the datatable.alloccol option very large, please report to data.table issue tracker including the result of sessionInfo()."),oldtncol, oldncol);
@@ -522,13 +457,14 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
522457
error(_("It appears that at some earlier point, names of this data.table have been reassigned. Please ensure to use setnames() rather than names<- or colnames<-. Otherwise, please report to data.table issue tracker.")); // # nocov
523458
// Can growVector at this point easily enough, but it shouldn't happen in first place so leave it as
524459
// strong error message for now.
525-
else if (TRUELENGTH(names) != oldtncol)
460+
else if (R_maxLength(names) != oldtncol)
526461
// Use (long long) to cast R_xlen_t to a fixed type to robustly avoid -Wformat compiler warnings, see #5768, PRId64 didn't work
527-
internal_error(__func__, "selfrefnames is ok but tl names [%lld] != tl [%d]", (long long)TRUELENGTH(names), oldtncol); // # nocov
462+
internal_error(__func__, "selfrefnames is ok but maxLength(names) [%lld] != maxLength(dt) [%d]", (long long)R_maxLength(names), oldtncol); // # nocov
528463
if (!selfrefok(dt, verbose)) // #6410 setDT(dt) and subsequent attr<- can lead to invalid selfref
529464
error(_("It appears that at some earlier point, attributes of this data.table have been reassigned. Please use setattr(DT, name, value) rather than attr(DT, name) <- value. If that doesn't apply to you, please report your case to the data.table issue tracker."));
530-
SETLENGTH(dt, oldncol+LENGTH(newcolnames));
531-
SETLENGTH(names, oldncol+LENGTH(newcolnames));
465+
R_resizeVector(dt, oldncol+LENGTH(newcolnames));
466+
R_resizeVector(names, oldncol+LENGTH(newcolnames));
467+
setAttrib(dt, R_NamesSymbol, names);
532468
for (int i=0; i<LENGTH(newcolnames); ++i)
533469
SET_STRING_ELT(names,oldncol+i,STRING_ELT(newcolnames,i));
534470
// truelengths of both already set by alloccol
@@ -726,8 +662,9 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
726662
SET_VECTOR_ELT(dt, i, R_NilValue);
727663
SET_STRING_ELT(names, i, NA_STRING); // release reference to the CHARSXP
728664
}
729-
SETLENGTH(dt, ndt-ndelete);
730-
SETLENGTH(names, ndt-ndelete);
665+
R_resizeVector(dt, ndt-ndelete);
666+
R_resizeVector(names, ndt-ndelete);
667+
setAttrib(dt, R_NamesSymbol, names);
731668
if (LENGTH(names)==0) {
732669
// That was last column deleted, leaving NULL data.table, so we need to reset .row_names, so that it really is the NULL data.table.
733670
PROTECT(nullint=allocVector(INTSXP, 0)); protecti++;

src/data.table.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,23 @@
8787
# endif
8888
#endif
8989

90+
// TODO(R>=4.6.0): remove the SVN revision check
91+
#if R_VERSION < R_Version(4, 6, 0) || R_SVN_REVISION < 89077
92+
# define BACKPORT_RESIZABLE_API
93+
# define R_allocResizableVector(type, maxlen) R_allocResizableVector_(type, maxlen)
94+
# define R_duplicateAsResizable(x) R_duplicateAsResizable_(x)
95+
# define R_maxLength(x) R_maxLength_(x)
96+
static inline R_xlen_t R_maxLength_(SEXP x) {
97+
return IS_GROWABLE(x) ? TRUELENGTH(x) : XLENGTH(x);
98+
}
99+
# define R_isResizable(x) R_isResizable_(x)
100+
static inline bool R_isResizable_(SEXP x) {
101+
// IS_GROWABLE checks for XLENGTH < TRUELENGTH instead
102+
return (LEVELS(x) & 0x20) && XLENGTH(x) <= TRUELENGTH(x);
103+
}
104+
# define R_resizeVector(x, newlen) R_resizeVector_(x, newlen)
105+
#endif
106+
90107
// init.c
91108
extern SEXP char_integer64;
92109
extern SEXP char_ITime;
@@ -282,7 +299,7 @@ SEXP memcpyVector(SEXP dest, SEXP src, SEXP offset, SEXP size);
282299
SEXP memcpyDT(SEXP dest, SEXP src, SEXP offset, SEXP size);
283300
SEXP memcpyVectoradaptive(SEXP dest, SEXP src, SEXP offset, SEXP size);
284301
SEXP memcpyDTadaptive(SEXP dest, SEXP src, SEXP offset, SEXP size);
285-
SEXP setgrowable(SEXP x);
302+
SEXP copyAsGrowable(SEXP x);
286303

287304
// nafill.c
288305
void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, bool nan_is_na, ans_t *ans, bool verbose);
@@ -322,6 +339,11 @@ bool perhapsDataTable(SEXP x);
322339
SEXP perhapsDataTableR(SEXP x);
323340
SEXP frev(SEXP x, SEXP copyArg);
324341
NORET void internal_error(const char *call_name, const char *format, ...);
342+
#ifdef BACKPORT_RESIZABLE_API
343+
SEXP R_allocResizableVector_(SEXPTYPE type, R_xlen_t maxlen);
344+
SEXP R_duplicateAsResizable_(SEXP x);
345+
void R_resizeVector_(SEXP x, R_xlen_t newlen);
346+
#endif
325347

326348
// types.c
327349
char *end(char *start);

src/dogroups.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,9 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
322322
// Even if we could know reliably to switch from allocNAVectorLike to allocVector for slight speedup, user code could still
323323
// contain a switched halt, and in that case we'd want the groups not yet done to have NA rather than 0 or uninitialized.
324324
// Increment length only if the allocation passes, #1676. But before SET_VECTOR_ELT otherwise attempt-to-set-index-n/n R error
325-
SETLENGTH(dtnames, LENGTH(dtnames)+1);
326-
SETLENGTH(dt, LENGTH(dt)+1);
325+
R_resizeVector(dtnames, LENGTH(dtnames)+1);
326+
R_resizeVector(dt, LENGTH(dt)+1);
327+
setAttrib(dt, R_NamesSymbol, dtnames);
327328
SET_VECTOR_ELT(dt, colj, target);
328329
UNPROTECT(1);
329330
SET_STRING_ELT(dtnames, colj, STRING_ELT(newnames, colj-origncol));
@@ -519,7 +520,7 @@ SEXP growVector(SEXP x, const R_len_t newlen)
519520
SEXP newx;
520521
R_len_t len = length(x);
521522
if (isNull(x)) error(_("growVector passed NULL"));
522-
PROTECT(newx = allocVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here?
523+
PROTECT(newx = R_allocResizableVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here?
523524
if (newlen < len) len=newlen; // i.e. shrink
524525
if (!len) { // cannot memcpy invalid pointer, #6819
525526
SHALLOW_DUPLICATE_ATTRIB(newx, x);

0 commit comments

Comments
 (0)