Skip to content

Commit b3a42a6

Browse files
authored
Merge branch 'master' into macroRemoval
2 parents 95cace5 + 131af20 commit b3a42a6

File tree

9 files changed

+96
-37
lines changed

9 files changed

+96
-37
lines changed

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,10 @@
4444

4545
12. Internal functions used to signal errors are now marked as non-returning, silencing a compiler warning about potentially unchecked allocation failure. Thanks to Prof. Brian D. Ripley for the report and @aitap for the fix, [#7070](https://github.com/Rdatatable/data.table/pull/7070).
4646

47+
13. In rare cases, `data.table` failed to expand ALTREP columns when assigning a full column by reference. This could result in the target column getting modified unintentionally if the next call to the data.table was a modification by reference of the source column. E.g. in `DT[, b := as.character(a)]` the string conversion gets deferred and subsequent modification of column `a` would also modify column `b`, [#5400](https://github.com/Rdatatable/data.table/issues/5400). Thanks to @aquasync for the report and Václav Tlapák for the PR.
48+
49+
14. `data.table()` function is now more aligned with `data.frame()` with respect to the names of the output when one of its inputs is a single-column matrix object, [#4124](https://github.com/Rdatatable/data.table/issues/4124). Thanks @PavoDive for the report and @jangorecki for the PR.
50+
4751
### NOTES
4852

4953
1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.

R/as.data.table.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,11 @@ as.data.table.list = function(x,
142142
xi = x[[i]] = as.POSIXct(xi)
143143
} else if (is.matrix(xi) || is.data.frame(xi)) {
144144
if (!is.data.table(xi)) {
145-
xi = x[[i]] = as.data.table(xi, keep.rownames=keep.rownames) # we will never allow a matrix to be a column; always unpack the columns
145+
if (is.matrix(xi) && NCOL(xi)<=1L && is.null(colnames(xi))) { # 1 column matrix naming #4124
146+
xi = x[[i]] = c(xi)
147+
} else {
148+
xi = x[[i]] = as.data.table(xi, keep.rownames=keep.rownames) # we will never allow a matrix to be a column; always unpack the columns
149+
}
146150
}
147151
# else avoid dispatching to as.data.table.data.table (which exists and copies)
148152
} else if (is.table(xi)) {

R/fread.R

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,13 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
7070
}
7171
}
7272
if (!is.null(cmd)) {
73-
(if (.Platform$OS.type == "unix") system else shell)(paste0('(', cmd, ') > ', tmpFile<-tempfile(tmpdir=tmpdir)))
74-
file = tmpFile
73+
tmpFile = tempfile(tmpdir=tmpdir)
7574
on.exit(unlink(tmpFile), add=TRUE)
75+
status = (if (.Platform$OS.type == "unix") system else shell)(paste0('(', cmd, ') > ', tmpFile))
76+
if (status != 0) {
77+
stopf("External command failed with exit code %d. This can happen when the disk is full in the temporary directory ('%s'). See ?fread for the tmpdir argument.", status, tmpdir)
78+
}
79+
file = tmpFile
7680
}
7781
if (!is.null(file)) {
7882
if (!is.character(file) || length(file)!=1L)
@@ -116,9 +120,14 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
116120
if (!requireNamespace("R.utils", quietly = TRUE))
117121
stopf("To read %s files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.", if (w<=2L || gzsig) "gz" else "bz2") # nocov
118122
FUN = if (w<=2L || gzsig) gzfile else bzfile
119-
R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional
120-
file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download
123+
decompFile = tempfile(tmpdir=tmpdir)
121124
on.exit(unlink(decompFile), add=TRUE)
125+
tryCatch({
126+
R.utils::decompressFile(file, decompFile, ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional
127+
}, error = function(e) {
128+
stopf("R.utils::decompressFile failed to decompress file '%s':\n %s\n. This can happen when the disk is full in the temporary directory ('%s'). See ?fread for the tmpdir argument.", file, conditionMessage(e), tmpdir)
129+
})
130+
file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download
122131
}
123132
file = enc2native(file) # CfreadR cannot handle UTF-8 if that is not the native encoding, see #3078.
124133

R/like.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Don't use * or % like SQL's like. Uses regexpr syntax - more powerful.
33
# returns 'logical' so can be combined with other where clauses.
44
like = function(vector, pattern, ignore.case = FALSE, fixed = FALSE, perl = FALSE) {
5+
# TODO(R>=4.1.0): grepl itself has "smarter" logic for factor input.
56
if (is.factor(vector)) {
67
# indexing by factors is equivalent to indexing by the numeric codes, see ?`[` #4748
78
ret = grepl(pattern, levels(vector), ignore.case = ignore.case, fixed = fixed, perl = perl)[vector]

inst/tests/tests.Rraw

Lines changed: 58 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2281,15 +2281,38 @@ test(753.1, DT[,c("x1","x2"):=4:6, verbose = TRUE], data.table(a=letters[1:3],x=
22812281
output = "RHS for item 2 has been duplicated")
22822282
test(753.2, DT[2,x2:=7L], data.table(a=letters[1:3],x=3:1,x1=4:6,x2=c(4L,7L,6L),key="a"))
22832283
DT = data.table(a=letters[3:1],x=1:3,y=4:6)
2284-
test(754.1, DT[,c("x1","y1","x2"):=list(x,y)], error="Supplied 3 columns to be assigned 2 items. Please see NEWS for v1.12.2")
2285-
test(754.2, DT[,c("x1","y1","x2"):=list(x,y,x)], data.table(a=letters[3:1],x=1:3,y=4:6,x1=1:3,y1=4:6,x2=1:3))
2284+
test(754.01, DT[,c("x1","y1","x2"):=list(x,y)], error="Supplied 3 columns to be assigned 2 items. Please see NEWS for v1.12.2")
2285+
test(754.02, DT[,c("x1","y1","x2"):=list(x,y,x)], data.table(a=letters[3:1],x=1:3,y=4:6,x1=1:3,y1=4:6,x2=1:3))
22862286
# And non-recycling i.e. that a single column copy does copy the column
22872287
DT = data.table(a=1:3)
2288-
test(754.3, DT[,b:=a][1,a:=4L][2,b:=5L], data.table(a=INT(4,2,3),b=INT(1,5,3)))
2289-
test(754.4, DT[,b:=a][3,b:=6L], data.table(a=INT(4,2,3),b=INT(4,2,6)))
2290-
test(754.5, DT[,a:=as.character(a),verbose=TRUE], output="Direct plonk.*no copy")
2288+
test(754.03, DT[, b := a][1, a := 4L][2, b := 5L], data.table(a=INT(4,2,3),b=INT(1,5,3)))
2289+
test(754.04, DT[, b := a][3, b := 6L], data.table(a=INT(4,2,3),b=INT(4,2,6)))
2290+
test(754.05, DT[, a := as.numeric(a), verbose=TRUE], output="Direct plonk.*no copy")
22912291
RHS = as.integer(DT$a)
2292-
test(754.6, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated")
2292+
test(754.06, DT[, a:= RHS, verbose=TRUE], output="RHS for item 1 has been duplicated")
2293+
# Expand ALTREPS in assign.c, #5400
2294+
# String conversion gets deferred
2295+
## first, a regression test of R itself -- we want to make sure our own test continues to be useful & testing its intended purpose
2296+
test(754.07, {a = 1:10; .Internal(inspect(a)); b = as.character(a); .Internal(inspect(b))}, output = "\\bcompact\\b.*\\bdeferred string conversion\\b")
2297+
test(754.08, DT[, a := as.character(a), verbose=TRUE], output="RHS for item 1 has been duplicated")
2298+
# Executing the code inside of test expands the ALTREP so we repeat the code
2299+
# in order to check the result after a further assignment
2300+
DT = data.table(a=1:3)
2301+
DT[, b := as.character(a)]
2302+
DT[, a := 5L]
2303+
test(754.09, DT, data.table(a=5L, b=as.character(1:3)))
2304+
# This function returns an ALTREP wrapper if the input is at least length 64
2305+
testFun = function(x) {
2306+
x[FALSE] = 1
2307+
x
2308+
}
2309+
DT = data.table(id=1:64, col1=0, col2=0)
2310+
test(754.10, DT[, col1 := testFun(col2), verbose = TRUE], output="RHS for item 1 has been duplicated")
2311+
DT = data.table(id=1:64, col1=0, col2=0)
2312+
DT[, col1 := testFun(col2)]
2313+
DT[, col2 := 999]
2314+
test(754.11, DT, data.table(id=1:64, col1=0, col2=999))
2315+
rm(testFun)
22932316

22942317
# Used to test warning on redundant by (#2282) but by=.EACHI has now superseded
22952318
DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key=c('a', 'b'))
@@ -21107,16 +21130,16 @@ class(DF) = c("tbl_df", "tbl", "data.frame")
2110721130
test(2309.02, key(as.data.table(DF, key="t")), "t")
2110821131

2110921132
# data.table keyed with "b"
21110-
DT = data.table(a = 1:5, b = 1:5, x = 1:5, key = "b")
21111-
test(2309.03, key(as.data.table(DT, key="a")), "a")
21112-
test(2309.04, key(as.data.table(DT)), "b")
21113-
test(2309.05, key(as.data.table(DT, key=NULL)), NULL)
21133+
DT = data.table(a = 1:5, b = 1:5, x = 1:5, key = "b")
21134+
test(2309.03, key(as.data.table(DT, key="a")), "a")
21135+
test(2309.04, key(as.data.table(DT)), "b")
21136+
test(2309.05, key(as.data.table(DT, key=NULL)), NULL)
2111421137

2111521138
# non-keyed data.table
2111621139
DT = data.table(a = 1:5, b = 1:5, x = 1:5)
21117-
test(2309.06, key(as.data.table(DT, key="a")), "a")
21118-
test(2309.07, key(as.data.table(DT)), NULL)
21119-
test(2309.08, key(as.data.table(DT, key=NULL)), NULL)
21140+
test(2309.06, key(as.data.table(DT, key="a")), "a")
21141+
test(2309.07, key(as.data.table(DT)), NULL)
21142+
test(2309.08, key(as.data.table(DT, key=NULL)), NULL)
2112021143

2112121144
# as.data.table(x, keep.rownames=TRUE) keeps rownames for class(x)==c("*", "data.frame")
2112221145
df = structure(list(i = 1:2), class = c("tbl", "data.frame"), row.names = c("a","b"))
@@ -21192,3 +21215,25 @@ test(2319.1, !is.null(attr(dt_get, ".internal.selfref")))
2119221215
dt_get0 = data.frame(a = 1:3, b = letters[1:3])
2119321216
setDT(get0("dt_get0"))
2119421217
test(2319.2, !is.null(attr(dt_get0, ".internal.selfref")))
21218+
21219+
# Improved fread error handling for cmd exe and decompression #5415
21220+
test(2320.1, fread(cmd="false"), error="External command failed with exit code", warning = if (.Platform$OS.type=="windows") "execution failed")
21221+
21222+
if (test_R.utils) local({
21223+
tmp <- tempfile(fileext=".gz")
21224+
file.create(tmp); on.exit(unlink(tmp))
21225+
local({
21226+
conn <- file(tmp, 'wb'); on.exit(close(conn))
21227+
writeBin(as.raw(c(31L, 139L)), conn) # Gzip header magic numbers to trigger that read path
21228+
})
21229+
test(2320.2, fread(tmp), error="R.utils::decompressFile failed to decompress", warning="invalid")
21230+
})
21231+
21232+
# Create a data.table when one vector is transposed doesn't respect the name defined by user #4124
21233+
test(2321.1, DT <- data.table(a=1:2, b=matrix(1:2)), data.table(a=1:2, b=1:2))
21234+
test(2321.2, names(DT), names(data.frame(a=1:2, b=matrix(1:2))))
21235+
test(2321.3, DT <- data.table(a=integer(), b=matrix(1L, nrow=0L, ncol=1L)), data.table(a=integer(), b=integer()))
21236+
test(2321.4, names(DT), names(data.frame(a=integer(), b=matrix(1L, nrow=0L, ncol=1L))))
21237+
## but respect named column vectors
21238+
test(2321.5, DT <- data.table(a=1:2, cbind(b=3:4)), data.table(a=1:2, b=3:4))
21239+
test(2321.6, names(DT), names(data.frame(a=1:2, cbind(b=3:4))))

src/assign.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -550,11 +550,12 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
550550
if (length(rows)==0 && targetlen==vlen && (vlen>0 || nrow==0)) {
551551
if ( MAYBE_SHARED(thisvalue) || // set() protects the NAMED of atomic vectors from .Call setting arguments to 2 by wrapping with list
552552
(TYPEOF(values)==VECSXP && i>LENGTH(values)-1) || // recycled RHS would have columns pointing to others, #185.
553-
(TYPEOF(values)!=VECSXP && i>0) // assigning the same values to a second column. Have to ensure a copy #2540
553+
(TYPEOF(values)!=VECSXP && i>0) || // assigning the same values to a second column. Have to ensure a copy #2540
554+
ALTREP(thisvalue) // Some ALTREP wrappers have survived to here, e.g. #5400
554555
) {
555556
if (verbose) {
556-
Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d, but then is being plonked. length(values)==%d; length(cols)==%d\n"),
557-
i+1, MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue), length(values), length(cols));
557+
Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d ALTREP==%d, but then is being plonked. length(values)==%d; length(cols)==%d\n"),
558+
i+1, MAYBE_REFERENCED(thisvalue), MAYBE_SHARED(thisvalue), ALTREP(thisvalue), length(values), length(cols));
558559
}
559560
thisvalue = copyAsPlain(thisvalue); // PROTECT not needed as assigned as element to protected list below.
560561
} else {

src/fread.c

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -211,19 +211,14 @@ static inline int64_t clamp_i64t(int64_t x, int64_t lower, int64_t upper) {
211211
* is constructed manually (using say snprintf) that warning(), stop()
212212
* and Rprintf() are all called as warning(_("%s"), msg) and not warning(msg).
213213
*/
214-
static const char* strlim(const char *ch, size_t limit) {
215-
static char buf[1002];
216-
static int flip = 0;
217-
char *ptr = buf + 501 * flip;
218-
flip = 1 - flip;
219-
char *ch2 = ptr;
220-
limit = imin(limit, 500);
214+
static const char* strlim(const char *ch, char buf[static 500], size_t limit) {
215+
char *ch2 = buf;
221216
size_t width = 0;
222217
while ((*ch > '\r' || (*ch != '\0' && *ch != '\r' && *ch != '\n')) && width++ < limit) {
223218
*ch2++ = *ch++;
224219
}
225220
*ch2 = '\0';
226-
return ptr;
221+
return buf;
227222
}
228223

229224
static const char *typeLetter = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
@@ -1639,7 +1634,7 @@ int freadMain(freadMainArgs _args) {
16391634
if (ch >= eof) STOP(_("Input is either empty, fully whitespace, or skip has been set after the last non-whitespace."));
16401635
if (verbose) {
16411636
if (lineStart > ch) DTPRINT(_(" Moved forward to first non-blank line (%d)\n"), row1line);
1642-
DTPRINT(_(" Positioned on line %d starting: <<%s>>\n"), row1line, strlim(lineStart, 30));
1637+
DTPRINT(_(" Positioned on line %d starting: <<%s>>\n"), row1line, strlim(lineStart, (char[500]) {}, 30));
16431638
}
16441639
ch = pos = lineStart;
16451640
}
@@ -1830,7 +1825,7 @@ int freadMain(freadMainArgs _args) {
18301825
if (!fill && tt != ncol) INTERNAL_STOP("first line has field count %d but expecting %d", tt, ncol); // # nocov
18311826
if (verbose) {
18321827
DTPRINT(_(" Detected %d columns on line %d. This line is either column names or first data row. Line starts as: <<%s>>\n"),
1833-
tt, row1line, strlim(pos, 30));
1828+
tt, row1line, strlim(pos, (char[500]) {}, 30));
18341829
DTPRINT(_(" Quote rule picked = %d\n"), quoteRule);
18351830
DTPRINT(_(" fill=%s and the most number of columns found is %d\n"), fill ? "true" : "false", ncol);
18361831
}
@@ -2761,23 +2756,23 @@ int freadMain(freadMainArgs _args) {
27612756
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
27622757
while (ch < eof && isspace(*ch)) ch++;
27632758
if (ch == eof) {
2764-
DTWARN(_("Discarded single-line footer: <<%s>>"), strlim(skippedFooter, 500));
2759+
DTWARN(_("Discarded single-line footer: <<%s>>"), strlim(skippedFooter, (char[500]) {}, 500));
27652760
}
27662761
else {
27672762
ch = headPos;
27682763
int tt = countfields(&ch);
27692764
if (fill > 0) {
27702765
DTWARN(_("Stopped early on line %"PRId64". Expected %d fields but found %d. Consider fill=%d or even more based on your knowledge of the input file. Use fill=Inf for reading the whole file for detecting the number of fields. First discarded non-empty line: <<%s>>"),
2771-
DTi+row1line, ncol, tt, tt, strlim(skippedFooter, 500));
2766+
DTi+row1line, ncol, tt, tt, strlim(skippedFooter, (char[500]) {}, 500));
27722767
} else {
27732768
DTWARN(_("Stopped early on line %"PRId64". Expected %d fields but found %d. Consider fill=TRUE. First discarded non-empty line: <<%s>>"),
2774-
DTi+row1line, ncol, tt, strlim(skippedFooter, 500));
2769+
DTi+row1line, ncol, tt, strlim(skippedFooter, (char[500]) {}, 500));
27752770
}
27762771
}
27772772
}
27782773
}
27792774
if (quoteRuleBumpedCh != NULL && quoteRuleBumpedCh<headPos) {
2780-
DTWARN(_("Found and resolved improper quoting out-of-sample. First healed line %"PRId64": <<%s>>. If the fields are not quoted (e.g. field separator does not appear within any field), try quote=\"\" to avoid this warning."), quoteRuleBumpedLine, strlim(quoteRuleBumpedCh, 500));
2775+
DTWARN(_("Found and resolved improper quoting out-of-sample. First healed line %"PRId64": <<%s>>. If the fields are not quoted (e.g. field separator does not appear within any field), try quote=\"\" to avoid this warning."), quoteRuleBumpedLine, strlim(quoteRuleBumpedCh, (char[500]) {}, 500));
27812776
}
27822777

27832778
if (verbose) {

tests/autoprint.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ local({
5858

5959
# child class of data.table doesn't induce unintended print, #3029
6060
dt = data.table(x = 1)
61-
class(dt) = c("foo", "data.table", "data.frame")
61+
setattr(dt, "class", c("foo", "data.table", "data.frame"))
6262
print.foo = function(x, ...) {
6363
NextMethod("print")
6464
}

tests/autoprint.Rout.save

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ NULL
172172
>
173173
> # child class of data.table doesn't induce unintended print, #3029
174174
> dt = data.table(x = 1)
175-
> class(dt) = c("foo", "data.table", "data.frame")
175+
> setattr(dt, "class", c("foo", "data.table", "data.frame"))
176176
> print.foo = function(x, ...) {
177177
+ NextMethod("print")
178178
+ }

0 commit comments

Comments
 (0)