Skip to content

Commit 71219c6

Browse files
Merge branch 'master' into cbindlist
2 parents 633206f + 96c3e6a commit 71219c6

File tree

9 files changed

+91
-24
lines changed

9 files changed

+91
-24
lines changed

NEWS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
# 2: 2 6 4 5
4141
```
4242

43+
8. `groupingsets()` gets a new argument `enclos` for use together with the `jj` argument in functions wrapping `groupingsets()`, including the existing wrappers `rollup()` and `cube()`. When forwarding a `j`-expression as `groupingsets(jj = substitute(j))`, make sure to pass `enclos = parent.frame()` as well, so that the `j`-expression will be evaluated in the right context. This makes it possible for `j` to refer to variables outside the `data.table`.
44+
4345
### BUG FIXES
4446

4547
1. Custom binary operators from the `lubridate` package now work with objects of class `IDate` as with a `Date` subclass, [#6839](https://github.com/Rdatatable/data.table/issues/6839). Thanks @emallickhossain for the report and @aitap for the fix.
@@ -72,6 +74,10 @@
7274
7375
15. Including an `ITime` object as a named input to `data.frame()` respects the provided name, i.e. `data.frame(a = as.ITime(...))` will have column `a`, [#4673](https://github.com/Rdatatable/data.table/issues/4673). Thanks @shrektan for the report and @MichaelChirico for the fix.
7476
77+
16. `fread()` now handles the `na.strings` argument for quoted text columns, making it possible to specify `na.strings = '""'` and read empty quoted strings as `NA`s, [#6974](https://github.com/Rdatatable/data.table/issues/6974). Thanks to @AngelFelizR for the report and @aitap for the PR.
78+
79+
17. A data.table with a column of class `vctrs_list_of` (from package {vctrs}) prints as expected, [#5948](https://github.com/Rdatatable/data.table/issues/5948). Before, they could be printed messily, e.g. printing every entry in a nested data.frame. Thanks @jesse-smith for the report, @DavisVaughan and @r2evans for contributing, and @MichaelChirico for the PR.
80+
7581
### NOTES
7682
7783
1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.

R/groupingsets.R

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) {
1313
sets = lapply(length(by):0L, function(i) by[0L:i])
1414
# redirect to workhorse function
1515
jj = substitute(j)
16-
groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label)
16+
groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame())
1717
}
1818

1919
cube = function(x, ...) {
@@ -35,13 +35,13 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) {
3535
sets = lapply((2L^n):1L, function(jj) by[keepBool[jj, ]])
3636
# redirect to workhorse function
3737
jj = substitute(j)
38-
groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label)
38+
groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame())
3939
}
4040

4141
groupingsets = function(x, ...) {
4242
UseMethod("groupingsets")
4343
}
44-
groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, ...) {
44+
groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, enclos = parent.frame(), ...) {
4545
# input data type basic validation
4646
if (!is.data.table(x))
4747
stopf("Argument 'x' must be a data.table object")
@@ -112,7 +112,10 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe
112112
.SDcols = if (".SD" %chin% av) setdiff(names(x), by) else NULL
113113
if (length(names(by))) by = unname(by)
114114
# 0 rows template data.table to keep colorder and type
115-
empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by]
115+
# inline all arguments that might clash with enclosing environment
116+
pcall = substitute(x[0L, jj, by], list(x = x, jj = jj, by = by))
117+
if (length(.SDcols)) pcall$.SDcols = .SDcols
118+
empty = eval(pcall, list(.datatable.aware = TRUE), enclos)
116119
if (id && "grouping" %chin% names(empty)) # `j` could have been evaluated to `grouping` field
117120
stopf("When using `id=TRUE` the 'j' expression must not evaluate to a column named 'grouping'.")
118121
if (anyDuplicated(names(empty)) > 0L)
@@ -150,8 +153,12 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe
150153
stopf("Using integer64 class columns require to have 'bit64' package installed.") # nocov
151154
int64.by.cols = intersect(int64.cols, by)
152155
# aggregate function called for each grouping set
156+
# inline all arguments that might clash with enclosing environment
157+
pcall = substitute(x[, jj], list(x = x, jj = jj))
158+
if (length(.SDcols)) pcall$.SDcols = .SDcols
153159
aggregate.set = function(by.set) {
154-
r = if (length(.SDcols)) x[, eval(jj), by.set, .SDcols=.SDcols] else x[, eval(jj), by.set]
160+
pcall$by = by.set
161+
r = eval(pcall, list(.datatable.aware = TRUE), enclos)
155162
if (id) {
156163
# integer bit mask of aggregation levels: http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE
157164
# 3267: strtoi("", base = 2L) output apparently unstable across platforms

R/print.data.table.R

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,6 @@ has_format_method = function(x) {
199199
format_col.default = function(x, ...) {
200200
if (!is.null(dim(x)))
201201
"<multi-column>"
202-
else if (has_format_method(x) && length(formatted<-format(x, ...))==length(x))
203-
formatted #PR5224 motivated by package sf where column class is c("sfc_MULTIPOLYGON","sfc") and sf:::format.sfc exists
204202
else if (is.list(x))
205203
vapply_1c(x, format_list_item, ...)
206204
else

inst/tests/other.Rraw

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf", "nanotime", "R.utils", "yaml")
1+
pkgs = c("bit64", "caret", "dplyr", "gdata", "ggplot2", "hexbin", "knitr", "nanotime", "nlme", "parallel", "plyr", "R.utils", "sf", "vctrs", "xts", "yaml", "zoo")
22
# First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs.
33
# So that these dependencies of other.Rraw are maintained in a single place.
44
# TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by
@@ -221,6 +221,7 @@ test(14.2, {example('CJ', package='data.table', local=TRUE, echo=FALSE); TRUE})
221221
if (loaded[["sf"]]) { #2273
222222
DT = as.data.table(st_read(system.file("shape/nc.shp", package = "sf"), quiet=TRUE))
223223
test(15, DT[1:3, .(NAME, FIPS, geometry)], output="Ashe.*-81.4.*Surry.*-80.4")
224+
test(15.1, DT, output="MULTIPOLYGON (((") # make sure individual list items are formatted, #6637, #5224
224225

225226
dsf = sf::st_as_sf(data.table(x=1:10, y=1:10, s=sample(1:2, 10, TRUE)), coords=1:2)
226227
test(16, split(dsf, dsf$s), list(`1` = dsf[dsf$s == 1, ], `2` = dsf[dsf$s == 2, ]))
@@ -774,3 +775,9 @@ if (loaded[["nanotime"]]) {
774775
res <- tables(env=.e)
775776
test(32, res[, .(NAME,NROW,NCOL,MB)], data.table(NAME="DT",NROW=20000000L,NCOL=15L,MB=2288.0))
776777
rm(.e, res)
778+
779+
if (loaded[["vctrs"]]) {
780+
# vctrs::list_of() columns are treated the same as other list() columns
781+
DT = data.table(a = 1, b = list_of(mtcars))
782+
test(33, DT, output="<vctrs_list_of>.*<data\\.frame\\[32x11\\]>")
783+
}

inst/tests/tests.Rraw

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16800,15 +16800,15 @@ registerS3method("format_col", "complex", format_col.default)
1680016800
# then i) test 1610.1 fails if test.data.table() is rerun, ii) user display of complex data would be affected
1680116801
# did try wrapping with on.exit(,add=TRUE) but perhaps because this is a script that is sys.source'd, it ran straight away
1680216802

16803-
# format method for column takes predecedence over format method for each list item
16803+
# as of #6637, format individual list items but not the whole list
1680416804
registerS3method("format", "myclass2130", function(x, ...) paste0("<", class(x)[1L], ":", x$id, ">"))
1680516805
DT = data.table(row=1:2, objs=list(structure(list(id="foo"), class="myclass2130"), structure(list(id="bar"), class="myclass2130")))
1680616806
test(2130.13, print(DT), output="myclass2130:foo.*myclass2130:bar")
1680716807
setattr(DT$objs, "class", "foo2130")
1680816808
registerS3method("format", "foo2130", function(x, ...) "All hail foo")
1680916809
test(2130.14, print(DT), output="myclass2130:foo.*myclass2130:bar") # because length 1 from format but needs to be length(x)
1681016810
registerS3method("format", "foo2130", function(x, ...) rep("All hail foo",length(x)))
16811-
test(2130.15, print(DT), output="All hail foo") # e.g. sf:::format.sfc rather than sf:::format.sfg on each item
16811+
test(2130.15, print(DT), output="myclass2130:foo.*myclass2130:bar") # used to call format(column), not vapply_1c(column, format)
1681216812
setattr(DT$objs, "class", "bar2130_with_no_method")
1681316813
test(2130.16, print(DT), output="myclass2130:foo.*myclass2130:bar")
1681416814
registerS3method("format", "myclass2130", format.default)
@@ -21253,14 +21253,53 @@ test(2323.1, names(data.frame(COL = it)), "COL")
2125321253
test(2323.2, names(data.frame(b = 1, COL = it)), c("b", "COL"))
2125421254
test(2323.3, names(as.data.frame(it, optional=TRUE)), NULL)
2125521255

21256+
# 'sets' is a local variable in groupingsets(), cube(), rollup() and shouldn't leak into the 'j' expression
21257+
n = 24L
21258+
set.seed(25)
21259+
DT = data.table(
21260+
color = sample(c("green","yellow","red"), n, TRUE),
21261+
year = as.Date(sample(paste0(2011:2015,"-01-01"), n, TRUE)),
21262+
status = as.factor(sample(c("removed","active","inactive","archived"), n, TRUE)),
21263+
amount = sample(1:5, n, TRUE),
21264+
value = sample(c(3, 3.5, 2.5, 2), n, TRUE)
21265+
)
21266+
sets = 0
21267+
test(2324.0,
21268+
groupingsets(DT, j = c(list(count=.N + ..sets)), by = c("color","year","status"), sets = list("color", c("year","status"), character()), id=TRUE),
21269+
groupingsets(DT, j = c(list(count=.N + 0)), by = c("color","year","status"), sets = list("color", c("year","status"), character()), id=TRUE)
21270+
)
21271+
test(2324.1,
21272+
cube(DT, j = sum(value) + ..sets, by = c("color","year","status"), id=TRUE),
21273+
cube(DT, j = sum(value), by = c("color","year","status"), id=TRUE)
21274+
)
21275+
test(2324.2,
21276+
rollup(DT, j = sum(value) + ..sets, by=c("color","year","status"), label="total"),
21277+
rollup(DT, j = sum(value), by=c("color","year","status"), label="total")
21278+
)
21279+
21280+
# allow na.strings to be quoted, #6974
21281+
f = tempfile()
21282+
DT = data.table(
21283+
"Date Example"=c("12/5/2012", NA),
21284+
"Question 1"=c("Yes", NA),
21285+
"Question 2"=c("Yes", NA),
21286+
"Site: Country"=c("Chile", "Virgin Islands, British")
21287+
)
21288+
fwrite(DT, f, na='""')
21289+
test(2325.1, fread(f, na.strings='""'), DT)
21290+
unlink(f)
21291+
test(2325.2,
21292+
fread('"foo","bar","baz"\n"a","b","c"', na.strings=c('"foo"', '"bar"', '"baz"'), header=FALSE),
21293+
data.table(V1=c(NA, "a"), V2=c(NA, "b"), V3=c(NA, "c")))
21294+
2125621295
## ensure setDT will retain key and indices when it is called on the list (cbindlist assumes this)
2125721296
local({
2125821297
d = data.table(x=1:2, y=2:1, z=2:1, v1=1:2)
2125921298
setkeyv(d, "x"); setindexv(d, list("y", "z"))
2126021299
a = attributes(d)
2126121300
attributes(d) = a[!names(a) %in% c("class", ".internal.selfref", "row.names")]
21262-
test(2324.01, class(d), "list")
21301+
test(2326.1, class(d), "list")
2126321302
setDT(d)
21264-
test(2324.02, key(d), "x")
21265-
test(2324.03, hasindex(d, "y") && hasindex(d, "z"))
21303+
test(2326.2, key(d), "x")
21304+
test(2326.3, hasindex(d, "y") && hasindex(d, "z"))
2126621305
})

man/fctr.Rd

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
\alias{factor}
44
\title{Create a factor retaining original ordering}
55
\description{
6-
Creates a code{\link[base:factor]{factor}}.
6+
Creates a \code{\link[base]{factor}}.
77

88
By default, the output will have its levels in the original order, i.e., \code{levels = unique(x)}, as opposed to \code{factor}'s default where \code{levels = sort(unique(x))}.
99
}
@@ -13,7 +13,7 @@ fctr(x, levels=unique(x), ..., sort=FALSE, rev=FALSE)
1313
\arguments{
1414
\item{x}{ Object to be turned into a factor. }
1515
\item{levels}{ Levels for the new factor; \code{unique(x)} by default. }
16-
\item{\dots}{ Other arguments passed to code{\link[base:factor]{factor}}. }
16+
\item{\dots}{ Other arguments passed to \code{\link[base]{factor}}. }
1717
\item{sort}{ Logical, default \code{FALSE}. Should \code{levels} be sorted? }
1818
\item{rev}{ Logical, default \code{FALSE}. Should \code{levels} be reversed? Applied \emph{after} \code{sort}. }
1919
}

man/groupingsets.Rd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ rollup(x, \dots)
1515
cube(x, \dots)
1616
\method{cube}{data.table}(x, j, by, .SDcols, id = FALSE, label = NULL, \dots)
1717
groupingsets(x, \dots)
18-
\method{groupingsets}{data.table}(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, \dots)
18+
\method{groupingsets}{data.table}(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, enclos = parent.frame(), \dots)
1919
}
2020
\arguments{
2121
\item{x}{\code{data.table}.}
@@ -27,6 +27,7 @@ groupingsets(x, \dots)
2727
\item{id}{logical default \code{FALSE}. If \code{TRUE} it will add leading column with bit mask of grouping sets.}
2828
\item{jj}{quoted version of \code{j} argument, for convenience. When provided function will ignore \code{j} argument.}
2929
\item{label}{label(s) to be used in the 'total' rows in the grouping variable columns of the output, that is, in rows where the grouping variable has been aggregated. Can be a named list of scalars, or a scalar, or \code{NULL}. Defaults to \code{NULL}, which results in the grouping variables having \code{NA} in their 'total' rows. See Details.}
30+
\item{enclos}{the environment containing the symbols referenced by \code{jj}. When writing functions that accept a \code{j} environment for non-standard evaluation by \pkg{data.table}, \code{\link[base]{substitute}()} it and forward it to \code{groupingsets} using the \code{jj} argument, set this to the \code{\link[base]{parent.frame}()} of the function that captures \code{j}.}
3031
}
3132
\details{
3233
All three functions \code{rollup, cube, groupingsets} are generic methods, \code{data.table} methods are provided.

src/fread.c

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -515,6 +515,8 @@ static void Field(FieldParseContext *ctx)
515515
// the field is quoted and quotes are correctly escaped (quoteRule 0 and 1)
516516
// or the field is quoted but quotes are not escaped (quoteRule 2)
517517
// or the field is not quoted but the data contains a quote at the start (quoteRule 2 too)
518+
// What if this string signifies an NA? Will find out after we're done parsing quotes
519+
const char *field_after_NA = end_NA_string(fieldStart);
518520
fieldStart++; // step over opening quote
519521
switch(quoteRule) {
520522
case 0: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol
@@ -573,6 +575,8 @@ static void Field(FieldParseContext *ctx)
573575
if (ch == eof && quoteRule != 2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
574576
while(target->len > 0 && ((ch[-1] == ' ' && stripWhite) || ch[-1] == '\0')) { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST "
575577
}
578+
// Does end-of-field correspond to end-of-possible-NA?
579+
if (field_after_NA == ch) target->len = INT32_MIN;
576580
}
577581

578582
static void str_to_i32_core(const char **pch, int32_t *target, bool parse_date)
@@ -770,7 +774,7 @@ static void parse_double_regular_core(const char **pch, double *target)
770774
// Not a single digit after "E"? Invalid number
771775
return;
772776
}
773-
e += Eneg? -E : E;
777+
e += Eneg ? -E : E;
774778
}
775779
if (e < -350 || e > 350) return;
776780

@@ -1418,7 +1422,7 @@ int freadMain(freadMainArgs _args) {
14181422
// Mac doesn't appear to support MAP_POPULATE anyway (failed on CRAN when I tried).
14191423
// TO DO?: MAP_HUGETLB for Linux but seems to need admin to setup first. My Hugepagesize is 2MB (>>2KB, so promising)
14201424
// https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
1421-
mmp = mmap(NULL, fileSize, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); // COW for last page lastEOLreplaced
1425+
mmp = mmap(NULL, fileSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); // COW for last page lastEOLreplaced
14221426
#ifdef __EMSCRIPTEN__
14231427
mmp_fd = fd;
14241428
#else
@@ -1447,11 +1451,11 @@ int freadMain(freadMainArgs _args) {
14471451
STOP(_("File size [%s] exceeds the address space: %s"), filesize_to_str(liFileSize.QuadPart), fnam); // # nocov
14481452
}
14491453
fileSize = (size_t)liFileSize.QuadPart;
1450-
if (fileSize==0) { CloseHandle(hFile); STOP(_("File is empty: %s"), fnam); }
1454+
if (fileSize == 0) { CloseHandle(hFile); STOP(_("File is empty: %s"), fnam); }
14511455
if (verbose) DTPRINT(_(" File opened, size = %s.\n"), filesize_to_str(fileSize));
14521456
HANDLE hMap = CreateFileMapping(hFile, NULL, PAGE_WRITECOPY, 0, 0, NULL);
14531457
if (hMap == NULL) { CloseHandle(hFile); STOP(_("This is Windows, CreateFileMapping returned error %lu for file %s"), GetLastError(), fnam); }
1454-
mmp = MapViewOfFile(hMap,FILE_MAP_COPY,0,0,fileSize); // fileSize must be <= hilo passed to CreateFileMapping above.
1458+
mmp = MapViewOfFile(hMap, FILE_MAP_COPY, 0, 0, fileSize); // fileSize must be <= hilo passed to CreateFileMapping above.
14551459
CloseHandle(hMap); // we don't need to keep the file open; the MapView keeps an internal reference;
14561460
CloseHandle(hFile); // see https://msdn.microsoft.com/en-us/library/windows/desktop/aa366537(v=vs.85).aspx
14571461
if (mmp == NULL) {
@@ -1500,7 +1504,7 @@ int freadMain(freadMainArgs _args) {
15001504
if (verbose) DTPRINT(_(" Last byte(s) of input found to be %s and removed.\n"),
15011505
c ? "0x1A (Ctrl+Z)" : "0x00 (NUL)");
15021506
}
1503-
if (eof<=sof) STOP(_("Input is empty or only contains BOM or terminal control characters"));
1507+
if (eof <= sof) STOP(_("Input is empty or only contains BOM or terminal control characters"));
15041508
}
15051509

15061510
//*********************************************************************************************
@@ -2270,7 +2274,7 @@ int freadMain(freadMainArgs _args) {
22702274
chunkBytes = bytesRead / nJumps;
22712275
} else {
22722276
ASSERT(nJumps == 1 /*when nrowLimit supplied*/ || nJumps == 2 /*small files*/, "nJumps (%d) != 1|2", nJumps);
2273-
nJumps=1;
2277+
nJumps = 1;
22742278
}
22752279
int64_t initialBuffRows = allocnrow / nJumps;
22762280

@@ -2421,7 +2425,7 @@ int freadMain(freadMainArgs _args) {
24212425
if (eol(&tch) && skipEmptyLines) { tch++; continue; }
24222426
tch = tLineStart; // in case white space at the beginning may need to be including in field
24232427
}
2424-
else if (eol(&tch) && j<ncol) { // j<ncol needed for #2523 (erroneous extra comma after last field)
2428+
else if (eol(&tch) && j < ncol) { // j<ncol needed for #2523 (erroneous extra comma after last field)
24252429
int8_t thisSize = size[j];
24262430
if (thisSize) ((char **) targets)[thisSize] += thisSize;
24272431
j++;

vignettes/datatable-intro.Rmd

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,13 +653,18 @@ flights[, `:=`(makeup = dep_delay - arr_delay)]
653653
makeup.models <- flights[, .(fit = list(lm(makeup ~ distance))), by = .(month)]
654654
makeup.models[, .(coefdist = coef(fit[[1]])[2], rsq = summary(fit[[1]])$r.squared), by = .(month)]
655655
```
656+
656657
Using data.frames, we need more complicated code to obtain same result.
658+
657659
```{r}
658660
setDF(flights)
659661
flights.split <- split(flights, f = flights$month)
660662
makeup.models.list <- lapply(flights.split, function(df) c(month = df$month[1], fit = list(lm(makeup ~ distance, data = df))))
661663
makeup.models.df <- do.call(rbind, makeup.models.list)
662-
sapply(makeup.models.df[, "fit"], function(model) c(coefdist = coef(model)[2], rsq = summary(model)$r.squared)) |> t() |> data.frame()
664+
data.frame(t(sapply(
665+
makeup.models.df[, "fit"],
666+
function(model) c(coefdist = coef(model)[2L], rsq = summary(model)$r.squared)
667+
)))
663668
setDT(flights)
664669
```
665670

0 commit comments

Comments
 (0)