Merge branch 'master' into cbindlist

MichaelChirico · MichaelChirico · commit 71219c6d2592 · 2025-07-01T16:53:39.000Z
diff --git a/NEWS.md b/NEWS.md
@@ -40,6 +40,8 @@
     # 2:     2     6     4     5
     ```
 
+8. `groupingsets()` gets a new argument `enclos` for use together with the `jj` argument in functions wrapping `groupingsets()`, including the existing wrappers `rollup()` and `cube()`. When forwarding a `j`-expression as `groupingsets(jj = substitute(j))`, make sure to pass `enclos = parent.frame()` as well, so that the `j`-expression will be evaluated in the right context. This makes it possible for `j` to refer to variables outside the `data.table`.
+
 ### BUG FIXES
 
 1. Custom binary operators from the `lubridate` package now work with objects of class `IDate` as with a `Date` subclass, [#6839](https://github.com/Rdatatable/data.table/issues/6839). Thanks @emallickhossain for the report and @aitap for the fix.
@@ -72,6 +74,10 @@
 
 15. Including an `ITime` object as a named input to `data.frame()` respects the provided name, i.e. `data.frame(a = as.ITime(...))` will have column `a`, [#4673](https://github.com/Rdatatable/data.table/issues/4673). Thanks @shrektan for the report and @MichaelChirico for the fix.
 
+16. `fread()` now handles the `na.strings` argument for quoted text columns, making it possible to specify `na.strings = '""'` and read empty quoted strings as `NA`s, [#6974](https://github.com/Rdatatable/data.table/issues/6974). Thanks to @AngelFelizR for the report and @aitap for the PR.
+
+17. A data.table with a column of class `vctrs_list_of` (from package {vctrs}) prints as expected, [#5948](https://github.com/Rdatatable/data.table/issues/5948). Before, they could be printed messily, e.g. printing every entry in a nested data.frame. Thanks @jesse-smith for the report, @DavisVaughan and @r2evans for contributing, and @MichaelChirico for the PR.
+
 ### NOTES
 
 1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.
diff --git a/R/groupingsets.R b/R/groupingsets.R
@@ -13,7 +13,7 @@ rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) {
   sets = lapply(length(by):0L, function(i) by[0L:i])
   # redirect to workhorse function
   jj = substitute(j)
-  groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label)
+  groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame())
 }
 
 cube = function(x, ...) {
@@ -35,13 +35,13 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) {
   sets = lapply((2L^n):1L, function(jj) by[keepBool[jj, ]])
   # redirect to workhorse function
   jj = substitute(j)
-  groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label)
+  groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame())
 }
 
 groupingsets = function(x, ...) {
   UseMethod("groupingsets")
 }
-groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, ...) {
+groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, enclos = parent.frame(), ...) {
   # input data type basic validation
   if (!is.data.table(x))
     stopf("Argument 'x' must be a data.table object")
@@ -112,7 +112,10 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe
     .SDcols = if (".SD" %chin% av) setdiff(names(x), by) else NULL
   if (length(names(by))) by = unname(by)
   # 0 rows template data.table to keep colorder and type
-  empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by]
+  # inline all arguments that might clash with enclosing environment
+  pcall = substitute(x[0L, jj, by], list(x = x, jj = jj, by = by))
+  if (length(.SDcols)) pcall$.SDcols = .SDcols
+  empty = eval(pcall, list(.datatable.aware = TRUE), enclos)
   if (id && "grouping" %chin% names(empty)) # `j` could have been evaluated to `grouping` field
     stopf("When using `id=TRUE` the 'j' expression must not evaluate to a column named 'grouping'.")
   if (anyDuplicated(names(empty)) > 0L)
@@ -150,8 +153,12 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe
     stopf("Using integer64 class columns require to have 'bit64' package installed.") # nocov
   int64.by.cols = intersect(int64.cols, by)
   # aggregate function called for each grouping set
+  # inline all arguments that might clash with enclosing environment
+  pcall = substitute(x[, jj], list(x = x, jj = jj))
+  if (length(.SDcols)) pcall$.SDcols = .SDcols
   aggregate.set = function(by.set) {
-    r = if (length(.SDcols)) x[, eval(jj), by.set, .SDcols=.SDcols] else x[, eval(jj), by.set]
+    pcall$by = by.set
+    r = eval(pcall, list(.datatable.aware = TRUE), enclos)
     if (id) {
       # integer bit mask of aggregation levels: http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE
       # 3267: strtoi("", base = 2L) output apparently unstable across platforms
diff --git a/R/print.data.table.R b/R/print.data.table.R
@@ -199,8 +199,6 @@ has_format_method = function(x) {
 format_col.default = function(x, ...) {
   if (!is.null(dim(x)))
     "<multi-column>"
-  else if (has_format_method(x) && length(formatted<-format(x, ...))==length(x))
-    formatted  #PR5224 motivated by package sf where column class is c("sfc_MULTIPOLYGON","sfc") and sf:::format.sfc exists
   else if (is.list(x))
     vapply_1c(x, format_list_item, ...)
   else
diff --git a/inst/tests/other.Rraw b/inst/tests/other.Rraw
@@ -1,4 +1,4 @@
-pkgs = c("ggplot2", "hexbin", "plyr", "dplyr", "caret", "zoo", "xts", "gdata", "nlme", "bit64", "knitr", "parallel", "sf", "nanotime", "R.utils", "yaml")
+pkgs = c("bit64", "caret", "dplyr", "gdata", "ggplot2", "hexbin", "knitr", "nanotime", "nlme", "parallel", "plyr", "R.utils", "sf", "vctrs", "xts", "yaml", "zoo")
 # First expression of this file must be as above: .gitlab-ci.yml uses parse(,n=1L) to read one expression from this file and installs pkgs.
 # So that these dependencies of other.Rraw are maintained in a single place.
 # TEST_DATA_TABLE_WITH_OTHER_PACKAGES is off by default so this other.Rraw doesn't run on CRAN. It is run by GLCI, locally in dev, and by
@@ -221,6 +221,7 @@ test(14.2, {example('CJ', package='data.table', local=TRUE, echo=FALSE); TRUE})
 if (loaded[["sf"]]) {  #2273
   DT = as.data.table(st_read(system.file("shape/nc.shp", package = "sf"), quiet=TRUE))
   test(15, DT[1:3, .(NAME, FIPS, geometry)], output="Ashe.*-81.4.*Surry.*-80.4")
+  test(15.1, DT, output="MULTIPOLYGON (((") # make sure individual list items are formatted, #6637, #5224
 
   dsf = sf::st_as_sf(data.table(x=1:10, y=1:10, s=sample(1:2, 10, TRUE)), coords=1:2)
   test(16, split(dsf, dsf$s), list(`1` = dsf[dsf$s == 1, ], `2` = dsf[dsf$s == 2, ]))
@@ -774,3 +775,9 @@ if (loaded[["nanotime"]]) {
 res <- tables(env=.e)
 test(32, res[, .(NAME,NROW,NCOL,MB)], data.table(NAME="DT",NROW=20000000L,NCOL=15L,MB=2288.0))
 rm(.e, res)
+
+if (loaded[["vctrs"]]) {
+  # vctrs::list_of() columns are treated the same as other list() columns
+  DT = data.table(a = 1, b = list_of(mtcars))
+  test(33, DT, output="<vctrs_list_of>.*<data\\.frame\\[32x11\\]>")
+}
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -16800,15 +16800,15 @@ registerS3method("format_col", "complex", format_col.default)
 # then i) test 1610.1 fails if test.data.table() is rerun, ii) user display of complex data would be affected
 # did try wrapping with on.exit(,add=TRUE) but perhaps because this is a script that is sys.source'd, it ran straight away
 
-# format method for column takes predecedence over format method for each list item
+# as of #6637, format individual list items but not the whole list
 registerS3method("format", "myclass2130", function(x, ...) paste0("<", class(x)[1L], ":", x$id, ">"))
 DT = data.table(row=1:2, objs=list(structure(list(id="foo"), class="myclass2130"), structure(list(id="bar"), class="myclass2130")))
 test(2130.13, print(DT), output="myclass2130:foo.*myclass2130:bar")
 setattr(DT$objs, "class", "foo2130")
 registerS3method("format", "foo2130", function(x, ...) "All hail foo")
 test(2130.14, print(DT), output="myclass2130:foo.*myclass2130:bar") # because length 1 from format but needs to be length(x)
 registerS3method("format", "foo2130", function(x, ...) rep("All hail foo",length(x)))
-test(2130.15, print(DT), output="All hail foo")  # e.g. sf:::format.sfc rather than sf:::format.sfg on each item
+test(2130.15, print(DT), output="myclass2130:foo.*myclass2130:bar") # used to call format(column), not vapply_1c(column, format)
 setattr(DT$objs, "class", "bar2130_with_no_method")
 test(2130.16, print(DT), output="myclass2130:foo.*myclass2130:bar")
 registerS3method("format", "myclass2130", format.default)
@@ -21253,14 +21253,53 @@ test(2323.1, names(data.frame(COL = it)), "COL")
 test(2323.2, names(data.frame(b = 1, COL = it)), c("b", "COL"))
 test(2323.3, names(as.data.frame(it, optional=TRUE)), NULL)
 
+# 'sets' is a local variable in groupingsets(), cube(), rollup() and shouldn't leak into the 'j' expression
+n = 24L
+set.seed(25)
+DT = data.table(
+    color = sample(c("green","yellow","red"), n, TRUE),
+    year = as.Date(sample(paste0(2011:2015,"-01-01"), n, TRUE)),
+    status = as.factor(sample(c("removed","active","inactive","archived"), n, TRUE)),
+    amount = sample(1:5, n, TRUE),
+    value = sample(c(3, 3.5, 2.5, 2), n, TRUE)
+)
+sets = 0
+test(2324.0,
+  groupingsets(DT, j = c(list(count=.N + ..sets)), by = c("color","year","status"), sets = list("color", c("year","status"), character()), id=TRUE),
+  groupingsets(DT, j = c(list(count=.N + 0)), by = c("color","year","status"), sets = list("color", c("year","status"), character()), id=TRUE)
+)
+test(2324.1,
+  cube(DT, j = sum(value) + ..sets, by = c("color","year","status"), id=TRUE),
+  cube(DT, j = sum(value), by = c("color","year","status"), id=TRUE)
+)
+test(2324.2,
+  rollup(DT, j = sum(value) + ..sets, by=c("color","year","status"), label="total"),
+  rollup(DT, j = sum(value), by=c("color","year","status"), label="total")
+)
+
+# allow na.strings to be quoted, #6974
+f = tempfile()
+DT = data.table(
+  "Date Example"=c("12/5/2012", NA),
+  "Question 1"=c("Yes", NA),
+  "Question 2"=c("Yes", NA),
+  "Site: Country"=c("Chile", "Virgin Islands, British")
+)
+fwrite(DT, f, na='""')
+test(2325.1, fread(f, na.strings='""'), DT)
+unlink(f)
+test(2325.2,
+     fread('"foo","bar","baz"\n"a","b","c"', na.strings=c('"foo"', '"bar"', '"baz"'), header=FALSE),
+     data.table(V1=c(NA, "a"), V2=c(NA, "b"), V3=c(NA, "c")))
+
 ## ensure setDT will retain key and indices when it is called on the list (cbindlist assumes this)
 local({
   d = data.table(x=1:2, y=2:1, z=2:1, v1=1:2)
   setkeyv(d, "x"); setindexv(d, list("y", "z"))
   a = attributes(d)
   attributes(d) = a[!names(a) %in% c("class", ".internal.selfref", "row.names")]
-  test(2324.01, class(d), "list")
+  test(2326.1, class(d), "list")
   setDT(d)
-  test(2324.02, key(d), "x")
-  test(2324.03, hasindex(d, "y") && hasindex(d, "z"))
+  test(2326.2, key(d), "x")
+  test(2326.3, hasindex(d, "y") && hasindex(d, "z"))
 })
diff --git a/man/fctr.Rd b/man/fctr.Rd
@@ -3,7 +3,7 @@
 \alias{factor}
 \title{Create a factor retaining original ordering}
 \description{
-  Creates a code{\link[base:factor]{factor}}.
+  Creates a \code{\link[base]{factor}}.
 
   By default, the output will have its levels in the original order, i.e., \code{levels = unique(x)}, as opposed to \code{factor}'s default where \code{levels = sort(unique(x))}.
 }
@@ -13,7 +13,7 @@ fctr(x, levels=unique(x), ..., sort=FALSE, rev=FALSE)
 \arguments{
   \item{x}{ Object to be turned into a factor. }
   \item{levels}{ Levels for the new factor; \code{unique(x)} by default. }
-  \item{\dots}{ Other arguments passed to code{\link[base:factor]{factor}}. }
+  \item{\dots}{ Other arguments passed to \code{\link[base]{factor}}. }
   \item{sort}{ Logical, default \code{FALSE}. Should \code{levels} be sorted? }
   \item{rev}{ Logical, default \code{FALSE}. Should \code{levels} be reversed? Applied \emph{after} \code{sort}. }
 }
diff --git a/man/groupingsets.Rd b/man/groupingsets.Rd
@@ -15,7 +15,7 @@ rollup(x, \dots)
 cube(x, \dots)
 \method{cube}{data.table}(x, j, by, .SDcols, id = FALSE, label = NULL, \dots)
 groupingsets(x, \dots)
-\method{groupingsets}{data.table}(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, \dots)
+\method{groupingsets}{data.table}(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, enclos = parent.frame(), \dots)
 }
 \arguments{
 	\item{x}{\code{data.table}.}
@@ -27,6 +27,7 @@ groupingsets(x, \dots)
 	\item{id}{logical default \code{FALSE}. If \code{TRUE} it will add leading column with bit mask of grouping sets.}
 	\item{jj}{quoted version of \code{j} argument, for convenience. When provided function will ignore \code{j} argument.}
 	\item{label}{label(s) to be used in the 'total' rows in the grouping variable columns of the output, that is, in rows where the grouping variable has been aggregated. Can be a named list of scalars, or a scalar, or \code{NULL}. Defaults to \code{NULL}, which results in the grouping variables having \code{NA} in their 'total' rows. See Details.}
+	\item{enclos}{the environment containing the symbols referenced by \code{jj}. When writing functions that accept a \code{j} environment for non-standard evaluation by \pkg{data.table}, \code{\link[base]{substitute}()} it and forward it to \code{groupingsets} using the \code{jj} argument, set this to the \code{\link[base]{parent.frame}()} of the function that captures \code{j}.}
 }
 \details{
     All three functions \code{rollup, cube, groupingsets} are generic methods, \code{data.table} methods are provided.
diff --git a/src/fread.c b/src/fread.c
@@ -515,6 +515,8 @@ static void Field(FieldParseContext *ctx)
   //    the field is quoted and quotes are correctly escaped (quoteRule 0 and 1)
   // or the field is quoted but quotes are not escaped (quoteRule 2)
   // or the field is not quoted but the data contains a quote at the start (quoteRule 2 too)
+  // What if this string signifies an NA? Will find out after we're done parsing quotes
+  const char *field_after_NA = end_NA_string(fieldStart);
   fieldStart++;  // step over opening quote
   switch(quoteRule) {
   case 0:  // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol
@@ -573,6 +575,8 @@ static void Field(FieldParseContext *ctx)
     if (ch == eof && quoteRule != 2) { target->off--; target->len++; }   // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
     while(target->len > 0 && ((ch[-1] == ' ' && stripWhite) || ch[-1] == '\0')) { target->len--; ch--; }  // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST       "
   }
+  // Does end-of-field correspond to end-of-possible-NA?
+  if (field_after_NA == ch) target->len = INT32_MIN;
 }
 
 static void str_to_i32_core(const char **pch, int32_t *target, bool parse_date)
@@ -770,7 +774,7 @@ static void parse_double_regular_core(const char **pch, double *target)
       // Not a single digit after "E"? Invalid number
         return;
     }
-    e += Eneg? -E : E;
+    e += Eneg ? -E : E;
   }
   if (e < -350 || e > 350) return;
 
@@ -1418,7 +1422,7 @@ int freadMain(freadMainArgs _args) {
       // Mac doesn't appear to support MAP_POPULATE anyway (failed on CRAN when I tried).
       // TO DO?: MAP_HUGETLB for Linux but seems to need admin to setup first. My Hugepagesize is 2MB (>>2KB, so promising)
       //         https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
-      mmp = mmap(NULL, fileSize, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);  // COW for last page lastEOLreplaced
+      mmp = mmap(NULL, fileSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);  // COW for last page lastEOLreplaced
       #ifdef __EMSCRIPTEN__
         mmp_fd = fd;
       #else
@@ -1447,11 +1451,11 @@ int freadMain(freadMainArgs _args) {
         STOP(_("File size [%s] exceeds the address space: %s"), filesize_to_str(liFileSize.QuadPart), fnam); // # nocov
       }
       fileSize = (size_t)liFileSize.QuadPart;
-      if (fileSize==0) { CloseHandle(hFile); STOP(_("File is empty: %s"), fnam); }
+      if (fileSize == 0) { CloseHandle(hFile); STOP(_("File is empty: %s"), fnam); }
       if (verbose) DTPRINT(_("  File opened, size = %s.\n"), filesize_to_str(fileSize));
       HANDLE hMap = CreateFileMapping(hFile, NULL, PAGE_WRITECOPY, 0, 0, NULL);
       if (hMap == NULL) { CloseHandle(hFile); STOP(_("This is Windows, CreateFileMapping returned error %lu for file %s"), GetLastError(), fnam); }
-      mmp = MapViewOfFile(hMap,FILE_MAP_COPY,0,0,fileSize);  // fileSize must be <= hilo passed to CreateFileMapping above.
+      mmp = MapViewOfFile(hMap, FILE_MAP_COPY, 0, 0, fileSize);  // fileSize must be <= hilo passed to CreateFileMapping above.
       CloseHandle(hMap);  // we don't need to keep the file open; the MapView keeps an internal reference;
       CloseHandle(hFile); //   see https://msdn.microsoft.com/en-us/library/windows/desktop/aa366537(v=vs.85).aspx
       if (mmp == NULL) {
@@ -1500,7 +1504,7 @@ int freadMain(freadMainArgs _args) {
     if (verbose) DTPRINT(_("  Last byte(s) of input found to be %s and removed.\n"),
                          c ? "0x1A (Ctrl+Z)" : "0x00 (NUL)");
   }
-  if (eof<=sof) STOP(_("Input is empty or only contains BOM or terminal control characters"));
+  if (eof <= sof) STOP(_("Input is empty or only contains BOM or terminal control characters"));
   }
 
   //*********************************************************************************************
@@ -2270,7 +2274,7 @@ int freadMain(freadMainArgs _args) {
     chunkBytes = bytesRead / nJumps;
   } else {
     ASSERT(nJumps == 1 /*when nrowLimit supplied*/ || nJumps == 2 /*small files*/, "nJumps (%d) != 1|2", nJumps);
-    nJumps=1;
+    nJumps = 1;
   }
   int64_t initialBuffRows = allocnrow / nJumps;
 
@@ -2421,7 +2425,7 @@ int freadMain(freadMainArgs _args) {
               if (eol(&tch) && skipEmptyLines) { tch++; continue; }
               tch = tLineStart;  // in case white space at the beginning may need to be including in field
             }
-            else if (eol(&tch) && j<ncol) {   // j<ncol needed for #2523 (erroneous extra comma after last field)
+            else if (eol(&tch) && j < ncol) {   // j<ncol needed for #2523 (erroneous extra comma after last field)
               int8_t thisSize = size[j];
               if (thisSize) ((char **) targets)[thisSize] += thisSize;
               j++;
diff --git a/vignettes/datatable-intro.Rmd b/vignettes/datatable-intro.Rmd
@@ -653,13 +653,18 @@ flights[, `:=`(makeup = dep_delay - arr_delay)]
 makeup.models <- flights[, .(fit = list(lm(makeup ~ distance))), by = .(month)]
 makeup.models[, .(coefdist = coef(fit[[1]])[2], rsq = summary(fit[[1]])$r.squared), by = .(month)]
 ```
+
 Using data.frames, we need more complicated code to obtain same result.
+
 ```{r}
 setDF(flights)
 flights.split <- split(flights, f = flights$month)
 makeup.models.list <- lapply(flights.split, function(df) c(month = df$month[1], fit = list(lm(makeup ~ distance, data = df))))
 makeup.models.df <- do.call(rbind, makeup.models.list)
-sapply(makeup.models.df[, "fit"], function(model) c(coefdist = coef(model)[2], rsq =  summary(model)$r.squared)) |> t() |> data.frame()
+data.frame(t(sapply(
+  makeup.models.df[, "fit"],
+  function(model) c(coefdist = coef(model)[2L], rsq =  summary(model)$r.squared)
+)))
 setDT(flights)
 ```