diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 761ba3c2bb..e52369b665 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -285,6 +285,43 @@ test.list <- atime::atime_test_list( expr = data.table::isoweek(x), Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927) Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation + + # Test case adapted from https://github.com/Rdatatable/data.table/issues/4177 which is where the issue was reported. + # Fixed in https://github.com/Rdatatable/data.table/pull/7236 + "fwrite(select) #4177 Nx5" = atime::atime_test( + setup = { + set.seed(1L) + DT = data.table(a=rnorm(N), b=rnorm(N), c=rnorm(N), d=rnorm(N), e=rnorm(N)) + temp_file = tempfile() + }, + expr = { + has_select = "select" %chin% names(formals(data.table::fwrite)) + if (has_select) { + data.table::fwrite(DT, temp_file, select = c("a","b","c")) + } else { + data.table::fwrite(DT[, c("a","b","c"), with = FALSE], temp_file) + } + }, + Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) + Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter + + # Wide case: 1 row, N columns; select a single column + "fwrite(select) #4177 1xN" = atime::atime_test( + setup = { + DT = data.table(t(1:N)) + temp_file = tempfile() + select_idx = 1L + }, + expr = { + has_select = "select" %chin% names(formals(data.table::fwrite)) + if (has_select) { + data.table::fwrite(DT, temp_file, select = select_idx) + } else { + data.table::fwrite(data.table:::`[.data.table`(DT, , select_idx, with = FALSE), temp_file) + } + }, + Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) + Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter tests=extra.test.list) # nolint end: undesirable_operator_linter. diff --git a/NEWS.md b/NEWS.md index fc69449438..1a5e167bd8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -69,6 +69,8 @@ 15. New function `isoyear()` has been implemented as a complement to `isoweek()`, returning the ISO 8601 year corresponding to a given date, [#7154](https://github.com/Rdatatable/data.table/issues/7154). Thanks to @ben-schwen and @MichaelChirico for the suggestion and @venom1204 for the implementation. +16. `fwrite()` gains `select` argument to write only specified columns, avoiding temporary object creation for memory efficiency, [#4177](https://github.com/Rdatatable/data.table/issues/4177). For `data.table` objects, this uses `.shallow()` to create shallow copies without data duplication. Thanks to @artidataio for feature request, @ColeMiller1 for suggesting implementation and @Mukulyadav2004 for the implementation. + ### BUG FIXES 1. `fread()` no longer warns on certain systems on R 4.5.0+ where the file owner can't be resolved, [#6918](https://github.com/Rdatatable/data.table/issues/6918). Thanks @ProfFancyPants for the report and PR. diff --git a/R/fwrite.R b/R/fwrite.R index 5d91b4e347..6c25d74b1f 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -14,7 +14,8 @@ fwrite = function(x, file="", append=FALSE, quote="auto", bom = FALSE, verbose=getOption("datatable.verbose", FALSE), encoding = "", - forceDecimal = FALSE) { + forceDecimal = FALSE, + select = NULL) { na = as.character(na[1L]) # fix for #1725 if (length(encoding) != 1L || !encoding %chin% c("", "UTF-8", "native")) { stopf("Argument 'encoding' must be '', 'UTF-8' or 'native'.") @@ -27,6 +28,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", buffMB = as.integer(buffMB) nThread = as.integer(nThread) compressLevel = as.integer(compressLevel) + # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix @@ -39,6 +41,22 @@ fwrite = function(x, file="", append=FALSE, quote="auto", x = as.data.table(x) } } + # Handle select argument using .shallow() + if (!is.null(select)) { + cols = if (is.numeric(select)) { # numeric/integer avoids O(#cols) name-match overhead + as.integer(select) + } else { + colnamesInt(x, select) + } + if (is.data.table(x)) { + if (length(cols) < NCOL(x) || !identical(cols, seq_len(NCOL(x)))) { # only build a shallow view when columns are reduced or reordered + x = .shallow(x, cols) + } + } else { + x = x[cols] + } + } + stopifnot( is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d7ffc476bb..8a288fcda0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21620,3 +21620,18 @@ local({ test(2338.9, {fwrite(dd, f, forceDecimal=FALSE); fread(f)}, di) }) +# test for select parameter #4177 +DT = data.table(a=1:2, b=3:4) +f = tempfile() +fwrite(DT, f, select = "a") +test(2339.1, names(fread(f)), "a") +df = as.data.frame(DT) +fwrite(df, f, select = "a") +test(2339.2, names(fread(f)), "a") +l = as.list(DT) +fwrite(l, f, select = "a") +test(2339.3, names(fread(f)), "a") +m = as.matrix(DT) +fwrite(m, f, select = "a") +test(2339.4, names(fread(f)), "a") +unlink(f) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index c3796aba07..ef14f45400 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -22,7 +22,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", bom = FALSE, verbose = getOption("datatable.verbose", FALSE), encoding = "", - forceDecimal = FALSE) + forceDecimal = FALSE, + select = NULL) } \arguments{ \item{x}{Any \code{list} of same length vectors; e.g. \code{data.frame} and \code{data.table}. If \code{matrix}, it gets internally coerced to \code{data.table} preserving col names but not row names} @@ -64,6 +65,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{verbose}{Be chatty and report timings?} \item{encoding}{ The encoding of the strings written to the CSV file. Default is \code{""}, which means writing raw bytes without considering the encoding. Other possible options are \code{"UTF-8"} and \code{"native"}. } \item{forceDecimal}{ Should decimal points be forced for whole numbers in numeric columns? When \code{FALSE}, the default, whole numbers like \code{c(1.0, 2.0, 3.0)} will be written as \samp{1, 2, 3} i.e., dropping \code{dec}. } + \item{select}{Vector of column names or column numbers specifying which columns to include. When \code{NULL} (default), all columns are selected. This avoids creating temporary subsets for memory efficiency.} } \details{ \code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. diff --git a/vignettes/datatable-fread-and-fwrite.Rmd b/vignettes/datatable-fread-and-fwrite.Rmd index 49cfa60db8..8a19d3e1ec 100644 --- a/vignettes/datatable-fread-and-fwrite.Rmd +++ b/vignettes/datatable-fread-and-fwrite.Rmd @@ -270,14 +270,23 @@ if (requireNamespace("bit64", quietly = TRUE)) { ### 2.4 Column Order and Subset Control -To control the order and subset of columns written to file, subset the data.table before calling `fwrite()`. The `col.names` argument in `fwrite()` is a logical (TRUE/FALSE) that controls whether the header row is written, not which columns are written. +To control the order and subset of columns written to file, you can use `[.data.table` to make a new table before calling `fwrite()`, but it is more efficient to use the `select` argument, which avoids making a copy. ```{r} dt = data.table(A = 1:3, B = 4:6, C = 7:9) - # Write only columns C and A, in that order fwrite(dt[, .(C, A)], "out.csv") cat(readLines("out.csv"), sep = "\n") +fwrite(dt, "out.csv", select=c("C","A")) +cat(readLines("out.csv"), sep = "\n") +file.remove("out.csv") +``` + +The `col.names` argument in `fwrite()` is a logical (TRUE/FALSE) that controls whether the header row is written, not which columns are written. + +```{r} +fwrite(dt, "out.csv", col.names=FALSE) +cat(readLines("out.csv"), sep = "\n") file.remove("out.csv") ``` @@ -292,4 +301,4 @@ For users interested in detailed, up-to-date performance comparisons, we recomme These benchmarks consistently show that `fread` and `fwrite` are highly competitive and often state-of-the-art for performance in the R ecosystem. -*** \ No newline at end of file +***