From 1fb7294d24545467991eff1ca25375ecdf62e633 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 5 Aug 2025 01:03:47 +0530 Subject: [PATCH 01/29] added logic to handle select argumnt using .shallow() --- R/fwrite.R | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index e94d00c3f3..d5897d0936 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -13,7 +13,8 @@ fwrite = function(x, file="", append=FALSE, quote="auto", yaml = FALSE, bom = FALSE, verbose=getOption("datatable.verbose", FALSE), - encoding = "") { + encoding = "", + select) { na = as.character(na[1L]) # fix for #1725 if (length(encoding) != 1L || !encoding %chin% c("", "UTF-8", "native")) { stopf("Argument 'encoding' must be '', 'UTF-8' or 'native'.") @@ -26,6 +27,16 @@ fwrite = function(x, file="", append=FALSE, quote="auto", buffMB = as.integer(buffMB) nThread = as.integer(nThread) compressLevel = as.integer(compressLevel) + + # Handle select argument using .shallow() + if (!missing(select)) { + if (is.data.table(x)) { + cols = colnamesInt(x, select) + shallow_x = .shallow(x, cols) + } else { + shallow_x = x[select] + } + } # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix From ee3e85f6b678a2b0de3b74096e89d85c668909af Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 5 Aug 2025 01:19:29 +0530 Subject: [PATCH 02/29] add conditional passing --- R/fwrite.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index d5897d0936..94ec079e5f 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -131,7 +131,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", x }) } - .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, + .Call(CfwriteR, if (missing(select)) x else shallow_x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread, showProgress, is_gzip, compressLevel, bom, yaml, verbose, encoding) invisible() From 01d8bbeccbb48cd3e6e14d974e2b594ca71d71e0 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 5 Aug 2025 01:37:26 +0530 Subject: [PATCH 03/29] assign x when selct is used --- R/fwrite.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 94ec079e5f..f5d38de753 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -32,9 +32,9 @@ fwrite = function(x, file="", append=FALSE, quote="auto", if (!missing(select)) { if (is.data.table(x)) { cols = colnamesInt(x, select) - shallow_x = .shallow(x, cols) + x = .shallow(x, cols) } else { - shallow_x = x[select] + x = x[select] } } # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' @@ -131,7 +131,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", x }) } - .Call(CfwriteR, if (missing(select)) x else shallow_x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, + .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, scipen, dateTimeAs, buffMB, nThread, showProgress, is_gzip, compressLevel, bom, yaml, verbose, encoding) invisible() From 97cb32bd8ba421320874fafca2d5a8b0a802bce9 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 5 Aug 2025 02:14:53 +0530 Subject: [PATCH 04/29] added coverage tests --- R/fwrite.R | 2 +- inst/tests/tests.Rraw | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index f5d38de753..77acb1c733 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -28,7 +28,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", nThread = as.integer(nThread) compressLevel = as.integer(compressLevel) - # Handle select argument using .shallow() + # Handle select argument using .shallow() if (!missing(select)) { if (is.data.table(x)) { cols = colnamesInt(x, select) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a2549ccf6d..581fd6d0e0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21549,3 +21549,10 @@ f = tempfile() writeLines(c('a', rep('0x1.ffffp0', 10000L), '0x1.ff\x9fp0', rep('0x1.ffffp0', 20000L)), f) test(2334, names(fread(f)), "a") unlink(f) + +# test for select parameter #4177 +DT = data.table(a=1:2, b=3:4) +f = tempfile() +fwrite(DT, f, select = "a") +test(2335.1, names(result = fread(f)), "a") +unlink(f) From 4f9928a4393ec79df2c222e0e264fdc9b88eadc5 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 5 Aug 2025 02:22:05 +0530 Subject: [PATCH 05/29] typo --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 581fd6d0e0..ed7ca2ccc7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21554,5 +21554,5 @@ unlink(f) DT = data.table(a=1:2, b=3:4) f = tempfile() fwrite(DT, f, select = "a") -test(2335.1, names(result = fread(f)), "a") +test(2335.1, names(fread(f)), "a") unlink(f) From 8424176d38b77e7cacd9fd6912794273defa473a Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 5 Aug 2025 02:27:46 +0530 Subject: [PATCH 06/29] right number --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dae734a4f8..89707b06f4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21607,5 +21607,5 @@ test(2337.5, is.null(fwrite(data.table(a=numeric()), dec=",", sep=","))) DT = data.table(a=1:2, b=3:4) f = tempfile() fwrite(DT, f, select = "a") -test(2335.1, names(fread(f)), "a") +test(2338.1, names(fread(f)), "a") unlink(f) From d0956f3abfd091a287ff44941fac2a4acaa5329d Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 5 Aug 2025 07:01:31 +0530 Subject: [PATCH 07/29] manual entry + news --- NEWS.md | 2 ++ R/fwrite.R | 4 ++-- man/fwrite.Rd | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index fc69449438..1a5e167bd8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -69,6 +69,8 @@ 15. New function `isoyear()` has been implemented as a complement to `isoweek()`, returning the ISO 8601 year corresponding to a given date, [#7154](https://github.com/Rdatatable/data.table/issues/7154). Thanks to @ben-schwen and @MichaelChirico for the suggestion and @venom1204 for the implementation. +16. `fwrite()` gains `select` argument to write only specified columns, avoiding temporary object creation for memory efficiency, [#4177](https://github.com/Rdatatable/data.table/issues/4177). For `data.table` objects, this uses `.shallow()` to create shallow copies without data duplication. Thanks to @artidataio for feature request, @ColeMiller1 for suggesting implementation and @Mukulyadav2004 for the implementation. + ### BUG FIXES 1. `fread()` no longer warns on certain systems on R 4.5.0+ where the file owner can't be resolved, [#6918](https://github.com/Rdatatable/data.table/issues/6918). Thanks @ProfFancyPants for the report and PR. diff --git a/R/fwrite.R b/R/fwrite.R index 8edd206bd5..6177e95ede 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -14,7 +14,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", bom = FALSE, verbose=getOption("datatable.verbose", FALSE), encoding = "", - select) { + select = NULL) { na = as.character(na[1L]) # fix for #1725 if (length(encoding) != 1L || !encoding %chin% c("", "UTF-8", "native")) { stopf("Argument 'encoding' must be '', 'UTF-8' or 'native'.") @@ -29,7 +29,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", compressLevel = as.integer(compressLevel) # Handle select argument using .shallow() - if (!missing(select)) { + if (!null(select)) { if (is.data.table(x)) { cols = colnamesInt(x, select) x = .shallow(x, cols) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index f16e4c9bd0..4e9efe0e96 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -62,6 +62,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.} \item{verbose}{Be chatty and report timings?} \item{encoding}{ The encoding of the strings written to the CSV file. Default is \code{""}, which means writing raw bytes without considering the encoding. Other possible options are \code{"UTF-8"} and \code{"native"}. } + \item{select}{Vector of column names or column numbers specifying which columns to include. When \code{NULL} (default), all columns are selected. This avoids creating temporary subsets for memory efficiency.} } \details{ \code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{https://h2o.ai/blog/2016/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. From bb8320d111ac3526c2c713e9fd0e2c999a4d5b4c Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Tue, 5 Aug 2025 18:47:16 +0530 Subject: [PATCH 08/29] moved select logic to address matrix inputs also --- R/fwrite.R | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 6177e95ede..7010dd25e9 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -27,16 +27,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", buffMB = as.integer(buffMB) nThread = as.integer(nThread) compressLevel = as.integer(compressLevel) - - # Handle select argument using .shallow() - if (!null(select)) { - if (is.data.table(x)) { - cols = colnamesInt(x, select) - x = .shallow(x, cols) - } else { - x = x[select] - } - } + # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix @@ -49,6 +40,16 @@ fwrite = function(x, file="", append=FALSE, quote="auto", x = as.data.table(x) } } + # Handle select argument using .shallow() + if (!null(select)) { + cols = colnamesInt(x, select) + if (is.data.table(x)) { + x = .shallow(x, cols) + } else { + x = x[select] + } + } + stopifnot( is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), From 8e639919da262cbf45a92b5d1f7c6336bcb7a202 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 5 Aug 2025 20:23:16 +0530 Subject: [PATCH 09/29] for matrix handling --- R/fwrite.R | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 6177e95ede..f8ac90aecc 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -28,15 +28,6 @@ fwrite = function(x, file="", append=FALSE, quote="auto", nThread = as.integer(nThread) compressLevel = as.integer(compressLevel) - # Handle select argument using .shallow() - if (!null(select)) { - if (is.data.table(x)) { - cols = colnamesInt(x, select) - x = .shallow(x, cols) - } else { - x = x[select] - } - } # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix @@ -49,6 +40,15 @@ fwrite = function(x, file="", append=FALSE, quote="auto", x = as.data.table(x) } } + # Handle select argument using .shallow() + if (!null(select)) { + cols = colnamesInt(x, select) + if (is.data.table(x)) { + x = .shallow(x, cols) + } else { + x = x[select] + } + } stopifnot( is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), From d237f5793234bf941b470203689b79751a9d4dcb Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Tue, 5 Aug 2025 20:37:58 +0530 Subject: [PATCH 10/29] add tests for other classes too --- inst/tests/tests.Rraw | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 89707b06f4..9dbabc0419 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21608,4 +21608,13 @@ DT = data.table(a=1:2, b=3:4) f = tempfile() fwrite(DT, f, select = "a") test(2338.1, names(fread(f)), "a") +df = as.data.frame(DT) +fwrite(df, f, select = "a") +test(2338.2, names(fread(f)), "a") +l = as.list(DT) +fwrite(l, f, select = "a") +test(2338.3, names(fread(f)), "a") +m = as.matrix(DT) +fwrite(m, f, select = "a") +test(2338.4, names(fread(f)), "a") unlink(f) From 3a718c08e22858a50492104799db529c221b92c4 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Tue, 5 Aug 2025 20:40:35 +0530 Subject: [PATCH 11/29] remove trail whit space --- R/fwrite.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 7010dd25e9..c537821d62 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -27,7 +27,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", buffMB = as.integer(buffMB) nThread = as.integer(nThread) compressLevel = as.integer(compressLevel) - + # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix @@ -42,14 +42,14 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } # Handle select argument using .shallow() if (!null(select)) { - cols = colnamesInt(x, select) + cols = colnamesInt(x, select) if (is.data.table(x)) { x = .shallow(x, cols) } else { x = x[select] } } - + stopifnot( is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), From 23214d146b375ca066b024bd82c103ebb238a3d3 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Tue, 5 Aug 2025 22:17:47 +0530 Subject: [PATCH 12/29] is.null --- R/fwrite.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index c537821d62..6314069f37 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -41,7 +41,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } } # Handle select argument using .shallow() - if (!null(select)) { + if (!is.null(select)) { cols = colnamesInt(x, select) if (is.data.table(x)) { x = .shallow(x, cols) From 2744a89e364c5a2b2c4af6e1ba28078b58e3e814 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Tue, 5 Aug 2025 22:22:41 +0530 Subject: [PATCH 13/29] add tests --- R/fwrite.R | 2 +- inst/tests/tests.Rraw | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index f8ac90aecc..6052fb0584 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -41,7 +41,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } } # Handle select argument using .shallow() - if (!null(select)) { + if (!is.null(select)) { cols = colnamesInt(x, select) if (is.data.table(x)) { x = .shallow(x, cols) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 89707b06f4..9dbabc0419 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21608,4 +21608,13 @@ DT = data.table(a=1:2, b=3:4) f = tempfile() fwrite(DT, f, select = "a") test(2338.1, names(fread(f)), "a") +df = as.data.frame(DT) +fwrite(df, f, select = "a") +test(2338.2, names(fread(f)), "a") +l = as.list(DT) +fwrite(l, f, select = "a") +test(2338.3, names(fread(f)), "a") +m = as.matrix(DT) +fwrite(m, f, select = "a") +test(2338.4, names(fread(f)), "a") unlink(f) From 66d37442f2a42257438cf1f97651c25921c8ffce Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Tue, 5 Aug 2025 22:27:23 +0530 Subject: [PATCH 14/29] trailing space --- R/fwrite.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index 6314069f37..3fe6abdf33 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -42,7 +42,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } # Handle select argument using .shallow() if (!is.null(select)) { - cols = colnamesInt(x, select) + cols = colnamesInt(x, select) if (is.data.table(x)) { x = .shallow(x, cols) } else { From d5c933bc64d94addb3b8206a879ba679bf734532 Mon Sep 17 00:00:00 2001 From: Mukul Kumar Date: Fri, 8 Aug 2025 21:48:56 +0530 Subject: [PATCH 15/29] added atime performance test --- .ci/atime/tests.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 761ba3c2bb..d783db964f 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -286,5 +286,17 @@ test.list <- atime::atime_test_list( Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927) Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation + # Test case adapted from https://github.com/Rdatatable/data.table/issues/4177 which is where the issue was reported. + # Fixed in https://github.com/Rdatatable/data.table/pull/7236 + "fwrite select parameter improved in #4177" = atime::atime_test( + setup = { + set.seed(1) + DT = data.table(a = rnorm(N), b = rnorm(N), c = rnorm(N), d = rnorm(N), e = rnorm(N)) + temp_file = tempfile() + }, + expr = data.table::fwrite(DT, temp_file, select = c("a", "b", "c")), + Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) + Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter + tests=extra.test.list) # nolint end: undesirable_operator_linter. From b0e7f8263664f942fef9af32c3cc895f40e4a18d Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Fri, 8 Aug 2025 22:49:17 +0530 Subject: [PATCH 16/29] add condition to when to use select --- .ci/atime/tests.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index d783db964f..ff43c3e311 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -293,8 +293,10 @@ test.list <- atime::atime_test_list( set.seed(1) DT = data.table(a = rnorm(N), b = rnorm(N), c = rnorm(N), d = rnorm(N), e = rnorm(N)) temp_file = tempfile() + has_select = "select" %chin% names(formals(data.table::fwrite)) }, - expr = data.table::fwrite(DT, temp_file, select = c("a", "b", "c")), + expr = if (has_select) + data.table::fwrite(DT, temp_file, select=c("a","b","c")), Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter From 8fe700249bc1994195711047bc62c189c58c091b Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Thu, 14 Aug 2025 21:47:29 +0530 Subject: [PATCH 17/29] added select parameter to docs also --- man/fwrite.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 99b7529d08..ef14f45400 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -22,7 +22,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", bom = FALSE, verbose = getOption("datatable.verbose", FALSE), encoding = "", - forceDecimal = FALSE) + forceDecimal = FALSE, + select = NULL) } \arguments{ \item{x}{Any \code{list} of same length vectors; e.g. \code{data.frame} and \code{data.table}. If \code{matrix}, it gets internally coerced to \code{data.table} preserving col names but not row names} From ec25903036141951e205e0aeef2ad12e6daa805f Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Fri, 15 Aug 2025 22:01:52 +0530 Subject: [PATCH 18/29] setup and branching only in expr and added cases --- .ci/atime/tests.R | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index ff43c3e311..273321f3f5 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -289,16 +289,37 @@ test.list <- atime::atime_test_list( # Test case adapted from https://github.com/Rdatatable/data.table/issues/4177 which is where the issue was reported. # Fixed in https://github.com/Rdatatable/data.table/pull/7236 "fwrite select parameter improved in #4177" = atime::atime_test( + # N row, 5 column setup = { - set.seed(1) - DT = data.table(a = rnorm(N), b = rnorm(N), c = rnorm(N), d = rnorm(N), e = rnorm(N)) + set.seed(1L) + DT = data.table(a=rnorm(N), b=rnorm(N), c=rnorm(N), d=rnorm(N), e=rnorm(N)) + temp_file = tempfile() + }, + expr = { + has_select = "select" %chin% names(formals(data.table::fwrite)) + if (has_select) { + data.table::fwrite(DT, temp_file, select = c("a","b","c")) + } else { + data.table::fwrite(DT[, c("a","b","c"), with = FALSE], temp_file) + } + }, + Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) + Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85" # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter + # 1 row, N columns + setup = { + DT = data.table(t(1:N)) temp_file = tempfile() + }, + expr = { has_select = "select" %chin% names(formals(data.table::fwrite)) + if (has_select) { + data.table::fwrite(DT, temp_file, select = 1L) + } else { + data.table::fwrite(DT[, 1L, with = FALSE], temp_file) + } }, - expr = if (has_select) - data.table::fwrite(DT, temp_file, select=c("a","b","c")), - Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) - Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter + Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) + Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85" # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter tests=extra.test.list) # nolint end: undesirable_operator_linter. From e54bd7135b85d5e5092ef0689e85c906595505be Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Fri, 15 Aug 2025 12:36:24 -0400 Subject: [PATCH 19/29] document select arg --- vignettes/datatable-fread-and-fwrite.Rmd | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/vignettes/datatable-fread-and-fwrite.Rmd b/vignettes/datatable-fread-and-fwrite.Rmd index 49cfa60db8..8a19d3e1ec 100644 --- a/vignettes/datatable-fread-and-fwrite.Rmd +++ b/vignettes/datatable-fread-and-fwrite.Rmd @@ -270,14 +270,23 @@ if (requireNamespace("bit64", quietly = TRUE)) { ### 2.4 Column Order and Subset Control -To control the order and subset of columns written to file, subset the data.table before calling `fwrite()`. The `col.names` argument in `fwrite()` is a logical (TRUE/FALSE) that controls whether the header row is written, not which columns are written. +To control the order and subset of columns written to file, you can use `[.data.table` to make a new table before calling `fwrite()`, but it is more efficient to use the `select` argument, which avoids making a copy. ```{r} dt = data.table(A = 1:3, B = 4:6, C = 7:9) - # Write only columns C and A, in that order fwrite(dt[, .(C, A)], "out.csv") cat(readLines("out.csv"), sep = "\n") +fwrite(dt, "out.csv", select=c("C","A")) +cat(readLines("out.csv"), sep = "\n") +file.remove("out.csv") +``` + +The `col.names` argument in `fwrite()` is a logical (TRUE/FALSE) that controls whether the header row is written, not which columns are written. + +```{r} +fwrite(dt, "out.csv", col.names=FALSE) +cat(readLines("out.csv"), sep = "\n") file.remove("out.csv") ``` @@ -292,4 +301,4 @@ For users interested in detailed, up-to-date performance comparisons, we recomme These benchmarks consistently show that `fread` and `fwrite` are highly competitive and often state-of-the-art for performance in the R ecosystem. -*** \ No newline at end of file +*** From c2c5d89a7a3d75c5cf2da5beda9f643146309c15 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Fri, 15 Aug 2025 22:15:42 +0530 Subject: [PATCH 20/29] bypass name res for num select, avoid no-op shallow, and cols in place of select(keep x as list/dt) --- R/fwrite.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index e28f6e9567..3e72f56fff 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -43,11 +43,17 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } # Handle select argument using .shallow() if (!is.null(select)) { - cols = colnamesInt(x, select) + if (is.integer(select) || is.numeric(select)) { # numeric/integer avoids O(#cols) name-match overhead + cols = as.integer(select) + } else { + cols = colnamesInt(x, select) + } if (is.data.table(x)) { - x = .shallow(x, cols) + if (length(cols) < NCOL(x) || !identical(cols, seq_len(NCOL(x)))) { # only build a shallow view when columns are reduced or reordered + x = .shallow(x, cols) + } } else { - x = x[select] + x = x[cols] } } From 926bb0e6e004058815b9a6bbcc4d15e57bf04f44 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Fri, 15 Aug 2025 22:19:53 +0530 Subject: [PATCH 21/29] better writing style Co-authored-by: Toby Dylan Hocking --- R/fwrite.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 3e72f56fff..cb3e724ab4 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -43,10 +43,10 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } # Handle select argument using .shallow() if (!is.null(select)) { - if (is.integer(select) || is.numeric(select)) { # numeric/integer avoids O(#cols) name-match overhead - cols = as.integer(select) + cols = if (is.integer(select) || is.numeric(select)) { # numeric/integer avoids O(#cols) name-match overhead + as.integer(select) } else { - cols = colnamesInt(x, select) + colnamesInt(x, select) } if (is.data.table(x)) { if (length(cols) < NCOL(x) || !identical(cols, seq_len(NCOL(x)))) { # only build a shallow view when columns are reduced or reordered From cbffb5bc93b60641d0057dc1eac8e6731b3cddb4 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Fri, 15 Aug 2025 22:50:56 +0530 Subject: [PATCH 22/29] seperate both cases --- .ci/atime/tests.R | 70 ++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 273321f3f5..f96619fb18 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -285,41 +285,43 @@ test.list <- atime::atime_test_list( expr = data.table::isoweek(x), Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927) Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation - + # Test case adapted from https://github.com/Rdatatable/data.table/issues/4177 which is where the issue was reported. - # Fixed in https://github.com/Rdatatable/data.table/pull/7236 - "fwrite select parameter improved in #4177" = atime::atime_test( - # N row, 5 column - setup = { - set.seed(1L) - DT = data.table(a=rnorm(N), b=rnorm(N), c=rnorm(N), d=rnorm(N), e=rnorm(N)) - temp_file = tempfile() - }, - expr = { - has_select = "select" %chin% names(formals(data.table::fwrite)) - if (has_select) { - data.table::fwrite(DT, temp_file, select = c("a","b","c")) - } else { - data.table::fwrite(DT[, c("a","b","c"), with = FALSE], temp_file) - } - }, - Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) - Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85" # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter - # 1 row, N columns - setup = { - DT = data.table(t(1:N)) - temp_file = tempfile() - }, - expr = { - has_select = "select" %chin% names(formals(data.table::fwrite)) - if (has_select) { - data.table::fwrite(DT, temp_file, select = 1L) - } else { - data.table::fwrite(DT[, 1L, with = FALSE], temp_file) - } - }, - Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) - Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85" # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter + # Fixed in https://github.com/Rdatatable/data.table/pull/7236 + "fwrite select parameter improved in #4177 (N rows and 5 columns, select a,b,c)" = atime::atime_test( + setup = { + set.seed(1L) + DT = data.table(a=rnorm(N), b=rnorm(N), c=rnorm(N), d=rnorm(N), e=rnorm(N)) + temp_file = tempfile() + }, + expr = { + has_select = "select" %chin% names(formals(data.table::fwrite)) + if (has_select) { + data.table::fwrite(DT, temp_file, select = c("a","b","c")) + } else { + data.table::fwrite(DT[, c("a","b","c"), with = FALSE], temp_file) + } + }, + Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) + Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85", # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter + + # Wide case: 1 row, N columns; select a single column + "fwrite select parameter improved in #4177 (N columns and 1 row, select 1)" = atime::atime_test( + setup = { + DT = data.table(t(1:N)) + temp_file = tempfile() + select_idx = 1L + }, + expr = { + has_select = "select" %chin% names(formals(data.table::fwrite)) + if (has_select) { + data.table::fwrite(DT, temp_file, select = select_idx) + } else { + data.table::fwrite(DT[, select_idx, with = FALSE], temp_file) + } + }, + Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) + Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85", # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter tests=extra.test.list) # nolint end: undesirable_operator_linter. From ea91282210ba9dc09b71a7c494f32626fb216ae3 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Fri, 15 Aug 2025 22:56:18 +0530 Subject: [PATCH 23/29] use only is.numeric(x) --- R/fwrite.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index cb3e724ab4..ee04dfe7ca 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -43,7 +43,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } # Handle select argument using .shallow() if (!is.null(select)) { - cols = if (is.integer(select) || is.numeric(select)) { # numeric/integer avoids O(#cols) name-match overhead + cols = if (is.numeric(x)) { # numeric/integer avoids O(#cols) name-match overhead as.integer(select) } else { colnamesInt(x, select) From 28705de3b7d1f2fa1f611ce8d5fe8a6c612ecf87 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Fri, 15 Aug 2025 23:00:45 +0530 Subject: [PATCH 24/29] better manual docs From 94a9b596c038d411ecf021805dc4827a3997c0fb Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Wed, 20 Aug 2025 18:01:26 +0530 Subject: [PATCH 25/29] closed parenthisis --- .ci/atime/tests.R | 70 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index f96619fb18..322d8ff68f 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -286,42 +286,42 @@ test.list <- atime::atime_test_list( Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927) Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation - # Test case adapted from https://github.com/Rdatatable/data.table/issues/4177 which is where the issue was reported. - # Fixed in https://github.com/Rdatatable/data.table/pull/7236 - "fwrite select parameter improved in #4177 (N rows and 5 columns, select a,b,c)" = atime::atime_test( - setup = { - set.seed(1L) - DT = data.table(a=rnorm(N), b=rnorm(N), c=rnorm(N), d=rnorm(N), e=rnorm(N)) - temp_file = tempfile() - }, - expr = { - has_select = "select" %chin% names(formals(data.table::fwrite)) - if (has_select) { - data.table::fwrite(DT, temp_file, select = c("a","b","c")) - } else { - data.table::fwrite(DT[, c("a","b","c"), with = FALSE], temp_file) - } - }, - Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) - Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85", # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter + # Test case adapted from https://github.com/Rdatatable/data.table/issues/4177 which is where the issue was reported. + # Fixed in https://github.com/Rdatatable/data.table/pull/7236 + "fwrite select parameter improved in #4177 (N rows and 5 columns, select a,b,c)" = atime::atime_test( + setup = { + set.seed(1L) + DT = data.table(a=rnorm(N), b=rnorm(N), c=rnorm(N), d=rnorm(N), e=rnorm(N)) + temp_file = tempfile() + }, + expr = { + has_select = "select" %chin% names(formals(data.table::fwrite)) + if (has_select) { + data.table::fwrite(DT, temp_file, select = c("a","b","c")) + } else { + data.table::fwrite(DT[, c("a","b","c"), with = FALSE], temp_file) + } + }, + Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) + Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter - # Wide case: 1 row, N columns; select a single column - "fwrite select parameter improved in #4177 (N columns and 1 row, select 1)" = atime::atime_test( - setup = { - DT = data.table(t(1:N)) - temp_file = tempfile() - select_idx = 1L - }, - expr = { - has_select = "select" %chin% names(formals(data.table::fwrite)) - if (has_select) { - data.table::fwrite(DT, temp_file, select = select_idx) - } else { - data.table::fwrite(DT[, select_idx, with = FALSE], temp_file) - } - }, - Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) - Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85", # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter + # Wide case: 1 row, N columns; select a single column + "fwrite select parameter improved in #4177 (N columns and 1 row, select 1)" = atime::atime_test( + setup = { + DT = data.table(t(1:N)) + temp_file = tempfile() + select_idx = 1L + }, + expr = { + has_select = "select" %chin% names(formals(data.table::fwrite)) + if (has_select) { + data.table::fwrite(DT, temp_file, select = select_idx) + } else { + data.table::fwrite(DT[, select_idx, with = FALSE], temp_file) + } + }, + Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806) + Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter tests=extra.test.list) # nolint end: undesirable_operator_linter. From 377620ee8ab80699786a8bf36aa4367fe0d3ac9b Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:51:12 +0530 Subject: [PATCH 26/29] improved test stmt Co-authored-by: Toby Dylan Hocking --- .ci/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 322d8ff68f..bf34f7a1b7 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -288,7 +288,7 @@ test.list <- atime::atime_test_list( # Test case adapted from https://github.com/Rdatatable/data.table/issues/4177 which is where the issue was reported. # Fixed in https://github.com/Rdatatable/data.table/pull/7236 - "fwrite select parameter improved in #4177 (N rows and 5 columns, select a,b,c)" = atime::atime_test( + "fwrite(select) #4177 Nx5" = atime::atime_test( setup = { set.seed(1L) DT = data.table(a=rnorm(N), b=rnorm(N), c=rnorm(N), d=rnorm(N), e=rnorm(N)) From d8348118dcd425c844b017f22c75e4480dd4bef9 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Wed, 20 Aug 2025 21:51:31 +0530 Subject: [PATCH 27/29] imprvd tst stmt Co-authored-by: Toby Dylan Hocking --- .ci/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index bf34f7a1b7..81f52d12a7 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -306,7 +306,7 @@ test.list <- atime::atime_test_list( Fast = "1887699fe965b5aa1fb8cb16b5507b7a5cbf5c85"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/4177/commits) that adds select parameter # Wide case: 1 row, N columns; select a single column - "fwrite select parameter improved in #4177 (N columns and 1 row, select 1)" = atime::atime_test( + "fwrite(select) #4177 1xN" = atime::atime_test( setup = { DT = data.table(t(1:N)) temp_file = tempfile() From 08f6fee012ce03322497e4292e63a7ec0354cfb6 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Sat, 23 Aug 2025 03:02:14 +0530 Subject: [PATCH 28/29] use select parm instead of x in is.numeric --- R/fwrite.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index ee04dfe7ca..6c25d74b1f 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -43,7 +43,7 @@ fwrite = function(x, file="", append=FALSE, quote="auto", } # Handle select argument using .shallow() if (!is.null(select)) { - cols = if (is.numeric(x)) { # numeric/integer avoids O(#cols) name-match overhead + cols = if (is.numeric(select)) { # numeric/integer avoids O(#cols) name-match overhead as.integer(select) } else { colnamesInt(x, select) From b072be59d73179b4ec8c4052fefa8f0e44a51709 Mon Sep 17 00:00:00 2001 From: Mukul <145585624+Mukulyadav2004@users.noreply.github.com> Date: Sat, 23 Aug 2025 21:45:48 +0530 Subject: [PATCH 29/29] fwrite call to use data.table subsetting --- .ci/atime/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/atime/tests.R b/.ci/atime/tests.R index 81f52d12a7..e52369b665 100644 --- a/.ci/atime/tests.R +++ b/.ci/atime/tests.R @@ -317,7 +317,7 @@ test.list <- atime::atime_test_list( if (has_select) { data.table::fwrite(DT, temp_file, select = select_idx) } else { - data.table::fwrite(DT[, select_idx, with = FALSE], temp_file) + data.table::fwrite(data.table:::`[.data.table`(DT, , select_idx, with = FALSE), temp_file) } }, Slow = "66cb6d2393cef30083b444346a7600a079207806", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/66cb6d2393cef30083b444346a7600a079207806)