Rdatatable
diff --git a/‎.ci/atime/tests.R‎
Lines changed: 26 additions & 2 deletions b/‎.ci/atime/tests.R‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎.github/CONTRIBUTING.md‎
Lines changed: 4 additions & 0 deletions b/‎.github/CONTRIBUTING.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎NEWS.md‎
Lines changed: 8 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎R/as.data.table.R‎
Lines changed: 4 additions & 3 deletions b/‎R/as.data.table.R‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/assign.c‎
Lines changed: 0 additions & 1 deletion b/‎src/assign.c‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/chmatch.c‎
Lines changed: 0 additions & 1 deletion b/‎src/chmatch.c‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/cj.c‎
Lines changed: 0 additions & 1 deletion b/‎src/cj.c‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/coalesce.c‎
Lines changed: 0 additions & 1 deletion b/‎src/coalesce.c‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/data.table.h‎
Lines changed: 0 additions & 1 deletion b/‎src/data.table.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/fastmean.c‎
Lines changed: 0 additions & 1 deletion b/‎src/fastmean.c‎
Lines changed: 0 additions & 1 deletion
@@ -13,6 +13,7 @@ for (extra.arg in extra.args.6107){
       tmp_csv = tempfile()
       fwrite(DT, tmp_csv)
     },
+    FasterIO = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
     Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
     Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
   this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
@@ -128,6 +129,18 @@ test.list <- atime::atime_test_list(
       paste0('useDynLib(', new.Package_))
   },
 
+  # Constant overhead improvement https://github.com/Rdatatable/data.table/pull/6925
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/7022#discussion_r2107900643
+  "fread disk overhead improved in #6925" = atime::atime_test(
+    N = 2^seq(0, 20), # smaller N because we are doing multiple fread calls.
+    setup = {
+      fwrite(iris[1], iris.csv <- tempfile())
+    },
+    expr = replicate(N, data.table::fread(iris.csv)),
+    Fast = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
+    Slow = "e25ea80b793165094cea87d946d2bab5628f70a6" # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/60a01fa65191c44d7997de1843e9a1dfe5be9f72)
+  ),
+
   # Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
   # Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
   "shallow regression fixed in #4440" = atime::atime_test(
@@ -177,8 +190,9 @@ test.list <- atime::atime_test_list(
   # Fixed in https://github.com/Rdatatable/data.table/pull/4558
   "DT[by] fixed in #4558" = atime::atime_test(
     setup = {
+      N9 <- as.integer(N * 0.9)
       d <- data.table(
-        id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
+        id = sample(c(seq.int(N9), sample(N9, N-N9, TRUE))),
         v1 = sample(5L, N, TRUE),
         v2 = sample(5L, N, TRUE)
       )
@@ -251,5 +265,15 @@ test.list <- atime::atime_test_list(
     Before = "f339aa64c426a9cd7cf2fcb13d91fc4ed353cd31", # Parent of the first commit https://github.com/Rdatatable/data.table/commit/fcc10d73a20837d0f1ad3278ee9168473afa5ff1 in the PR https://github.com/Rdatatable/data.table/pull/6393/commits with major change to fwrite with gzip.
     PR = "3630413ae493a5a61b06c50e80d166924d2ef89a"), # Close-to-last merge commit in the PR.
 
-  tests=extra.test.list)
+  # Test case created directly using the atime code below (not adapted from any other benchmark), based on the PR, Removes unnecessary data.table call from as.data.table.array https://github.com/Rdatatable/data.table/pull/7010 
+  "as.data.table.array improved in #7010" = atime::atime_test(
+    setup = {
+      dims = c(N, 1, 1)
+      arr = array(seq_len(prod(dims)), dim=dims)
+    },
+    expr = data.table:::as.data.table.array(arr, na.rm=FALSE),
+    Slow = "73d79edf8ff8c55163e90631072192301056e336",   # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/8397dc3c993b61a07a81c786ca68c22bc589befc)
+    Fast = "8397dc3c993b61a07a81c786ca68c22bc589befc"),  # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7019/commits) that removes inefficiency
+
+    tests=extra.test.list)
 # nolint end: undesirable_operator_linter.
@@ -76,6 +76,10 @@ See [`?test`](https://rdatatable.gitlab.io/data.table/reference/test.html).
 1. **[Squashing Github pull requests into a single commit](http://eli.thegreenplace.net/2014/02/19/squashing-github-pull-requests-into-a-single-commit)**.
 1. **[Github help](https://help.github.com/articles/using-pull-requests/)** - you'll need the *fork and pull* model.
 
+#### Performance testing
+
+If your PR may have an effect on time/memory usage, please consider adding a performance test, either in the same PR, or a follow-up PR. Note that first-time contributors _must_ do so in a follow-up PR, since the tests are only run on PRs from branches created directly in the Rdatatable/data.table repo. See the [Performance testing](https://github.com/Rdatatable/data.table/wiki/Performance-testing) wiki page for details.
+
 Minimal first time PR
 ---------------------
 
 
@@ -14,6 +14,8 @@
 
 4. `as.Date()` method for `IDate` no longer coerces to `double` [#6922](https://github.com/Rdatatable/data.table/issues/6922). Thanks @MichaelChirico for the report and PR. The only effect should be on overly-strict tests that assert `Date` objects have `double` storage, which is not in general true, especially from R 4.5.0.
 
+5. `as.data.table()` is slightly more efficient at converting arrays to data.tables, [#7019](https://github.com/Rdatatable/data.table/pull/7019). Thanks @eliocamp.
+
 ### BUG FIXES
 
 1. Custom binary operators from the `lubridate` package now work with objects of class `IDate` as with a `Date` subclass, [#6839](https://github.com/Rdatatable/data.table/issues/6839). Thanks @emallickhossain for the report and @aitap for the fix.
@@ -45,6 +47,12 @@
 
 3. {data.table} now depends on R 3.4.0 (2017).
 
+4. Changes to `fread()` output and errors:
+
+   + When the size of the file exceeds the size of the address space, `fread()` now signals an informative error instead of trying to map its size modulo the address space.
+   + On non-Windows systems, `fread()` now prints the reason why the file couldn't be opened, which could also be due to it being too large to map.
+   + With `verbose=TRUE`, file sizes are now printed using correct binary SI prefixes (the sizes have always been reported as bytes denominated in powers of `2^10`, so e.g. `1024*1024` bytes was reported as `1 MB` where `1 MiB` or `1.05 MB` is correct).
+
 ## data.table [v1.17.0](https://github.com/Rdatatable/data.table/milestone/34)  (20 Feb 2025)
 
 ### POTENTIALLY BREAKING CHANGES
 
@@ -96,9 +96,9 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
   dnx = dimnames(x)
   # NULL dimnames will create integer keys, not character as in table method
   val = if (is.null(dnx)) {
-    lapply(dx, seq.int)
+    lapply(dx, seq_len)
   } else if (any(nulldnx <- vapply_1b(dnx, is.null))) {
-    dnx[nulldnx] = lapply(dx[nulldnx], seq.int) #3636
+    dnx[nulldnx] = lapply(dx[nulldnx], seq_len) #3636
     dnx
   } else dnx
   val = rev(val)
@@ -107,7 +107,8 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
   if (value.name %chin% names(val))
     stopf("Argument 'value.name' should not overlap with column names in result: %s", brackify(rev(names(val))))
   N = NULL
-  ans = data.table(do.call(CJ, c(val, sorted=FALSE)), N=as.vector(x))
+  ans = do.call(CJ, c(val, sorted=FALSE))
+  set(ans, j="N", value=as.vector(x))
   if (isTRUE(na.rm))
     ans = ans[!is.na(N)]
   setnames(ans, "N", value.name)
 
@@ -1335,4 +1335,3 @@ SEXP setcharvec(SEXP x, SEXP which, SEXP newx)
   }
   return R_NilValue;
 }
-
@@ -169,4 +169,3 @@ system.time(ans2 <- .Call("Cchmatch2", x,y,0L))     # 0.17sec  as of 1.12.0 and
 system.time(ans3 <- chmatchdup(x,y,0L))             # 0.09sec  from 1.12.2; but goal wasn't speed rather simplified code; e.g. rbindlist.c down from 960 to 360 lines
 identical(ans2,ans3)  # test 2000
 */
-
@@ -99,4 +99,3 @@ SEXP cj(SEXP base_list) {
   UNPROTECT(1);
   return out;
 }
-
@@ -172,4 +172,3 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) {
   UNPROTECT(nprotect);
   return first;
 }
-
@@ -356,4 +356,3 @@ SEXP dt_has_zlib(void);
 SEXP startsWithAny(SEXP, SEXP, SEXP);
 SEXP convertDate(SEXP, SEXP);
 SEXP fastmean(SEXP);
-
@@ -134,4 +134,3 @@ SEXP fastmean(SEXP args)
       COMPLEX(ans)[0].i = (double) si;
       break;
 */
-
Original file line number	Diff line number	Diff line change
`@@ -1335,4 +1335,3 @@ SEXP setcharvec(SEXP x, SEXP which, SEXP newx)`
`1335`	`1335`	`}`
`1336`	`1336`	`return R_NilValue;`
`1337`	`1337`	`}`
`1338`		`-`
Original file line number	Diff line number	Diff line change
`@@ -99,4 +99,3 @@ SEXP cj(SEXP base_list) {`
`99`	`99`	`UNPROTECT(1);`
`100`	`100`	`return out;`
`101`	`101`	`}`
`102`		`-`
Original file line number	Diff line number	Diff line change
`@@ -172,4 +172,3 @@ SEXP coalesce(SEXP x, SEXP inplaceArg) {`
`172`	`172`	`UNPROTECT(nprotect);`
`173`	`173`	`return first;`
`174`	`174`	`}`
`175`		`-`
-Original file line number
+Diff line change
       COMPLEX(ans)[0].i = (double) si;
       break;
 */
+-