Rdatatable
diff --git a/‎.ci/atime/tests.R‎
Lines changed: 26 additions & 2 deletions b/‎.ci/atime/tests.R‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎.ci/linters/cocci/malloc_return_value_cast.cocci‎
Lines changed: 6 additions & 0 deletions b/‎.ci/linters/cocci/malloc_return_value_cast.cocci‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.github/workflows/performance-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/performance-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎NEWS.md‎
Lines changed: 12 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎R/as.data.table.R‎
Lines changed: 4 additions & 3 deletions b/‎R/as.data.table.R‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎R/between.R‎
Lines changed: 3 additions & 6 deletions b/‎R/between.R‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎R/data.table.R‎
Lines changed: 4 additions & 2 deletions b/‎R/data.table.R‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎inst/tests/tests.Rraw‎
Lines changed: 24 additions & 0 deletions b/‎inst/tests/tests.Rraw‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎man/between.Rd‎
Lines changed: 2 additions & 1 deletion b/‎man/between.Rd‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/assign.c‎
Lines changed: 7 additions & 8 deletions b/‎src/assign.c‎
Lines changed: 7 additions & 8 deletions
@@ -15,6 +15,7 @@ for (extra.arg in extra.args.6107){
       tmp_csv = tempfile()
       fwrite(DT, tmp_csv)
     },
+    FasterIO = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
     Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
     Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
   this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
@@ -130,6 +131,18 @@ test.list <- atime::atime_test_list(
       paste0('useDynLib(', new.Package_))
   },
 
+  # Constant overhead improvement https://github.com/Rdatatable/data.table/pull/6925
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/7022#discussion_r2107900643
+  "fread disk overhead improved in #6925" = atime::atime_test(
+    N = 2^seq(0, 20), # smaller N because we are doing multiple fread calls.
+    setup = {
+      fwrite(iris[1], iris.csv <- tempfile())
+    },
+    expr = replicate(N, data.table::fread(iris.csv)),
+    Fast = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
+    Slow = "e25ea80b793165094cea87d946d2bab5628f70a6" # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/60a01fa65191c44d7997de1843e9a1dfe5be9f72)
+  ),
+
   # Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
   # Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
   "shallow regression fixed in #4440" = atime::atime_test(
@@ -179,8 +192,9 @@ test.list <- atime::atime_test_list(
   # Fixed in https://github.com/Rdatatable/data.table/pull/4558
   "DT[by] fixed in #4558" = atime::atime_test(
     setup = {
+      N9 <- as.integer(N * 0.9)
       d <- data.table(
-        id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
+        id = sample(c(seq.int(N9), sample(N9, N-N9, TRUE))),
         v1 = sample(5L, N, TRUE),
         v2 = sample(5L, N, TRUE)
       )
@@ -253,5 +267,15 @@ test.list <- atime::atime_test_list(
     Before = "f339aa64c426a9cd7cf2fcb13d91fc4ed353cd31", # Parent of the first commit https://github.com/Rdatatable/data.table/commit/fcc10d73a20837d0f1ad3278ee9168473afa5ff1 in the PR https://github.com/Rdatatable/data.table/pull/6393/commits with major change to fwrite with gzip.
     PR = "3630413ae493a5a61b06c50e80d166924d2ef89a"), # Close-to-last merge commit in the PR.
 
-  tests=extra.test.list)
+  # Test case created directly using the atime code below (not adapted from any other benchmark), based on the PR, Removes unnecessary data.table call from as.data.table.array https://github.com/Rdatatable/data.table/pull/7010 
+  "as.data.table.array improved in #7010" = atime::atime_test(
+    setup = {
+      dims = c(N, 1, 1)
+      arr = array(seq_len(prod(dims)), dim=dims)
+    },
+    expr = data.table:::as.data.table.array(arr, na.rm=FALSE),
+    Slow = "73d79edf8ff8c55163e90631072192301056e336",   # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/8397dc3c993b61a07a81c786ca68c22bc589befc)
+    Fast = "8397dc3c993b61a07a81c786ca68c22bc589befc"),  # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7019/commits) that removes inefficiency
+
+    tests=extra.test.list)
 # nolint end: undesirable_operator_linter.
@@ -4,3 +4,9 @@ expression E;
@@
 - (T)
   malloc(E)
+
+- (T)
+  calloc(_, E)
+
+- (T)
+  realloc(_, E)
@@ -20,4 +20,4 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       repo_token: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: Anirban166/[email protected]
+      - uses: Anirban166/[email protected]
@@ -14,6 +14,10 @@
 
 4. `as.Date()` method for `IDate` no longer coerces to `double` [#6922](https://github.com/Rdatatable/data.table/issues/6922). Thanks @MichaelChirico for the report and PR. The only effect should be on overly-strict tests that assert `Date` objects have `double` storage, which is not in general true, especially from R 4.5.0.
 
+5. `as.data.table()` is slightly more efficient at converting arrays to data.tables, [#7019](https://github.com/Rdatatable/data.table/pull/7019). Thanks @eliocamp.
+
+6. `between()` gains the argument `ignore_tzone=FALSE`. Normally, a difference in time zone between `lower` and `upper` will produce an error, and a difference in time zone between `x` and either of the others will produce a message. Setting `ignore_tzone=TRUE` bypasses the checks, allowing both comparisons to proceed without error or message about time zones.
+
 ### BUG FIXES
 
 1. Custom binary operators from the `lubridate` package now work with objects of class `IDate` as with a `Date` subclass, [#6839](https://github.com/Rdatatable/data.table/issues/6839). Thanks @emallickhossain for the report and @aitap for the fix.
@@ -32,6 +36,8 @@
 
 8. `fread()` no longer warns on certain systems on R 4.5.0+ where the file owner can't be resolved, [#6918](https://github.com/Rdatatable/data.table/issues/6918). Thanks @ProfFancyPants for the report and PR.
 
+9. Joins to extended data.frames, e.g. `x[i, col := x.col1 + i.col2]` where `i` is a `tbl`, can use the `x.` and `i.` prefix forms, [#6998](https://github.com/Rdatatable/data.table/issues/6998). Thanks @MichaelChirico for the bug and PR.
+
 ### NOTES
 
 1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.
@@ -45,6 +51,12 @@
 
 3. {data.table} now depends on R 3.4.0 (2017).
 
+4. Changes to `fread()` output and errors:
+
+   + When the size of the file exceeds the size of the address space, `fread()` now signals an informative error instead of trying to map its size modulo the address space.
+   + On non-Windows systems, `fread()` now prints the reason why the file couldn't be opened, which could also be due to it being too large to map.
+   + With `verbose=TRUE`, file sizes are now printed using correct binary SI prefixes (the sizes have always been reported as bytes denominated in powers of `2^10`, so e.g. `1024*1024` bytes was reported as `1 MB` where `1 MiB` or `1.05 MB` is correct).
+
 ## data.table [v1.17.0](https://github.com/Rdatatable/data.table/milestone/34)  (20 Feb 2025)
 
 ### POTENTIALLY BREAKING CHANGES
 
@@ -96,9 +96,9 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
   dnx = dimnames(x)
   # NULL dimnames will create integer keys, not character as in table method
   val = if (is.null(dnx)) {
-    lapply(dx, seq.int)
+    lapply(dx, seq_len)
   } else if (any(nulldnx <- vapply_1b(dnx, is.null))) {
-    dnx[nulldnx] = lapply(dx[nulldnx], seq.int) #3636
+    dnx[nulldnx] = lapply(dx[nulldnx], seq_len) #3636
     dnx
   } else dnx
   val = rev(val)
@@ -107,7 +107,8 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
   if (value.name %chin% names(val))
     stopf("Argument 'value.name' should not overlap with column names in result: %s", brackify(rev(names(val))))
   N = NULL
-  ans = data.table(do.call(CJ, c(val, sorted=FALSE)), N=as.vector(x))
+  ans = do.call(CJ, c(val, sorted=FALSE))
+  set(ans, j="N", value=as.vector(x))
   if (isTRUE(na.rm))
     ans = ans[!is.na(N)]
   setnames(ans, "N", value.name)
 
@@ -1,5 +1,5 @@
 # is x[i] in between lower[i] and upper[i] ?
-between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) {
+between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE, ignore_tzone=FALSE) {
   if (is.logical(x)) stopf("between has been passed an argument x of type logical")
   if (is.logical(lower)) lower = as.integer(lower)   # typically NA (which is logical type)
   if (is.logical(upper)) upper = as.integer(upper)   # typically NA (which is logical type)
@@ -16,15 +16,12 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE)
     stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal
   }
   # POSIX check timezone match
-  if (is.px(x) && is.px(lower) && is.px(upper)) {
-    tzs = sapply(list(x,lower,upper), function(x) {
-      attr(x, "tzone", exact=TRUE) %||% ""
-    })
+  if (!ignore_tzone && is.px(x) && is.px(lower) && is.px(upper)) {
+    tzs = vapply_1c(list(x, lower, upper), function(x) attr(x, "tzone", exact=TRUE) %||% "")
     # lower/upper should be more tightly linked than x/lower, so error
     #   if the former don't match but only inform if they latter don't
     if (tzs[2L]!=tzs[3L]) {
       stopf("'between' lower= and upper= are both POSIXct but have different tzone attributes: %s. Please align their time zones.", brackify(tzs[2:3], quote=TRUE))
-      # otherwise the check in between.c that lower<=upper can (correctly) fail for this reason
     }
     if (tzs[1L]!=tzs[2L]) {
       messagef("'between' arguments are all POSIXct but have mismatched tzone attributes: %s. The UTC times will be compared.", brackify(tzs, quote=TRUE))
 
@@ -677,10 +677,12 @@ replace_dot_alias = function(e) {
     }
     ansvals = chmatch(ansvars, nx)
   } else {
-    if (is.data.table(i)) {
+    if (is.data.frame(i)) {
       idotprefix = paste0("i.", names_i)
       xdotprefix = paste0("x.", names_x)
-    } else idotprefix = xdotprefix = character(0L)
+    } else {
+      idotprefix = xdotprefix = character(0L)
+    }
 
     # j was substituted before dealing with i so that := can set allow.cartesian=FALSE (#800) (used above in i logic)
     if (is.null(jsub)) return(NULL)
 
@@ -21133,3 +21133,27 @@ test(2315.1, tail(DT[order(i), i], 2L), 1:2)
 # wider range of numbers needed for further coverage
 DT[1L, i := 1000L]
 test(2315.2, tail(DT[order(i), i], 2L), c(1L, 1000L))
+
+# issue #6898, test that tzone behavior changes with ignore_tzone=TRUE
+tms = list(.POSIXct(1), .POSIXct(1.0, "UTC"))
+test(2316.1, between(tms[[1]], tms[[1L]], tms[[2L]]), error = "different tzone attributes")
+test(2316.2, between(tms[[1]], tms[[1L]], tms[[2L]], ignore_tzone=TRUE))
+test(2316.3, between(tms[[1]], tms[[2L]], tms[[2L]]), message = "mismatched tzone attributes")
+test(2316.4, between(tms[[1]], tms[[2L]], tms[[2L]], ignore_tzone=TRUE))
+
+# tbl in i still allows 'i.' prefix reference for update join, #6998
+DT1 = data.table(a=1, b=2)
+DT2 = data.table(a=1, c=3)
+DF1 = data.frame(a=1, d=4)
+DF2 = data.frame(a=1, e=5)
+class(DF2) = c("tbl_df", "tbl", "data.frame")
+
+test(2317.1, DT1[DT2, on='a', c := i.c]$c, 3)
+test(2317.2, DT1[DT2, on='a', c2 := x.a + i.c]$c2, 4)
+test(2317.3, DT1[DT2, on='a', .(c = x.a + i.c)]$c, 4)
+test(2317.4, DT1[DF1, on='a', d := i.d]$d, 4)
+test(2317.5, DT1[DF1, on='a', d2 := x.a + i.d]$d2, 5)
+test(2317.6, DT1[DF1, on='a', .(d = x.a + i.d)]$d, 5)
+test(2317.7, DT1[DF2, on='a', e := i.e]$e, 5)
+test(2317.8, DT1[DF2, on='a', e2 := x.a + i.e]$e2, 6)
+test(2317.9, DT1[DF2, on='a', .(e = x.a + i.e)]$e, 6)
@@ -16,7 +16,7 @@ This can be changed by setting \code{NAbounds} to \code{NA}.
 the intervals provided in \code{lower,upper}.
 }
 \usage{
-between(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE)
+between(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE, ignore_tzone=FALSE)
 x \%between\% y
 
 inrange(x, lower, upper, incbounds=TRUE)
@@ -35,6 +35,7 @@ interpreted as \code{lower} and \code{y[[2]]} as \code{upper}.}
 It is set to \code{TRUE} by default for infix notations.}
 \item{NAbounds}{ If \code{lower} (\code{upper}) contains an \code{NA} what should \code{lower<=x} (\code{x<=upper}) return? By default \code{TRUE} so that a missing bound is interpreted as unlimited. }
 \item{check}{ Produce error if \code{any(lower>upper)}? \code{FALSE} by default for efficiency, in particular type \code{character}. }
+\item{ignore_tzone}{ \code{TRUE} means skip timezone checks among \code{x}, \code{lower}, and \code{upper}. }
 }
 \details{
 
 
@@ -400,7 +400,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
   // FR #2077 - set able to add new cols by reference
   if (isString(cols)) {
     PROTECT(tmp = chmatch(cols, names, 0)); protecti++;
-    buf = (int *) R_alloc(length(cols), sizeof(int));
+    buf = (int *) R_alloc(length(cols), sizeof(*buf));
     int k=0;
     for (int i=0; i<length(cols); ++i) {
       if (INTEGER(tmp)[i] == 0) buf[k++] = i;
@@ -699,7 +699,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
   }
   if (ndelete) {
     // delete any columns assigned NULL (there was a 'continue' earlier in loop above)
-    int *tt = (int *)R_alloc(ndelete, sizeof(int));
+    int *tt = (int *)R_alloc(ndelete, sizeof(*tt));
     const int *colsd=INTEGER(cols), ncols=length(cols), ndt=length(dt);
     for (int i=0, k=0; i<ncols; ++i) {   // find which ones to delete and put them in tt
       // Aside: a new column being assigned NULL (something odd to do) would have been warned above, added above, and now deleted. Just
@@ -1055,7 +1055,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
     case RAWSXP:    BODY(Rbyte, RAW,    int, val!=0,                                    td[i]=cval)
     case LGLSXP:
       if (mc) {
-                    memcpy(td, LOGICAL_RO(source), slen*sizeof(int)); break;
+                    memcpy(td, LOGICAL_RO(source), slen*sizeof(*td)); break;
       } else        BODY(int, LOGICAL,  int, val,                                       td[i]=cval)
     case INTSXP:    BODY(int, INTEGER,  int, val==NA_INTEGER ? NA_LOGICAL : val!=0,     td[i]=cval)
     case REALSXP:
@@ -1072,7 +1072,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
     case  LGLSXP:   // same as INTSXP ...
     case  INTSXP:
       if (mc) {
-                    memcpy(td, INTEGER_RO(source), slen*sizeof(int)); break;
+                    memcpy(td, INTEGER_RO(source), slen*sizeof(*td)); break;
       } else        BODY(int, INTEGER,  int, val,                                       td[i]=cval)
     case REALSXP:
       if (sourceIsI64)
@@ -1092,7 +1092,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
       case REALSXP:
         if (sourceIsI64) {
           if (mc) {
-                    memcpy(td, (const int64_t *)REAL_RO(source), slen*sizeof(int64_t)); break;
+                    memcpy(td, (const int64_t *)REAL_RO(source), slen*sizeof(*td)); break;
           } else    BODY(int64_t, REAL, int64_t, val,                                   td[i]=cval)
         } else      BODY(double, REAL,  int64_t, within_int64_repres(val) ? val : NA_INTEGER64,    td[i]=cval)
       case CPLXSXP: BODY(Rcomplex, COMPLEX, int64_t, ISNAN(val.r) ? NA_INTEGER64 : (int64_t)val.r, td[i]=cval)
@@ -1291,14 +1291,14 @@ void savetl(SEXP s)
       internal_error(__func__, "reached maximum %d items for savetl", nalloc); // # nocov
     }
     nalloc = nalloc>(INT_MAX/2) ? INT_MAX : nalloc*2;
-    char *tmp = (char *)realloc(saveds, nalloc*sizeof(SEXP));
+    char *tmp = realloc(saveds, sizeof(SEXP)*nalloc);
     if (tmp==NULL) {
       // C spec states that if realloc() fails the original block is left untouched; it is not freed or moved. We rely on that here.
       savetl_end();                                                      // # nocov  free(saveds) happens inside savetl_end
       error(_("Failed to realloc saveds to %d items in savetl"), nalloc);   // # nocov
     }
     saveds = (SEXP *)tmp;
-    tmp = (char *)realloc(savedtl, nalloc*sizeof(R_len_t));
+    tmp = realloc(savedtl, sizeof(R_len_t)*nalloc);
     if (tmp==NULL) {
       savetl_end();                                                      // # nocov
       error(_("Failed to realloc savedtl to %d items in savetl"), nalloc);  // # nocov
@@ -1335,4 +1335,3 @@ SEXP setcharvec(SEXP x, SEXP which, SEXP newx)
   }
   return R_NilValue;
 }
-
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ This can be changed by setting \code{NAbounds} to \code{NA}.`
`16`	`16`	`the intervals provided in \code{lower,upper}.`
`17`	`17`	`}`
`18`	`18`	`\usage{`
`19`		`-between(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE)`
	`19`	`+between(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE, ignore_tzone=FALSE)`
`20`	`20`	`x \%between\% y`
`21`	`21`
`22`	`22`	`inrange(x, lower, upper, incbounds=TRUE)`
`@@ -35,6 +35,7 @@ interpreted as \code{lower} and \code{y[[2]]} as \code{upper}.}`
`35`	`35`	`It is set to \code{TRUE} by default for infix notations.}`
`36`	`36`	`\item{NAbounds}{ If \code{lower} (\code{upper}) contains an \code{NA} what should \code{lower<=x} (\code{x<=upper}) return? By default \code{TRUE} so that a missing bound is interpreted as unlimited. }`
`37`	`37`	`\item{check}{ Produce error if \code{any(lower>upper)}? \code{FALSE} by default for efficiency, in particular type \code{character}. }`
	`38`	`+\item{ignore_tzone}{ \code{TRUE} means skip timezone checks among \code{x}, \code{lower}, and \code{upper}. }`
`38`	`39`	`}`
`39`	`40`	`\details{`
`40`	`41`