Rdatatable
diff --git a/‎.ci/atime/tests.R‎
Lines changed: 28 additions & 2 deletions b/‎.ci/atime/tests.R‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎.ci/linters/c/cocci_linter.R‎
Lines changed: 14 additions & 11 deletions b/‎.ci/linters/c/cocci_linter.R‎
Lines changed: 14 additions & 11 deletions
diff --git a/‎.ci/linters/cocci/malloc_return_value_cast.cocci‎
Lines changed: 8 additions & 0 deletions b/‎.ci/linters/cocci/malloc_return_value_cast.cocci‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎.github/CODE_OF_CONDUCT.md‎
Lines changed: 9 additions & 28 deletions b/‎.github/CODE_OF_CONDUCT.md‎
Lines changed: 9 additions & 28 deletions
diff --git a/‎.github/workflows/performance-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/performance-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎GOVERNANCE.md‎
Lines changed: 6 additions & 13 deletions b/‎GOVERNANCE.md‎
Lines changed: 6 additions & 13 deletions
diff --git a/‎NEWS.md‎
Lines changed: 13 additions & 1 deletion b/‎NEWS.md‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎R/as.data.table.R‎
Lines changed: 4 additions & 3 deletions b/‎R/as.data.table.R‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎R/between.R‎
Lines changed: 3 additions & 6 deletions b/‎R/between.R‎
Lines changed: 3 additions & 6 deletions
@@ -1,3 +1,5 @@
+pval.thresh <- 0.001 # to reduce false positives.
+
 # Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
 # https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
 extra.args.6107 <- c(
@@ -13,6 +15,7 @@ for (extra.arg in extra.args.6107){
       tmp_csv = tempfile()
       fwrite(DT, tmp_csv)
     },
+    FasterIO = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
     Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
     Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
   this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
@@ -128,6 +131,18 @@ test.list <- atime::atime_test_list(
       paste0('useDynLib(', new.Package_))
   },
 
+  # Constant overhead improvement https://github.com/Rdatatable/data.table/pull/6925
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/7022#discussion_r2107900643
+  "fread disk overhead improved in #6925" = atime::atime_test(
+    N = 2^seq(0, 20), # smaller N because we are doing multiple fread calls.
+    setup = {
+      fwrite(iris[1], iris.csv <- tempfile())
+    },
+    expr = replicate(N, data.table::fread(iris.csv)),
+    Fast = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
+    Slow = "e25ea80b793165094cea87d946d2bab5628f70a6" # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/60a01fa65191c44d7997de1843e9a1dfe5be9f72)
+  ),
+
   # Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
   # Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
   "shallow regression fixed in #4440" = atime::atime_test(
@@ -177,8 +192,9 @@ test.list <- atime::atime_test_list(
   # Fixed in https://github.com/Rdatatable/data.table/pull/4558
   "DT[by] fixed in #4558" = atime::atime_test(
     setup = {
+      N9 <- as.integer(N * 0.9)
       d <- data.table(
-        id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
+        id = sample(c(seq.int(N9), sample(N9, N-N9, TRUE))),
         v1 = sample(5L, N, TRUE),
         v2 = sample(5L, N, TRUE)
       )
@@ -251,5 +267,15 @@ test.list <- atime::atime_test_list(
     Before = "f339aa64c426a9cd7cf2fcb13d91fc4ed353cd31", # Parent of the first commit https://github.com/Rdatatable/data.table/commit/fcc10d73a20837d0f1ad3278ee9168473afa5ff1 in the PR https://github.com/Rdatatable/data.table/pull/6393/commits with major change to fwrite with gzip.
     PR = "3630413ae493a5a61b06c50e80d166924d2ef89a"), # Close-to-last merge commit in the PR.
 
-  tests=extra.test.list)
+  # Test case created directly using the atime code below (not adapted from any other benchmark), based on the PR, Removes unnecessary data.table call from as.data.table.array https://github.com/Rdatatable/data.table/pull/7010 
+  "as.data.table.array improved in #7010" = atime::atime_test(
+    setup = {
+      dims = c(N, 1, 1)
+      arr = array(seq_len(prod(dims)), dim=dims)
+    },
+    expr = data.table:::as.data.table.array(arr, na.rm=FALSE),
+    Slow = "73d79edf8ff8c55163e90631072192301056e336",   # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/8397dc3c993b61a07a81c786ca68c22bc589befc)
+    Fast = "8397dc3c993b61a07a81c786ca68c22bc589befc"),  # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7019/commits) that removes inefficiency
+
+    tests=extra.test.list)
 # nolint end: undesirable_operator_linter.
@@ -1,20 +1,23 @@
 cocci_linter = if (!nzchar(Sys.which("spatch"))) function(...) {} else function(c_obj) {
-  bad <- FALSE
+  bad = FALSE
+  tmp = tempfile(fileext = '.c')
+  on.exit(unlink(tmp))
+  writeLines(c_obj$preprocessed, tmp)
   for (spfile in list.files(".ci/linters/cocci", full.names = TRUE)) {
-    # Coccinelle parser gets confused sometimes, so ignore stderr and the exit code
-    out = suppressWarnings(system2(
+    out = system2(
       "spatch",
-      shQuote(c(
-        "--sp-file", spfile, c_obj$path, "--recursive-includes",
-        "-I", R.home("include"), "-I", "src"
-      )),
+      shQuote(c("--sp-file", spfile, tmp)),
       stdout = TRUE, stderr = FALSE
-    ))
+    )
     if (length(out) > 0) {
-      cat(sprintf("In file '%s', Coccinelle patch '%s' recommends the following changes:\n", c_obj$path, spfile))
+      cat(sprintf("In file '%s', Coccinelle linter '%s' located the following problems:\n", c_obj$path, spfile))
       writeLines(out)
-      bad <- TRUE
+      bad = TRUE
+    }
+    if (!is.null(status <- attr(out, 'status'))) {
+      cat(sprintf("While working on file '%s', Coccinelle linter '%s' failed with exit code %d:\n", c_obj$path, spfile, status))
+      bad = TRUE
     }
   }
-  if (bad) stop("Please apply the changes above or fix the linter")
+  if (bad) stop("Please investigate the problems above.")
 }
@@ -4,3 +4,11 @@ expression E;
@@
 - (T)
   malloc(E)
+
+@calloc_realloc_return_value_cast expression@
+type T;
+expression E1, E2;
+identifier alloc =~ "^(c|re)alloc$";
+@@
+- (T)
+  alloc(E1, E2)
@@ -1,35 +1,16 @@
-As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
+The R data.table project adheres to NumFOCUS's Code of Conduct. 
 
-We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality.
+# The NumFOCUS Code of Conduct
 
-Examples of unacceptable behavior by participants include:
+## The Short Version
 
-* The use of sexualized language or imagery
-* Personal attacks
-* Trolling or insulting/derogatory comments
-* Public or private harassment
-* Publishing other's private information, such as physical or electronic addresses, without explicit permission
-* Other unethical or unprofessional conduct
+Be kind to others. Do not insult or put down others. Behave professionally. Remember that harassment and sexist, racist, or exclusionary jokes are not appropriate for NumFOCUS.
 
-Project members with the Committer role have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
+All communication should be appropriate for a professional audience including people of many different backgrounds. Sexual language and imagery is not appropriate.
 
-By adopting this Code of Conduct, project members commit themselves to fairly and consistently apply these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team.
+NumFOCUS is dedicated to providing a harassment-free community for everyone, regardless of gender, sexual orientation, gender identity and expression, disability, physical appearance, body size, race, or religion. We do not tolerate harassment of community members in any form.
+Thank you for helping make this a welcoming, friendly community for all.
 
-This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community.
+[Code of Conduct Reporting Form](https://numfocus.typeform.com/to/ynjGdT)
 
-
-## Reporting
-
-Project members with the Committer role or the CRAN Maintainer role are pledged to promptly address any reported issues. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to any individual with this role.
-
-Those who prefer to report in a way that is independent of the current Committers and Maintainer may instead contact the Community Engagement Coordinator by e-mailing [r.data.table\@gmail.com](mailto:[email protected]). Messages sent to this e-mail address will be visible only to the current Community Engagement Coordinator, a position always held by an individual who is not a Committer or CRAN Maintainer of the package.
-
-The current Committers are Toby Dylan Hocking (@tdhock), Matt Dowle (@mattdowle), Arun Srinivasan (@arunsrinivasan), Jan Gorecki (@jangorecki), Michael Chirico (@MichaelChirico), Benjamin Schwendinger (@ben-schwen), and Ivan Krylov (@aitap).
-
-The current CRAN Maintainer is Tyson Barrett (@tysonstanley).
-
-The current Community Engagement Coordinator is Kelly Bodwin (@kbodwin).
-
-All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. Complaint respondents are obligated to maintain confidentiality with regard to the reporter of an incident.
-
-This Code of Conduct is adapted from the [Contributor Covenant, version 1.3.0](https://www.contributor-covenant.org/version/1/3/0/code-of-conduct/), available at [https://www.contributor-covenant.org/version/1/3/0/](https://www.contributor-covenant.org/version/1/3/0/), and the Swift Code of Conduct.
+For the full version of the Code of Conduct, please visit: [https://numfocus.org/code-of-conduct](https://numfocus.org/code-of-conduct).
@@ -20,4 +20,4 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       repo_token: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: Anirban166/[email protected].1
+      - uses: Anirban166/[email protected].3
@@ -108,23 +108,14 @@ Please also make a note in the change log under [`# Governance history`](#govern
 
 # Finances and Funding
 
-There is currently no mechanism for the data.table project to receive funding as an entity.  
+data.table is a [NumFOCUS](https://numfocus.org/) project.  Donations to the data.table can be made at [https://numfocus.org/project/data-table]([https://numfocus.org/donate-to-data-table](https://app.hubspot.com/payments/FFWKWTTvKFdzqH?referrer=PAYMENT_LINK))
 
-Funding support for this project therefore may come in two forms:
+*NumFOCUS is a 501(c)(3) non-profit charity in the United States; as such, donations to NumFOCUS are tax-deductible as allowed by law. As with any donation, you should consult with your personal tax adviser or the IRS about your particular tax situation.*
 
-## Individual external funding
 
-Any individual developer or community member of data.table may apply for and receive funding for their work on the project.  Individuals or groups seeking funding support are strongly encouraged to consult directly with the data.table Project Members (by initiating an Issue on GitHub) to ensure funds are used meaningfully. Formally, however, decisions about use of funds are governed by the individual grantee(s) and their contract with the funding agency. 
+## Decision-making for funding use
 
-There is no guarantee that funded work will be incorporated into the data.table package; any contributions, whether funded or unfunded, are subject to the same review process as outlined above.
-
-## Direct donations
-
-Direct donations to the project may be made via GitHub Sponsorships, which allow individuals to fund a specific developer.  If the current CRAN Maintainer offers a personal sponsorship option, donations may be made to them to support the project in general.
-
-## Decision-making for future opportunities
-
-We here outline a procedure for disbursing funds, should this project in the future become a directly fundable entity (e.g. an LLC or a subsidiary of an umbrella LLC).
+We here outline a procedure for disbursing funds acquired through direct donations via NumFOCUS or grant-style research funding.
 
 Funds acquired by the data.table project will be disbursed at the discretion of the **Committers**, defined as above.  The **CRAN Maintainer** will have authority to make final decisions in the event that no consensus is reached among committers prior to deadlines for use of funds, and will be responsible for disbursement logistics.
 
@@ -148,6 +139,8 @@ data.table Version line in DESCRIPTION typically has the following meanings
 
 # Governance history
 
+May 2025: update Finance and CoC language for NumFOCUS incorporation.
+
 Feb 2025: add Finances and Funding section, update Code of Conduct section to be a brief summary and reference the broader CoC document.
 
 Jan 2025: clarify that edits to governance should notify all committers, and that role names are proper nouns (i.e., upper-case) throughout.
 
@@ -27,7 +27,11 @@ frollsum(c(1,2,3,Inf,5,6), 2)
 
 4. `as.Date()` method for `IDate` no longer coerces to `double` [#6922](https://github.com/Rdatatable/data.table/issues/6922). Thanks @MichaelChirico for the report and PR. The only effect should be on overly-strict tests that assert `Date` objects have `double` storage, which is not in general true, especially from R 4.5.0.
 
-5. Multiple improvements has been added to rolling functions. Request came from @gpierard who needed left aligned, adaptive, rolling max, [#5438](https://github.com/Rdatatable/data.table/issues/5438). There was no `frollmax` function yet. Adaptive rolling functions did not have support for `align="left"`. `frollapply` did not support `adaptive=TRUE`. Available alternatives were base R `mapply` or self-join using `max` and grouping `by=.EACHI`. As a follow up of his request, following features has been added:
+5. `as.data.table()` is slightly more efficient at converting arrays to data.tables, [#7019](https://github.com/Rdatatable/data.table/pull/7019). Thanks @eliocamp.
+
+6. `between()` gains the argument `ignore_tzone=FALSE`. Normally, a difference in time zone between `lower` and `upper` will produce an error, and a difference in time zone between `x` and either of the others will produce a message. Setting `ignore_tzone=TRUE` bypasses the checks, allowing both comparisons to proceed without error or message about time zones.
+
+7. Multiple improvements has been added to rolling functions. Request came from @gpierard who needed left aligned, adaptive, rolling max, [#5438](https://github.com/Rdatatable/data.table/issues/5438). There was no `frollmax` function yet. Adaptive rolling functions did not have support for `align="left"`. `frollapply` did not support `adaptive=TRUE`. Available alternatives were base R `mapply` or self-join using `max` and grouping `by=.EACHI`. As a follow up of his request, following features has been added:
 - new function `frollmax`, applies `max` over a rolling window.
 - support for `align="left"` for adaptive rolling function.
 - support for `adaptive=TRUE` in `frollapply`.
@@ -85,6 +89,8 @@ As of now, adaptive rolling max has no _on-line_ implemention (`algo="fast"`), i
 
 8. `fread()` no longer warns on certain systems on R 4.5.0+ where the file owner can't be resolved, [#6918](https://github.com/Rdatatable/data.table/issues/6918). Thanks @ProfFancyPants for the report and PR.
 
+9. Joins to extended data.frames, e.g. `x[i, col := x.col1 + i.col2]` where `i` is a `tbl`, can use the `x.` and `i.` prefix forms, [#6998](https://github.com/Rdatatable/data.table/issues/6998). Thanks @MichaelChirico for the bug and PR.
+
 ### NOTES
 
 1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.
@@ -98,6 +104,12 @@ As of now, adaptive rolling max has no _on-line_ implemention (`algo="fast"`), i
 
 3. {data.table} now depends on R 3.4.0 (2017).
 
+4. Changes to `fread()` output and errors:
+
+   + When the size of the file exceeds the size of the address space, `fread()` now signals an informative error instead of trying to map its size modulo the address space.
+   + On non-Windows systems, `fread()` now prints the reason why the file couldn't be opened, which could also be due to it being too large to map.
+   + With `verbose=TRUE`, file sizes are now printed using correct binary SI prefixes (the sizes have always been reported as bytes denominated in powers of `2^10`, so e.g. `1024*1024` bytes was reported as `1 MB` where `1 MiB` or `1.05 MB` is correct).
+
 ## data.table [v1.17.0](https://github.com/Rdatatable/data.table/milestone/34)  (20 Feb 2025)
 
 ### POTENTIALLY BREAKING CHANGES
 
@@ -96,9 +96,9 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
   dnx = dimnames(x)
   # NULL dimnames will create integer keys, not character as in table method
   val = if (is.null(dnx)) {
-    lapply(dx, seq.int)
+    lapply(dx, seq_len)
   } else if (any(nulldnx <- vapply_1b(dnx, is.null))) {
-    dnx[nulldnx] = lapply(dx[nulldnx], seq.int) #3636
+    dnx[nulldnx] = lapply(dx[nulldnx], seq_len) #3636
     dnx
   } else dnx
   val = rev(val)
@@ -107,7 +107,8 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
   if (value.name %chin% names(val))
     stopf("Argument 'value.name' should not overlap with column names in result: %s", brackify(rev(names(val))))
   N = NULL
-  ans = data.table(do.call(CJ, c(val, sorted=FALSE)), N=as.vector(x))
+  ans = do.call(CJ, c(val, sorted=FALSE))
+  set(ans, j="N", value=as.vector(x))
   if (isTRUE(na.rm))
     ans = ans[!is.na(N)]
   setnames(ans, "N", value.name)
 
@@ -1,5 +1,5 @@
 # is x[i] in between lower[i] and upper[i] ?
-between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) {
+between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE, ignore_tzone=FALSE) {
   if (is.logical(x)) stopf("between has been passed an argument x of type logical")
   if (is.logical(lower)) lower = as.integer(lower)   # typically NA (which is logical type)
   if (is.logical(upper)) upper = as.integer(upper)   # typically NA (which is logical type)
@@ -16,15 +16,12 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE)
     stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal
   }
   # POSIX check timezone match
-  if (is.px(x) && is.px(lower) && is.px(upper)) {
-    tzs = sapply(list(x,lower,upper), function(x) {
-      attr(x, "tzone", exact=TRUE) %||% ""
-    })
+  if (!ignore_tzone && is.px(x) && is.px(lower) && is.px(upper)) {
+    tzs = vapply_1c(list(x, lower, upper), function(x) attr(x, "tzone", exact=TRUE) %||% "")
     # lower/upper should be more tightly linked than x/lower, so error
     #   if the former don't match but only inform if they latter don't
     if (tzs[2L]!=tzs[3L]) {
       stopf("'between' lower= and upper= are both POSIXct but have different tzone attributes: %s. Please align their time zones.", brackify(tzs[2:3], quote=TRUE))
-      # otherwise the check in between.c that lower<=upper can (correctly) fail for this reason
     }
     if (tzs[1L]!=tzs[2L]) {
       messagef("'between' arguments are all POSIXct but have mismatched tzone attributes: %s. The UTC times will be compared.", brackify(tzs, quote=TRUE))