Rdatatable
diff --git a/‎.Rbuildignore‎
Lines changed: 1 addition & 0 deletions b/‎.Rbuildignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/.lintr.R‎
Lines changed: 3 additions & 2 deletions b/‎.ci/.lintr.R‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎.ci/README.md‎
Lines changed: 20 additions & 1 deletion b/‎.ci/README.md‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎.ci/atime/tests.R‎
Lines changed: 75 additions & 27 deletions b/‎.ci/atime/tests.R‎
Lines changed: 75 additions & 27 deletions
diff --git a/‎.ci/ci.R‎
Lines changed: 4 additions & 3 deletions b/‎.ci/ci.R‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎.ci/linters/c/alloc_linter.R‎
Lines changed: 3 additions & 2 deletions b/‎.ci/linters/c/alloc_linter.R‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎.ci/linters/r/class1_linter.R‎
Lines changed: 10 additions & 0 deletions b/‎.ci/linters/r/class1_linter.R‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎.ci/publish.R‎
Lines changed: 1 addition & 1 deletion b/‎.ci/publish.R‎
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,5 @@
 .dir-locals.el
+.check.translations.R
 ^\.Rprofile$
 ^data\.table_.*\.tar\.gz$
 ^config\.log$
 
@@ -9,10 +9,10 @@ linters = c(dt_linters, all_linters(
   packages = "lintr", # TODO(lintr->3.2.0): Remove this.
   # eq_assignment_linter(),
   brace_linter(allow_single_line = TRUE),
+  implicit_integer_linter(allow_colon = TRUE),
   # TODO(michaelchirico): Activate these incrementally. These are the
   #   parameterizations that match our style guide.
   # implicit_assignment_linter(allow_lazy = TRUE, allow_scoped = TRUE),
-  # implicit_integer_linter(allow_colon = TRUE),
   # system_time_linter = undesirable_function_linter(c(
   #   system.time = "Only run timings in benchmark.Rraw"
   # )),
@@ -84,7 +84,8 @@ exclusions = c(local({
       infix_spaces_linter = Inf,
       undesirable_function_linter = Inf
     )),
-    exclusion_for_dir("vignettes", list(
+    exclusion_for_dir(c("vignettes", "vignettes/fr"), list(
+      implicit_integer_linter = Inf,
       quotes_linter = Inf,
       sample_int_linter = Inf
       # strings_as_factors_linter = Inf
 
@@ -1,6 +1,6 @@
 # data.table continuous integration and deployment
 
-On each Pull Request opened in GitHub we run GitHub Actions test jobs to provide prompt feedback about the status of PR. Our main CI pipeline runs on GitLab CI nightly. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch every night. It tests more environments and different configurations. It publish variety of artifacts.
+On each Pull Request opened in GitHub we run GitHub Actions test jobs to provide prompt feedback about the status of PR. Our more thorough main CI pipeline runs nightly on GitLab CI. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch every night. It tests more environments and different configurations. It publishes a variety of artifacts such as our [homepage](https://rdatatable.gitlab.io/data.table/) and [CRAN-like website for dev version](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html), including windows binaries for the dev version.
 
 ## Environments
 
@@ -44,3 +44,22 @@ Base R implemented helper script, [originally proposed to base R](https://svn.r-
 ### [`publish.R`](./publish.R)
 
 Base R implemented helper script to orchestrate generation of most artifacts and to arrange them nicely. It is being used only in [_integration_ stage in GitLab CI pipeline](./../.gitlab-ci.yml).
+
+## GitLab Open Source Program
+
+We are currently part of the [GitLab for Open Source Program](https://about.gitlab.com/solutions/open-source/). This gives us 50,000 compute minutes per month for our GitLab CI. Our license needs to be renewed yearly (around July) and is currently managed by @ben-schwen.
+
+## Updating CI pipeline
+
+Basic CI checks are also run on every push to the GitLab repository. This can **and should** be used for PRs changing the CI pipeline before merging them to master.
+
+```shell
+# fetch changes from remote (GitHub) and push them to GitLab 
+git fetch [email protected]:Rdatatable/data.table.git new_branch:new_branch
+git push
+# after updating on GitHub, pull changes from remote and push to GitLab
+git pull [email protected]:Rdatatable/data.table.git new_branch
+git push
+```
+
+Make sure to include a link to the pipeline results in your PR. 
@@ -1,12 +1,12 @@
-# #6107 fixed performance across 3 ways to specify a column as Date, test each individually
+# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
+# https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
 extra.args.6107 <- c(
   "colClasses=list(Date='date')",
   "colClasses='Date'",
   "select=list(Date='date')")
 extra.test.list <- list()
 for (extra.arg in extra.args.6107){
   this.test <- atime::atime_test(
-    N = 10^seq(1, 7, by=0.25),
     setup = {
       set.seed(1)
       DT = data.table(date=.Date(sample(20000, N, replace=TRUE)))
@@ -19,6 +19,31 @@ for (extra.arg in extra.args.6107){
   extra.test.list[[sprintf("fread(%s) improved in #6107", extra.arg)]] <- this.test
 }
 
+# Test case adapted from https://github.com/Rdatatable/data.table/pull/4386#issue-602528139 which is where the performance was improved.
+for(retGrp_chr in c("T","F"))extra.test.list[[sprintf(
+  "forderv(retGrp=%s) improved in #4386", retGrp_chr
+)]] <- list(
+  setup = quote({
+    dt <- data.table(group = rep(1:2, l=N))
+  }),
+  expr = substitute({
+    old.opt <- options(datatable.forder.auto.index = TRUE) # required for test, un-documented, comments in forder.c say it is for debugging only.
+    data.table:::forderv(dt, "group", retGrp = RETGRP)
+    options(old.opt) # so the option does not affect other tests.
+  }, list(RETGRP=eval(str2lang(retGrp_chr)))),
+  ## From ?bench::mark, "Each expression will always run at least twice,
+  ## once to measure the memory allocation and store results
+  ## and one or more times to measure timing."
+  ## So for atime(times=10) that means 11 times total.
+  ## First time for memory allocation measurement,
+  ## (also sets the index of dt in this example),
+  ## then 10 more times for time measurement.
+  ## Timings should be constant if the cached index is used (Fast),
+  ## and (log-)linear if the index is re-computed (Slow).
+  Slow = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/b0efcf59442a7d086c6df17fa6a45c81b082322e) in the PR (https://github.com/Rdatatable/data.table/pull/4386/commits) where the performance was improved.
+  Fast = "ffe431fbc1fe2d52ed9499f78e7e16eae4d71a93" # Last commit of the PR (https://github.com/Rdatatable/data.table/pull/4386/commits) where the performance was improved.
+)
+
 # A list of performance tests.
 #
 # See documentation in https://github.com/Rdatatable/data.table/wiki/Performance-testing for best practices.
@@ -40,6 +65,8 @@ for (extra.arg in extra.args.6107){
 # @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information.
 # nolint start: undesirable_operator_linter. ':::' needed+appropriate here.
 test.list <- atime::atime_test_list(
+  # Common N and pkg.edit.fun are defined here, and inherited in all test cases below which do not re-define them.
+  N = as.integer(10^seq(1, 7, by=0.25)),
   # A function to customize R package metadata and source files to facilitate version-specific installation and testing.
   #
   # This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R)
@@ -96,10 +123,9 @@ test.list <- atime::atime_test_list(
       paste0('useDynLib(', new.Package_))
   },
 
-  # Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/4440
+  # Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
   "shallow regression fixed in #4440" = atime::atime_test(
-    N = 10^seq(3, 8),
     setup = {
       set.seed(1L)
       dt <- data.table(a = sample.int(N))
@@ -110,17 +136,16 @@ test.list <- atime::atime_test_list(
     Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression
     Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440)
 
-  # Test based on: https://github.com/Rdatatable/data.table/issues/5424
-  # Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/5463
+  # Test based on https://github.com/Rdatatable/data.table/issues/5424
+  # Performance regression introduced from a commit in https://github.com/Rdatatable/data.table/pull/4491
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/5463#issue-1373642456 which is the fix PR.
   "memrecycle regression fixed in #5463" = atime::atime_test(
-    N = 10^seq(3, 8),
     setup = {
-      n <- N/100
+      bigN <- N*100
       set.seed(2L)
       dt <- data.table(
-        g = sample(seq_len(n), N, TRUE),
-        x = runif(N),
+        g = sample(seq_len(N), bigN, TRUE),
+        x = runif(bigN),
         key = "g")
       dt_mod <- copy(dt)
     },
@@ -129,10 +154,9 @@ test.list <- atime::atime_test_list(
     Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
     Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5463/commits) that fixed the regression
 
-  # Issue reported in: https://github.com/Rdatatable/data.table/issues/5426
-  # To be fixed in: https://github.com/Rdatatable/data.table/pull/5427
+  # Issue reported in https://github.com/Rdatatable/data.table/issues/5426
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/5427#issue-1323678063 which is the fix PR.
   "setDT improved in #5427" = atime::atime_test(
-    N = 10^seq(1, 7),
     setup = {
       L <- replicate(N, 1, simplify = FALSE)
       setDT(L)
@@ -144,10 +168,9 @@ test.list <- atime::atime_test_list(
     Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
     Fast = "af48a805e7a5026a0c2d0a7fd9b587fea5cfa3c4"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
 
-  # Issue reported in: https://github.com/Rdatatable/data.table/issues/4200
-  # To be fixed in: https://github.com/Rdatatable/data.table/pull/4558
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/4200#issuecomment-645980224 which is where the issue was reported.
+  # Fixed in https://github.com/Rdatatable/data.table/pull/4558
   "DT[by] fixed in #4558" = atime::atime_test(
-    N = 10^seq(1, 20),
     setup = {
       d <- data.table(
         id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
@@ -160,10 +183,9 @@ test.list <- atime::atime_test_list(
     Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
     Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
 
-  # Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/4501
+  # Issue with sorting again when already sorted, as reported in https://github.com/Rdatatable/data.table/issues/4498
+  # Test case adapted from https://github.com/Rdatatable/data.table/pull/4501#issue-625311918 which is the fix PR.
   "DT[,.SD] improved in #4501" = atime::atime_test(
-    N = 10^seq(1, 10, by=0.5),
     setup = {
       set.seed(1)
       L = as.data.table(as.character(rnorm(N, 1, 0.5)))
@@ -175,10 +197,9 @@ test.list <- atime::atime_test_list(
     Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
     Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue 
 
-  # Issue reported in: https://github.com/Rdatatable/data.table/issues/6286
-  # Fixed in: https://github.com/Rdatatable/data.table/pull/6296
-  "DT[by, verbose = TRUE] improved in #6296" = atime::atime_test(
-    N = 10^seq(1, 9),
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/6286#issue-2412141289 which is where the issue was reported.
+  # Fixed in https://github.com/Rdatatable/data.table/pull/6296
+  "DT[by,verbose=TRUE] improved in #6296" = atime::atime_test(
     setup = {
       dt = data.table(a = 1:N)
       dt_mod <- copy(dt)
@@ -187,9 +208,9 @@ test.list <- atime::atime_test_list(
     Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue
     Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue
 
-  # Issue mentioned and fixed in: https://github.com/Rdatatable/data.table/pull/5493
+  # Test case adapted from https://github.com/Rdatatable/data.table/issues/5492#issue-1416598382 which is where the issue was reported,
+  # and from https://github.com/Rdatatable/data.table/pull/5493#issue-1416656788 which is the fix PR.
   "transform improved in #5493" = atime::atime_test(
-    N = 10^seq(1, 7),
     setup = {
       df <- data.frame(x = runif(N))
       dt <- as.data.table(df)
@@ -198,5 +219,32 @@ test.list <- atime::atime_test_list(
     Slow = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/93ce3ce1373bf733ebd2036e2883d2ffe377ab58) in the PR (https://github.com/Rdatatable/data.table/pull/5493/commits) that fixes the issue
     Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5493) that fixes the issue
 
+  # Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
+  "melt improved in #5054" = atime::atime_test(
+    setup = {
+      DT <- as.data.table(as.list(1:N))
+      measure.vars <- lapply(1:N, function(i) {
+        x = rep(NA, N)
+        x[i] = i
+        x
+      })  
+    },
+    expr = data.table:::melt(DT, measure.vars = measure.vars),
+    Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
+    Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue  # Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
+
+  # Test case created from @tdhock's comment https://github.com/Rdatatable/data.table/pull/6393#issuecomment-2327396833, in turn adapted from @philippechataignon's comment https://github.com/Rdatatable/data.table/pull/6393#issuecomment-2326714012
+  "fwrite refactored in #6393" = atime::atime_test(
+    setup = {
+      set.seed(1)
+      NC = 10L
+      L <- data.table(i=1:N)
+      L[, paste0("V", 1:NC) := replicate(NC, rnorm(N), simplify=FALSE)]
+      out.csv <- tempfile()
+    },
+    expr = data.table::fwrite(L, out.csv, compress="gzip"),
+    Before = "f339aa64c426a9cd7cf2fcb13d91fc4ed353cd31", # Parent of the first commit https://github.com/Rdatatable/data.table/commit/fcc10d73a20837d0f1ad3278ee9168473afa5ff1 in the PR https://github.com/Rdatatable/data.table/pull/6393/commits with major change to fwrite with gzip.
+    PR = "3630413ae493a5a61b06c50e80d166924d2ef89a"), # Close-to-last merge commit in the PR.
+
   tests=extra.test.list)
 # nolint end: undesirable_operator_linter.
@@ -111,7 +111,7 @@ mirror.packages <-
 function(pkgs,
          which = c("Depends", "Imports", "LinkingTo"),
          repos = getOption("repos"),
-         type = c("source", "mac.binary", "win.binary"),
+         type = c("source", "mac.binary.big-sur-arm64", "win.binary"),
          repodir,
          except.repodir = repodir,
          except.priority = "base",
@@ -169,7 +169,8 @@ function(pkgs,
     newpkgs <- newpkgs[availpkgs]
   }
 
-  pkgsext <- switch(type,
+  typeshort <- if (startsWith(type, "mac.binary.")) "mac.binary" else type
+  pkgsext <- switch(typeshort,
                     "source" = "tar.gz",
                     "mac.binary" = "tgz",
                     "win.binary" = "zip")
@@ -181,7 +182,7 @@ function(pkgs,
   dp <- utils::download.packages(pkgs = newpkgs, destdir = destdir,
                                  available = db, contriburl = repos.url,
                                  type = type, method = method, quiet = quiet)
-  tools::write_PACKAGES(dir = destdir, type = type, ...)
+  tools::write_PACKAGES(dir = destdir, type = typeshort, ...)
   dp
 }
 
@@ -4,8 +4,9 @@
 #   2. Check the next line for a check like 'if (!x || !y)'
 alloc_linter = function(c_obj) {
   lines = c_obj$lines
-  # Be a bit more precise to avoid mentions in comments
-  alloc_lines = grep(R"{=\s*([(]\w+\s*[*][)])?[mc]alloc[(]}", lines)
+  # Be a bit more precise to avoid mentions in comments, and allow
+  #   malloc(0) to be used for convenience (e.g. #6757)
+  alloc_lines = grep(R"{=\s*([(]\w+\s*[*][)])?[mc]alloc[(][^0]}", lines)
   if (!length(alloc_lines)) return()
   # int *tmp=(int*)malloc(...); or just int tmp=malloc(...);
   alloc_keys = lines[alloc_lines] |>
 
@@ -0,0 +1,10 @@
+class1_linter = lintr::make_linter_from_xpath(
+  "
+    //OP-LEFT-BRACKET[
+      preceding-sibling::expr/expr/SYMBOL_FUNCTION_CALL[text() = 'class']
+      and following-sibling::expr/NUM_CONST[text() = '1' or text() = '1L']
+    ]
+      /parent::expr
+  ",
+  "Use class1(x) to get class(x)[1L], or classes1(x) to do so for a full list/data.table"
+)
@@ -27,7 +27,7 @@ format.bins <- function(ver, bin_ver, cran.home, os.type, pkg, version, repodir)
     plat.path = "windows"
   } else if (os.type=="macosx") {
     ext = "tgz"
-    plat.path = "macosx/el-capitan"
+    plat.path = "macosx/big-sur-arm64"
   } else stop("format.bins only valid for 'windows' or 'macosx' os.type")
   file = sprintf("bin/%s/contrib/%s/%s_%s.%s", plat.path, bin_ver, pkg, version, ext)
   fe = file.exists(file.path(repodir, file))
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`.dir-locals.el`
	`2`	`+.check.translations.R`
`2`	`3`	`^\.Rprofile$`
`3`	`4`	`^data\.table_.*\.tar\.gz$`
`4`	`5`	`^config\.log$`