Skip to content

Commit 51666c4

Browse files
Merge branch 'master' into consistentcolreplacement
2 parents 3ab49af + 6641ca0 commit 51666c4

File tree

193 files changed

+39105
-6814
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

193 files changed

+39105
-6814
lines changed

.Rbuildignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
.dir-locals.el
2+
.check.translations.R
23
^\.Rprofile$
34
^data\.table_.*\.tar\.gz$
45
^config\.log$
@@ -16,6 +17,8 @@
1617
^\.devcontainer$
1718
^\.graphics$
1819
^\.github$
20+
^\.vscode$
21+
^\.zed$
1922

2023
^\.gitlab-ci\.yml$
2124

@@ -26,6 +29,7 @@
2629
^src/Makevars$
2730
^CODEOWNERS$
2831
^GOVERNANCE\.md$
32+
^Seal_of_Approval\.md$
2933

3034
^\.RData$
3135
^\.Rhistory$
@@ -45,3 +49,6 @@
4549
^lib$
4650
^library$
4751
^devwd$
52+
53+
# only the inst/po compressed files are needed, not raw .pot/.po
54+
^po$

.ci/.lintr.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
dt_linters = new.env()
2-
for (f in list.files('.ci/linters', full.names=TRUE)) sys.source(f, dt_linters)
2+
for (f in list.files('.ci/linters/r', full.names=TRUE)) sys.source(f, dt_linters)
33
rm(f)
44

55
# NB: Could do this inside the linter definition, this separation makes those files more standardized
@@ -84,7 +84,7 @@ exclusions = c(local({
8484
infix_spaces_linter = Inf,
8585
undesirable_function_linter = Inf
8686
)),
87-
exclusion_for_dir("vignettes", list(
87+
exclusion_for_dir(c("vignettes", "vignettes/fr"), list(
8888
quotes_linter = Inf,
8989
sample_int_linter = Inf
9090
# strings_as_factors_linter = Inf

.ci/README.md

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# data.table continuous integration and deployment
22

3-
On each Pull Request opened in GitHub we run GitHub Actions test jobs to provide prompt feedback about the status of PR. Our main CI pipeline runs on GitLab CI nightly. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch every night. It tests more environments and different configurations. It publish variety of artifacts.
3+
On each Pull Request opened in GitHub we run GitHub Actions test jobs to provide prompt feedback about the status of PR. Our more thorough main CI pipeline runs nightly on GitLab CI. GitLab repository automatically mirrors our GitHub repository and runs pipeline on `master` branch every night. It tests more environments and different configurations. It publishes a variety of artifacts such as our [homepage](https://rdatatable.gitlab.io/data.table/) and [CRAN-like website for dev version](https://rdatatable.gitlab.io/data.table/web/packages/data.table/index.html), including windows binaries for the dev version.
44

55
## Environments
66

@@ -44,3 +44,22 @@ Base R implemented helper script, [originally proposed to base R](https://svn.r-
4444
### [`publish.R`](./publish.R)
4545

4646
Base R implemented helper script to orchestrate generation of most artifacts and to arrange them nicely. It is being used only in [_integration_ stage in GitLab CI pipeline](./../.gitlab-ci.yml).
47+
48+
## GitLab Open Source Program
49+
50+
We are currently part of the [GitLab for Open Source Program](https://about.gitlab.com/solutions/open-source/). This gives us 50,000 compute minutes per month for our GitLab CI. Our license needs to be renewed yearly (around July) and is currently managed by @ben-schwen.
51+
52+
## Updating CI pipeline
53+
54+
Basic CI checks are also run on every push to the GitLab repository. This can **and should** be used for PRs changing the CI pipeline before merging them to master.
55+
56+
```shell
57+
# fetch changes from remote (GitHub) and push them to GitLab
58+
git fetch [email protected]:Rdatatable/data.table.git new_branch:new_branch
59+
git push
60+
# after updating on GitHub, pull changes from remote and push to GitLab
61+
git pull [email protected]:Rdatatable/data.table.git new_branch
62+
git push
63+
```
64+
65+
Make sure to include a link to the pipeline results in your PR.

.ci/atime/tests.R

Lines changed: 145 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,53 @@
1+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
2+
# https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
3+
extra.args.6107 <- c(
4+
"colClasses=list(Date='date')",
5+
"colClasses='Date'",
6+
"select=list(Date='date')")
7+
extra.test.list <- list()
8+
for (extra.arg in extra.args.6107){
9+
this.test <- atime::atime_test(
10+
setup = {
11+
set.seed(1)
12+
DT = data.table(date=.Date(sample(20000, N, replace=TRUE)))
13+
tmp_csv = tempfile()
14+
fwrite(DT, tmp_csv)
15+
},
16+
Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
17+
Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
18+
this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
19+
extra.test.list[[sprintf("fread(%s) improved in #6107", extra.arg)]] <- this.test
20+
}
21+
22+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4386#issue-602528139 which is where the performance was improved.
23+
for(retGrp_chr in c("T","F"))extra.test.list[[sprintf(
24+
"forderv(retGrp=%s) improved in #4386", retGrp_chr
25+
)]] <- list(
26+
setup = quote({
27+
dt <- data.table(group = rep(1:2, l=N))
28+
}),
29+
expr = substitute({
30+
old.opt <- options(datatable.forder.auto.index = TRUE) # required for test, un-documented, comments in forder.c say it is for debugging only.
31+
data.table:::forderv(dt, "group", retGrp = RETGRP)
32+
options(old.opt) # so the option does not affect other tests.
33+
}, list(RETGRP=eval(str2lang(retGrp_chr)))),
34+
## From ?bench::mark, "Each expression will always run at least twice,
35+
## once to measure the memory allocation and store results
36+
## and one or more times to measure timing."
37+
## So for atime(times=10) that means 11 times total.
38+
## First time for memory allocation measurement,
39+
## (also sets the index of dt in this example),
40+
## then 10 more times for time measurement.
41+
## Timings should be constant if the cached index is used (Fast),
42+
## and (log-)linear if the index is re-computed (Slow).
43+
Slow = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/b0efcf59442a7d086c6df17fa6a45c81b082322e) in the PR (https://github.com/Rdatatable/data.table/pull/4386/commits) where the performance was improved.
44+
Fast = "ffe431fbc1fe2d52ed9499f78e7e16eae4d71a93" # Last commit of the PR (https://github.com/Rdatatable/data.table/pull/4386/commits) where the performance was improved.
45+
)
46+
147
# A list of performance tests.
248
#
49+
# See documentation in https://github.com/Rdatatable/data.table/wiki/Performance-testing for best practices.
50+
#
351
# Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments:
452
# - N: A numeric sequence of data sizes to vary.
553
# - setup: An expression evaluated for every data size before measuring time/memory.
@@ -17,6 +65,8 @@
1765
# @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information.
1866
# nolint start: undesirable_operator_linter. ':::' needed+appropriate here.
1967
test.list <- atime::atime_test_list(
68+
# Common N and pkg.edit.fun are defined here, and inherited in all test cases below which do not re-define them.
69+
N = as.integer(10^seq(1, 7, by=0.25)),
2070
# A function to customize R package metadata and source files to facilitate version-specific installation and testing.
2171
#
2272
# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R)
@@ -73,10 +123,9 @@ test.list <- atime::atime_test_list(
73123
paste0('useDynLib(', new.Package_))
74124
},
75125

76-
# Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311
77-
# Fixed in: https://github.com/Rdatatable/data.table/pull/4440
126+
# Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
127+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
78128
"shallow regression fixed in #4440" = atime::atime_test(
79-
N = 10^seq(3,8),
80129
setup = {
81130
set.seed(1L)
82131
dt <- data.table(a = sample.int(N))
@@ -87,29 +136,27 @@ test.list <- atime::atime_test_list(
87136
Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression
88137
Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440)
89138

90-
# Test based on: https://github.com/Rdatatable/data.table/issues/5424
91-
# Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491
92-
# Fixed in: https://github.com/Rdatatable/data.table/pull/5463
139+
# Test based on https://github.com/Rdatatable/data.table/issues/5424
140+
# Performance regression introduced from a commit in https://github.com/Rdatatable/data.table/pull/4491
141+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/5463#issue-1373642456 which is the fix PR.
93142
"memrecycle regression fixed in #5463" = atime::atime_test(
94-
N = 10^seq(3, 8),
95143
setup = {
96-
n <- N/100
144+
bigN <- N*100
97145
set.seed(2L)
98146
dt <- data.table(
99-
g = sample(seq_len(n), N, TRUE),
100-
x = runif(N),
147+
g = sample(seq_len(N), bigN, TRUE),
148+
x = runif(bigN),
101149
key = "g")
102150
dt_mod <- copy(dt)
103151
},
104152
expr = data.table:::`[.data.table`(dt_mod, , N := .N, by = g),
105-
Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
106-
Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
107-
Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits)
153+
Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
154+
Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
155+
Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5463/commits) that fixed the regression
108156

109-
# Issue reported in: https://github.com/Rdatatable/data.table/issues/5426
110-
# To be fixed in: https://github.com/Rdatatable/data.table/pull/5427
157+
# Issue reported in https://github.com/Rdatatable/data.table/issues/5426
158+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/5427#issue-1323678063 which is the fix PR.
111159
"setDT improved in #5427" = atime::atime_test(
112-
N = 10^seq(1, 7),
113160
setup = {
114161
L <- replicate(N, 1, simplify = FALSE)
115162
setDT(L)
@@ -118,7 +165,86 @@ test.list <- atime::atime_test_list(
118165
data.table:::setattr(L, "class", NULL)
119166
data.table:::setDT(L)
120167
},
121-
Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801)
122-
Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
123-
)
168+
Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
169+
Fast = "af48a805e7a5026a0c2d0a7fd9b587fea5cfa3c4"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
170+
171+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/4200#issuecomment-645980224 which is where the issue was reported.
172+
# Fixed in https://github.com/Rdatatable/data.table/pull/4558
173+
"DT[by] fixed in #4558" = atime::atime_test(
174+
setup = {
175+
d <- data.table(
176+
id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
177+
v1 = sample(5L, N, TRUE),
178+
v2 = sample(5L, N, TRUE)
179+
)
180+
},
181+
expr = data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id),
182+
Before = "7a9eaf62ede487625200981018d8692be8c6f134", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/515de90a6068911a148e54343a3503043b8bb87c) in the PR (https://github.com/Rdatatable/data.table/pull/4164/commits) that introduced the regression
183+
Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
184+
Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
185+
186+
# Issue with sorting again when already sorted, as reported in https://github.com/Rdatatable/data.table/issues/4498
187+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4501#issue-625311918 which is the fix PR.
188+
"DT[,.SD] improved in #4501" = atime::atime_test(
189+
setup = {
190+
set.seed(1)
191+
L = as.data.table(as.character(rnorm(N, 1, 0.5)))
192+
setkey(L, V1)
193+
},
194+
## New DT can safely retain key.
195+
expr = data.table:::`[.data.table`(L, , .SD),
196+
Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
197+
Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
198+
Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
199+
200+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/6286#issue-2412141289 which is where the issue was reported.
201+
# Fixed in https://github.com/Rdatatable/data.table/pull/6296
202+
"DT[by,verbose=TRUE] improved in #6296" = atime::atime_test(
203+
setup = {
204+
dt = data.table(a = 1:N)
205+
dt_mod <- copy(dt)
206+
},
207+
expr = data.table:::`[.data.table`(dt_mod, , 1, by = a, verbose = TRUE),
208+
Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue
209+
Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue
210+
211+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/5492#issue-1416598382 which is where the issue was reported,
212+
# and from https://github.com/Rdatatable/data.table/pull/5493#issue-1416656788 which is the fix PR.
213+
"transform improved in #5493" = atime::atime_test(
214+
setup = {
215+
df <- data.frame(x = runif(N))
216+
dt <- as.data.table(df)
217+
},
218+
expr = data.table:::transform.data.table(dt, y = round(x)),
219+
Slow = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/93ce3ce1373bf733ebd2036e2883d2ffe377ab58) in the PR (https://github.com/Rdatatable/data.table/pull/5493/commits) that fixes the issue
220+
Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5493) that fixes the issue
221+
222+
# Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
223+
"melt improved in #5054" = atime::atime_test(
224+
setup = {
225+
DT <- as.data.table(as.list(1:N))
226+
measure.vars <- lapply(1:N, function(i) {
227+
x = rep(NA, N)
228+
x[i] = i
229+
x
230+
})
231+
},
232+
expr = data.table:::melt(DT, measure.vars = measure.vars),
233+
Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
234+
Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue # Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
235+
236+
# Test case created from @tdhock's comment https://github.com/Rdatatable/data.table/pull/6393#issuecomment-2327396833, in turn adapted from @philippechataignon's comment https://github.com/Rdatatable/data.table/pull/6393#issuecomment-2326714012
237+
"fwrite refactored in #6393" = atime::atime_test(
238+
setup = {
239+
set.seed(1)
240+
NC = 10L
241+
L <- data.table(i=1:N)
242+
L[, paste0("V", 1:NC) := replicate(NC, rnorm(N), simplify=FALSE)]
243+
out.csv <- tempfile()
244+
},
245+
expr = data.table::fwrite(L, out.csv, compress="gzip"),
246+
Before = "f339aa64c426a9cd7cf2fcb13d91fc4ed353cd31", # Parent of the first commit https://github.com/Rdatatable/data.table/commit/fcc10d73a20837d0f1ad3278ee9168473afa5ff1 in the PR https://github.com/Rdatatable/data.table/pull/6393/commits with major change to fwrite with gzip.
247+
PR = "3630413ae493a5a61b06c50e80d166924d2ef89a"), # Close-to-last merge commit in the PR.
248+
249+
tests=extra.test.list)
124250
# nolint end: undesirable_operator_linter.

.ci/ci.R

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ mirror.packages <-
111111
function(pkgs,
112112
which = c("Depends", "Imports", "LinkingTo"),
113113
repos = getOption("repos"),
114-
type = c("source", "mac.binary", "win.binary"),
114+
type = c("source", "mac.binary.big-sur-arm64", "win.binary"),
115115
repodir,
116116
except.repodir = repodir,
117117
except.priority = "base",
@@ -169,7 +169,8 @@ function(pkgs,
169169
newpkgs <- newpkgs[availpkgs]
170170
}
171171

172-
pkgsext <- switch(type,
172+
typeshort <- if (startsWith(type, "mac.binary.")) "mac.binary" else type
173+
pkgsext <- switch(typeshort,
173174
"source" = "tar.gz",
174175
"mac.binary" = "tgz",
175176
"win.binary" = "zip")
@@ -181,7 +182,7 @@ function(pkgs,
181182
dp <- utils::download.packages(pkgs = newpkgs, destdir = destdir,
182183
available = db, contriburl = repos.url,
183184
type = type, method = method, quiet = quiet)
184-
tools::write_PACKAGES(dir = destdir, type = type, ...)
185+
tools::write_PACKAGES(dir = destdir, type = typeshort, ...)
185186
dp
186187
}
187188

0 commit comments

Comments
 (0)