Skip to content

Commit 0358f99

Browse files
Merge branch 'master' into rchk
2 parents 860ef17 + 71ac34a commit 0358f99

File tree

155 files changed

+33081
-5748
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

155 files changed

+33081
-5748
lines changed

.Rbuildignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
^\.devcontainer$
1717
^\.graphics$
1818
^\.github$
19+
^\.vscode$
20+
^\.zed$
1921

2022
^\.gitlab-ci\.yml$
2123

@@ -26,6 +28,7 @@
2628
^src/Makevars$
2729
^CODEOWNERS$
2830
^GOVERNANCE\.md$
31+
^Seal_of_Approval\.md$
2932

3033
^\.RData$
3134
^\.Rhistory$
@@ -45,3 +48,6 @@
4548
^lib$
4649
^library$
4750
^devwd$
51+
52+
# only the inst/po compressed files are needed, not raw .pot/.po
53+
^po$

.ci/.lintr.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
dt_linters = new.env()
2-
for (f in list.files('.ci/linters', full.names=TRUE)) sys.source(f, dt_linters)
2+
for (f in list.files('.ci/linters/r', full.names=TRUE)) sys.source(f, dt_linters)
33
rm(f)
44

55
# NB: Could do this inside the linter definition, this separation makes those files more standardized

.ci/atime/tests.R

Lines changed: 132 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,53 @@
1+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
2+
# https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
3+
extra.args.6107 <- c(
4+
"colClasses=list(Date='date')",
5+
"colClasses='Date'",
6+
"select=list(Date='date')")
7+
extra.test.list <- list()
8+
for (extra.arg in extra.args.6107){
9+
this.test <- atime::atime_test(
10+
setup = {
11+
set.seed(1)
12+
DT = data.table(date=.Date(sample(20000, N, replace=TRUE)))
13+
tmp_csv = tempfile()
14+
fwrite(DT, tmp_csv)
15+
},
16+
Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
17+
Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
18+
this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
19+
extra.test.list[[sprintf("fread(%s) improved in #6107", extra.arg)]] <- this.test
20+
}
21+
22+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4386#issue-602528139 which is where the performance was improved.
23+
for(retGrp_chr in c("T","F"))extra.test.list[[sprintf(
24+
"forderv(retGrp=%s) improved in #4386", retGrp_chr
25+
)]] <- list(
26+
setup = quote({
27+
dt <- data.table(group = rep(1:2, l=N))
28+
}),
29+
expr = substitute({
30+
old.opt <- options(datatable.forder.auto.index = TRUE) # required for test, un-documented, comments in forder.c say it is for debugging only.
31+
data.table:::forderv(dt, "group", retGrp = RETGRP)
32+
options(old.opt) # so the option does not affect other tests.
33+
}, list(RETGRP=eval(str2lang(retGrp_chr)))),
34+
## From ?bench::mark, "Each expression will always run at least twice,
35+
## once to measure the memory allocation and store results
36+
## and one or more times to measure timing."
37+
## So for atime(times=10) that means 11 times total.
38+
## First time for memory allocation measurement,
39+
## (also sets the index of dt in this example),
40+
## then 10 more times for time measurement.
41+
## Timings should be constant if the cached index is used (Fast),
42+
## and (log-)linear if the index is re-computed (Slow).
43+
Slow = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/b0efcf59442a7d086c6df17fa6a45c81b082322e) in the PR (https://github.com/Rdatatable/data.table/pull/4386/commits) where the performance was improved.
44+
Fast = "ffe431fbc1fe2d52ed9499f78e7e16eae4d71a93" # Last commit of the PR (https://github.com/Rdatatable/data.table/pull/4386/commits) where the performance was improved.
45+
)
46+
147
# A list of performance tests.
248
#
49+
# See documentation in https://github.com/Rdatatable/data.table/wiki/Performance-testing for best practices.
50+
#
351
# Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments:
452
# - N: A numeric sequence of data sizes to vary.
553
# - setup: An expression evaluated for every data size before measuring time/memory.
@@ -17,6 +65,8 @@
1765
# @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information.
1866
# nolint start: undesirable_operator_linter. ':::' needed+appropriate here.
1967
test.list <- atime::atime_test_list(
68+
# Common N and pkg.edit.fun are defined here, and inherited in all test cases below which do not re-define them.
69+
N = as.integer(10^seq(1, 7, by=0.25)),
2070
# A function to customize R package metadata and source files to facilitate version-specific installation and testing.
2171
#
2272
# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R)
@@ -73,10 +123,9 @@ test.list <- atime::atime_test_list(
73123
paste0('useDynLib(', new.Package_))
74124
},
75125

76-
# Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311
77-
# Fixed in: https://github.com/Rdatatable/data.table/pull/4440
126+
# Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
127+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
78128
"shallow regression fixed in #4440" = atime::atime_test(
79-
N = 10^seq(3,8),
80129
setup = {
81130
set.seed(1L)
82131
dt <- data.table(a = sample.int(N))
@@ -87,29 +136,27 @@ test.list <- atime::atime_test_list(
87136
Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression
88137
Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440)
89138

90-
# Test based on: https://github.com/Rdatatable/data.table/issues/5424
91-
# Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491
92-
# Fixed in: https://github.com/Rdatatable/data.table/pull/5463
139+
# Test based on https://github.com/Rdatatable/data.table/issues/5424
140+
# Performance regression introduced from a commit in https://github.com/Rdatatable/data.table/pull/4491
141+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/5463#issue-1373642456 which is the fix PR.
93142
"memrecycle regression fixed in #5463" = atime::atime_test(
94-
N = 10^seq(3, 8),
95143
setup = {
96-
n <- N/100
144+
bigN <- N*100
97145
set.seed(2L)
98146
dt <- data.table(
99-
g = sample(seq_len(n), N, TRUE),
100-
x = runif(N),
147+
g = sample(seq_len(N), bigN, TRUE),
148+
x = runif(bigN),
101149
key = "g")
102150
dt_mod <- copy(dt)
103151
},
104152
expr = data.table:::`[.data.table`(dt_mod, , N := .N, by = g),
105-
Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
106-
Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
107-
Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits)
153+
Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
154+
Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
155+
Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5463/commits) that fixed the regression
108156

109-
# Issue reported in: https://github.com/Rdatatable/data.table/issues/5426
110-
# To be fixed in: https://github.com/Rdatatable/data.table/pull/5427
157+
# Issue reported in https://github.com/Rdatatable/data.table/issues/5426
158+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/5427#issue-1323678063 which is the fix PR.
111159
"setDT improved in #5427" = atime::atime_test(
112-
N = 10^seq(1, 7),
113160
setup = {
114161
L <- replicate(N, 1, simplify = FALSE)
115162
setDT(L)
@@ -118,7 +165,73 @@ test.list <- atime::atime_test_list(
118165
data.table:::setattr(L, "class", NULL)
119166
data.table:::setDT(L)
120167
},
121-
Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801)
122-
Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
123-
)
168+
Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
169+
Fast = "af48a805e7a5026a0c2d0a7fd9b587fea5cfa3c4"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
170+
171+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/4200#issuecomment-645980224 which is where the issue was reported.
172+
# Fixed in https://github.com/Rdatatable/data.table/pull/4558
173+
"DT[by] fixed in #4558" = atime::atime_test(
174+
setup = {
175+
d <- data.table(
176+
id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
177+
v1 = sample(5L, N, TRUE),
178+
v2 = sample(5L, N, TRUE)
179+
)
180+
},
181+
expr = data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id),
182+
Before = "7a9eaf62ede487625200981018d8692be8c6f134", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/515de90a6068911a148e54343a3503043b8bb87c) in the PR (https://github.com/Rdatatable/data.table/pull/4164/commits) that introduced the regression
183+
Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
184+
Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
185+
186+
# Issue with sorting again when already sorted, as reported in https://github.com/Rdatatable/data.table/issues/4498
187+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4501#issue-625311918 which is the fix PR.
188+
"DT[,.SD] improved in #4501" = atime::atime_test(
189+
setup = {
190+
set.seed(1)
191+
L = as.data.table(as.character(rnorm(N, 1, 0.5)))
192+
setkey(L, V1)
193+
},
194+
## New DT can safely retain key.
195+
expr = data.table:::`[.data.table`(L, , .SD),
196+
Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
197+
Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
198+
Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
199+
200+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/6286#issue-2412141289 which is where the issue was reported.
201+
# Fixed in https://github.com/Rdatatable/data.table/pull/6296
202+
"DT[by,verbose=TRUE] improved in #6296" = atime::atime_test(
203+
setup = {
204+
dt = data.table(a = 1:N)
205+
dt_mod <- copy(dt)
206+
},
207+
expr = data.table:::`[.data.table`(dt_mod, , 1, by = a, verbose = TRUE),
208+
Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue
209+
Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue
210+
211+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/5492#issue-1416598382 which is where the issue was reported,
212+
# and from https://github.com/Rdatatable/data.table/pull/5493#issue-1416656788 which is the fix PR.
213+
"transform improved in #5493" = atime::atime_test(
214+
setup = {
215+
df <- data.frame(x = runif(N))
216+
dt <- as.data.table(df)
217+
},
218+
expr = data.table:::transform.data.table(dt, y = round(x)),
219+
Slow = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/93ce3ce1373bf733ebd2036e2883d2ffe377ab58) in the PR (https://github.com/Rdatatable/data.table/pull/5493/commits) that fixes the issue
220+
Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5493) that fixes the issue
221+
222+
# Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
223+
"melt improved in #5054" = atime::atime_test(
224+
setup = {
225+
DT <- as.data.table(as.list(1:N))
226+
measure.vars <- lapply(1:N, function(i) {
227+
x = rep(NA, N)
228+
x[i] = i
229+
x
230+
})
231+
},
232+
expr = data.table:::melt(DT, measure.vars = measure.vars),
233+
Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
234+
Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
235+
236+
tests=extra.test.list)
124237
# nolint end: undesirable_operator_linter.

.ci/ci.R

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ mirror.packages <-
111111
function(pkgs,
112112
which = c("Depends", "Imports", "LinkingTo"),
113113
repos = getOption("repos"),
114-
type = c("source", "mac.binary", "win.binary"),
114+
type = c("source", "mac.binary.big-sur-arm64", "win.binary"),
115115
repodir,
116116
except.repodir = repodir,
117117
except.priority = "base",
@@ -169,7 +169,8 @@ function(pkgs,
169169
newpkgs <- newpkgs[availpkgs]
170170
}
171171

172-
pkgsext <- switch(type,
172+
typeshort <- if (startsWith(type, "mac.binary.")) "mac.binary" else type
173+
pkgsext <- switch(typeshort,
173174
"source" = "tar.gz",
174175
"mac.binary" = "tgz",
175176
"win.binary" = "zip")
@@ -181,7 +182,7 @@ function(pkgs,
181182
dp <- utils::download.packages(pkgs = newpkgs, destdir = destdir,
182183
available = db, contriburl = repos.url,
183184
type = type, method = method, quiet = quiet)
184-
tools::write_PACKAGES(dir = destdir, type = type, ...)
185+
tools::write_PACKAGES(dir = destdir, type = typeshort, ...)
185186
dp
186187
}
187188

.ci/linters/c/alloc_linter.R

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Ensure that we check the result of malloc()/calloc() for success
2+
# More specifically, given that this is an AST-ignorant checker,
3+
# 1. Find groups of malloc()/calloc() calls
4+
# 2. Check the next line for a check like 'if (!x || !y)'
5+
alloc_linter = function(c_obj) {
6+
lines = c_obj$lines
7+
# Be a bit more precise to avoid mentions in comments
8+
alloc_lines = grep(R"{=\s*([(]\w+\s*[*][)])?[mc]alloc[(]}", lines)
9+
if (!length(alloc_lines)) return()
10+
# int *tmp=(int*)malloc(...); or just int tmp=malloc(...);
11+
alloc_keys = lines[alloc_lines] |>
12+
strsplit(R"(\s*=\s*)") |>
13+
vapply(head, 1L, FUN.VALUE="") |>
14+
trimws() |>
15+
# just normalize the more exotic assignments, namely 'type *restrict key = ...'
16+
gsub(pattern = "[*]\\s*(restrict\\s*)?", replacement = "*") |>
17+
strsplit("*", fixed=TRUE) |>
18+
vapply(tail, 1L, FUN.VALUE="")
19+
alloc_grp_id = cumsum(c(TRUE, diff(alloc_lines) != 1L))
20+
21+
# execute by group
22+
tapply(seq_along(alloc_lines), alloc_grp_id, function(grp_idx) {
23+
keys_regex = paste0("\\s*!\\s*", alloc_keys[grp_idx], "\\s*", collapse = "[|][|]")
24+
check_regex = paste0("if\\s*\\(", keys_regex)
25+
check_line = lines[alloc_lines[tail(grp_idx, 1L)] + 1L]
26+
# Rarely (once in fread.c as of initialization), error checking is handled
27+
# but not immediately, so use 'NOCHECK' to escape.
28+
if (!grepl(check_regex, check_line) && !grepl("NOCHECK", check_line, fixed=TRUE)) {
29+
bad_lines_idx = seq(alloc_lines[grp_idx[1L]], length.out=length(grp_idx)+1L)
30+
cat("FILE: ", c_obj$path, "; LINES: ", head(bad_lines_idx, 1L), "-", tail(bad_lines_idx, 1L), "\n", sep="")
31+
writeLines(lines[bad_lines_idx])
32+
cat(strrep("-", max(nchar(lines[bad_lines_idx]))), "\n", sep="")
33+
stop("Expected the malloc()/calloc() usage above to be followed immediately by error checking.", call.=FALSE)
34+
}
35+
})
36+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Ensure no calls to omp_set_nested() as
2+
# i) it's hard to fully honor OMP_THREAD_LIMIT as required by CRAN, and
3+
# ii) a simpler non-nested approach is always preferable if possible, as has been the case so far
4+
omp_set_nested_linter = function(c_obj) {
5+
idx = grep("omp_set_nested", c_obj$lines, fixed=TRUE)
6+
if (!length(idx)) return()
7+
stop(sprintf(
8+
"In %s, found omp_set_nested() usage, please reconsider:\n%s",
9+
c_obj$path, paste0(" ", format(idx), ":", c_obj$lines[idx], collapse = "\n")
10+
))
11+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Ensure no calls to omp_set_num_threads() [to avoid affecting other packages and base R]
2+
# Only comments referring to it should be in openmp-utils.c
3+
omp_set_num_threads_linter = function(c_obj) {
4+
# strip comments, we only care if the function appears in actual code.
5+
idx = grep("omp_set_num_threads", c_obj$preprocessed, fixed = TRUE)
6+
if (!length(idx)) return()
7+
stop(sprintf(
8+
"In %s, found omp_set_num_threads() usage, which could affect other packages and base R:\n%s",
9+
c_obj$path, paste0(" ", format(idx), ":", c_obj$preprocessed[idx], collapse = "\n")
10+
))
11+
}

0 commit comments

Comments
 (0)