Skip to content

Commit d9ec26d

Browse files
Merge branch 'master' into openmp-doc
2 parents 653fc7a + c3ba954 commit d9ec26d

File tree

139 files changed

+21509
-5677
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

139 files changed

+21509
-5677
lines changed

.Rbuildignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
^\.devcontainer$
1717
^\.graphics$
1818
^\.github$
19+
^\.zed$
1920

2021
^\.gitlab-ci\.yml$
2122

@@ -26,6 +27,7 @@
2627
^src/Makevars$
2728
^CODEOWNERS$
2829
^GOVERNANCE\.md$
30+
^Seal_of_Approval\.md$
2931

3032
^\.RData$
3133
^\.Rhistory$
@@ -45,3 +47,6 @@
4547
^lib$
4648
^library$
4749
^devwd$
50+
51+
# only the inst/po compressed files are needed, not raw .pot/.po
52+
^po$

.ci/.lintr.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
dt_linters = new.env()
2-
for (f in list.files('.ci/linters', full.names=TRUE)) sys.source(f, dt_linters)
2+
for (f in list.files('.ci/linters/r', full.names=TRUE)) sys.source(f, dt_linters)
33
rm(f)
44

55
# NB: Could do this inside the linter definition, this separation makes those files more standardized

.ci/atime/tests.R

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# A list of performance tests.
22
#
3+
# See documentation in https://github.com/Rdatatable/data.table/wiki/Performance-testing for best practices.
4+
#
35
# Each entry in this list corresponds to a performance test and contains a sublist with three mandatory arguments:
46
# - N: A numeric sequence of data sizes to vary.
57
# - setup: An expression evaluated for every data size before measuring time/memory.
@@ -119,6 +121,42 @@ test.list <- atime::atime_test_list(
119121
data.table:::setDT(L)
120122
},
121123
Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801)
122-
Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15") # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
123-
)
124+
Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15"), # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
125+
126+
# Issue reported in: https://github.com/Rdatatable/data.table/issues/4200
127+
# To be fixed in: https://github.com/Rdatatable/data.table/pull/4558
128+
"DT[by] fixed in #4558" = atime::atime_test(
129+
N = 10^seq(1, 20),
130+
setup = {
131+
d <- data.table(
132+
id3 = sample(c(seq.int(N*0.9), sample( N*0.9, N*0.1, TRUE))),
133+
v1 = sample(5L, N, TRUE),
134+
v2 = sample(5L, N, TRUE)
135+
)
136+
},
137+
expr = {
138+
expr=data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id3)
139+
},
140+
Before = "7a9eaf62ede487625200981018d8692be8c6f134", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/515de90a6068911a148e54343a3503043b8bb87c) in the PR (https://github.com/Rdatatable/data.table/pull/4164/commits) that introduced the regression
141+
Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
142+
Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
143+
144+
# Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498
145+
# Fixed in: https://github.com/Rdatatable/data.table/pull/4501
146+
"DT[,.SD] improved in #4501" = atime::atime_test(
147+
N = 10^seq(1, 10, by=0.5),
148+
setup = {
149+
set.seed(1)
150+
L = as.data.table(as.character(rnorm(N, 1, 0.5)))
151+
setkey(L, V1)
152+
},
153+
## New DT can safely retain key.
154+
expr = {
155+
data.table:::`[.data.table`(L, , .SD)
156+
},
157+
Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
158+
Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
159+
Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
160+
161+
NULL)
124162
# nolint end: undesirable_operator_linter.

.ci/linters/c/alloc_linter.R

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Ensure that we check the result of malloc()/calloc() for success
2+
# More specifically, given that this is an AST-ignorant checker,
3+
# 1. Find groups of malloc()/calloc() calls
4+
# 2. Check the next line for a check like 'if (!x || !y)'
5+
alloc_linter = function(c_obj) {
6+
lines = c_obj$lines
7+
# Be a bit more precise to avoid mentions in comments
8+
alloc_lines = grep(R"{=\s*([(]\w+\s*[*][)])?[mc]alloc[(]}", lines)
9+
if (!length(alloc_lines)) return()
10+
# int *tmp=(int*)malloc(...); or just int tmp=malloc(...);
11+
alloc_keys = lines[alloc_lines] |>
12+
strsplit(R"(\s*=\s*)") |>
13+
vapply(head, 1L, FUN.VALUE="") |>
14+
trimws() |>
15+
# just normalize the more exotic assignments, namely 'type *restrict key = ...'
16+
gsub(pattern = "[*]\\s*(restrict\\s*)?", replacement = "*") |>
17+
strsplit("*", fixed=TRUE) |>
18+
vapply(tail, 1L, FUN.VALUE="")
19+
alloc_grp_id = cumsum(c(TRUE, diff(alloc_lines) != 1L))
20+
21+
# execute by group
22+
tapply(seq_along(alloc_lines), alloc_grp_id, function(grp_idx) {
23+
keys_regex = paste0("\\s*!\\s*", alloc_keys[grp_idx], "\\s*", collapse = "[|][|]")
24+
check_regex = paste0("if\\s*\\(", keys_regex)
25+
check_line = lines[alloc_lines[tail(grp_idx, 1L)] + 1L]
26+
# Rarely (once in fread.c as of initialization), error checking is handled
27+
# but not immediately, so use 'NOCHECK' to escape.
28+
if (!grepl(check_regex, check_line) && !grepl("NOCHECK", check_line, fixed=TRUE)) {
29+
bad_lines_idx = seq(alloc_lines[grp_idx[1L]], length.out=length(grp_idx)+1L)
30+
cat("FILE: ", c_obj$path, "; LINES: ", head(bad_lines_idx, 1L), "-", tail(bad_lines_idx, 1L), "\n", sep="")
31+
writeLines(lines[bad_lines_idx])
32+
cat(strrep("-", max(nchar(lines[bad_lines_idx]))), "\n", sep="")
33+
stop("Expected the malloc()/calloc() usage above to be followed immediately by error checking.", call.=FALSE)
34+
}
35+
})
36+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Ensure no calls to omp_set_nested() as
2+
# i) it's hard to fully honor OMP_THREAD_LIMIT as required by CRAN, and
3+
# ii) a simpler non-nested approach is always preferable if possible, as has been the case so far
4+
omp_set_nested_linter = function(c_obj) {
5+
idx = grep("omp_set_nested", c_obj$lines, fixed=TRUE)
6+
if (!length(idx)) return()
7+
stop(sprintf(
8+
"In %s, found omp_set_nested() usage, please reconsider:\n%s",
9+
c_obj$path, paste0(" ", format(idx), ":", c_obj$lines[idx], collapse = "\n")
10+
))
11+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Ensure no calls to omp_set_num_threads() [to avoid affecting other packages and base R]
2+
# Only comments referring to it should be in openmp-utils.c
3+
omp_set_num_threads_linter = function(c_obj) {
4+
# strip comments, we only care if the function appears in actual code.
5+
idx = grep("omp_set_num_threads", c_obj$preprocessed, fixed = TRUE)
6+
if (!length(idx)) return()
7+
stop(sprintf(
8+
"In %s, found omp_set_num_threads() usage, which could affect other packages and base R:\n%s",
9+
c_obj$path, paste0(" ", format(idx), ":", c_obj$preprocessed[idx], collapse = "\n")
10+
))
11+
}

.ci/linters/md/news_linter.R

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
any_mismatch = FALSE
2+
3+
# ensure that numbered list in each section is in sequence
4+
check_section_numbering = function(news) {
5+
# plain '#' catches some examples; 'd' for 'data.table'
6+
sections = grep("^#+ [A-Zd]", news)
7+
entries = grep("^[0-9]+[.]", news)
8+
entry_value = as.integer(gsub("^([0-9]+)[.].*", "\\1", news[entries]))
9+
section_id = findInterval(entries, sections)
10+
11+
any_mismatch = FALSE
12+
for (id in unique(section_id)) {
13+
section_entries = entry_value[section_id == id]
14+
intended_value = seq_along(section_entries)
15+
matched = section_entries == intended_value
16+
if (all(matched)) next
17+
any_mismatch = TRUE
18+
section_header = news[sections[id]]
19+
cat(sprintf(
20+
"In section '%s' (line %d), bad numbering:\n%s\n",
21+
section_header, sections[id],
22+
paste0(" [", section_entries[!matched], " --> ", intended_value[!matched], "]", collapse="\n")
23+
))
24+
}
25+
return(any_mismatch)
26+
}
27+
28+
# ensure that GitHub link text & URL actually agree
29+
check_gh_links = function(news) {
30+
gh_links_info = gregexpr(
31+
"\\[#(?<md_number>[0-9]+)\\]\\(https://github.com/Rdatatable/data.table/(?<link_type>[^/]+)/(?<link_number>[0-9]+)\\)",
32+
news,
33+
perl=TRUE # required for within-group indices
34+
)
35+
gh_link_metadata = do.call(rbind, lapply(seq_along(gh_links_info), function(idx) {
36+
x = gh_links_info[[idx]]
37+
if (x[1L] <= 0L) return(NULL)
38+
match_mat = attr(x, "capture.start") # matrix seeded with the correct dimensions
39+
match_mat[] = substring(news[idx], match_mat, match_mat + attr(x, "capture.length") - 1L)
40+
match_df = data.frame(match_mat)
41+
match_df$line_number = idx
42+
match_df
43+
}))
44+
matched = gh_link_metadata$md_number == gh_link_metadata$link_number
45+
if (all(matched)) return(FALSE)
46+
47+
cat(sep = "", with(gh_link_metadata[!matched, ], sprintf(
48+
"In line %d, link pointing to %s %s is written #%s\n",
49+
line_number, link_type, link_number, md_number
50+
)))
51+
return(TRUE)
52+
}
53+
54+
any_error = FALSE
55+
for (news in list.files(pattern = "NEWS")) {
56+
cat(sprintf("Checking NEWS file %s...\n", news))
57+
news_lines = readLines(news)
58+
any_error = check_section_numbering(news_lines) || any_error
59+
any_error = check_gh_links(news_lines) || any_error
60+
}
61+
if (any_error) stop("Please fix the NEWS issues above.")

.ci/linters/po/msgfmt_linter.R

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Use msgfmt to check for untranslated/fuzzy messages, and for whether
2+
# the implied .mo compiled form matches that which is already checked in
3+
msgfmt_linter <- function(po_file) {
4+
mo_tmp <- tempfile()
5+
on.exit(unlink(mo_tmp))
6+
7+
res = system2("msgfmt", c("--statistics", po_file, "-o", mo_tmp), stdout=TRUE, stderr=TRUE)
8+
if (any(grepl("untranslated message|fuzzy translation", res))) {
9+
cat(sprintf("In %s, found incomplete translations:\n%s\n", po_file, paste(res, collapse="\n")))
10+
stop("Please fix.")
11+
}
12+
13+
mo_ref = sprintf(
14+
"inst/%s/LC_MESSAGES/%sdata.table.mo",
15+
gsub("^R-|[.]po$", "", po_file),
16+
if (startsWith(basename(po_file), "R-")) "R-" else ""
17+
)
18+
19+
if (!file.exists(mo_ref)) {
20+
stop(po_file, " has not been compiled as ", mo_ref, ". Please fix.")
21+
}
22+
if (tools::md5sum(mo_ref) == tools::md5sum(mo_tmp)) return(invisible())
23+
24+
# NB: file.mtime() will probably be wrong, it will reflect the check-out time of the git repo.
25+
last_edit_time = system2("git",
26+
c("log", "-1", '--format="%ad"', "--date=format:'%Y-%m-%d %H:%M:%S'", "--", mo_ref),
27+
stdout=TRUE
28+
)
29+
cat(sprintf(
30+
".mo compilation %s of .po translation %s appears out of date! It was last updated %s\n",
31+
mo_ref, po_file, last_edit_time
32+
))
33+
34+
unmo_tmp = tempfile()
35+
unmo_ref = tempfile()
36+
on.exit(unlink(c(unmo_tmp, unmo_ref), add=TRUE))
37+
system2("msgunfmt", c(mo_tmp, "-o", unmo_tmp))
38+
system2("msgunfmt", c(mo_ref, "-o", unmo_ref))
39+
cat("Here are the observed differences after converting back to .po:\n\n")
40+
system2("diff", c(unmo_tmp, unmo_ref))
41+
stop("Please fix.")
42+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
tools_check_linter = function(po_file) {
2+
res = tools::checkPoFile(po_file, strictPlural=TRUE)
3+
if (NROW(res)) {
4+
print(res)
5+
stop("Fix the above .po file issues.")
6+
}
7+
}

.ci/linters/po/utf8_linter.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
utf8_linter <- function(po_file) {
2+
if (!any(grepl("charset=UTF-8", readLines(po_file), fixed=TRUE)))
3+
stop("In ", po_file, ", please use charset=UTF-8.")
4+
}

0 commit comments

Comments
 (0)