Skip to content

Commit cdf4277

Browse files
Merge branch 'master' into fix_fwrite_length
2 parents 255f1ce + 3734726 commit cdf4277

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+11731
-508
lines changed

.Rbuildignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
^\.devcontainer$
1717
^\.graphics$
1818
^\.github$
19+
^\.vscode$
1920
^\.zed$
2021

2122
^\.gitlab-ci\.yml$

.ci/atime/tests.R

Lines changed: 82 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,24 @@
1+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
2+
# https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
3+
extra.args.6107 <- c(
4+
"colClasses=list(Date='date')",
5+
"colClasses='Date'",
6+
"select=list(Date='date')")
7+
extra.test.list <- list()
8+
for (extra.arg in extra.args.6107){
9+
this.test <- atime::atime_test(
10+
setup = {
11+
set.seed(1)
12+
DT = data.table(date=.Date(sample(20000, N, replace=TRUE)))
13+
tmp_csv = tempfile()
14+
fwrite(DT, tmp_csv)
15+
},
16+
Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
17+
Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
18+
this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
19+
extra.test.list[[sprintf("fread(%s) improved in #6107", extra.arg)]] <- this.test
20+
}
21+
122
# A list of performance tests.
223
#
324
# See documentation in https://github.com/Rdatatable/data.table/wiki/Performance-testing for best practices.
@@ -19,6 +40,8 @@
1940
# @note Please check https://github.com/tdhock/atime/blob/main/vignettes/data.table.Rmd for more information.
2041
# nolint start: undesirable_operator_linter. ':::' needed+appropriate here.
2142
test.list <- atime::atime_test_list(
43+
# Common N and pkg.edit.fun are defined here, and inherited in all test cases below which do not re-define them.
44+
N = as.integer(10^seq(1, 7, by=0.25)),
2245
# A function to customize R package metadata and source files to facilitate version-specific installation and testing.
2346
#
2447
# This is specifically tailored for handling data.table which requires specific changes in non-standard files (such as the object file name in Makevars and version checking code in onLoad.R)
@@ -75,10 +98,9 @@ test.list <- atime::atime_test_list(
7598
paste0('useDynLib(', new.Package_))
7699
},
77100

78-
# Performance regression discussed in: https://github.com/Rdatatable/data.table/issues/4311
79-
# Fixed in: https://github.com/Rdatatable/data.table/pull/4440
101+
# Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
102+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
80103
"shallow regression fixed in #4440" = atime::atime_test(
81-
N = 10^seq(3,8),
82104
setup = {
83105
set.seed(1L)
84106
dt <- data.table(a = sample.int(N))
@@ -89,29 +111,27 @@ test.list <- atime::atime_test_list(
89111
Regression = "b1b1832b0d2d4032b46477d9fe6efb15006664f4", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/0f0e7127b880df8459b0ed064dc841acd22f5b73) in the PR (https://github.com/Rdatatable/data.table/pull/4440/commits) that fixes the regression
90112
Fixed = "9d3b9202fddb980345025a4f6ac451ed26a423be"), # Merge commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/4440)
91113

92-
# Test based on: https://github.com/Rdatatable/data.table/issues/5424
93-
# Performance regression introduced from a commit in: https://github.com/Rdatatable/data.table/pull/4491
94-
# Fixed in: https://github.com/Rdatatable/data.table/pull/5463
114+
# Test based on https://github.com/Rdatatable/data.table/issues/5424
115+
# Performance regression introduced from a commit in https://github.com/Rdatatable/data.table/pull/4491
116+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/5463#issue-1373642456 which is the fix PR.
95117
"memrecycle regression fixed in #5463" = atime::atime_test(
96-
N = 10^seq(3, 8),
97118
setup = {
98-
n <- N/100
119+
bigN <- N*100
99120
set.seed(2L)
100121
dt <- data.table(
101-
g = sample(seq_len(n), N, TRUE),
102-
x = runif(N),
122+
g = sample(seq_len(N), bigN, TRUE),
123+
x = runif(bigN),
103124
key = "g")
104125
dt_mod <- copy(dt)
105126
},
106127
expr = data.table:::`[.data.table`(dt_mod, , N := .N, by = g),
107-
Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
108-
Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR that introduced the issue (https://github.com/Rdatatable/data.table/pull/4491/commits)
109-
Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR that fixed the regression (https://github.com/Rdatatable/data.table/pull/5463/commits)
128+
Before = "be2f72e6f5c90622fe72e1c315ca05769a9dc854", # Parent of the regression causing commit (https://github.com/Rdatatable/data.table/commit/e793f53466d99f86e70fc2611b708ae8c601a451) in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
129+
Regression = "e793f53466d99f86e70fc2611b708ae8c601a451", # Commit responsible for regression in the PR (https://github.com/Rdatatable/data.table/pull/4491/commits) that introduced the issue
130+
Fixed = "58409197426ced4714af842650b0cc3b9e2cb842"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5463/commits) that fixed the regression
110131

111-
# Issue reported in: https://github.com/Rdatatable/data.table/issues/5426
112-
# To be fixed in: https://github.com/Rdatatable/data.table/pull/5427
132+
# Issue reported in https://github.com/Rdatatable/data.table/issues/5426
133+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/5427#issue-1323678063 which is the fix PR.
113134
"setDT improved in #5427" = atime::atime_test(
114-
N = 10^seq(1, 7),
115135
setup = {
116136
L <- replicate(N, 1, simplify = FALSE)
117137
setDT(L)
@@ -120,43 +140,73 @@ test.list <- atime::atime_test_list(
120140
data.table:::setattr(L, "class", NULL)
121141
data.table:::setDT(L)
122142
},
123-
Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801)
124-
Fast = "1872f473b20fdcddc5c1b35d79fe9229cd9a1d15"), # Last commit in the PR that fixes the issue (https://github.com/Rdatatable/data.table/pull/5427/commits)
143+
Slow = "c4a2085e35689a108d67dacb2f8261e4964d7e12", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/7cc4da4c1c8e568f655ab5167922dcdb75953801) in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
144+
Fast = "af48a805e7a5026a0c2d0a7fd9b587fea5cfa3c4"), # Last commit in the PR (https://github.com/Rdatatable/data.table/pull/5427/commits) that fixes the issue
125145

126-
# Issue reported in: https://github.com/Rdatatable/data.table/issues/4200
127-
# To be fixed in: https://github.com/Rdatatable/data.table/pull/4558
146+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/4200#issuecomment-645980224 which is where the issue was reported.
147+
# Fixed in https://github.com/Rdatatable/data.table/pull/4558
128148
"DT[by] fixed in #4558" = atime::atime_test(
129-
N = 10^seq(1, 20),
130149
setup = {
131150
d <- data.table(
132-
id3 = sample(c(seq.int(N*0.9), sample( N*0.9, N*0.1, TRUE))),
151+
id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
133152
v1 = sample(5L, N, TRUE),
134153
v2 = sample(5L, N, TRUE)
135154
)
136155
},
137-
expr = {
138-
expr=data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id3)
139-
},
156+
expr = data.table:::`[.data.table`(d, , max(v1) - min(v2), by = id),
140157
Before = "7a9eaf62ede487625200981018d8692be8c6f134", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/515de90a6068911a148e54343a3503043b8bb87c) in the PR (https://github.com/Rdatatable/data.table/pull/4164/commits) that introduced the regression
141158
Regression = "c152ced0e5799acee1589910c69c1a2c6586b95d", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/15f0598b9828d3af2eb8ddc9b38e0356f42afe4f) in the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
142159
Fixed = "f750448a2efcd258b3aba57136ee6a95ce56b302"), # Second commit of the PR (https://github.com/Rdatatable/data.table/pull/4558/commits) that fixes the regression
143160

144-
# Issue with sorting again when already sorted: https://github.com/Rdatatable/data.table/issues/4498
145-
# Fixed in: https://github.com/Rdatatable/data.table/pull/4501
161+
# Issue with sorting again when already sorted, as reported in https://github.com/Rdatatable/data.table/issues/4498
162+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4501#issue-625311918 which is the fix PR.
146163
"DT[,.SD] improved in #4501" = atime::atime_test(
147-
N = 10^seq(1, 10, by=0.5),
148164
setup = {
149165
set.seed(1)
150166
L = as.data.table(as.character(rnorm(N, 1, 0.5)))
151167
setkey(L, V1)
152168
},
153169
## New DT can safely retain key.
154-
expr = {
155-
data.table:::`[.data.table`(L, , .SD)
156-
},
170+
expr = data.table:::`[.data.table`(L, , .SD),
157171
Fast = "353dc7a6b66563b61e44b2fa0d7b73a0f97ca461", # Close-to-last merge commit in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
158172
Slow = "3ca83738d70d5597d9e168077f3768e32569c790", # Circa 2024 master parent of close-to-last merge commit (https://github.com/Rdatatable/data.table/commit/353dc7a6b66563b61e44b2fa0d7b73a0f97ca461) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
159173
Slower = "cacdc92df71b777369a217b6c902c687cf35a70d"), # Circa 2020 parent of the first commit (https://github.com/Rdatatable/data.table/commit/74636333d7da965a11dad04c322c752a409db098) in the PR (https://github.com/Rdatatable/data.table/pull/4501/commits) that fixes the issue
160174

161-
NULL)
175+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/6286#issue-2412141289 which is where the issue was reported.
176+
# Fixed in https://github.com/Rdatatable/data.table/pull/6296
177+
"DT[by,verbose=TRUE] improved in #6296" = atime::atime_test(
178+
setup = {
179+
dt = data.table(a = 1:N)
180+
dt_mod <- copy(dt)
181+
},
182+
expr = data.table:::`[.data.table`(dt_mod, , 1, by = a, verbose = TRUE),
183+
Slow = "a01f00f7438daf4612280d6886e6929fa8c8f76e", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/fc0c1e76408c34a8482f16f7421d262c7f1bde32) in the PR (https://github.com/Rdatatable/data.table/pull/6296/commits) that fixes the issue
184+
Fast = "f248bbe6d1204dfc8def62328788eaadcc8e17a1"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6296) that fixes the issue
185+
186+
# Test case adapted from https://github.com/Rdatatable/data.table/issues/5492#issue-1416598382 which is where the issue was reported,
187+
# and from https://github.com/Rdatatable/data.table/pull/5493#issue-1416656788 which is the fix PR.
188+
"transform improved in #5493" = atime::atime_test(
189+
setup = {
190+
df <- data.frame(x = runif(N))
191+
dt <- as.data.table(df)
192+
},
193+
expr = data.table:::transform.data.table(dt, y = round(x)),
194+
Slow = "0895fa247afcf6b38044bd5f56c0d209691ddb31", # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/93ce3ce1373bf733ebd2036e2883d2ffe377ab58) in the PR (https://github.com/Rdatatable/data.table/pull/5493/commits) that fixes the issue
195+
Fast = "2d1a0575f87cc50e90f64825c30d7a6cb6b05dd7"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5493) that fixes the issue
196+
197+
# Test case created directly using the atime code below (not adapted from any other benchmark), based on the issue/fix PR https://github.com/Rdatatable/data.table/pull/5054#issue-930603663 "melt should be more efficient when there are missing input columns."
198+
"melt improved in #5054" = atime::atime_test(
199+
setup = {
200+
DT <- as.data.table(as.list(1:N))
201+
measure.vars <- lapply(1:N, function(i) {
202+
x = rep(NA, N)
203+
x[i] = i
204+
x
205+
})
206+
},
207+
expr = data.table:::melt(DT, measure.vars = measure.vars),
208+
Slow = "fd24a3105953f7785ea7414678ed8e04524e6955", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/ed72e398df76a0fcfd134a4ad92356690e4210ea) of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
209+
Fast = "ed72e398df76a0fcfd134a4ad92356690e4210ea"), # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/5054) that fixes the issue
210+
211+
tests=extra.test.list)
162212
# nolint end: undesirable_operator_linter.

.ci/linters/po/msgfmt_linter.R

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ msgfmt_linter <- function(po_file) {
2020
stop(po_file, " has not been compiled as ", mo_ref, ". Please fix.")
2121
}
2222
if (tools::md5sum(mo_ref) == tools::md5sum(mo_tmp)) return(invisible())
23+
# TODO(#6517): Re-activate this part of the check to ensure .mo is up to date.
24+
cat(sprintf("Note: MD5 sum of msgfmt output for %s does not match %s.\n", po_file, mo_ref))
25+
return(invisible())
2326

2427
# NB: file.mtime() will probably be wrong, it will reflect the check-out time of the git repo.
2528
last_edit_time = system2("git",
@@ -33,7 +36,7 @@ msgfmt_linter <- function(po_file) {
3336

3437
unmo_tmp = tempfile()
3538
unmo_ref = tempfile()
36-
on.exit(unlink(c(unmo_tmp, unmo_ref), add=TRUE))
39+
on.exit(unlink(c(unmo_tmp, unmo_ref)), add=TRUE)
3740
system2("msgunfmt", c(mo_tmp, "-o", unmo_tmp))
3841
system2("msgunfmt", c(mo_ref, "-o", unmo_ref))
3942
cat("Here are the observed differences after converting back to .po:\n\n")

.ci/linters/r/class1_linter.R

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
class1_linter = lintr::make_linter_from_xpath(
2+
"
3+
//OP-LEFT-BRACKET[
4+
preceding-sibling::expr/expr/SYMBOL_FUNCTION_CALL[text() = 'class']
5+
and following-sibling::expr/NUM_CONST[text() = '1' or text() = '1L']
6+
]
7+
/parent::expr
8+
",
9+
"Use class1(x) to get class(x)[1L], or classes1(x) to do so for a full list/data.table"
10+
)

.dev/cc.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@ cc = function(test=FALSE, clean=FALSE, debug=FALSE, omp=!debug, path=Sys.getenv(
9191
if (clean) system("rm *.o *.so")
9292
OMP = if (omp) "openmp" else "no-openmp"
9393
if (debug) {
94-
cmd = sprintf(R"(MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f% CFLAGS=-std=c99\ -O0\ -ggdb\ %s\ -pedantic' R CMD SHLIB -d -o data_table.so *.c)", CC, OMP, W32)
94+
cmd = sprintf(R"(MAKEFLAGS='-j CC=%s PKG_CFLAGS=-f% CFLAGS=-std=c11\ -O0\ -ggdb\ %s\ -pedantic' R CMD SHLIB -d -o data_table.so *.c)", CC, OMP, W32)
9595
} else {
96-
cmd = sprintf(R"(MAKEFLAGS='-j CC=%s CFLAGS=-f%s\ -std=c99\ -O3\ -pipe\ -Wall\ -pedantic\ -Wstrict-prototypes\ -isystem\ /usr/share/R/include\ %s\ -fno-common' R CMD SHLIB -o data_table.so *.c)", CC, OMP, W32)
96+
cmd = sprintf(R"(MAKEFLAGS='-j CC=%s CFLAGS=-f%s\ -std=c11\ -O3\ -pipe\ -Wall\ -pedantic\ -Wstrict-prototypes\ -isystem\ /usr/share/R/include\ %s\ -fno-common' R CMD SHLIB -o data_table.so *.c)", CC, OMP, W32)
9797
# the -isystem suppresses strict-prototypes warnings from R's headers, #5477. Look at the output to see what -I is and pass the same path to -isystem.
9898
# TODO add -Wextra too?
9999
}

.github/workflows/performance-tests.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Autocomment atime-based performance regression analysis on PRs
1+
name: atime performance tests
22

33
on:
44
pull_request:
@@ -15,10 +15,11 @@ on:
1515

1616
jobs:
1717
comment:
18+
if: github.repository == 'Rdatatable/data.table'
1819
runs-on: ubuntu-latest
1920
container: ghcr.io/iterative/cml:0-dvc2-base1
2021
env:
2122
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
2223
repo_token: ${{ secrets.GITHUB_TOKEN }}
2324
steps:
24-
- uses: Anirban166/[email protected].0
25+
- uses: Anirban166/[email protected].1
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
2+
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
3+
on:
4+
push:
5+
branches: [master]
6+
pull_request:
7+
branches: [master]
8+
9+
name: test-coverage.yaml
10+
11+
permissions: read-all
12+
13+
jobs:
14+
test-coverage:
15+
runs-on: ubuntu-latest
16+
env:
17+
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
18+
19+
steps:
20+
- uses: actions/checkout@v4
21+
22+
- uses: r-lib/actions/setup-r@v2
23+
with:
24+
use-public-rspm: true
25+
r-version: '4.3' # TODO(r-lib/covr#567): Go back to using r-devel
26+
27+
- uses: r-lib/actions/setup-r-dependencies@v2
28+
with:
29+
extra-packages: any::covr, any::xml2
30+
needs: coverage
31+
32+
- name: Test coverage
33+
run: |
34+
cov <- covr::package_coverage(
35+
quiet = FALSE,
36+
clean = FALSE,
37+
install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
38+
)
39+
covr::to_cobertura(cov)
40+
shell: Rscript {0}
41+
42+
- uses: codecov/codecov-action@v4
43+
with:
44+
fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }}
45+
file: ./cobertura.xml
46+
plugin: noop
47+
disable_search: true
48+
token: ${{ secrets.CODECOV_TOKEN }}
49+
50+
- name: Upload test results
51+
if: failure()
52+
uses: actions/upload-artifact@v4
53+
with:
54+
name: coverage-test-failures
55+
path: ${{ runner.temp }}/package

CODEOWNERS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@
3535
/src/shift.c @ben-schwen @michaelchirico
3636
/man/shift.Rd @ben-schwen @michaelchirico
3737

38+
# rbind
39+
/src/rbindlist.c @ben-schwen
40+
/man/rbindlist.Rd @ben-schwen
41+
3842
# translations
3943
/inst/po/ @michaelchirico
4044
/po/ @michaelchirico

DESCRIPTION

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,5 +89,8 @@ Authors@R: c(
8989
person("Iago", "Giné-Vázquez", role="ctb"),
9090
person("Anirban", "Chetia", role="ctb"),
9191
person("Doris", "Amoakohene", role="ctb"),
92-
person("Ivan", "Krylov", role="ctb")
92+
person("Ivan", "Krylov", role="ctb"),
93+
person("Michael","Young", role="ctb"),
94+
person("Mark", "Seeto", role="ctb"),
95+
NULL
9396
)

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ exportClasses(data.table, IDate, ITime)
66
##
77

88
export(data.table, tables, setkey, setkeyv, key, "key<-", haskey, CJ, SJ, copy)
9+
export(rowwiseDT)
910
export(setindex, setindexv, indices)
1011
export(as.data.table,is.data.table,test.data.table)
1112
export(last,first,like,"%like%","%ilike%","%flike%","%plike%",between,"%between%",inrange,"%inrange%", "%notin%")

0 commit comments

Comments
 (0)