Skip to content

Commit 7c39648

Browse files
committed
Merge branch 'master' into froll2025
2 parents 898857b + 5bbc4d5 commit 7c39648

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+765
-704
lines changed

.ci/atime/tests.R

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
pval.thresh <- 0.001 # to reduce false positives.
2+
13
# Test case adapted from https://github.com/Rdatatable/data.table/issues/6105#issue-2268691745 which is where the issue was reported.
24
# https://github.com/Rdatatable/data.table/pull/6107 fixed performance across 3 ways to specify a column as Date, and we test each individually.
35
extra.args.6107 <- c(
@@ -13,6 +15,7 @@ for (extra.arg in extra.args.6107){
1315
tmp_csv = tempfile()
1416
fwrite(DT, tmp_csv)
1517
},
18+
FasterIO = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
1619
Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
1720
Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
1821
this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
@@ -128,6 +131,18 @@ test.list <- atime::atime_test_list(
128131
paste0('useDynLib(', new.Package_))
129132
},
130133

134+
# Constant overhead improvement https://github.com/Rdatatable/data.table/pull/6925
135+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/7022#discussion_r2107900643
136+
"fread disk overhead improved in #6925" = atime::atime_test(
137+
N = 2^seq(0, 20), # smaller N because we are doing multiple fread calls.
138+
setup = {
139+
fwrite(iris[1], iris.csv <- tempfile())
140+
},
141+
expr = replicate(N, data.table::fread(iris.csv)),
142+
Fast = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
143+
Slow = "e25ea80b793165094cea87d946d2bab5628f70a6" # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/60a01fa65191c44d7997de1843e9a1dfe5be9f72)
144+
),
145+
131146
# Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
132147
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
133148
"shallow regression fixed in #4440" = atime::atime_test(
@@ -177,8 +192,9 @@ test.list <- atime::atime_test_list(
177192
# Fixed in https://github.com/Rdatatable/data.table/pull/4558
178193
"DT[by] fixed in #4558" = atime::atime_test(
179194
setup = {
195+
N9 <- as.integer(N * 0.9)
180196
d <- data.table(
181-
id = sample(c(seq.int(N * 0.9), sample(N * 0.9, N * 0.1, TRUE))),
197+
id = sample(c(seq.int(N9), sample(N9, N-N9, TRUE))),
182198
v1 = sample(5L, N, TRUE),
183199
v2 = sample(5L, N, TRUE)
184200
)
@@ -251,5 +267,15 @@ test.list <- atime::atime_test_list(
251267
Before = "f339aa64c426a9cd7cf2fcb13d91fc4ed353cd31", # Parent of the first commit https://github.com/Rdatatable/data.table/commit/fcc10d73a20837d0f1ad3278ee9168473afa5ff1 in the PR https://github.com/Rdatatable/data.table/pull/6393/commits with major change to fwrite with gzip.
252268
PR = "3630413ae493a5a61b06c50e80d166924d2ef89a"), # Close-to-last merge commit in the PR.
253269

254-
tests=extra.test.list)
270+
# Test case created directly using the atime code below (not adapted from any other benchmark), based on the PR, Removes unnecessary data.table call from as.data.table.array https://github.com/Rdatatable/data.table/pull/7010
271+
"as.data.table.array improved in #7010" = atime::atime_test(
272+
setup = {
273+
dims = c(N, 1, 1)
274+
arr = array(seq_len(prod(dims)), dim=dims)
275+
},
276+
expr = data.table:::as.data.table.array(arr, na.rm=FALSE),
277+
Slow = "73d79edf8ff8c55163e90631072192301056e336", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/8397dc3c993b61a07a81c786ca68c22bc589befc)
278+
Fast = "8397dc3c993b61a07a81c786ca68c22bc589befc"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7019/commits) that removes inefficiency
279+
280+
tests=extra.test.list)
255281
# nolint end: undesirable_operator_linter.

.ci/linters/c/cocci_linter.R

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,23 @@
11
cocci_linter = if (!nzchar(Sys.which("spatch"))) function(...) {} else function(c_obj) {
2-
bad <- FALSE
2+
bad = FALSE
3+
tmp = tempfile(fileext = '.c')
4+
on.exit(unlink(tmp))
5+
writeLines(c_obj$preprocessed, tmp)
36
for (spfile in list.files(".ci/linters/cocci", full.names = TRUE)) {
4-
# Coccinelle parser gets confused sometimes, so ignore stderr and the exit code
5-
out = suppressWarnings(system2(
7+
out = system2(
68
"spatch",
7-
shQuote(c(
8-
"--sp-file", spfile, c_obj$path, "--recursive-includes",
9-
"-I", R.home("include"), "-I", "src"
10-
)),
9+
shQuote(c("--sp-file", spfile, tmp)),
1110
stdout = TRUE, stderr = FALSE
12-
))
11+
)
1312
if (length(out) > 0) {
14-
cat(sprintf("In file '%s', Coccinelle patch '%s' recommends the following changes:\n", c_obj$path, spfile))
13+
cat(sprintf("In file '%s', Coccinelle linter '%s' located the following problems:\n", c_obj$path, spfile))
1514
writeLines(out)
16-
bad <- TRUE
15+
bad = TRUE
16+
}
17+
if (!is.null(status <- attr(out, 'status'))) {
18+
cat(sprintf("While working on file '%s', Coccinelle linter '%s' failed with exit code %d:\n", c_obj$path, spfile, status))
19+
bad = TRUE
1720
}
1821
}
19-
if (bad) stop("Please apply the changes above or fix the linter")
22+
if (bad) stop("Please investigate the problems above.")
2023
}

.ci/linters/cocci/malloc_return_value_cast.cocci

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,11 @@ expression E;
44
@@
55
- (T)
66
malloc(E)
7+
8+
@calloc_realloc_return_value_cast expression@
9+
type T;
10+
expression E1, E2;
11+
identifier alloc =~ "^(c|re)alloc$";
12+
@@
13+
- (T)
14+
alloc(E1, E2)

.github/CODE_OF_CONDUCT.md

Lines changed: 9 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,16 @@
1-
As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
1+
The R data.table project adheres to NumFOCUS's Code of Conduct.
22

3-
We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality.
3+
# The NumFOCUS Code of Conduct
44

5-
Examples of unacceptable behavior by participants include:
5+
## The Short Version
66

7-
* The use of sexualized language or imagery
8-
* Personal attacks
9-
* Trolling or insulting/derogatory comments
10-
* Public or private harassment
11-
* Publishing other's private information, such as physical or electronic addresses, without explicit permission
12-
* Other unethical or unprofessional conduct
7+
Be kind to others. Do not insult or put down others. Behave professionally. Remember that harassment and sexist, racist, or exclusionary jokes are not appropriate for NumFOCUS.
138

14-
Project members with the Committer role have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
9+
All communication should be appropriate for a professional audience including people of many different backgrounds. Sexual language and imagery is not appropriate.
1510

16-
By adopting this Code of Conduct, project members commit themselves to fairly and consistently apply these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team.
11+
NumFOCUS is dedicated to providing a harassment-free community for everyone, regardless of gender, sexual orientation, gender identity and expression, disability, physical appearance, body size, race, or religion. We do not tolerate harassment of community members in any form.
12+
Thank you for helping make this a welcoming, friendly community for all.
1713

18-
This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community.
14+
[Code of Conduct Reporting Form](https://numfocus.typeform.com/to/ynjGdT)
1915

20-
21-
## Reporting
22-
23-
Project members with the Committer role or the CRAN Maintainer role are pledged to promptly address any reported issues. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to any individual with this role.
24-
25-
Those who prefer to report in a way that is independent of the current Committers and Maintainer may instead contact the Community Engagement Coordinator by e-mailing [r.data.table\@gmail.com](mailto:[email protected]). Messages sent to this e-mail address will be visible only to the current Community Engagement Coordinator, a position always held by an individual who is not a Committer or CRAN Maintainer of the package.
26-
27-
The current Committers are Toby Dylan Hocking (@tdhock), Matt Dowle (@mattdowle), Arun Srinivasan (@arunsrinivasan), Jan Gorecki (@jangorecki), Michael Chirico (@MichaelChirico), Benjamin Schwendinger (@ben-schwen), and Ivan Krylov (@aitap).
28-
29-
The current CRAN Maintainer is Tyson Barrett (@tysonstanley).
30-
31-
The current Community Engagement Coordinator is Kelly Bodwin (@kbodwin).
32-
33-
All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. Complaint respondents are obligated to maintain confidentiality with regard to the reporter of an incident.
34-
35-
This Code of Conduct is adapted from the [Contributor Covenant, version 1.3.0](https://www.contributor-covenant.org/version/1/3/0/code-of-conduct/), available at [https://www.contributor-covenant.org/version/1/3/0/](https://www.contributor-covenant.org/version/1/3/0/), and the Swift Code of Conduct.
16+
For the full version of the Code of Conduct, please visit: [https://numfocus.org/code-of-conduct](https://numfocus.org/code-of-conduct).

.github/workflows/performance-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ jobs:
2020
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
2121
repo_token: ${{ secrets.GITHUB_TOKEN }}
2222
steps:
23-
- uses: Anirban166/[email protected].1
23+
- uses: Anirban166/[email protected].3

GOVERNANCE.md

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -108,23 +108,14 @@ Please also make a note in the change log under [`# Governance history`](#govern
108108

109109
# Finances and Funding
110110

111-
There is currently no mechanism for the data.table project to receive funding as an entity.
111+
data.table is a [NumFOCUS](https://numfocus.org/) project. Donations to the data.table can be made at [https://numfocus.org/project/data-table]([https://numfocus.org/donate-to-data-table](https://app.hubspot.com/payments/FFWKWTTvKFdzqH?referrer=PAYMENT_LINK))
112112

113-
Funding support for this project therefore may come in two forms:
113+
*NumFOCUS is a 501(c)(3) non-profit charity in the United States; as such, donations to NumFOCUS are tax-deductible as allowed by law. As with any donation, you should consult with your personal tax adviser or the IRS about your particular tax situation.*
114114

115-
## Individual external funding
116115

117-
Any individual developer or community member of data.table may apply for and receive funding for their work on the project. Individuals or groups seeking funding support are strongly encouraged to consult directly with the data.table Project Members (by initiating an Issue on GitHub) to ensure funds are used meaningfully. Formally, however, decisions about use of funds are governed by the individual grantee(s) and their contract with the funding agency.
116+
## Decision-making for funding use
118117

119-
There is no guarantee that funded work will be incorporated into the data.table package; any contributions, whether funded or unfunded, are subject to the same review process as outlined above.
120-
121-
## Direct donations
122-
123-
Direct donations to the project may be made via GitHub Sponsorships, which allow individuals to fund a specific developer. If the current CRAN Maintainer offers a personal sponsorship option, donations may be made to them to support the project in general.
124-
125-
## Decision-making for future opportunities
126-
127-
We here outline a procedure for disbursing funds, should this project in the future become a directly fundable entity (e.g. an LLC or a subsidiary of an umbrella LLC).
118+
We here outline a procedure for disbursing funds acquired through direct donations via NumFOCUS or grant-style research funding.
128119

129120
Funds acquired by the data.table project will be disbursed at the discretion of the **Committers**, defined as above. The **CRAN Maintainer** will have authority to make final decisions in the event that no consensus is reached among committers prior to deadlines for use of funds, and will be responsible for disbursement logistics.
130121

@@ -148,6 +139,8 @@ data.table Version line in DESCRIPTION typically has the following meanings
148139

149140
# Governance history
150141

142+
May 2025: update Finance and CoC language for NumFOCUS incorporation.
143+
151144
Feb 2025: add Finances and Funding section, update Code of Conduct section to be a brief summary and reference the broader CoC document.
152145

153146
Jan 2025: clarify that edits to governance should notify all committers, and that role names are proper nouns (i.e., upper-case) throughout.

NEWS.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ frollsum(c(1,2,3,Inf,5,6), 2)
2727

2828
4. `as.Date()` method for `IDate` no longer coerces to `double` [#6922](https://github.com/Rdatatable/data.table/issues/6922). Thanks @MichaelChirico for the report and PR. The only effect should be on overly-strict tests that assert `Date` objects have `double` storage, which is not in general true, especially from R 4.5.0.
2929

30-
5. Multiple improvements has been added to rolling functions. Request came from @gpierard who needed left aligned, adaptive, rolling max, [#5438](https://github.com/Rdatatable/data.table/issues/5438). There was no `frollmax` function yet. Adaptive rolling functions did not have support for `align="left"`. `frollapply` did not support `adaptive=TRUE`. Available alternatives were base R `mapply` or self-join using `max` and grouping `by=.EACHI`. As a follow up of his request, following features has been added:
30+
5. `as.data.table()` is slightly more efficient at converting arrays to data.tables, [#7019](https://github.com/Rdatatable/data.table/pull/7019). Thanks @eliocamp.
31+
32+
6. `between()` gains the argument `ignore_tzone=FALSE`. Normally, a difference in time zone between `lower` and `upper` will produce an error, and a difference in time zone between `x` and either of the others will produce a message. Setting `ignore_tzone=TRUE` bypasses the checks, allowing both comparisons to proceed without error or message about time zones.
33+
34+
7. Multiple improvements has been added to rolling functions. Request came from @gpierard who needed left aligned, adaptive, rolling max, [#5438](https://github.com/Rdatatable/data.table/issues/5438). There was no `frollmax` function yet. Adaptive rolling functions did not have support for `align="left"`. `frollapply` did not support `adaptive=TRUE`. Available alternatives were base R `mapply` or self-join using `max` and grouping `by=.EACHI`. As a follow up of his request, following features has been added:
3135
- new function `frollmax`, applies `max` over a rolling window.
3236
- support for `align="left"` for adaptive rolling function.
3337
- support for `adaptive=TRUE` in `frollapply`.
@@ -85,6 +89,8 @@ As of now, adaptive rolling max has no _on-line_ implemention (`algo="fast"`), i
8589

8690
8. `fread()` no longer warns on certain systems on R 4.5.0+ where the file owner can't be resolved, [#6918](https://github.com/Rdatatable/data.table/issues/6918). Thanks @ProfFancyPants for the report and PR.
8791

92+
9. Joins to extended data.frames, e.g. `x[i, col := x.col1 + i.col2]` where `i` is a `tbl`, can use the `x.` and `i.` prefix forms, [#6998](https://github.com/Rdatatable/data.table/issues/6998). Thanks @MichaelChirico for the bug and PR.
93+
8894
### NOTES
8995

9096
1. Continued work to remove non-API C functions, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks Ivan Krylov for the PRs and for writing a clear and concise guide about the R API: https://aitap.codeberg.page/R-api/.
@@ -98,6 +104,12 @@ As of now, adaptive rolling max has no _on-line_ implemention (`algo="fast"`), i
98104

99105
3. {data.table} now depends on R 3.4.0 (2017).
100106

107+
4. Changes to `fread()` output and errors:
108+
109+
+ When the size of the file exceeds the size of the address space, `fread()` now signals an informative error instead of trying to map its size modulo the address space.
110+
+ On non-Windows systems, `fread()` now prints the reason why the file couldn't be opened, which could also be due to it being too large to map.
111+
+ With `verbose=TRUE`, file sizes are now printed using correct binary SI prefixes (the sizes have always been reported as bytes denominated in powers of `2^10`, so e.g. `1024*1024` bytes was reported as `1 MB` where `1 MiB` or `1.05 MB` is correct).
112+
101113
## data.table [v1.17.0](https://github.com/Rdatatable/data.table/milestone/34) (20 Feb 2025)
102114

103115
### POTENTIALLY BREAKING CHANGES

R/as.data.table.R

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,9 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
9696
dnx = dimnames(x)
9797
# NULL dimnames will create integer keys, not character as in table method
9898
val = if (is.null(dnx)) {
99-
lapply(dx, seq.int)
99+
lapply(dx, seq_len)
100100
} else if (any(nulldnx <- vapply_1b(dnx, is.null))) {
101-
dnx[nulldnx] = lapply(dx[nulldnx], seq.int) #3636
101+
dnx[nulldnx] = lapply(dx[nulldnx], seq_len) #3636
102102
dnx
103103
} else dnx
104104
val = rev(val)
@@ -107,7 +107,8 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
107107
if (value.name %chin% names(val))
108108
stopf("Argument 'value.name' should not overlap with column names in result: %s", brackify(rev(names(val))))
109109
N = NULL
110-
ans = data.table(do.call(CJ, c(val, sorted=FALSE)), N=as.vector(x))
110+
ans = do.call(CJ, c(val, sorted=FALSE))
111+
set(ans, j="N", value=as.vector(x))
111112
if (isTRUE(na.rm))
112113
ans = ans[!is.na(N)]
113114
setnames(ans, "N", value.name)

R/between.R

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# is x[i] in between lower[i] and upper[i] ?
2-
between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) {
2+
between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE, ignore_tzone=FALSE) {
33
if (is.logical(x)) stopf("between has been passed an argument x of type logical")
44
if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type)
55
if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type)
@@ -16,15 +16,12 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE)
1616
stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal
1717
}
1818
# POSIX check timezone match
19-
if (is.px(x) && is.px(lower) && is.px(upper)) {
20-
tzs = sapply(list(x,lower,upper), function(x) {
21-
attr(x, "tzone", exact=TRUE) %||% ""
22-
})
19+
if (!ignore_tzone && is.px(x) && is.px(lower) && is.px(upper)) {
20+
tzs = vapply_1c(list(x, lower, upper), function(x) attr(x, "tzone", exact=TRUE) %||% "")
2321
# lower/upper should be more tightly linked than x/lower, so error
2422
# if the former don't match but only inform if they latter don't
2523
if (tzs[2L]!=tzs[3L]) {
2624
stopf("'between' lower= and upper= are both POSIXct but have different tzone attributes: %s. Please align their time zones.", brackify(tzs[2:3], quote=TRUE))
27-
# otherwise the check in between.c that lower<=upper can (correctly) fail for this reason
2825
}
2926
if (tzs[1L]!=tzs[2L]) {
3027
messagef("'between' arguments are all POSIXct but have mismatched tzone attributes: %s. The UTC times will be compared.", brackify(tzs, quote=TRUE))

0 commit comments

Comments
 (0)