Skip to content

Commit b7fce75

Browse files
Merge branch 'master' into tbl-i
2 parents ffde66d + b689cd2 commit b7fce75

39 files changed

+634
-606
lines changed

.ci/atime/tests.R

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ for (extra.arg in extra.args.6107){
1313
tmp_csv = tempfile()
1414
fwrite(DT, tmp_csv)
1515
},
16+
FasterIO = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
1617
Slow = "e9087ce9860bac77c51467b19e92cf4b72ca78c7", # Parent of the merge commit (https://github.com/Rdatatable/data.table/commit/a77e8c22e44e904835d7b34b047df2eff069d1f2) of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
1718
Fast = "a77e8c22e44e904835d7b34b047df2eff069d1f2") # Merge commit of the PR (https://github.com/Rdatatable/data.table/pull/6107) that fixes the issue
1819
this.test$expr = str2lang(sprintf("data.table::fread(tmp_csv, %s)", extra.arg))
@@ -128,6 +129,18 @@ test.list <- atime::atime_test_list(
128129
paste0('useDynLib(', new.Package_))
129130
},
130131

132+
# Constant overhead improvement https://github.com/Rdatatable/data.table/pull/6925
133+
# Test case adapted from https://github.com/Rdatatable/data.table/pull/7022#discussion_r2107900643
134+
"fread disk overhead improved in #6925" = atime::atime_test(
135+
N = 2^seq(0, 20), # smaller N because we are doing multiple fread calls.
136+
setup = {
137+
fwrite(iris[1], iris.csv <- tempfile())
138+
},
139+
expr = replicate(N, data.table::fread(iris.csv)),
140+
Fast = "60a01fa65191c44d7997de1843e9a1dfe5be9f72", # First commit of the PR (https://github.com/Rdatatable/data.table/pull/6925/commits) that reduced time usage
141+
Slow = "e25ea80b793165094cea87d946d2bab5628f70a6" # Parent of the first commit (https://github.com/Rdatatable/data.table/commit/60a01fa65191c44d7997de1843e9a1dfe5be9f72)
142+
),
143+
131144
# Performance regression discussed in https://github.com/Rdatatable/data.table/issues/4311
132145
# Test case adapted from https://github.com/Rdatatable/data.table/pull/4440#issuecomment-632842980 which is the fix PR.
133146
"shallow regression fixed in #4440" = atime::atime_test(

.ci/linters/cocci/malloc_return_value_cast.cocci

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,9 @@ expression E;
44
@@
55
- (T)
66
malloc(E)
7+
8+
- (T)
9+
calloc(_, E)
10+
11+
- (T)
12+
realloc(_, E)

.github/workflows/performance-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ jobs:
2020
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
2121
repo_token: ${{ secrets.GITHUB_TOKEN }}
2222
steps:
23-
- uses: Anirban166/[email protected].1
23+
- uses: Anirban166/[email protected].2

NEWS.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
5. `as.data.table()` is slightly more efficient at converting arrays to data.tables, [#7019](https://github.com/Rdatatable/data.table/pull/7019). Thanks @eliocamp.
1818

19+
6. `between()` gains the argument `ignore_tzone=FALSE`. Normally, a difference in time zone between `lower` and `upper` will produce an error, and a difference in time zone between `x` and either of the others will produce a message. Setting `ignore_tzone=TRUE` bypasses the checks, allowing both comparisons to proceed without error or message about time zones.
20+
1921
### BUG FIXES
2022

2123
1. Custom binary operators from the `lubridate` package now work with objects of class `IDate` as with a `Date` subclass, [#6839](https://github.com/Rdatatable/data.table/issues/6839). Thanks @emallickhossain for the report and @aitap for the fix.
@@ -49,6 +51,12 @@
4951

5052
3. {data.table} now depends on R 3.4.0 (2017).
5153

54+
4. Changes to `fread()` output and errors:
55+
56+
+ When the size of the file exceeds the size of the address space, `fread()` now signals an informative error instead of trying to map its size modulo the address space.
57+
+ On non-Windows systems, `fread()` now prints the reason why the file couldn't be opened, which could also be due to it being too large to map.
58+
+ With `verbose=TRUE`, file sizes are now printed using correct binary SI prefixes (the sizes have always been reported as bytes denominated in powers of `2^10`, so e.g. `1024*1024` bytes was reported as `1 MB` where `1 MiB` or `1.05 MB` is correct).
59+
5260
## data.table [v1.17.0](https://github.com/Rdatatable/data.table/milestone/34) (20 Feb 2025)
5361

5462
### POTENTIALLY BREAKING CHANGES

R/between.R

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# is x[i] in between lower[i] and upper[i] ?
2-
between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE) {
2+
between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE, ignore_tzone=FALSE) {
33
if (is.logical(x)) stopf("between has been passed an argument x of type logical")
44
if (is.logical(lower)) lower = as.integer(lower) # typically NA (which is logical type)
55
if (is.logical(upper)) upper = as.integer(upper) # typically NA (which is logical type)
@@ -16,15 +16,12 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE)
1616
stopifnot(is.px(x), is.px(lower), is.px(upper)) # nocov # internal
1717
}
1818
# POSIX check timezone match
19-
if (is.px(x) && is.px(lower) && is.px(upper)) {
20-
tzs = sapply(list(x,lower,upper), function(x) {
21-
attr(x, "tzone", exact=TRUE) %||% ""
22-
})
19+
if (!ignore_tzone && is.px(x) && is.px(lower) && is.px(upper)) {
20+
tzs = vapply_1c(list(x, lower, upper), function(x) attr(x, "tzone", exact=TRUE) %||% "")
2321
# lower/upper should be more tightly linked than x/lower, so error
2422
# if the former don't match but only inform if they latter don't
2523
if (tzs[2L]!=tzs[3L]) {
2624
stopf("'between' lower= and upper= are both POSIXct but have different tzone attributes: %s. Please align their time zones.", brackify(tzs[2:3], quote=TRUE))
27-
# otherwise the check in between.c that lower<=upper can (correctly) fail for this reason
2825
}
2926
if (tzs[1L]!=tzs[2L]) {
3027
messagef("'between' arguments are all POSIXct but have mismatched tzone attributes: %s. The UTC times will be compared.", brackify(tzs, quote=TRUE))

inst/tests/tests.Rraw

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21134,19 +21134,26 @@ test(2315.1, tail(DT[order(i), i], 2L), 1:2)
2113421134
DT[1L, i := 1000L]
2113521135
test(2315.2, tail(DT[order(i), i], 2L), c(1L, 1000L))
2113621136

21137+
# issue #6898, test that tzone behavior changes with ignore_tzone=TRUE
21138+
tms = list(.POSIXct(1), .POSIXct(1.0, "UTC"))
21139+
test(2316.1, between(tms[[1]], tms[[1L]], tms[[2L]]), error = "different tzone attributes")
21140+
test(2316.2, between(tms[[1]], tms[[1L]], tms[[2L]], ignore_tzone=TRUE))
21141+
test(2316.3, between(tms[[1]], tms[[2L]], tms[[2L]]), message = "mismatched tzone attributes")
21142+
test(2316.4, between(tms[[1]], tms[[2L]], tms[[2L]], ignore_tzone=TRUE))
21143+
2113721144
# tbl in i still allows 'i.' prefix reference for update join, #6998
2113821145
DT1 = data.table(a=1, b=2)
2113921146
DT2 = data.table(a=1, c=3)
2114021147
DF1 = data.frame(a=1, d=4)
2114121148
DF2 = data.frame(a=1, e=5)
2114221149
class(DF2) = c("tbl_df", "tbl", "data.frame")
2114321150

21144-
test(2316.1, DT1[DT2, on='a', c := i.c]$c, 3)
21145-
test(2316.2, DT1[DT2, on='a', c2 := x.a + i.c]$c2, 4)
21146-
test(2316.3, DT1[DT2, on='a', .(c = x.a + i.c)]$c, 4)
21147-
test(2316.4, DT1[DF1, on='a', d := i.d]$d, 4)
21148-
test(2316.5, DT1[DF1, on='a', d2 := x.a + i.d]$d2, 5)
21149-
test(2316.6, DT1[DF1, on='a', .(d = x.a + i.d)]$d, 5)
21150-
test(2316.7, DT1[DF2, on='a', e := i.e]$e, 5)
21151-
test(2316.8, DT1[DF2, on='a', e2 := x.a + i.e]$e2, 6)
21152-
test(2316.9, DT1[DF2, on='a', .(e = x.a + i.e)]$e, 6)
21151+
test(2317.1, DT1[DT2, on='a', c := i.c]$c, 3)
21152+
test(2317.2, DT1[DT2, on='a', c2 := x.a + i.c]$c2, 4)
21153+
test(2317.3, DT1[DT2, on='a', .(c = x.a + i.c)]$c, 4)
21154+
test(2317.4, DT1[DF1, on='a', d := i.d]$d, 4)
21155+
test(2317.5, DT1[DF1, on='a', d2 := x.a + i.d]$d2, 5)
21156+
test(2317.6, DT1[DF1, on='a', .(d = x.a + i.d)]$d, 5)
21157+
test(2317.7, DT1[DF2, on='a', e := i.e]$e, 5)
21158+
test(2317.8, DT1[DF2, on='a', e2 := x.a + i.e]$e2, 6)
21159+
test(2317.9, DT1[DF2, on='a', .(e = x.a + i.e)]$e, 6)

man/between.Rd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ This can be changed by setting \code{NAbounds} to \code{NA}.
1616
the intervals provided in \code{lower,upper}.
1717
}
1818
\usage{
19-
between(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE)
19+
between(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE, ignore_tzone=FALSE)
2020
x \%between\% y
2121

2222
inrange(x, lower, upper, incbounds=TRUE)
@@ -35,6 +35,7 @@ interpreted as \code{lower} and \code{y[[2]]} as \code{upper}.}
3535
It is set to \code{TRUE} by default for infix notations.}
3636
\item{NAbounds}{ If \code{lower} (\code{upper}) contains an \code{NA} what should \code{lower<=x} (\code{x<=upper}) return? By default \code{TRUE} so that a missing bound is interpreted as unlimited. }
3737
\item{check}{ Produce error if \code{any(lower>upper)}? \code{FALSE} by default for efficiency, in particular type \code{character}. }
38+
\item{ignore_tzone}{ \code{TRUE} means skip timezone checks among \code{x}, \code{lower}, and \code{upper}. }
3839
}
3940
\details{
4041

src/assign.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
400400
// FR #2077 - set able to add new cols by reference
401401
if (isString(cols)) {
402402
PROTECT(tmp = chmatch(cols, names, 0)); protecti++;
403-
buf = (int *) R_alloc(length(cols), sizeof(int));
403+
buf = (int *) R_alloc(length(cols), sizeof(*buf));
404404
int k=0;
405405
for (int i=0; i<length(cols); ++i) {
406406
if (INTEGER(tmp)[i] == 0) buf[k++] = i;
@@ -699,7 +699,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values)
699699
}
700700
if (ndelete) {
701701
// delete any columns assigned NULL (there was a 'continue' earlier in loop above)
702-
int *tt = (int *)R_alloc(ndelete, sizeof(int));
702+
int *tt = (int *)R_alloc(ndelete, sizeof(*tt));
703703
const int *colsd=INTEGER(cols), ncols=length(cols), ndt=length(dt);
704704
for (int i=0, k=0; i<ncols; ++i) { // find which ones to delete and put them in tt
705705
// Aside: a new column being assigned NULL (something odd to do) would have been warned above, added above, and now deleted. Just
@@ -1055,7 +1055,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
10551055
case RAWSXP: BODY(Rbyte, RAW, int, val!=0, td[i]=cval)
10561056
case LGLSXP:
10571057
if (mc) {
1058-
memcpy(td, LOGICAL_RO(source), slen*sizeof(int)); break;
1058+
memcpy(td, LOGICAL_RO(source), slen*sizeof(*td)); break;
10591059
} else BODY(int, LOGICAL, int, val, td[i]=cval)
10601060
case INTSXP: BODY(int, INTEGER, int, val==NA_INTEGER ? NA_LOGICAL : val!=0, td[i]=cval)
10611061
case REALSXP:
@@ -1072,7 +1072,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
10721072
case LGLSXP: // same as INTSXP ...
10731073
case INTSXP:
10741074
if (mc) {
1075-
memcpy(td, INTEGER_RO(source), slen*sizeof(int)); break;
1075+
memcpy(td, INTEGER_RO(source), slen*sizeof(*td)); break;
10761076
} else BODY(int, INTEGER, int, val, td[i]=cval)
10771077
case REALSXP:
10781078
if (sourceIsI64)
@@ -1092,7 +1092,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
10921092
case REALSXP:
10931093
if (sourceIsI64) {
10941094
if (mc) {
1095-
memcpy(td, (const int64_t *)REAL_RO(source), slen*sizeof(int64_t)); break;
1095+
memcpy(td, (const int64_t *)REAL_RO(source), slen*sizeof(*td)); break;
10961096
} else BODY(int64_t, REAL, int64_t, val, td[i]=cval)
10971097
} else BODY(double, REAL, int64_t, within_int64_repres(val) ? val : NA_INTEGER64, td[i]=cval)
10981098
case CPLXSXP: BODY(Rcomplex, COMPLEX, int64_t, ISNAN(val.r) ? NA_INTEGER64 : (int64_t)val.r, td[i]=cval)
@@ -1291,14 +1291,14 @@ void savetl(SEXP s)
12911291
internal_error(__func__, "reached maximum %d items for savetl", nalloc); // # nocov
12921292
}
12931293
nalloc = nalloc>(INT_MAX/2) ? INT_MAX : nalloc*2;
1294-
char *tmp = (char *)realloc(saveds, nalloc*sizeof(SEXP));
1294+
char *tmp = realloc(saveds, sizeof(SEXP)*nalloc);
12951295
if (tmp==NULL) {
12961296
// C spec states that if realloc() fails the original block is left untouched; it is not freed or moved. We rely on that here.
12971297
savetl_end(); // # nocov free(saveds) happens inside savetl_end
12981298
error(_("Failed to realloc saveds to %d items in savetl"), nalloc); // # nocov
12991299
}
13001300
saveds = (SEXP *)tmp;
1301-
tmp = (char *)realloc(savedtl, nalloc*sizeof(R_len_t));
1301+
tmp = realloc(savedtl, sizeof(R_len_t)*nalloc);
13021302
if (tmp==NULL) {
13031303
savetl_end(); // # nocov
13041304
error(_("Failed to realloc savedtl to %d items in savetl"), nalloc); // # nocov
@@ -1335,4 +1335,3 @@ SEXP setcharvec(SEXP x, SEXP which, SEXP newx)
13351335
}
13361336
return R_NilValue;
13371337
}
1338-

src/chmatch.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
9696
// For example: A,B,C,B,D,E,A,A => A(TL=1),B(2),C(3),D(4),E(5) => dupMap 1 2 3 5 6 | 8 7 4
9797
// dupLink 7 8 | 6 (blank=0)
9898
unsigned int mapsize = tablelen+nuniq; // lto compilation warning #5760 // +nuniq to store a 0 at the end of each group
99-
int *counts = (int *)calloc(nuniq, sizeof(int));
100-
int *map = (int *)calloc(mapsize, sizeof(int));
99+
int *counts = calloc(nuniq, sizeof(*counts));
100+
int *map = calloc(mapsize, sizeof(*map));
101101
if (!counts || !map) {
102102
// # nocov start
103103
free(counts); free(map);
@@ -169,4 +169,3 @@ system.time(ans2 <- .Call("Cchmatch2", x,y,0L)) # 0.17sec as of 1.12.0 and
169169
system.time(ans3 <- chmatchdup(x,y,0L)) # 0.09sec from 1.12.2; but goal wasn't speed rather simplified code; e.g. rbindlist.c down from 960 to 360 lines
170170
identical(ans2,ans3) # test 2000
171171
*/
172-

src/cj.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ SEXP cj(SEXP base_list) {
3636
}
3737
#pragma omp parallel for num_threads(getDTthreads(ncopy*blocklen, true))
3838
for (int i=1; i<ncopy; ++i) {
39-
memcpy(targetP + i*blocklen, targetP, blocklen*sizeof(int));
39+
memcpy(targetP + i*blocklen, targetP, blocklen*sizeof(*targetP));
4040
}
4141
} break;
4242
case REALSXP: {
@@ -99,4 +99,3 @@ SEXP cj(SEXP base_list) {
9999
UNPROTECT(1);
100100
return out;
101101
}
102-

0 commit comments

Comments
 (0)