2nd pass

MichaelChirico · MichaelChirico · commit 4ffc2629b8a3 · 2025-07-03T12:30:44.000-07:00
diff --git a/.dev/CRAN_Release.cmd b/.dev/CRAN_Release.cmd
@@ -562,9 +562,9 @@ ls -1 *.tar.gz | grep -E 'Chicago|dada2|flowWorkspace|LymphoSeq' | TZ='UTC' para
 #  3) dllVersion() at the end of init.c
 # DO NOT push to GitHub's master branch. Prevents even a slim possibility of user getting premature version. 
 # Even release numbers must have been obtained from CRAN and only CRAN. There were too many support problems in the past before this procedure was brought in.
-du -k inst/tests                # 1.5MB before
+du -k inst/tests                # 1.5MiB before
 bzip2 inst/tests/*.Rraw         # compress *.Rraw just for release to CRAN; do not commit compressed *.Rraw to git
-du -k inst/tests                # 0.75MB after
+du -k inst/tests                # 0.75MiB after
 R CMD build .
 export GITHUB_PAT="f1c.. github personal access token ..7ad"
 Rdevel -q -e "packageVersion('xml2')"   # ensure installed
diff --git a/.dev/revdep.R b/.dev/revdep.R
@@ -67,7 +67,7 @@ options(repos = BiocManager::repositories())
 
 options(warn=1)  # warning at the time so we can more easily see what's going on package by package when we scroll through output
 cat("options()$timeout==", options()$timeout," set by R_DEFAULT_INTERNET_TIMEOUT in .dev/.bash_aliases revdepsh\n",sep="")
-# R's default is 60. Before Dec 2020, we used 300 but that wasn't enough to download Bioc package BSgenome.Hsapiens.UCSC.hg19 (677GB) which is
+# R's default is 60. Before Dec 2020, we used 300 but that wasn't enough to download Bioc package BSgenome.Hsapiens.UCSC.hg19 (677GiB) which is
 # suggested by CRAN package CNVScope which imports data.table. From Dec 2020 we use 3600.
 
 if (is.null(utils::old.packages(.libPaths()[2]))) {
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -151,7 +151,7 @@ test-lin-rel-cran:
     _R_CHECK_CRAN_INCOMING_: "TRUE"           ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though)
     _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE"   ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284
     _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## bytes
-    _R_CHECK_PKG_SIZES_THRESHOLD_: "10"        ## MB 'checking installed package size' NOTE increased due to po
+    _R_CHECK_PKG_SIZES_THRESHOLD_: "10"        ## MiB 'checking installed package size' NOTE increased due to po
   script:
     - *install-deps
     - echo 'CFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -4,7 +4,7 @@ Title: Extension of `data.frame`
 Depends: R (>= 3.4.0)
 Imports: methods
 Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), R.utils, xts, zoo (>= 1.8-1), yaml, knitr, markdown
-Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development.
+Description: Fast aggregation of large data (e.g. 100GiB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development.
 License: MPL-2.0 | file LICENSE
 URL: https://r-datatable.com, https://Rdatatable.gitlab.io/data.table, https://github.com/Rdatatable/data.table
 BugReports: https://github.com/Rdatatable/data.table/issues
diff --git a/R/data.table.R b/R/data.table.R
@@ -2237,7 +2237,7 @@ tail.data.table = function(x, n=6L, ...) {
 
 "[<-.data.table" = function(x, i, j, value) {
   # [<- is provided for consistency, but := is preferred as it allows by group and by reference to subsets of columns
-  # with no copy of the (very large, say 10GB) columns at all. := is like an UPDATE in SQL and we like and want two symbols to change.
+  # with no copy of the (very large, say 10GiB) columns at all. := is like an UPDATE in SQL and we like and want two symbols to change.
   if (!cedta()) {
     x = if (nargs()<4L) `[<-.data.frame`(x, i, value=value)
         else `[<-.data.frame`(x, i, j, value)
diff --git a/R/duplicated.R b/R/duplicated.R
@@ -49,7 +49,7 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon
 
 # Test for #2013 unique() memory efficiency improvement in v1.10.5
 # set.seed(1)
-# Create unique 7.6GB DT on 16GB laptop
+# Create unique 7.6GiB DT on 16GiB laptop
 # DT = data.table(
 #  A = sample(1e8, 2e8, TRUE),
 #  B = sample(1e8, 2e8, TRUE),
diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ pay for developer time, professional services, travel, workshops, and a variety
 * fast and friendly delimited **file reader**: **[`?fread`](https://rdatatable.gitlab.io/data.table/reference/fread.html)**, see also [convenience features for _small_ data](https://github.com/Rdatatable/data.table/wiki/Convenience-features-of-fread)
 * fast and feature rich delimited **file writer**: **[`?fwrite`](https://rdatatable.gitlab.io/data.table/reference/fwrite.html)**
 * low-level **parallelism**: many common operations are internally parallelized to use multiple CPU threads
-* fast and scalable aggregations; e.g. 100GB in RAM (see [benchmarks](https://duckdblabs.github.io/db-benchmark/) on up to **two billion rows**)
+* fast and scalable aggregations; e.g. 100GiB in RAM (see [benchmarks](https://duckdblabs.github.io/db-benchmark/) on up to **two billion rows**)
 * fast and feature rich joins: **ordered joins** (e.g. rolling forwards, backwards, nearest and limited staleness), **[overlapping range joins](https://github.com/Rdatatable/data.table/wiki/talks/EARL2014_OverlapRangeJoin_Arun.pdf)** (similar to `IRanges::findOverlaps`), **[non-equi joins](https://github.com/Rdatatable/data.table/wiki/talks/ArunSrinivasanUseR2016.pdf)** (i.e. joins using operators `>, >=, <, <=`), **aggregate on join** (`by=.EACHI`), **update on join**
 * fast add/update/delete columns **by reference** by group using no copies at all
 * fast and feature rich **reshaping** data: **[`?dcast`](https://rdatatable.gitlab.io/data.table/reference/dcast.data.table.html)** (_pivot/wider/spread_) and **[`?melt`](https://rdatatable.gitlab.io/data.table/reference/melt.data.table.html)** (_unpivot/longer/gather_)
diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw
@@ -24,7 +24,7 @@ test(476, nrow(as.matrix(ans)), 2L*N)
 
 # Test that as.list.data.table no longer copies via unclass, so speeding up sapply(DT,class) and lapply(.SD,...) etc, #2000
 N = 1e6
-DT = data.table(a=1:N,b=1:N,c=1:N,d=1:N)   # 15MB in dev testing, but test with N=1e7
+DT = data.table(a=1:N,b=1:N,c=1:N,d=1:N)   # 15MiB in dev testing, but test with N=1e7
 test(603, system.time(sapply(DT,class))["user.self"] < 0.1)
 
 
@@ -96,7 +96,7 @@ local({
 
 # fwrite showProgress test 1735. Turned off as too long/big for CRAN.
 if (FALSE) {
-  N = 6e8  # apx 6GB
+  N = 6e8  # apx 6GiB
   DT = data.table(C1=sample(100000,N,replace=TRUE), C2=sample(paste0(LETTERS,LETTERS,LETTERS), N, replace=TRUE))
   gc()
   d = "/dev/shm/"
@@ -232,15 +232,15 @@ DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001))
 before = gc()["Vcells",2]
 for (i in 1:50) DT[, sum(B), by=A]
 after = gc()["Vcells",2]
-test(1157, after < before+3)  # +3 = 3MB
-# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case.
+test(1157, after < before+3)  # +3 = 3MiB
+# Before the patch, Vcells grew dramatically from 6MiB to 60MiB. Now stable at 6MiB. Increase 50 to 1000 and it grew to over 1GiB for this case.
 
 # Similar for when dogroups writes less rows than allocated, #2648.
 DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4))
 before = gc()["Vcells",2]
 for (i in 1:50) DT[ , unlist(.SD), by = 'k']
 after = gc()["Vcells",2]
-test(1158, after < before+3)  # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024
+test(1158, after < before+3)  # 177.6MiB => 179.2MiB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024
 
 #  fix DT[TRUE, :=] using too much working memory for i, #1249
 if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) {  # in case R not compiled with memory profiling enabled
@@ -311,7 +311,7 @@ unlink(f)
 # test no memory leak, #2191 and #2284
 # These take a few seconds each, and it's important to run these on CRAN to check no leak
 gc(); before = gc()["Vcells","(Mb)"]
-for (i in 1:2000) { DT = data.table(1:3); rm(DT) }  # in 1.8.2 would leak 3MB
+for (i in 1:2000) { DT = data.table(1:3); rm(DT) }  # in 1.8.2 would leak 3MiB
 gc(); after = gc()["Vcells","(Mb)"]
 test(861, after < before+0.5)   # close to 0.0 difference, but 0.5 for safe margin
 gc(); before = gc()["Vcells","(Mb)"]
@@ -327,7 +327,7 @@ test(863, after < before+0.5)
 
 # fread should use multiple threads on single column input.
 # tests 2 threads; the very reasonable limit on CRAN
-# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently)
+# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MiB currently)
 if (getDTthreads() == 1L) {
   cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n")
 } else {
@@ -369,7 +369,7 @@ for(i in 1:100) {
 gc()  # extra gc() (i.e. two including the one on next line) seems to reduce `after`
       # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm.
 after = sum(gc()[, 2])
-test(1912.1, after < before + 10)  # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up).
+test(1912.1, after < before + 10)  # 10MiB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up).
 #
 before = sum(gc()[, 2])
 fff = function(aref) {
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -5468,7 +5468,7 @@ test(1333.2, fread('A,B\nfoo,1\n"Analyst\\" ,2\nbar,3', strip.white=FALSE), data
 test(1334, fread('A,B\nfoo,1\n"Analyst\\" ,",2\nbar,3'), data.table(A=c('foo', 'Analyst\\" ,', 'bar'), B=1:3))
 test(1335, fread('A,B\nfoo,1\n"Analyst\\\\",2\nbar,3'), data.table(A=c('foo','Analyst\\\\','bar'), B=1:3))
 
-# data from 12GB file in comments on http://stackoverflow.com/a/23858323/403310 ...
+# data from 12GiB file in comments on http://stackoverflow.com/a/23858323/403310 ...
 # note that read.csv gets this wrong and puts jacoleman high school into the previous field, then fills the rest of the line silently.
 cat('A,B,C,D,E,F
 "12",0,"teacher private nfp\\\\\\\\"",""jacoleman high school","",""