Rdatatable
diff --git a/‎.dev/CRAN_Release.cmd‎
Lines changed: 2 additions & 2 deletions b/‎.dev/CRAN_Release.cmd‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.dev/revdep.R‎
Lines changed: 1 addition & 1 deletion b/‎.dev/revdep.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.gitlab-ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/data.table.R‎
Lines changed: 1 addition & 1 deletion b/‎R/data.table.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/duplicated.R‎
Lines changed: 1 addition & 1 deletion b/‎R/duplicated.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/test.data.table.R‎
Lines changed: 4 additions & 4 deletions b/‎R/test.data.table.R‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎R/utils.R‎
Lines changed: 2 additions & 2 deletions b/‎R/utils.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎inst/tests/benchmark.Rraw‎
Lines changed: 8 additions & 8 deletions b/‎inst/tests/benchmark.Rraw‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎inst/tests/tests.Rraw‎
Lines changed: 8 additions & 8 deletions b/‎inst/tests/tests.Rraw‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎man/datatable-optimize.Rd‎
Lines changed: 1 addition & 1 deletion b/‎man/datatable-optimize.Rd‎
Lines changed: 1 addition & 1 deletion
@@ -562,9 +562,9 @@ ls -1 *.tar.gz | grep -E 'Chicago|dada2|flowWorkspace|LymphoSeq' | TZ='UTC' para
 #  3) dllVersion() at the end of init.c
 # DO NOT push to GitHub's master branch. Prevents even a slim possibility of user getting premature version. 
 # Even release numbers must have been obtained from CRAN and only CRAN. There were too many support problems in the past before this procedure was brought in.
-du -k inst/tests                # 1.5MB before
+du -k inst/tests                # 1.5MiB before
 bzip2 inst/tests/*.Rraw         # compress *.Rraw just for release to CRAN; do not commit compressed *.Rraw to git
-du -k inst/tests                # 0.75MB after
+du -k inst/tests                # 0.75MiB after
 R CMD build .
 export GITHUB_PAT="f1c.. github personal access token ..7ad"
 Rdevel -q -e "packageVersion('xml2')"   # ensure installed
 
@@ -67,7 +67,7 @@ options(repos = BiocManager::repositories())
 
 options(warn=1)  # warning at the time so we can more easily see what's going on package by package when we scroll through output
 cat("options()$timeout==", options()$timeout," set by R_DEFAULT_INTERNET_TIMEOUT in .dev/.bash_aliases revdepsh\n",sep="")
-# R's default is 60. Before Dec 2020, we used 300 but that wasn't enough to download Bioc package BSgenome.Hsapiens.UCSC.hg19 (677GB) which is
+# R's default is 60. Before Dec 2020, we used 300 but that wasn't enough to download Bioc package BSgenome.Hsapiens.UCSC.hg19 (677MiB) which is
 # suggested by CRAN package CNVScope which imports data.table. From Dec 2020 we use 3600.
 
 if (is.null(utils::old.packages(.libPaths()[2]))) {
 
@@ -151,7 +151,7 @@ test-lin-rel-cran:
     _R_CHECK_CRAN_INCOMING_: "TRUE"           ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though)
     _R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE"   ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284
     _R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## bytes
-    _R_CHECK_PKG_SIZES_THRESHOLD_: "10"        ## MB 'checking installed package size' NOTE increased due to po
+    _R_CHECK_PKG_SIZES_THRESHOLD_: "10"        ## MiB 'checking installed package size' NOTE increased due to po
   script:
     - *install-deps
     - echo 'CFLAGS=-g -O2 -fopenmp -Wall -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2' > ~/.R/Makevars
 
@@ -2237,7 +2237,7 @@ tail.data.table = function(x, n=6L, ...) {
 
 "[<-.data.table" = function(x, i, j, value) {
   # [<- is provided for consistency, but := is preferred as it allows by group and by reference to subsets of columns
-  # with no copy of the (very large, say 10GB) columns at all. := is like an UPDATE in SQL and we like and want two symbols to change.
+  # with no copy of the (very large, say 10GiB) columns at all. := is like an UPDATE in SQL and we like and want two symbols to change.
   if (!cedta()) {
     x = if (nargs()<4L) `[<-.data.frame`(x, i, value=value)
         else `[<-.data.frame`(x, i, j, value)
 
@@ -49,7 +49,7 @@ unique.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_alon
 
 # Test for #2013 unique() memory efficiency improvement in v1.10.5
 # set.seed(1)
-# Create unique 7.6GB DT on 16GB laptop
+# Create unique 7.6GiB DT on 16GiB laptop
 # DT = data.table(
 #  A = sample(1e8, 2e8, TRUE),
 #  B = sample(1e8, 2e8, TRUE),
 
@@ -277,13 +277,13 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
     y = head(order(-diff(timings$RSS)), 10L)
     ans = timings[, diff := c(NA_real_, round(diff(RSS), 1L))][y + 1L]
     ans[, time:=NULL]  # time is distracting and influenced by gc() calls; just focus on RAM usage here
-    catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n")
+    catf("10 largest RAM increases (MiB); see plot for cumulative effect (if any)\n")
     print(ans, class=FALSE)
     get("dev.new")(width=14.0, height=7.0)
     get("par")(mfrow=1:2)
-    get("plot")(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MB)", ylim=c(0.0, max(timings$RSS)))
+    get("plot")(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MiB)", ylim=c(0.0, max(timings$RSS)))
     get("mtext")(lastRSS<-as.integer(ceiling(last(timings$RSS))), side=4L, at=lastRSS, las=1L, font=2L)
-    get("plot")(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MB)")
+    get("plot")(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MiB)")
     get("mtext")(lastRSS, side=4L, at=lastRSS, las=1L, font=2L)
   }
 
@@ -316,7 +316,7 @@ INT = function(...) { as.integer(c(...)) }   # utility used in tests.Rraw
 
 gc_mem = function() {
   # nocov start
-  # gc reports memory in MB
+  # gc reports memory in MiB
   m = colSums(gc()[, c(2L, 4L, 6L)])
   names(m) = c("GC_used", "GC_gc_trigger", "GC_max_used")
   m
 
@@ -212,10 +212,10 @@ edit.data.table = function(name, ...) {
 
 rss = function() {  #5515 #5517
   # nocov start
-  cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB
+  cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KiB
   ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_)
   if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case
-  round(ans / 1024.0, 1L)  # return MB
+  round(ans / 1024.0, 1L)  # return MiB
   # nocov end
 }
 
 
@@ -24,7 +24,7 @@ test(476, nrow(as.matrix(ans)), 2L*N)
 
 # Test that as.list.data.table no longer copies via unclass, so speeding up sapply(DT,class) and lapply(.SD,...) etc, #2000
 N = 1e6
-DT = data.table(a=1:N,b=1:N,c=1:N,d=1:N)   # 15MB in dev testing, but test with N=1e7
+DT = data.table(a=1:N,b=1:N,c=1:N,d=1:N)   # 15MiB in dev testing, but test with N=1e7
 test(603, system.time(sapply(DT,class))["user.self"] < 0.1)
 
 
@@ -96,7 +96,7 @@ local({
 
 # fwrite showProgress test 1735. Turned off as too long/big for CRAN.
 if (FALSE) {
-  N = 6e8  # apx 6GB
+  N = 6e8  # apx 6GiB
   DT = data.table(C1=sample(100000,N,replace=TRUE), C2=sample(paste0(LETTERS,LETTERS,LETTERS), N, replace=TRUE))
   gc()
   d = "/dev/shm/"
@@ -232,15 +232,15 @@ DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001))
 before = gc()["Vcells",2]
 for (i in 1:50) DT[, sum(B), by=A]
 after = gc()["Vcells",2]
-test(1157, after < before+3)  # +3 = 3MB
-# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case.
+test(1157, after < before+3)  # +3 = 3MiB
+# Before the patch, Vcells grew dramatically from 6MiB to 60MiB. Now stable at 6MiB. Increase 50 to 1000 and it grew to over 1GiB for this case.
 
 # Similar for when dogroups writes less rows than allocated, #2648.
 DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4))
 before = gc()["Vcells",2]
 for (i in 1:50) DT[ , unlist(.SD), by = 'k']
 after = gc()["Vcells",2]
-test(1158, after < before+3)  # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024
+test(1158, after < before+3)  # 177.6MiB => 179.2MiB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024
 
 #  fix DT[TRUE, :=] using too much working memory for i, #1249
 if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) {  # in case R not compiled with memory profiling enabled
@@ -311,7 +311,7 @@ unlink(f)
 # test no memory leak, #2191 and #2284
 # These take a few seconds each, and it's important to run these on CRAN to check no leak
 gc(); before = gc()["Vcells","(Mb)"]
-for (i in 1:2000) { DT = data.table(1:3); rm(DT) }  # in 1.8.2 would leak 3MB
+for (i in 1:2000) { DT = data.table(1:3); rm(DT) }  # in 1.8.2 would leak 3MiB
 gc(); after = gc()["Vcells","(Mb)"]
 test(861, after < before+0.5)   # close to 0.0 difference, but 0.5 for safe margin
 gc(); before = gc()["Vcells","(Mb)"]
@@ -327,7 +327,7 @@ test(863, after < before+0.5)
 
 # fread should use multiple threads on single column input.
 # tests 2 threads; the very reasonable limit on CRAN
-# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently)
+# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MiB currently)
 if (getDTthreads() == 1L) {
   cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n")
 } else {
@@ -369,7 +369,7 @@ for(i in 1:100) {
 gc()  # extra gc() (i.e. two including the one on next line) seems to reduce `after`
       # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm.
 after = sum(gc()[, 2])
-test(1912.1, after < before + 10)  # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up).
+test(1912.1, after < before + 10)  # 10MiB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up).
 #
 before = sum(gc()[, 2])
 fff = function(aref) {
 
@@ -2290,7 +2290,7 @@ test(754.04, DT[, b := a][3, b := 6L], data.table(a=INT(4,2,3),b=INT(4,2,6)))
 test(754.05, DT[, a := as.numeric(a), verbose=TRUE], output="Direct plonk.*no copy")
 RHS = as.integer(DT$a)
 test(754.06, DT[, a:= RHS, verbose=TRUE], output="RHS for item 1 has been duplicated")
-if (getRversion() >= "3.5.0") { # TODO(R>=3.5.0): test unconditionally
+if (base::getRversion() >= "3.5.0") { # TODO(R>=3.5.0): test unconditionally
   # Expand ALTREPS in assign.c, #5400
   # String conversion gets deferred
   ## first, a regression test of R itself -- we want to make sure our own test continues to be useful & testing its intended purpose
@@ -5468,7 +5468,7 @@ test(1333.2, fread('A,B\nfoo,1\n"Analyst\\" ,2\nbar,3', strip.white=FALSE), data
 test(1334, fread('A,B\nfoo,1\n"Analyst\\" ,",2\nbar,3'), data.table(A=c('foo', 'Analyst\\" ,', 'bar'), B=1:3))
 test(1335, fread('A,B\nfoo,1\n"Analyst\\\\",2\nbar,3'), data.table(A=c('foo','Analyst\\\\','bar'), B=1:3))
 
-# data from 12GB file in comments on http://stackoverflow.com/a/23858323/403310 ...
+# data from 12GiB file in comments on http://stackoverflow.com/a/23858323/403310 ...
 # note that read.csv gets this wrong and puts jacoleman high school into the previous field, then fills the rest of the line silently.
 cat('A,B,C,D,E,F
 "12",0,"teacher private nfp\\\\\\\\"",""jacoleman high school","",""
@@ -9770,7 +9770,7 @@ test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c(
 nq_fun = function(n=100L) {
   i1 = sample(sample.int(n, 10L), n, TRUE)
   i2 = sample.int(n, n, TRUE) - as.integer(n/2)    # this used to be type numeric before #5517 which didn't seem intentional
-  i3 = sample.int(2e6, n, TRUE) - as.integer(1e6)  # used to sample from -1e6:1e6 which if allocated would be 8MB, #5517
+  i3 = sample.int(2e6, n, TRUE) - as.integer(1e6)  # used to sample from -1e6:1e6 which if allocated would be 8MiB, #5517
   i4 = sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE)
 
   d1 = sample(rnorm(10L), n, TRUE)
@@ -9861,7 +9861,7 @@ y = na.omit(dt2)
 
 if (.Machine$sizeof.pointer>4) {
 
-  # temporarily off due to hitting 2GB limit on 32bit, #2767
+  # temporarily off due to hitting 2GiB limit on 32bit, #2767
   # turn off temporarily using FALSE when using valgrind, too, as very slow
 
   set.seed(1509611616L)
@@ -11964,7 +11964,7 @@ test(1800.2, fread("A\n1e55555555\n-1e+234056\n2e-59745"), data.table(A=c("1e555
 #
 # Tests thanks to Pasha copied verbatim from his PR#2200
 #
-# Test files with "round" sizes (different multiples of 2, from 512B to 64KB)
+# Test files with "round" sizes (different multiples of 2, from 512B to 64KiB)
 for (mul in c(16, 128, 512, 1024, 2048)) {
   ff = file(f<-tempfile(), open="wb")
   cat(strrep("1234,5678,9012,3456,7890,abcd,4\x0A", mul), file=ff)
@@ -12943,7 +12943,7 @@ test(1903.2, fread(",A,B\n1,0,1\n2,0,1\n3,1,1\n", logical01=TRUE), data.table(V1
 txt = 'A,   B,    C\n17,  34, 2.3\n3.,  NA,   1\nNA ,  2, NA \n0,0.1,0'
 test(1904.1, fread(txt, na.strings="NA", verbose=TRUE),
   ans <- data.table(A=c(17,3,NA,0), B=c(34,NA,2,0.1), C=c(2.3,1.0,NA,0.0)),
-  output = c("Number of sampling jump points = 1 because.*Reading 1 chunks \\(0 swept\\) of 1.000MB \\(each chunk 4 rows\\) using 1 thread.*Rereading 0 columns"))
+  output = c("Number of sampling jump points = 1 because.*Reading 1 chunks \\(0 swept\\) of 1.000MiB \\(each chunk 4 rows\\) using 1 thread.*Rereading 0 columns"))
 test(1904.2, fread(txt, na.strings=c("NA", " ")), ans, warning='na.strings\\[2\\]==" " consists only of whitespace, ignoring. Since strip.white=TRUE.*use.*"".*<NA>')
 test(1904.3, fread(txt, na.strings=c("NA", "")), ans)
 test(1904.4, fread(txt, na.strings=c("NA", "", " ")), ans, warning='na.strings\\[3\\]==" ".*only.*whitespace.*will already be read as <NA>')
@@ -17973,7 +17973,7 @@ DT = data.table(x = sample(letters[1:5], 20, TRUE),
                 c = sample(c(0+3i,1,-1-1i,NA), 20, TRUE),
                 l = sample(c(TRUE, FALSE, NA), 20, TRUE),
                 r = as.raw(sample(1:5, 20, TRUE)))
-load(testDir("test2224.Rdata")) # 47KB array 24x8 where each cell contains a length-20 result
+load(testDir("test2224.Rdata")) # 47KiB array 24x8 where each cell contains a length-20 result
 if (test_bit64) {
   DT[, i64:=as.integer64(sample(c(-2L,0L,2L,NA), 20, TRUE))]
 } else {
@@ -17984,7 +17984,7 @@ for (col in names(DT)[-1]) {
   for (n in list(1, 5, -1, -5, c(1,2), c(-1,1))) {
     for (type in c('lag','lead','shift','cyclic')) {
       # fill is tested by group in tests 2218.*; see comments in #5205
-      # sapply(sapply()) changed to for(for(for())) to save 29MB, #5517
+      # sapply(sapply()) changed to for(for(for())) to save 29MiB, #5517
       test(2224.1+i/10000,  # 192 tests here when test_bit64=TRUE; 168 when FALSE
            EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)),
            ans[[i]])
 
@@ -110,7 +110,7 @@ old = options(datatable.optimize = Inf)
 set.seed(1L)
 DT = lapply(1:20, function(x) sample(c(-100:100), 5e6L, TRUE))
 setDT(DT)[, id := sample(1e5, 5e6, TRUE)]
-print(object.size(DT), units="Mb") # 400MB, not huge, but will do
+print(object.size(DT), units="MiB") # 400MiB, not huge, but will do
 
 # 'order' optimisation
 options(datatable.optimize = 1L) # optimisation 'on'