finish

MichaelChirico · MichaelChirico · commit a5fae71810d2 · 2025-07-04T08:16:07.000-07:00
diff --git a/R/test.data.table.R b/R/test.data.table.R
@@ -277,13 +277,13 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F
     y = head(order(-diff(timings$RSS)), 10L)
     ans = timings[, diff := c(NA_real_, round(diff(RSS), 1L))][y + 1L]
     ans[, time:=NULL]  # time is distracting and influenced by gc() calls; just focus on RAM usage here
-    catf("10 largest RAM increases (MB); see plot for cumulative effect (if any)\n")
+    catf("10 largest RAM increases (MiB); see plot for cumulative effect (if any)\n")
     print(ans, class=FALSE)
     get("dev.new")(width=14.0, height=7.0)
     get("par")(mfrow=1:2)
-    get("plot")(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MB)", ylim=c(0.0, max(timings$RSS)))
+    get("plot")(timings$RSS, main=paste(basename(fn),"\nylim[0]=0 for context"), ylab="RSS (MiB)", ylim=c(0.0, max(timings$RSS)))
     get("mtext")(lastRSS<-as.integer(ceiling(last(timings$RSS))), side=4L, at=lastRSS, las=1L, font=2L)
-    get("plot")(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MB)")
+    get("plot")(timings$RSS, main=paste(basename(fn),"\nylim=range for inspection"), ylab="RSS (MiB)")
     get("mtext")(lastRSS, side=4L, at=lastRSS, las=1L, font=2L)
   }
 
@@ -316,7 +316,7 @@ INT = function(...) { as.integer(c(...)) }   # utility used in tests.Rraw
 
 gc_mem = function() {
   # nocov start
-  # gc reports memory in MB
+  # gc reports memory in MiB
   m = colSums(gc()[, c(2L, 4L, 6L)])
   names(m) = c("GC_used", "GC_gc_trigger", "GC_max_used")
   m
diff --git a/R/utils.R b/R/utils.R
@@ -212,10 +212,10 @@ edit.data.table = function(name, ...) {
 
 rss = function() {  #5515 #5517
   # nocov start
-  cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KB
+  cmd = paste0("ps -o rss --no-headers ", Sys.getpid()) # ps returns KiB
   ans = tryCatch(as.numeric(system(cmd, intern=TRUE)), warning=function(w) NA_real_, error=function(e) NA_real_)
   if (length(ans)!=1L || !is.numeric(ans)) ans=NA_real_ # just in case
-  round(ans / 1024.0, 1L)  # return MB
+  round(ans / 1024.0, 1L)  # return MiB
   # nocov end
 }
 
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -9770,7 +9770,7 @@ test(1640.2, x[y, c(.SD, .(x.aa=x.aa)), on=c(aa="bb")], data.table(aa=3:5, cc=c(
 nq_fun = function(n=100L) {
   i1 = sample(sample.int(n, 10L), n, TRUE)
   i2 = sample.int(n, n, TRUE) - as.integer(n/2)    # this used to be type numeric before #5517 which didn't seem intentional
-  i3 = sample.int(2e6, n, TRUE) - as.integer(1e6)  # used to sample from -1e6:1e6 which if allocated would be 8MB, #5517
+  i3 = sample.int(2e6, n, TRUE) - as.integer(1e6)  # used to sample from -1e6:1e6 which if allocated would be 8MiB, #5517
   i4 = sample(c(NA_integer_, sample.int(n*2L, 10L, FALSE)-n), n, TRUE)
 
   d1 = sample(rnorm(10L), n, TRUE)
@@ -9861,7 +9861,7 @@ y = na.omit(dt2)
 
 if (.Machine$sizeof.pointer>4) {
 
-  # temporarily off due to hitting 2GB limit on 32bit, #2767
+  # temporarily off due to hitting 2GiB limit on 32bit, #2767
   # turn off temporarily using FALSE when using valgrind, too, as very slow
 
   set.seed(1509611616L)
@@ -11964,7 +11964,7 @@ test(1800.2, fread("A\n1e55555555\n-1e+234056\n2e-59745"), data.table(A=c("1e555
 #
 # Tests thanks to Pasha copied verbatim from his PR#2200
 #
-# Test files with "round" sizes (different multiples of 2, from 512B to 64KB)
+# Test files with "round" sizes (different multiples of 2, from 512B to 64KiB)
 for (mul in c(16, 128, 512, 1024, 2048)) {
   ff = file(f<-tempfile(), open="wb")
   cat(strrep("1234,5678,9012,3456,7890,abcd,4\x0A", mul), file=ff)
@@ -12943,7 +12943,7 @@ test(1903.2, fread(",A,B\n1,0,1\n2,0,1\n3,1,1\n", logical01=TRUE), data.table(V1
 txt = 'A,   B,    C\n17,  34, 2.3\n3.,  NA,   1\nNA ,  2, NA \n0,0.1,0'
 test(1904.1, fread(txt, na.strings="NA", verbose=TRUE),
   ans <- data.table(A=c(17,3,NA,0), B=c(34,NA,2,0.1), C=c(2.3,1.0,NA,0.0)),
-  output = c("Number of sampling jump points = 1 because.*Reading 1 chunks \\(0 swept\\) of 1.000MB \\(each chunk 4 rows\\) using 1 thread.*Rereading 0 columns"))
+  output = c("Number of sampling jump points = 1 because.*Reading 1 chunks \\(0 swept\\) of 1.000MiB \\(each chunk 4 rows\\) using 1 thread.*Rereading 0 columns"))
 test(1904.2, fread(txt, na.strings=c("NA", " ")), ans, warning='na.strings\\[2\\]==" " consists only of whitespace, ignoring. Since strip.white=TRUE.*use.*"".*<NA>')
 test(1904.3, fread(txt, na.strings=c("NA", "")), ans)
 test(1904.4, fread(txt, na.strings=c("NA", "", " ")), ans, warning='na.strings\\[3\\]==" ".*only.*whitespace.*will already be read as <NA>')
@@ -17973,7 +17973,7 @@ DT = data.table(x = sample(letters[1:5], 20, TRUE),
                 c = sample(c(0+3i,1,-1-1i,NA), 20, TRUE),
                 l = sample(c(TRUE, FALSE, NA), 20, TRUE),
                 r = as.raw(sample(1:5, 20, TRUE)))
-load(testDir("test2224.Rdata")) # 47KB array 24x8 where each cell contains a length-20 result
+load(testDir("test2224.Rdata")) # 47KiB array 24x8 where each cell contains a length-20 result
 if (test_bit64) {
   DT[, i64:=as.integer64(sample(c(-2L,0L,2L,NA), 20, TRUE))]
 } else {
@@ -17984,7 +17984,7 @@ for (col in names(DT)[-1]) {
   for (n in list(1, 5, -1, -5, c(1,2), c(-1,1))) {
     for (type in c('lag','lead','shift','cyclic')) {
       # fill is tested by group in tests 2218.*; see comments in #5205
-      # sapply(sapply()) changed to for(for(for())) to save 29MB, #5517
+      # sapply(sapply()) changed to for(for(for())) to save 29MiB, #5517
       test(2224.1+i/10000,  # 192 tests here when test_bit64=TRUE; 168 when FALSE
            EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)),
            ans[[i]])
diff --git a/src/forder.c b/src/forder.c
@@ -318,7 +318,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max
         savetl(s);           // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much.
       // now save unique SEXP in ustr so i) we can loop through them afterwards and reset TRUELENGTH to 0 and ii) sort uniques when sorting too
       if (ustr_alloc<=ustr_n) {
-        ustr_alloc = (ustr_alloc==0) ? 16384 : ustr_alloc*2;  // small initial guess, negligible time to alloc 128KB (32 pages)
+        ustr_alloc = (ustr_alloc==0) ? 16384 : ustr_alloc*2;  // small initial guess, negligible time to alloc 128KiB (32 pages)
         if (ustr_alloc>n) ustr_alloc = n;  // clamp at n. Reaches n when fully unique (no dups)
         ustr = realloc(ustr, sizeof(SEXP) * ustr_alloc);
         if (ustr==NULL) STOP(_("Unable to realloc %d * %d bytes in range_str"), ustr_alloc, (int)sizeof(SEXP));  // # nocov
diff --git a/src/fread.c b/src/fread.c
@@ -1420,7 +1420,7 @@ int freadMain(freadMainArgs _args) {
 
       // No MAP_POPULATE for faster nrows=10 and to make possible earlier progress bar in row count stage
       // Mac doesn't appear to support MAP_POPULATE anyway (failed on CRAN when I tried).
-      // TO DO?: MAP_HUGETLB for Linux but seems to need admin to setup first. My Hugepagesize is 2MB (>>2KB, so promising)
+      // TO DO?: MAP_HUGETLB for Linux but seems to need admin to setup first. My Hugepagesize is 2MiB (>>2KiB, so promising)
       //         https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
       mmp = mmap(NULL, fileSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);  // COW for last page lastEOLreplaced
       #ifdef __EMSCRIPTEN__
@@ -1901,7 +1901,7 @@ int freadMain(freadMainArgs _args) {
 
   const ptrdiff_t jump0size = firstJumpEnd - pos;  // the size in bytes of the first 100 lines from the start (jump point 0)
   // how many places in the file to jump to and test types there (the very end is added as 11th or 101th)
-  // not too many though so as not to slow down wide files; e.g. 10,000 columns.  But for such large files (50GB) it is
+  // not too many though so as not to slow down wide files; e.g. 10,000 columns.  But for such large files (50GiB) it is
   // worth spending a few extra seconds sampling 10,000 rows to decrease a chance of costly reread even further.
   nJumps = 1;
   const ptrdiff_t sz = eof - pos;
@@ -2254,10 +2254,10 @@ int freadMain(freadMainArgs _args) {
   int buffGrown = 0;
   // chunkBytes is the distance between each jump point; it decides the number of jumps
   // We may want each chunk to write to its own page of the final column, hence 1000*maxLen
-  // For the 44GB file with 12875 columns, the max line len is 108,497. We may want each chunk to write to its
+  // For the 44GiB file with 12875 columns, the max line len is 108,497. We may want each chunk to write to its
   // own page (4k) of the final column, hence 1000 rows of the smallest type (4 byte int) is just
   // under 4096 to leave space for R's header + malloc's header.
-  size_t chunkBytes = umax((uint64_t)(1000 * meanLineLen), 1ULL * 1024 * 1024/*MB*/);
+  size_t chunkBytes = umax((uint64_t)(1000 * meanLineLen), 1ULL * 1024 * 1024/*MiB*/);
   // Index of the first jump to read. May be modified if we ever need to restart
   // reading from the middle of the file.
   int jump0 = 0;
diff --git a/src/fsort.c b/src/fsort.c
@@ -93,7 +93,7 @@ int qsort_cmp(const void *a, const void *b) {
   uint64_t x = qsort_data[*(int *)a];
   uint64_t y = qsort_data[*(int *)b];
   // return x-y;  would like this, but this is long and the cast to int return may not preserve sign
-  // We have long vectors in mind (1e10(74GB), 1e11(740GB)) where extreme skew may feasibly mean the largest count
+  // We have long vectors in mind (1e10(74GiB), 1e11(740GiB)) where extreme skew may feasibly mean the largest count
   // is greater than 2^32. The first split is (currently) 16 bits so should be very rare but to be safe keep 64bit counts.
   return (x<y)-(x>y);   // largest first in a safe branchless way casting long to int
 }
@@ -233,7 +233,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {
       // This assignment to ans is not random access as it may seem, but cache efficient by
       // design since target pages are written to contiguously. MSBsize * 4k < cache.
       // TODO: therefore 16 bit MSB seems too big for this step. Time this step and reduce 16 a lot.
-      //       20MB cache / nth / 4k => MSBsize=160
+      //       20MiB cache / nth / 4k => MSBsize=160
       source++;
     }
   }
diff --git a/src/fwrite.c b/src/fwrite.c
@@ -791,7 +791,7 @@ void fwriteMain(fwriteMainArgs args)
   }
   char *buffPool = malloc(alloc_size);
   if (!buffPool) {
-    STOP(_("Unable to allocate %zu MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), // # nocov
+    STOP(_("Unable to allocate %zu MiB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), // # nocov
          buffSize / MEGA, nth, errno, strerror(errno)); // # nocov
   }
 
diff --git a/src/fwrite.h b/src/fwrite.h
@@ -109,7 +109,7 @@ typedef struct fwriteMainArgs
                           //   iff scipen >= 3=8-5
   bool squashDateTime;
   bool append;
-  int buffMB;             // [1-1024] default 8MB
+  int buffMB;             // [1-1024] default 8MiB
   int nth;
   bool showProgress;
   bool is_gzip;
diff --git a/src/fwriteR.c b/src/fwriteR.c
@@ -163,7 +163,7 @@ SEXP fwriteR(
   SEXP logical01_Arg,      // TRUE|FALSE
   SEXP scipen_Arg,
   SEXP dateTimeAs_Arg,     // 0=ISO(yyyy-mm-dd),1=squash(yyyymmdd),2=epoch,3=write.csv
-  SEXP buffMB_Arg,         // [1-1024] default 8MB
+  SEXP buffMB_Arg,         // [1-1024] default 8MiB
   SEXP nThread_Arg,
   SEXP showProgress_Arg,
   SEXP is_gzip_Arg,
diff --git a/vignettes/datatable-keys-fast-subset.Rmd b/vignettes/datatable-keys-fast-subset.Rmd
@@ -416,10 +416,10 @@ N = 2e7L
 DT = data.table(x = sample(letters, N, TRUE),
                 y = sample(1000L, N, TRUE),
                 val = runif(N))
-print(object.size(DT), units = "Mb")
+print(object.size(DT), units = "MiB")
 ```
 
-`DT` is ~380MB. It is not really huge, but this will do to illustrate the point.
+`DT` is ~380MiB. It is not really huge, but this will do to illustrate the point.
 
 From what we have seen in the Introduction to data.table section, we can subset those rows where columns `x = "g"` and `y = 877` as follows:
 

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ int qsort_cmp(const void a, const void b) {`
`93`	`93`	`uint64_t x = qsort_data[(int )a];`
`94`	`94`	`uint64_t y = qsort_data[(int )b];`
`95`	`95`	`// return x-y; would like this, but this is long and the cast to int return may not preserve sign`
`96`		`- // We have long vectors in mind (1e10(74GB), 1e11(740GB)) where extreme skew may feasibly mean the largest count`
	`96`	`+ // We have long vectors in mind (1e10(74GiB), 1e11(740GiB)) where extreme skew may feasibly mean the largest count`
`97`	`97`	`// is greater than 2^32. The first split is (currently) 16 bits so should be very rare but to be safe keep 64bit counts.`
`98`	`98`	`return (x<y)-(x>y); // largest first in a safe branchless way casting long to int`
`99`	`99`	`}`
`@@ -233,7 +233,7 @@ SEXP fsort(SEXP x, SEXP verboseArg) {`
`233`	`233`	`// This assignment to ans is not random access as it may seem, but cache efficient by`
`234`	`234`	`// design since target pages are written to contiguously. MSBsize * 4k < cache.`
`235`	`235`	`// TODO: therefore 16 bit MSB seems too big for this step. Time this step and reduce 16 a lot.`
`236`		`- // 20MB cache / nth / 4k => MSBsize=160`
	`236`	`+ // 20MiB cache / nth / 4k => MSBsize=160`
`237`	`237`	`source++;`
`238`	`238`	`}`
`239`	`239`	`}`
Original file line number	Diff line number	Diff line change
`@@ -791,7 +791,7 @@ void fwriteMain(fwriteMainArgs args)`
`791`	`791`	`}`
`792`	`792`	`char *buffPool = malloc(alloc_size);`
`793`	`793`	`if (!buffPool) {`
`794`		`- STOP(_("Unable to allocate %zu MB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), // # nocov`
	`794`	`+ STOP(_("Unable to allocate %zu MiB * %d thread buffers; '%d: %s'. Please read ?fwrite for nThread, buffMB and verbose options."), // # nocov`
`795`	`795`	`buffSize / MEGA, nth, errno, strerror(errno)); // # nocov`
`796`	`796`	`}`
`797`	`797`