first pass

MichaelChirico · MichaelChirico · commit a01ac2f3a1bb · 2025-07-03T12:22:28.000-07:00
diff --git a/man/datatable-optimize.Rd b/man/datatable-optimize.Rd
@@ -110,7 +110,7 @@ old = options(datatable.optimize = Inf)
 set.seed(1L)
 DT = lapply(1:20, function(x) sample(c(-100:100), 5e6L, TRUE))
 setDT(DT)[, id := sample(1e5, 5e6, TRUE)]
-print(object.size(DT), units="Mb") # 400MB, not huge, but will do
+print(object.size(DT), units="MiB") # 400MiB, not huge, but will do
 
 # 'order' optimisation
 options(datatable.optimize = 1L) # optimisation 'on'
diff --git a/man/fread.Rd b/man/fread.Rd
@@ -72,7 +72,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC"
 
 A sample of 10,000 rows is used for a very good estimate of column types. 100 contiguous rows are read from 100 equally spaced points throughout the file including the beginning, middle and the very end. This results in a better guess when a column changes type later in the file (e.g. blank at the beginning/only populated near the end, or 001 at the start but 0A0 later on). This very good type guess enables a single allocation of the correct type up front once for speed, memory efficiency and convenience of avoiding the need to set \code{colClasses} after an error. Even though the sample is large and jumping over the file, it is almost instant regardless of the size of the file because a lazy on-demand memory map is used. If a jump lands inside a quoted field containing newlines, each newline is tested until 5 lines are found following it with the expected number of fields. The lowest type for each column is chosen from the ordered list: \code{logical}, \code{integer}, \code{integer64}, \code{double}, \code{character}. Rarely, the file may contain data of a higher type in rows outside the sample (referred to as an out-of-sample type exception). In this event \code{fread} will \emph{automatically} reread just those columns from the beginning so that you don't have the inconvenience of having to set \code{colClasses} yourself; particularly helpful if you have a lot of columns. Such columns must be read from the beginning to correctly distinguish "00" from "000" when those have both been interpreted as integer 0 due to the sample but 00A occurs out of sample. Set \code{verbose=TRUE} to see a detailed report of the logic deployed to read your file.
 
-There is no line length limit, not even a very large one. Since we are encouraging \code{list} columns (i.e. \code{sep2}) this has the potential to encourage longer line lengths. So the approach of scanning each line into a buffer first and then rescanning that buffer is not used. There are no buffers used in \code{fread}'s C code at all. The field width limit is limited by R itself: the maximum width of a character string (currently 2^31-1 bytes, 2GB).
+There is no line length limit, not even a very large one. Since we are encouraging \code{list} columns (i.e. \code{sep2}) this has the potential to encourage longer line lengths. So the approach of scanning each line into a buffer first and then rescanning that buffer is not used. There are no buffers used in \code{fread}'s C code at all. The field width limit is limited by R itself: the maximum width of a character string (currently 2^31-1 bytes, 2GiB).
 
 The filename extension (such as .csv) is irrelevant for "auto" \code{sep} and \code{sep2}. Separator detection is entirely driven by the file contents. This can be useful when loading a set of different files which may not be named consistently, or may not have the extension .csv despite being csv. Some datasets have been collected over many years, one file per day for example. Sometimes the file name format has changed at some point in the past or even the format of the file itself. So the idea is that you can loop \code{fread} through a set of files and as long as each file is regular and delimited, \code{fread} can read them all. Whether they all stack is another matter but at least each one is read quickly without you needing to vary \code{colClasses} in \code{read.table} or \code{read.csv}.
 
@@ -231,8 +231,8 @@ DT[2,e:=+Inf]
 DT[3,e:=-Inf]
 
 write.table(DT,"test.csv",sep=",",row.names=FALSE,quote=FALSE)
-cat("File size (MB):", round(file.info("test.csv")$size/1024^2),"\n")
-# 50 MB (1e6 rows x 6 columns)
+cat("File size (MiB):", round(file.info("test.csv")$size/1024^2),"\n")
+# 50 MiB (1e6 rows x 6 columns)
 
 system.time(DF1 <-read.csv("test.csv",stringsAsFactors=FALSE))
 # 5.4 sec (first time in fresh R session)
@@ -257,13 +257,15 @@ l = vector("list",10)
 for (i in 1:10) l[[i]] = DT
 DTbig = rbindlist(l)
 tables()
-write.table(DTbig,"testbig.csv",sep=",",row.names=FALSE,quote=FALSE)
-# 500MB csv (10 million rows x 6 columns)
-
-system.time(DF <- read.table("testbig.csv",header=TRUE,sep=",",
-    quote="",stringsAsFactors=FALSE,comment.char="",nrows=1e7,
-    colClasses=c("integer","integer","numeric",
-                 "character","numeric","integer")))
+write.table(DTbig, "testbig.csv", sep=",", row.names=FALSE, quote=FALSE)
+# ~500MiB csv (10 million rows x 6 columns)
+
+system.time({
+  DF <- read.table("testbig.csv", header=TRUE, sep=",",
+    quote="", stringsAsFactors=FALSE, comment.char="", nrows=1e7,
+    colClasses=c("integer", "integer", "numeric",
+                 "character", "numeric", "integer"))
+})
 # 17.0 sec (varies)
 
 system.time(DT <- fread("testbig.csv"))
diff --git a/man/fwrite.Rd b/man/fwrite.Rd
@@ -53,7 +53,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
   This option applies to vectors of date/time in list column cells, too. \cr \cr
   A fully flexible format string (such as \code{"\%m/\%d/\%Y"}) is not supported. This is to encourage use of ISO standards and because that flexibility is not known how to make fast at C level. We may be able to support one or two more specific options if required.
   }
-  \item{buffMB}{The buffer size (MB) per thread in the range 1 to 1024, default 8MB. Experiment to see what works best for your data on your hardware.}
+  \item{buffMB}{The buffer size (MiB) per thread in the range 1 to 1024, default 8MiB. Experiment to see what works best for your data on your hardware.}
   \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
   \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. }
   \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.}
@@ -115,7 +115,7 @@ fwrite(DT, sep="|", sep2=c("{",",","}"))
 
 set.seed(1)
 DT = as.data.table( lapply(1:10, sample,
-         x=as.numeric(1:5e7), size=5e6))                            #     382MB
+         x=as.numeric(1:5e7), size=5e6))                            #    382MiB
 system.time(fwrite(DT, "/dev/shm/tmp1.csv"))                        #      0.8s
 system.time(write.csv(DT, "/dev/shm/tmp2.csv",                      #     60.6s
                       quote=FALSE, row.names=FALSE))
@@ -135,7 +135,7 @@ DT = data.table(
   str6=sample(c("M","F"),N,TRUE),
   int1=sample(ceiling(rexp(1e6)), N, replace=TRUE),
   int2=sample(N,N,replace=TRUE)-N/2
-)                                                                   #     774MB
+)                                                                   #    775MiB
 system.time(fwrite(DT,"/dev/shm/tmp1.csv"))                         #      1.1s
 system.time(write.csv(DT,"/dev/shm/tmp2.csv",                       #     63.2s
                       row.names=FALSE, quote=FALSE))
diff --git a/man/transform.data.table.Rd b/man/transform.data.table.Rd
@@ -51,7 +51,7 @@ DT[,`:=`(b = rev(b),
          a = NULL)]
 identical(DT,DT2)
 
-DT$d = ave(DT$b, DT$c, FUN=max)               # copies entire DT, even if it is 10GB in RAM
+DT$d = ave(DT$b, DT$c, FUN=max)               # copies entire DT, even if it is 10GiB in RAM
 DT = DT[, transform(.SD, d=max(b)), by="c"]   # same, but even worse as .SD is copied for each group
 DT[, d:=max(b), by="c"]                       # same result, but much faster, shorter and scales