Skip to content

Commit 622de2b

Browse files
committed
Merge branch 'master' into warn_encodings
2 parents e35d5ad + 59f966c commit 622de2b

File tree

8 files changed

+260
-18
lines changed

8 files changed

+260
-18
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@
293293
# user system elapsed
294294
# 0.028 0.000 0.005
295295
```
296+
20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation.
296297

297298
### BUG FIXES
298299

R/fread.R

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ fread = function(
22
input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
33
na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
44
skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
5-
col.names, check.names=FALSE, encoding="UTF-8", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
5+
col.names, check.names=FALSE, encoding="UTF-8", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL,
66
showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE),
77
nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE),
88
logicalYN=getOption("datatable.logicalYN", FALSE),
@@ -30,6 +30,9 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
3030
isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0),
3131
is.numeric(nrows), length(nrows)==1L
3232
)
33+
if (!is.character(comment.char) || length(comment.char) != 1L || is.na(comment.char) || nchar(comment.char) > 1L) {
34+
stopf("comment.char= must be a single non-NA character.")
35+
}
3336
fill = if(identical(fill, Inf)) .Machine$integer.max else as.integer(fill)
3437
nrows=as.double(nrows) #4686
3538
if (is.na(nrows) || nrows<0L) nrows=Inf # accept -1 to mean Inf, as read.table does
@@ -289,7 +292,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
289292
if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local
290293
tz="UTC"
291294
}
292-
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,
295+
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,comment.char,
293296
fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC")
294297
if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns
295298
nr = length(ans[[1L]])

inst/tests/tests.Rraw

Lines changed: 130 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21687,8 +21687,135 @@ test(2340.05, .selfref.ok(d3), FALSE)
2168721687
setDT(d3)
2168821688
test(2340.06, .selfref.ok(d3), TRUE)
2168921689

21690+
# implement comment.char argument in fread, #856
21691+
test(2341.01, fread('a,b
21692+
#a comment
21693+
1,2
21694+
#another comment
21695+
3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
21696+
21697+
test(2341.02, fread('a,b #line-trailing comment
21698+
1,2', comment.char='#'), data.table(a=1L, b=2L))
21699+
21700+
test(2341.03, fread('a,b#line-trailing comment and no whitespace
21701+
1,2', comment.char='#'), data.table(a=1L, b=2L))
21702+
21703+
test(2341.04, fread('a,b
21704+
1,2 #trailing after numeric', comment.char='#'), data.table(a=1L, b=2L))
21705+
21706+
# comment char inside quotes
21707+
test(2341.05, fread('a
21708+
"#quotes#"', comment.char="#"), data.table(a="#quotes#"))
21709+
21710+
# multi line comments
21711+
test(2341.06, fread('# multi line
21712+
# comment
21713+
1,2 # line-end comment
21714+
# multi line
21715+
# comment
21716+
3,4 # line-end comment
21717+
# trailing comment', comment.char='#'), data.table(V1=c(1L,3L), V2=c(2L,4L)))
21718+
21719+
test(2341.07, fread('id;value
21720+
1;2,5! trailing comment
21721+
2;NA
21722+
!final comment', sep=';', dec=',', na.strings='NA', comment.char='!'), data.table(id=1:2, value=c(2.5, NA_real_)))
21723+
21724+
# skip
21725+
test(2341.08, fread('meta line
21726+
DATA STARTS
21727+
x,y
21728+
# skip this
21729+
1,2', skip="DATA", header=TRUE, comment.char='#'), data.table(x=1L, y=2L))
21730+
21731+
# weird comment chars like space or quote
21732+
test(2341.09, fread('a
21733+
inline comment
21734+
1', comment.char=' '), data.table(a=1L))
21735+
test(2341.10, fread('a,b
21736+
1,2" trailing"
21737+
"comment line"
21738+
3,4', comment.char='"', quote=""), data.table(a=c(1L,3L), b=c(2L,4L)))
21739+
21740+
# invalid comment chars
21741+
test(2341.11, fread('a,b
21742+
## multichar commentchar
21743+
1,2', comment.char = '##'), error = "comment.char= must be a single non-NA character")
21744+
21745+
test(2341.12, fread('a,b
21746+
NA,NA
21747+
1,2', comment.char = NA), error = "comment.char= must be a single non-NA character")
21748+
21749+
# CLRF
21750+
test(2341.13, fread('a,b\r\n# cmt\r\n1,2\r\n3,4\r\n', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
21751+
21752+
# header comment
21753+
test(2341.14, fread('# hdr cmt
21754+
x,y
21755+
1,2', header=TRUE, comment.char='#'), data.table(x=1L, y=2L))
21756+
21757+
# nrow not counting comments
21758+
test(2341.15, fread('a,b
21759+
1,2
21760+
# cmt
21761+
3,4
21762+
5,6', nrows=2, comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
21763+
21764+
# sep and comment char same
21765+
test(2341.16, fread('a#b
21766+
1#2
21767+
# only comment', sep="#", comment.char="#"), data.table(a=1L))
21768+
21769+
# na.strings
21770+
local({
21771+
txt = 'v
21772+
#NA
21773+
1
21774+
# comment'
21775+
writeLines(txt, tmp <- tempfile())
21776+
on.exit(unlink(tmp))
21777+
test(2341.170, fread(tmp, na.strings="#NA", comment.char='#'), data.table(v=1L))
21778+
test(2341.171, fread(tmp, na.strings="#NA", comment.char='#'), setDT(read.table(tmp, na.strings="#NA", comment.char='#', header=TRUE)))
21779+
})
21780+
21781+
test(2341.18, fread('a,b
21782+
"p#q",2 # tail
21783+
"r#s",3', comment.char='#'), data.table(a=c("p#q","r#s"), b=c(2L,3L)))
21784+
21785+
test(2341.19, fread(' # lead comment with padding
21786+
\t# and tab
21787+
a,b
21788+
1,2', comment.char='#'), data.table(a=1L, b=2L))
21789+
21790+
test(2341.20, fread('a,b # header cmt with padding # second cmt
21791+
1,2
21792+
3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
21793+
21794+
test(2341.21, fread('# meta1 # meta2
21795+
a,b
21796+
1,2', comment.char = '#'), data.table(a=1L, b=2L))
21797+
test(2341.22, fread('a,b # inline header comment\r\n1,2\r\n', comment.char = '#'), data.table(a=1L, b=2L))
21798+
21799+
# control skipping white space before comments with strip.white
21800+
test(2341.230, fread('a
21801+
b # trailing cmnt
21802+
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a="b "))
21803+
test(2341.231, fread('a # trailing header cmnt
21804+
b
21805+
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(`a `="b"))
21806+
test(2341.232, fread('a
21807+
# full line cmnt
21808+
# full line with leading ws cmnt
21809+
b
21810+
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a=c(" ", "b")))
21811+
21812+
test(2341.24, fread('a
21813+
# leading cmnt
21814+
b
21815+
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a=c(" ", "b")))
21816+
2169021817
# warn about different encodings in unique and duplicated, #469
2169121818
dt = data.table(x=c(iconv("\u00E9","UTF-8","latin1"), "\u00E9"))
21692-
test(2341.1, unique(dt), data.table(x="\u00E9"), warning="Mixed encodings.*")
21693-
test(2341.2, duplicated(dt), c(FALSE, TRUE), warning="Mixed encodings.*")
21694-
test(2341.3, unique(dt[c(2L,2L)]), data.table(x="\u00E9"))
21819+
test(2342.1, unique(dt), data.table(x="\u00E9"), warning="Mixed encodings.*")
21820+
test(2342.2, duplicated(dt), c(FALSE, TRUE), warning="Mixed encodings.*")
21821+
test(2342.3, unique(dt[c(2L,2L)]), data.table(x="\u00E9"))

man/fread.Rd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL,
1717
integer64=getOption("datatable.integer64", "integer64"),
1818
col.names,
1919
check.names=FALSE, encoding="unknown",
20-
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE,
20+
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="",
2121
key=NULL, index=NULL,
2222
showProgress=getOption("datatable.showProgress", interactive()),
2323
data.table=getOption("datatable.fread.datatable", TRUE),
@@ -56,6 +56,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC"
5656
\item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.}
5757
\item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. }
5858
\item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.}
59+
\item{comment.char}{Character vector of length one containing a single character of an empty string. Any text after the comment character in a line is ignored, including skipping comment-only lines. Use \code{""} to turn off the interpretation of comments altogether.}
5960
\item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
6061
\item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
6162
\item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. }

src/data.table.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP);
361361
SEXP chmatch_R(SEXP, SEXP, SEXP);
362362
SEXP chmatchdup_R(SEXP, SEXP, SEXP);
363363
SEXP chin_R(SEXP, SEXP);
364-
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
364+
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
365365
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
366366
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP);
367367
SEXP setlistelt(SEXP, SEXP, SEXP);

0 commit comments

Comments
 (0)