Skip to content

Commit 59f966c

Browse files
implement comment.char argument for fread (#7375)
* implement comment.char argument for fread * remove handling of comment.char=NULL * update NEWS * update tests * change wording for error * remove unreachable code * add helper function * extend tests * update tests * use skip_line helper * add comments for helpers * fix test numbering * add more tests * simplify read * Revert "simplify read" This reverts commit a0a9525. * separate helpers * add comments * add coverage * simplify header handling * increase coverage * simplify code * control skipping white spaces before comments with strip.white * tighten helper * try improving readability with blank lines * include some line-end comments in the multi-line comment test * match read.table for na.strings and comment.char * add strip.white=FALSE header testcase * refactor end_of_field helper into more readable version * add example for strip.white * summarize line-skipping behavior * clean up tmp * don't introduce whitespace to string literal body --------- Co-authored-by: Michael Chirico <[email protected]>
1 parent 67129f0 commit 59f966c

File tree

8 files changed

+257
-15
lines changed

8 files changed

+257
-15
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@
291291
# user system elapsed
292292
# 0.028 0.000 0.005
293293
```
294+
20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation.
294295

295296
### BUG FIXES
296297

R/fread.R

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ fread = function(
22
input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
33
na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
44
skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
5-
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
5+
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL,
66
showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE),
77
nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE),
88
logicalYN=getOption("datatable.logicalYN", FALSE),
@@ -30,6 +30,9 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
3030
isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0),
3131
is.numeric(nrows), length(nrows)==1L
3232
)
33+
if (!is.character(comment.char) || length(comment.char) != 1L || is.na(comment.char) || nchar(comment.char) > 1L) {
34+
stopf("comment.char= must be a single non-NA character.")
35+
}
3336
fill = if(identical(fill, Inf)) .Machine$integer.max else as.integer(fill)
3437
nrows=as.double(nrows) #4686
3538
if (is.na(nrows) || nrows<0L) nrows=Inf # accept -1 to mean Inf, as read.table does
@@ -289,7 +292,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
289292
if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local
290293
tz="UTC"
291294
}
292-
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,
295+
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,comment.char,
293296
fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC")
294297
if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns
295298
nr = length(ans[[1L]])

inst/tests/tests.Rraw

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21688,3 +21688,130 @@ d3 = unserialize(serialize(d2, NULL))
2168821688
test(2340.05, .selfref.ok(d3), FALSE)
2168921689
setDT(d3)
2169021690
test(2340.06, .selfref.ok(d3), TRUE)
21691+
21692+
# implement comment.char argument in fread, #856
21693+
test(2341.01, fread('a,b
21694+
#a comment
21695+
1,2
21696+
#another comment
21697+
3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
21698+
21699+
test(2341.02, fread('a,b #line-trailing comment
21700+
1,2', comment.char='#'), data.table(a=1L, b=2L))
21701+
21702+
test(2341.03, fread('a,b#line-trailing comment and no whitespace
21703+
1,2', comment.char='#'), data.table(a=1L, b=2L))
21704+
21705+
test(2341.04, fread('a,b
21706+
1,2 #trailing after numeric', comment.char='#'), data.table(a=1L, b=2L))
21707+
21708+
# comment char inside quotes
21709+
test(2341.05, fread('a
21710+
"#quotes#"', comment.char="#"), data.table(a="#quotes#"))
21711+
21712+
# multi line comments
21713+
test(2341.06, fread('# multi line
21714+
# comment
21715+
1,2 # line-end comment
21716+
# multi line
21717+
# comment
21718+
3,4 # line-end comment
21719+
# trailing comment', comment.char='#'), data.table(V1=c(1L,3L), V2=c(2L,4L)))
21720+
21721+
test(2341.07, fread('id;value
21722+
1;2,5! trailing comment
21723+
2;NA
21724+
!final comment', sep=';', dec=',', na.strings='NA', comment.char='!'), data.table(id=1:2, value=c(2.5, NA_real_)))
21725+
21726+
# skip
21727+
test(2341.08, fread('meta line
21728+
DATA STARTS
21729+
x,y
21730+
# skip this
21731+
1,2', skip="DATA", header=TRUE, comment.char='#'), data.table(x=1L, y=2L))
21732+
21733+
# weird comment chars like space or quote
21734+
test(2341.09, fread('a
21735+
inline comment
21736+
1', comment.char=' '), data.table(a=1L))
21737+
test(2341.10, fread('a,b
21738+
1,2" trailing"
21739+
"comment line"
21740+
3,4', comment.char='"', quote=""), data.table(a=c(1L,3L), b=c(2L,4L)))
21741+
21742+
# invalid comment chars
21743+
test(2341.11, fread('a,b
21744+
## multichar commentchar
21745+
1,2', comment.char = '##'), error = "comment.char= must be a single non-NA character")
21746+
21747+
test(2341.12, fread('a,b
21748+
NA,NA
21749+
1,2', comment.char = NA), error = "comment.char= must be a single non-NA character")
21750+
21751+
# CLRF
21752+
test(2341.13, fread('a,b\r\n# cmt\r\n1,2\r\n3,4\r\n', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
21753+
21754+
# header comment
21755+
test(2341.14, fread('# hdr cmt
21756+
x,y
21757+
1,2', header=TRUE, comment.char='#'), data.table(x=1L, y=2L))
21758+
21759+
# nrow not counting comments
21760+
test(2341.15, fread('a,b
21761+
1,2
21762+
# cmt
21763+
3,4
21764+
5,6', nrows=2, comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
21765+
21766+
# sep and comment char same
21767+
test(2341.16, fread('a#b
21768+
1#2
21769+
# only comment', sep="#", comment.char="#"), data.table(a=1L))
21770+
21771+
# na.strings
21772+
local({
21773+
txt = 'v
21774+
#NA
21775+
1
21776+
# comment'
21777+
writeLines(txt, tmp <- tempfile())
21778+
on.exit(unlink(tmp))
21779+
test(2341.170, fread(tmp, na.strings="#NA", comment.char='#'), data.table(v=1L))
21780+
test(2341.171, fread(tmp, na.strings="#NA", comment.char='#'), setDT(read.table(tmp, na.strings="#NA", comment.char='#', header=TRUE)))
21781+
})
21782+
21783+
test(2341.18, fread('a,b
21784+
"p#q",2 # tail
21785+
"r#s",3', comment.char='#'), data.table(a=c("p#q","r#s"), b=c(2L,3L)))
21786+
21787+
test(2341.19, fread(' # lead comment with padding
21788+
\t# and tab
21789+
a,b
21790+
1,2', comment.char='#'), data.table(a=1L, b=2L))
21791+
21792+
test(2341.20, fread('a,b # header cmt with padding # second cmt
21793+
1,2
21794+
3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
21795+
21796+
test(2341.21, fread('# meta1 # meta2
21797+
a,b
21798+
1,2', comment.char = '#'), data.table(a=1L, b=2L))
21799+
test(2341.22, fread('a,b # inline header comment\r\n1,2\r\n', comment.char = '#'), data.table(a=1L, b=2L))
21800+
21801+
# control skipping white space before comments with strip.white
21802+
test(2341.230, fread('a
21803+
b # trailing cmnt
21804+
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a="b "))
21805+
test(2341.231, fread('a # trailing header cmnt
21806+
b
21807+
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(`a `="b"))
21808+
test(2341.232, fread('a
21809+
# full line cmnt
21810+
# full line with leading ws cmnt
21811+
b
21812+
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a=c(" ", "b")))
21813+
21814+
test(2341.24, fread('a
21815+
# leading cmnt
21816+
b
21817+
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a=c(" ", "b")))

man/fread.Rd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL,
1717
integer64=getOption("datatable.integer64", "integer64"),
1818
col.names,
1919
check.names=FALSE, encoding="unknown",
20-
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE,
20+
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="",
2121
key=NULL, index=NULL,
2222
showProgress=getOption("datatable.showProgress", interactive()),
2323
data.table=getOption("datatable.fread.datatable", TRUE),
@@ -56,6 +56,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC"
5656
\item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.}
5757
\item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. }
5858
\item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.}
59+
\item{comment.char}{Character vector of length one containing a single character of an empty string. Any text after the comment character in a line is ignored, including skipping comment-only lines. Use \code{""} to turn off the interpretation of comments altogether.}
5960
\item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
6061
\item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
6162
\item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. }

src/data.table.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP);
361361
SEXP chmatch_R(SEXP, SEXP, SEXP);
362362
SEXP chmatchdup_R(SEXP, SEXP, SEXP);
363363
SEXP chin_R(SEXP, SEXP);
364-
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
364+
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
365365
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
366366
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP);
367367
SEXP setlistelt(SEXP, SEXP, SEXP);

0 commit comments

Comments
 (0)