Skip to content
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
74bf37e
implement comment.char argument for fread
ben-schwen Oct 17, 2025
6f34ff2
remove handling of comment.char=NULL
ben-schwen Oct 18, 2025
2d4ae58
update NEWS
ben-schwen Oct 18, 2025
1d2f11a
update tests
ben-schwen Oct 18, 2025
a9851ef
change wording for error
ben-schwen Oct 18, 2025
9b05759
remove unreachable code
ben-schwen Oct 18, 2025
86b3b14
add helper function
ben-schwen Oct 18, 2025
5024949
extend tests
ben-schwen Oct 18, 2025
9d18827
update tests
ben-schwen Oct 18, 2025
efd8797
use skip_line helper
ben-schwen Oct 18, 2025
87ba151
add comments for helpers
ben-schwen Oct 18, 2025
b457e4a
fix test numbering
ben-schwen Oct 18, 2025
09b4932
add more tests
ben-schwen Oct 18, 2025
a0a9525
simplify read
ben-schwen Oct 18, 2025
79508a3
Revert "simplify read"
ben-schwen Oct 18, 2025
e5bbe96
separate helpers
ben-schwen Oct 18, 2025
cc57d27
add comments
ben-schwen Oct 18, 2025
3e73565
add coverage
ben-schwen Oct 18, 2025
e77137e
simplify header handling
ben-schwen Oct 19, 2025
09f9359
increase coverage
ben-schwen Oct 19, 2025
29bdb3d
simplify code
ben-schwen Oct 19, 2025
a25dd37
control skipping white spaces before comments with strip.white
ben-schwen Oct 19, 2025
01edb9a
tighten helper
ben-schwen Oct 19, 2025
21769cf
Merge branch 'master' into fread_commentChar
MichaelChirico Oct 20, 2025
f61989a
try improving readability with blank lines
MichaelChirico Oct 20, 2025
9416006
include some line-end comments in the multi-line comment test
MichaelChirico Oct 20, 2025
fe2b74b
match read.table for na.strings and comment.char
ben-schwen Oct 20, 2025
9288dd1
add strip.white=FALSE header testcase
ben-schwen Oct 20, 2025
73590c4
refactor end_of_field helper into more readable version
ben-schwen Oct 20, 2025
f925ce4
add example for strip.white
ben-schwen Oct 20, 2025
c77332d
summarize line-skipping behavior
MichaelChirico Oct 20, 2025
d73651d
clean up tmp
MichaelChirico Oct 20, 2025
ba0c68a
don't introduce whitespace to string literal body
MichaelChirico Oct 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@
# user system elapsed
# 0.028 0.000 0.005
```
20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation.

### BUG FIXES

Expand Down
7 changes: 5 additions & 2 deletions R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ fread = function(
input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL,
showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE),
nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE),
logicalYN=getOption("datatable.logicalYN", FALSE),
Expand Down Expand Up @@ -30,6 +30,9 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0),
is.numeric(nrows), length(nrows)==1L
)
if (!is.character(comment.char) || length(comment.char) != 1L || is.na(comment.char) || nchar(comment.char) > 1L) {
stopf("comment.char= must be a single non-NA character.")
}
fill = if(identical(fill, Inf)) .Machine$integer.max else as.integer(fill)
nrows=as.double(nrows) #4686
if (is.na(nrows) || nrows<0L) nrows=Inf # accept -1 to mean Inf, as read.table does
Expand Down Expand Up @@ -289,7 +292,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local
tz="UTC"
}
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,comment.char,
fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC")
if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns
nr = length(ans[[1L]])
Expand Down
113 changes: 113 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21688,3 +21688,116 @@ d3 = unserialize(serialize(d2, NULL))
test(2340.05, .selfref.ok(d3), FALSE)
setDT(d3)
test(2340.06, .selfref.ok(d3), TRUE)

# implement comment.char argument in fread, #856
test(2341.01, fread('a,b
#a comment
1,2
#another comment
3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))

test(2341.02, fread('a,b #line-trailing comment
1,2', comment.char='#'), data.table(a=1L, b=2L))

test(2341.03, fread('a,b#line-trailing comment and no whitespace
1,2', comment.char='#'), data.table(a=1L, b=2L))

test(2341.04, fread('a,b
1,2 #trailing after numeric', comment.char='#'), data.table(a=1L, b=2L))

# comment char inside quotes
test(2341.05, fread('a
"#quotes#"', comment.char="#"), data.table(a="#quotes#"))

# multi line comments
test(2341.06, fread('# multi line
# comment
1,2 # line-end comment
# multi line
# comment
3,4 # line-end comment
# trailing comment', comment.char='#'), data.table(V1=c(1L,3L), V2=c(2L,4L)))

test(2341.07, fread('id;value
1;2,5! trailing comment
2;NA
!final comment', sep=';', dec=',', na.strings='NA', comment.char='!'), data.table(id=1:2, value=c(2.5, NA_real_)))

# skip
test(2341.08, fread('meta line
DATA STARTS
x,y
# skip this
1,2', skip="DATA", header=TRUE, comment.char='#'), data.table(x=1L, y=2L))

# weird comment chars like space or quote
test(2341.09, fread('a
inline comment
1', comment.char=' '), data.table(a=1L))
test(2341.10, fread('a,b
1,2" trailing"
"comment line"
3,4', comment.char='"', quote=""), data.table(a=c(1L,3L), b=c(2L,4L)))

# invalid comment chars
test(2341.11, fread('a,b
## multichar commentchar
1,2', comment.char = '##'), error = "comment.char= must be a single non-NA character")

test(2341.12, fread('a,b
NA,NA
1,2', comment.char = NA), error = "comment.char= must be a single non-NA character")

# CLRF
test(2341.13, fread('a,b\r\n# cmt\r\n1,2\r\n3,4\r\n', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))

# header comment
test(2341.14, fread('# hdr cmt
x,y
1,2', header=TRUE, comment.char='#'), data.table(x=1L, y=2L))

# nrow not counting comments
test(2341.15, fread('a,b
1,2
# cmt
3,4
5,6', nrows=2, comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))

# sep and comment char same
test(2341.16, fread('a#b
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm that's interesting. I don't know what I'd expect out of reading this file. E.g. data.table(a=1L, b=2L) also seems like a possible answer, and even data.table(a=c(1L, NA), b=c("2", "only comment")) could be justified too. Maybe let's just error if comment.char==sep?

1#2
# only comment', sep="#", comment.char="#"), data.table(a=1L))

# na.strings
test(2341.17, fread('v
#NA
1
# comment', na.strings="#NA", comment.char='#'), data.table(v=1L))

test(2341.18, fread('a,b
"p#q",2 # tail
"r#s",3', comment.char='#'), data.table(a=c("p#q","r#s"), b=c(2L,3L)))

test(2341.19, fread(' # lead comment with padding
\t# and tab
a,b
1,2', comment.char='#'), data.table(a=1L, b=2L))

test(2341.20, fread('a,b # header cmt with padding # second cmt
1,2
3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))

test(2341.21, fread('# meta1 # meta2
a,b
1,2', comment.char = '#'), data.table(a=1L, b=2L))
test(2341.22, fread('a,b # inline header comment\r\n1,2\r\n', comment.char = '#'), data.table(a=1L, b=2L))

# control skipping white space before comments with strip.white
test(2341.23, fread('a
b # trailing cmnt
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a="b "))

test(2341.24, fread('a
# leading cmnt
b
', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a=c(" ", "b")))
3 changes: 2 additions & 1 deletion man/fread.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL,
integer64=getOption("datatable.integer64", "integer64"),
col.names,
check.names=FALSE, encoding="unknown",
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE,
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="",
key=NULL, index=NULL,
showProgress=getOption("datatable.showProgress", interactive()),
data.table=getOption("datatable.fread.datatable", TRUE),
Expand Down Expand Up @@ -56,6 +56,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC"
\item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.}
\item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. }
\item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.}
\item{comment.char}{Character vector of length one containing a single character of an empty string. Any text after the comment character in a line is ignored. Use \code{""} to turn off the interpretation of comments altogether.}
\item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
\item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
\item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. }
Expand Down
2 changes: 1 addition & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP);
SEXP chmatch_R(SEXP, SEXP, SEXP);
SEXP chmatchdup_R(SEXP, SEXP, SEXP);
SEXP chin_R(SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP setlistelt(SEXP, SEXP, SEXP);
Expand Down
Loading
Loading