Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
74bf37e
implement comment.char argument for fread
ben-schwen Oct 17, 2025
6f34ff2
remove handling of comment.char=NULL
ben-schwen Oct 18, 2025
2d4ae58
update NEWS
ben-schwen Oct 18, 2025
1d2f11a
update tests
ben-schwen Oct 18, 2025
a9851ef
change wording for error
ben-schwen Oct 18, 2025
9b05759
remove unreachable code
ben-schwen Oct 18, 2025
86b3b14
add helper function
ben-schwen Oct 18, 2025
5024949
extend tests
ben-schwen Oct 18, 2025
9d18827
update tests
ben-schwen Oct 18, 2025
efd8797
use skip_line helper
ben-schwen Oct 18, 2025
87ba151
add comments for helpers
ben-schwen Oct 18, 2025
b457e4a
fix test numbering
ben-schwen Oct 18, 2025
09b4932
add more tests
ben-schwen Oct 18, 2025
a0a9525
simplify read
ben-schwen Oct 18, 2025
79508a3
Revert "simplify read"
ben-schwen Oct 18, 2025
e5bbe96
separate helpers
ben-schwen Oct 18, 2025
cc57d27
add comments
ben-schwen Oct 18, 2025
3e73565
add coverage
ben-schwen Oct 18, 2025
e77137e
simplify header handling
ben-schwen Oct 19, 2025
09f9359
increase coverage
ben-schwen Oct 19, 2025
29bdb3d
simplify code
ben-schwen Oct 19, 2025
a25dd37
control skipping white spaces before comments with strip.white
ben-schwen Oct 19, 2025
01edb9a
tighten helper
ben-schwen Oct 19, 2025
21769cf
Merge branch 'master' into fread_commentChar
MichaelChirico Oct 20, 2025
f61989a
try improving readability with blank lines
MichaelChirico Oct 20, 2025
9416006
include some line-end comments in the multi-line comment test
MichaelChirico Oct 20, 2025
fe2b74b
match read.table for na.strings and comment.char
ben-schwen Oct 20, 2025
9288dd1
add strip.white=FALSE header testcase
ben-schwen Oct 20, 2025
73590c4
refactor end_of_field helper into more readable version
ben-schwen Oct 20, 2025
f925ce4
add example for strip.white
ben-schwen Oct 20, 2025
c77332d
summarize line-skipping behavior
MichaelChirico Oct 20, 2025
d73651d
clean up tmp
MichaelChirico Oct 20, 2025
ba0c68a
don't introduce whitespace to string literal body
MichaelChirico Oct 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@
# user system elapsed
# 0.028 0.000 0.005
```
20. `fread()` has now `comment.char` argument implemented to skip trailing comments or comment-only lines similar as in `read.table`, [#856](https://github.com/Rdatatable/data.table/issues/856). Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation.

### BUG FIXES

Expand Down
8 changes: 6 additions & 2 deletions R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ fread = function(
input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL,
showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE),
nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE),
logicalYN=getOption("datatable.logicalYN", FALSE),
Expand Down Expand Up @@ -30,6 +30,10 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0),
is.numeric(nrows), length(nrows)==1L
)
if (is.null(comment.char)) comment.char = ""
if (!is.character(comment.char) || length(comment.char) != 1L || is.na(comment.char) || nchar(comment.char) > 1L) {
stopf("comment.char= must be a single non-NA character string.")
}
fill = if(identical(fill, Inf)) .Machine$integer.max else as.integer(fill)
nrows=as.double(nrows) #4686
if (is.na(nrows) || nrows<0L) nrows=Inf # accept -1 to mean Inf, as read.table does
Expand Down Expand Up @@ -289,7 +293,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local
tz="UTC"
}
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,comment.char,
fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC")
if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns
nr = length(ans[[1L]])
Expand Down
7 changes: 7 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21685,3 +21685,10 @@ d3 = unserialize(serialize(d2, NULL))
test(2340.05, .selfref.ok(d3), FALSE)
setDT(d3)
test(2340.06, .selfref.ok(d3), TRUE)

# implement comment.char argument in fread, #856
test(2341.1, fread('a,b\n#a comment\n1,2\n#another comment\n3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
test(2341.2, fread('a,b #line-trailing comment\n1,2', comment.char='#'), data.table(a=c(1L), b=c(2L)))
test(2341.3, fread('a,b#line-trailing comment and no whitespace\n1,2', comment.char='#'), data.table(a=c(1L), b=c(2L)))
test(2341.4, fread('a,b\n1,2 #trailing after numeric', comment.char='#'), data.table(a=c(1L), b=c(2L)))
test(2341.5, fread('a\n"#quotes#"\n', comment.char="#"), data.table(a=c("#quotes#")))
3 changes: 2 additions & 1 deletion man/fread.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL,
integer64=getOption("datatable.integer64", "integer64"),
col.names,
check.names=FALSE, encoding="unknown",
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE,
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="",
key=NULL, index=NULL,
showProgress=getOption("datatable.showProgress", interactive()),
data.table=getOption("datatable.fread.datatable", TRUE),
Expand Down Expand Up @@ -56,6 +56,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC"
\item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.}
\item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. }
\item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.}
\item{comment.char}{Character vector of length one containing a single character of an empty string. Any text after the comment character in a line is ignored. Use \code{""} to turn off the interpretation of comments altogether.}
\item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
\item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
\item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. }
Expand Down
2 changes: 1 addition & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP);
SEXP chmatch_R(SEXP, SEXP, SEXP);
SEXP chmatchdup_R(SEXP, SEXP, SEXP);
SEXP chin_R(SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP);
SEXP setlistelt(SEXP, SEXP, SEXP);
Expand Down
79 changes: 75 additions & 4 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ static const char *sof, *eof;
static char sep;
static char whiteChar; // what to consider as whitespace to skip: ' ', '\t' or 0 means both (when sep!=' ' && sep!='\t')
static char quote, dec;
static char commentChar;
static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.'
static bool eol_one_r; // only true very rarely for \r-only files

Expand Down Expand Up @@ -188,7 +189,7 @@ bool freadCleanup(void)
}
free(mmp_copy); mmp_copy = NULL;
fileSize = 0;
sep = whiteChar = quote = dec = '\0';
sep = whiteChar = quote = dec = commentChar = '\0';
quoteRule = -1;
any_number_like_NAstrings = false;
blank_is_a_NAstring = false;
Expand Down Expand Up @@ -304,7 +305,8 @@ static inline bool end_of_field(const char *ch)
// default, and therefore characters in the range 0x80-0xFF are negative.
// We use eol() because that looks at eol_one_r inside it w.r.t. \r
// \0 (maybe more than one) before eof are part of field and do not end it; eol() returns false for \0 but the ch==eof will return true for the \0 at eof.
return *ch == sep || ((uint8_t)*ch <= 13 && (ch == eof || eol(&ch)));
// Comment characters terminate a field immediately and take precedence over separators.
return *ch == sep || ((uint8_t)*ch <= 13 && (ch == eof || eol(&ch))) || (commentChar && *ch == commentChar);
}

static inline const char *end_NA_string(const char *start)
Expand Down Expand Up @@ -336,8 +338,24 @@ static inline int countfields(const char **pch)
static void *targets[9];
targets[8] = (void*) &trash;
const char *ch = *pch;
if (sep == ' ') while (*ch == ' ') ch++; // multiple sep==' ' at the start does not mean sep
skip_white(&ch);
for (;;) {
if (ch >= eof) { *pch = ch; return 0; }
if (sep == ' ') while (*ch == ' ') ch++; // multiple sep==' ' at the start does not mean sep
skip_white(&ch);
if (commentChar && *ch == commentChar) {
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
if (ch < eof) {
if (*ch == '\r' || *ch == '\n') {
eol(&ch);
if (ch < eof) ch++;
}
continue; // rescan next line
}
*pch = ch;
return 0;
}
break;
}
if (eol(&ch) || ch == eof) {
*pch = ch + 1;
return 0;
Expand All @@ -350,6 +368,17 @@ static inline int countfields(const char **pch)
};
while (ch < eof) {
Field(&ctx);
if (commentChar && *ch == commentChar) {
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
if (ch < eof) {
if (*ch == '\r' || *ch == '\n') {
eol(&ch);
if (ch < eof) ch++;
}
}
*pch = ch;
return ncol;
}
// Field() leaves *ch resting on sep, \r, \n or *eof=='\0'
if (sep == ' ' && *ch == sep) {
while (ch[1] == ' ') ch++;
Expand Down Expand Up @@ -1422,6 +1451,7 @@ int freadMain(freadMainArgs _args)
fill = args.fill;
dec = args.dec;
quote = args.quote;
commentChar = args.comment;
if (args.sep == quote && quote!='\0') STOP(_("sep == quote ('%c') is not allowed"), quote);
if (args.sep == dec && dec != '\0') STOP(_("sep == dec ('%c') is not allowed"), dec);
if (quote == dec && dec != '\0') STOP(_("quote == dec ('%c') is not allowed"), dec);
Expand Down Expand Up @@ -2206,12 +2236,31 @@ int freadMain(freadMainArgs _args)
ch++;
Field(&fctx); // stores the string length and offset as <uint,uint> in colNames[i]
((lenOff**) fctx.targets)[8]++;
if (commentChar) {
// skip leading whitespace to detect inline comment marker in header row
const char *commentPos = ch;
while (commentPos < eof && (*commentPos == ' ' || *commentPos == '\t' || *commentPos == '\0')) commentPos++;
if (commentPos < eof && *commentPos == commentChar) {
ch = commentPos;
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
break; // stop header parsing after comment
}
}
if (*ch != sep) break;
if (sep == ' ') {
while (ch[1] == ' ') ch++;
if (ch[1] == '\r' || ch[1] == '\n' || ch[1] == '\0') { ch++; break; }
}
}
if (commentChar) {
// fast-trim trailing comment text after the header names
const char *commentPos = ch;
while (commentPos < eof && (*commentPos == ' ' || *commentPos == '\t' || *commentPos == '\0')) commentPos++;
if (commentPos < eof && *commentPos == commentChar) {
ch = commentPos;
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
}
}
if (eol(&ch)) pos = ++ch;
else if (*ch == '\0') pos = ch;
else INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov
Expand Down Expand Up @@ -2452,6 +2501,19 @@ int freadMain(freadMainArgs _args)
tLineStart = tch; // for error message
const char *fieldStart = tch;
int j = 0;

if (commentChar) {
// treat lines whose first non-space character is the comment marker as empty
const char *afterWhite = tLineStart;
while (afterWhite < eof && (*afterWhite == ' ' || *afterWhite == '\t' || *afterWhite == '\0')) afterWhite++;
if (afterWhite < eof && *afterWhite == commentChar) {
const char *skip = afterWhite;
while (skip < eof && *skip != '\n' && *skip != '\r') skip++;
if (skip < eof && eol(&skip)) skip++;
tch = skip;
continue;
}
}

//*** START HOT ***//
if (sep != ' ' && !any_number_like_NAstrings) { // TODO: can this 'if' be dropped somehow? Can numeric NAstrings be dealt with afterwards in one go as numeric comparison?
Expand Down Expand Up @@ -2596,6 +2658,15 @@ int freadMain(freadMainArgs _args)
int8_t thisSize = size[j];
if (thisSize) ((char**) targets)[size[j]] += size[j]; // 'if' to avoid undefined NULL+=0 when rereading
j++;
if (commentChar) {
const char *commentPtr = tch;
while (commentPtr < eof && (*commentPtr == ' ' || *commentPtr == '\t' || *commentPtr == '\0')) commentPtr++;
if (commentPtr < eof && *commentPtr == commentChar) {
tch = commentPtr;
while (tch < eof && *tch != '\n' && *tch != '\r') tch++;
break;
}
}
if (*tch == sep) { tch++; continue; }
if (fill && (*tch == '\n' || *tch == '\r' || tch == eof) && j < ncol) continue; // reuse processors to write appropriate NA to target; saves maintenance of a type switch down here
break;
Expand Down
4 changes: 4 additions & 0 deletions src/fread.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ typedef struct freadMainArgs
// non-ASCII, or different open/closing quotation marks are not supported.
char quote;

// Character that marks the beginning of a comment. When '\0', comment
// parsing is disabled.
char comment;

// Is there a header at the beginning of the file?
// 0 = no, 1 = yes, -128 = autodetect
int8_t header;
Expand Down
8 changes: 8 additions & 0 deletions src/freadR.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ SEXP freadR(
SEXP NAstringsArg,
SEXP stripWhiteArg,
SEXP skipEmptyLinesArg,
SEXP commentCharArg,
SEXP fillArg,
SEXP showProgressArg,
SEXP nThreadArg,
Expand Down Expand Up @@ -158,6 +159,13 @@ SEXP freadR(
// here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL
args.stripWhite = LOGICAL(stripWhiteArg)[0];
args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0];
if (!isString(commentCharArg) || LENGTH(commentCharArg) != 1)
error(_("comment.char must be a single character vector of length 1")); // # notranslate
const char *commentStr = CHAR(STRING_ELT(commentCharArg, 0));
size_t commentLen = strlen(commentStr);
if (commentLen > 1)
error(_("comment.char must be a single character or \"\"")); // # notranslate
args.comment = commentLen == 0 ? '\0' : commentStr[0];
args.fill = INTEGER(fillArg)[0];
args.showProgress = LOGICAL(showProgressArg)[0];
if (INTEGER(nThreadArg)[0] < 1)
Expand Down
Loading