Skip to content

Commit 74bf37e

Browse files
committed
implement comment.char argument for fread
1 parent 55b0de6 commit 74bf37e

File tree

8 files changed

+104
-8
lines changed

8 files changed

+104
-8
lines changed

NEWS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@
291291
# user system elapsed
292292
# 0.028 0.000 0.005
293293
```
294+
20. `fread()` has now `comment.char` argument implemented to skip trailing comments or comment-only lines similar as in `read.table`, [#856](https://github.com/Rdatatable/data.table/issues/856). Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation.
294295

295296
### BUG FIXES
296297

R/fread.R

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ fread = function(
22
input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto",
33
na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE),
44
skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"),
5-
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL,
5+
col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL,
66
showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE),
77
nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE),
88
logicalYN=getOption("datatable.logicalYN", FALSE),
@@ -30,6 +30,10 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
3030
isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0),
3131
is.numeric(nrows), length(nrows)==1L
3232
)
33+
if (is.null(comment.char)) comment.char = ""
34+
if (!is.character(comment.char) || length(comment.char) != 1L || is.na(comment.char) || nchar(comment.char) > 1L) {
35+
stopf("comment.char= must be a single non-NA character string.")
36+
}
3337
fill = if(identical(fill, Inf)) .Machine$integer.max else as.integer(fill)
3438
nrows=as.double(nrows) #4686
3539
if (is.na(nrows) || nrows<0L) nrows=Inf # accept -1 to mean Inf, as read.table does
@@ -289,7 +293,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
289293
if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local
290294
tz="UTC"
291295
}
292-
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,
296+
ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,comment.char,
293297
fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC")
294298
if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns
295299
nr = length(ans[[1L]])

inst/tests/tests.Rraw

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21685,3 +21685,10 @@ d3 = unserialize(serialize(d2, NULL))
2168521685
test(2340.05, .selfref.ok(d3), FALSE)
2168621686
setDT(d3)
2168721687
test(2340.06, .selfref.ok(d3), TRUE)
21688+
21689+
# implement comment.char argument in fread, #856
21690+
test(2341.1, fread('a,b\n#a comment\n1,2\n#another comment\n3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L)))
21691+
test(2341.2, fread('a,b #line-trailing comment\n1,2', comment.char='#'), data.table(a=c(1L), b=c(2L)))
21692+
test(2341.3, fread('a,b#line-trailing comment and no whitespace\n1,2', comment.char='#'), data.table(a=c(1L), b=c(2L)))
21693+
test(2341.4, fread('a,b\n1,2 #trailing after numeric', comment.char='#'), data.table(a=c(1L), b=c(2L)))
21694+
test(2341.5, fread('a\n"#quotes#"\n', comment.char="#"), data.table(a=c("#quotes#")))

man/fread.Rd

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL,
1717
integer64=getOption("datatable.integer64", "integer64"),
1818
col.names,
1919
check.names=FALSE, encoding="unknown",
20-
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE,
20+
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="",
2121
key=NULL, index=NULL,
2222
showProgress=getOption("datatable.showProgress", interactive()),
2323
data.table=getOption("datatable.fread.datatable", TRUE),
@@ -56,6 +56,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC"
5656
\item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.}
5757
\item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. }
5858
\item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.}
59+
\item{comment.char}{Character vector of length one containing a single character of an empty string. Any text after the comment character in a line is ignored. Use \code{""} to turn off the interpretation of comments altogether.}
5960
\item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
6061
\item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
6162
\item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. }

src/data.table.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP);
351351
SEXP chmatch_R(SEXP, SEXP, SEXP);
352352
SEXP chmatchdup_R(SEXP, SEXP, SEXP);
353353
SEXP chin_R(SEXP, SEXP);
354-
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
354+
SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
355355
SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
356356
SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP);
357357
SEXP setlistelt(SEXP, SEXP, SEXP);

src/fread.c

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ static const char *sof, *eof;
3131
static char sep;
3232
static char whiteChar; // what to consider as whitespace to skip: ' ', '\t' or 0 means both (when sep!=' ' && sep!='\t')
3333
static char quote, dec;
34+
static char commentChar;
3435
static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.'
3536
static bool eol_one_r; // only true very rarely for \r-only files
3637

@@ -188,7 +189,7 @@ bool freadCleanup(void)
188189
}
189190
free(mmp_copy); mmp_copy = NULL;
190191
fileSize = 0;
191-
sep = whiteChar = quote = dec = '\0';
192+
sep = whiteChar = quote = dec = commentChar = '\0';
192193
quoteRule = -1;
193194
any_number_like_NAstrings = false;
194195
blank_is_a_NAstring = false;
@@ -304,7 +305,8 @@ static inline bool end_of_field(const char *ch)
304305
// default, and therefore characters in the range 0x80-0xFF are negative.
305306
// We use eol() because that looks at eol_one_r inside it w.r.t. \r
306307
// \0 (maybe more than one) before eof are part of field and do not end it; eol() returns false for \0 but the ch==eof will return true for the \0 at eof.
307-
return *ch == sep || ((uint8_t)*ch <= 13 && (ch == eof || eol(&ch)));
308+
// Comment characters terminate a field immediately and take precedence over separators.
309+
return *ch == sep || ((uint8_t)*ch <= 13 && (ch == eof || eol(&ch))) || (commentChar && *ch == commentChar);
308310
}
309311

310312
static inline const char *end_NA_string(const char *start)
@@ -336,8 +338,24 @@ static inline int countfields(const char **pch)
336338
static void *targets[9];
337339
targets[8] = (void*) &trash;
338340
const char *ch = *pch;
339-
if (sep == ' ') while (*ch == ' ') ch++; // multiple sep==' ' at the start does not mean sep
340-
skip_white(&ch);
341+
for (;;) {
342+
if (ch >= eof) { *pch = ch; return 0; }
343+
if (sep == ' ') while (*ch == ' ') ch++; // multiple sep==' ' at the start does not mean sep
344+
skip_white(&ch);
345+
if (commentChar && *ch == commentChar) {
346+
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
347+
if (ch < eof) {
348+
if (*ch == '\r' || *ch == '\n') {
349+
eol(&ch);
350+
if (ch < eof) ch++;
351+
}
352+
continue; // rescan next line
353+
}
354+
*pch = ch;
355+
return 0;
356+
}
357+
break;
358+
}
341359
if (eol(&ch) || ch == eof) {
342360
*pch = ch + 1;
343361
return 0;
@@ -350,6 +368,17 @@ static inline int countfields(const char **pch)
350368
};
351369
while (ch < eof) {
352370
Field(&ctx);
371+
if (commentChar && *ch == commentChar) {
372+
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
373+
if (ch < eof) {
374+
if (*ch == '\r' || *ch == '\n') {
375+
eol(&ch);
376+
if (ch < eof) ch++;
377+
}
378+
}
379+
*pch = ch;
380+
return ncol;
381+
}
353382
// Field() leaves *ch resting on sep, \r, \n or *eof=='\0'
354383
if (sep == ' ' && *ch == sep) {
355384
while (ch[1] == ' ') ch++;
@@ -1422,6 +1451,7 @@ int freadMain(freadMainArgs _args)
14221451
fill = args.fill;
14231452
dec = args.dec;
14241453
quote = args.quote;
1454+
commentChar = args.comment;
14251455
if (args.sep == quote && quote!='\0') STOP(_("sep == quote ('%c') is not allowed"), quote);
14261456
if (args.sep == dec && dec != '\0') STOP(_("sep == dec ('%c') is not allowed"), dec);
14271457
if (quote == dec && dec != '\0') STOP(_("quote == dec ('%c') is not allowed"), dec);
@@ -2206,12 +2236,31 @@ int freadMain(freadMainArgs _args)
22062236
ch++;
22072237
Field(&fctx); // stores the string length and offset as <uint,uint> in colNames[i]
22082238
((lenOff**) fctx.targets)[8]++;
2239+
if (commentChar) {
2240+
// skip leading whitespace to detect inline comment marker in header row
2241+
const char *commentPos = ch;
2242+
while (commentPos < eof && (*commentPos == ' ' || *commentPos == '\t' || *commentPos == '\0')) commentPos++;
2243+
if (commentPos < eof && *commentPos == commentChar) {
2244+
ch = commentPos;
2245+
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
2246+
break; // stop header parsing after comment
2247+
}
2248+
}
22092249
if (*ch != sep) break;
22102250
if (sep == ' ') {
22112251
while (ch[1] == ' ') ch++;
22122252
if (ch[1] == '\r' || ch[1] == '\n' || ch[1] == '\0') { ch++; break; }
22132253
}
22142254
}
2255+
if (commentChar) {
2256+
// fast-trim trailing comment text after the header names
2257+
const char *commentPos = ch;
2258+
while (commentPos < eof && (*commentPos == ' ' || *commentPos == '\t' || *commentPos == '\0')) commentPos++;
2259+
if (commentPos < eof && *commentPos == commentChar) {
2260+
ch = commentPos;
2261+
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
2262+
}
2263+
}
22152264
if (eol(&ch)) pos = ++ch;
22162265
else if (*ch == '\0') pos = ch;
22172266
else INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov
@@ -2452,6 +2501,19 @@ int freadMain(freadMainArgs _args)
24522501
tLineStart = tch; // for error message
24532502
const char *fieldStart = tch;
24542503
int j = 0;
2504+
2505+
if (commentChar) {
2506+
// treat lines whose first non-space character is the comment marker as empty
2507+
const char *afterWhite = tLineStart;
2508+
while (afterWhite < eof && (*afterWhite == ' ' || *afterWhite == '\t' || *afterWhite == '\0')) afterWhite++;
2509+
if (afterWhite < eof && *afterWhite == commentChar) {
2510+
const char *skip = afterWhite;
2511+
while (skip < eof && *skip != '\n' && *skip != '\r') skip++;
2512+
if (skip < eof && eol(&skip)) skip++;
2513+
tch = skip;
2514+
continue;
2515+
}
2516+
}
24552517

24562518
//*** START HOT ***//
24572519
if (sep != ' ' && !any_number_like_NAstrings) { // TODO: can this 'if' be dropped somehow? Can numeric NAstrings be dealt with afterwards in one go as numeric comparison?
@@ -2596,6 +2658,15 @@ int freadMain(freadMainArgs _args)
25962658
int8_t thisSize = size[j];
25972659
if (thisSize) ((char**) targets)[size[j]] += size[j]; // 'if' to avoid undefined NULL+=0 when rereading
25982660
j++;
2661+
if (commentChar) {
2662+
const char *commentPtr = tch;
2663+
while (commentPtr < eof && (*commentPtr == ' ' || *commentPtr == '\t' || *commentPtr == '\0')) commentPtr++;
2664+
if (commentPtr < eof && *commentPtr == commentChar) {
2665+
tch = commentPtr;
2666+
while (tch < eof && *tch != '\n' && *tch != '\r') tch++;
2667+
break;
2668+
}
2669+
}
25992670
if (*tch == sep) { tch++; continue; }
26002671
if (fill && (*tch == '\n' || *tch == '\r' || tch == eof) && j < ncol) continue; // reuse processors to write appropriate NA to target; saves maintenance of a type switch down here
26012672
break;

src/fread.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,10 @@ typedef struct freadMainArgs
123123
// non-ASCII, or different open/closing quotation marks are not supported.
124124
char quote;
125125

126+
// Character that marks the beginning of a comment. When '\0', comment
127+
// parsing is disabled.
128+
char comment;
129+
126130
// Is there a header at the beginning of the file?
127131
// 0 = no, 1 = yes, -128 = autodetect
128132
int8_t header;

src/freadR.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ SEXP freadR(
6161
SEXP NAstringsArg,
6262
SEXP stripWhiteArg,
6363
SEXP skipEmptyLinesArg,
64+
SEXP commentCharArg,
6465
SEXP fillArg,
6566
SEXP showProgressArg,
6667
SEXP nThreadArg,
@@ -158,6 +159,13 @@ SEXP freadR(
158159
// here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL
159160
args.stripWhite = LOGICAL(stripWhiteArg)[0];
160161
args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0];
162+
if (!isString(commentCharArg) || LENGTH(commentCharArg) != 1)
163+
error(_("comment.char must be a single character vector of length 1")); // # notranslate
164+
const char *commentStr = CHAR(STRING_ELT(commentCharArg, 0));
165+
size_t commentLen = strlen(commentStr);
166+
if (commentLen > 1)
167+
error(_("comment.char must be a single character or \"\"")); // # notranslate
168+
args.comment = commentLen == 0 ? '\0' : commentStr[0];
161169
args.fill = INTEGER(fillArg)[0];
162170
args.showProgress = LOGICAL(showProgressArg)[0];
163171
if (INTEGER(nThreadArg)[0] < 1)

0 commit comments

Comments
 (0)