You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: NEWS.md
+1Lines changed: 1 addition & 0 deletions
Display the source diff
Display the rich diff
Original file line number
Diff line number
Diff line change
@@ -291,6 +291,7 @@
291
291
# user system elapsed
292
292
# 0.028 0.000 0.005
293
293
```
294
+
20.`fread()`hasnow`comment.char`argumentimplementedtoskiptrailingcommentsorcomment-onlylinessimilarasin`read.table`, [#856](https://github.com/Rdatatable/data.table/issues/856). Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation.
\item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.}
57
57
\item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. }
58
58
\item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.}
59
+
\item{comment.char}{Character vector of length one containing a single character of an empty string. Any text after the comment character in a line is ignored. Use \code{""} to turn off the interpretation of comments altogether.}
59
60
\item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
60
61
\item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. }
61
62
\item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. }
// default, and therefore characters in the range 0x80-0xFF are negative.
305
306
// We use eol() because that looks at eol_one_r inside it w.r.t. \r
306
307
// \0 (maybe more than one) before eof are part of field and do not end it; eol() returns false for \0 but the ch==eof will return true for the \0 at eof.
@@ -336,8 +338,24 @@ static inline int countfields(const char **pch)
336
338
staticvoid*targets[9];
337
339
targets[8] = (void*) &trash;
338
340
constchar*ch=*pch;
339
-
if (sep==' ') while (*ch==' ') ch++; // multiple sep==' ' at the start does not mean sep
340
-
skip_white(&ch);
341
+
for (;;) {
342
+
if (ch >= eof) { *pch=ch; return0; }
343
+
if (sep==' ') while (*ch==' ') ch++; // multiple sep==' ' at the start does not mean sep
344
+
skip_white(&ch);
345
+
if (commentChar&&*ch==commentChar) {
346
+
while (ch<eof&&*ch!='\n'&&*ch!='\r') ch++;
347
+
if (ch<eof) {
348
+
if (*ch=='\r'||*ch=='\n') {
349
+
eol(&ch);
350
+
if (ch<eof) ch++;
351
+
}
352
+
continue; // rescan next line
353
+
}
354
+
*pch=ch;
355
+
return0;
356
+
}
357
+
break;
358
+
}
341
359
if (eol(&ch) ||ch==eof) {
342
360
*pch=ch+1;
343
361
return0;
@@ -350,6 +368,17 @@ static inline int countfields(const char **pch)
350
368
};
351
369
while (ch<eof) {
352
370
Field(&ctx);
371
+
if (commentChar&&*ch==commentChar) {
372
+
while (ch<eof&&*ch!='\n'&&*ch!='\r') ch++;
373
+
if (ch<eof) {
374
+
if (*ch=='\r'||*ch=='\n') {
375
+
eol(&ch);
376
+
if (ch<eof) ch++;
377
+
}
378
+
}
379
+
*pch=ch;
380
+
returnncol;
381
+
}
353
382
// Field() leaves *ch resting on sep, \r, \n or *eof=='\0'
354
383
if (sep==' '&&*ch==sep) {
355
384
while (ch[1] ==' ') ch++;
@@ -1422,6 +1451,7 @@ int freadMain(freadMainArgs _args)
1422
1451
fill=args.fill;
1423
1452
dec=args.dec;
1424
1453
quote=args.quote;
1454
+
commentChar=args.comment;
1425
1455
if (args.sep==quote&"e!='\0') STOP(_("sep == quote ('%c') is not allowed"), quote);
1426
1456
if (args.sep==dec&&dec!='\0') STOP(_("sep == dec ('%c') is not allowed"), dec);
1427
1457
if (quote==dec&&dec!='\0') STOP(_("quote == dec ('%c') is not allowed"), dec);
@@ -2206,12 +2236,31 @@ int freadMain(freadMainArgs _args)
2206
2236
ch++;
2207
2237
Field(&fctx); // stores the string length and offset as <uint,uint> in colNames[i]
2208
2238
((lenOff**) fctx.targets)[8]++;
2239
+
if (commentChar) {
2240
+
// skip leading whitespace to detect inline comment marker in header row
2241
+
constchar*commentPos=ch;
2242
+
while (commentPos<eof&& (*commentPos==' '||*commentPos=='\t'||*commentPos=='\0')) commentPos++;
2243
+
if (commentPos<eof&&*commentPos==commentChar) {
2244
+
ch=commentPos;
2245
+
while (ch<eof&&*ch!='\n'&&*ch!='\r') ch++;
2246
+
break; // stop header parsing after comment
2247
+
}
2248
+
}
2209
2249
if (*ch!=sep) break;
2210
2250
if (sep==' ') {
2211
2251
while (ch[1] ==' ') ch++;
2212
2252
if (ch[1] =='\r'||ch[1] =='\n'||ch[1] =='\0') { ch++; break; }
2213
2253
}
2214
2254
}
2255
+
if (commentChar) {
2256
+
// fast-trim trailing comment text after the header names
2257
+
constchar*commentPos=ch;
2258
+
while (commentPos<eof&& (*commentPos==' '||*commentPos=='\t'||*commentPos=='\0')) commentPos++;
2259
+
if (commentPos<eof&&*commentPos==commentChar) {
2260
+
ch=commentPos;
2261
+
while (ch<eof&&*ch!='\n'&&*ch!='\r') ch++;
2262
+
}
2263
+
}
2215
2264
if (eol(&ch)) pos=++ch;
2216
2265
elseif (*ch=='\0') pos=ch;
2217
2266
elseINTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov
@@ -2452,6 +2501,19 @@ int freadMain(freadMainArgs _args)
2452
2501
tLineStart=tch; // for error message
2453
2502
constchar*fieldStart=tch;
2454
2503
intj=0;
2504
+
2505
+
if (commentChar) {
2506
+
// treat lines whose first non-space character is the comment marker as empty
2507
+
constchar*afterWhite=tLineStart;
2508
+
while (afterWhite<eof&& (*afterWhite==' '||*afterWhite=='\t'||*afterWhite=='\0')) afterWhite++;
2509
+
if (afterWhite<eof&&*afterWhite==commentChar) {
2510
+
constchar*skip=afterWhite;
2511
+
while (skip<eof&&*skip!='\n'&&*skip!='\r') skip++;
2512
+
if (skip<eof&&eol(&skip)) skip++;
2513
+
tch=skip;
2514
+
continue;
2515
+
}
2516
+
}
2455
2517
2456
2518
//*** START HOT ***//
2457
2519
if (sep!=' '&& !any_number_like_NAstrings) { // TODO: can this 'if' be dropped somehow? Can numeric NAstrings be dealt with afterwards in one go as numeric comparison?
@@ -2596,6 +2658,15 @@ int freadMain(freadMainArgs _args)
2596
2658
int8_tthisSize=size[j];
2597
2659
if (thisSize) ((char**) targets)[size[j]] +=size[j]; // 'if' to avoid undefined NULL+=0 when rereading
2598
2660
j++;
2661
+
if (commentChar) {
2662
+
constchar*commentPtr=tch;
2663
+
while (commentPtr<eof&& (*commentPtr==' '||*commentPtr=='\t'||*commentPtr=='\0')) commentPtr++;
2664
+
if (commentPtr<eof&&*commentPtr==commentChar) {
2665
+
tch=commentPtr;
2666
+
while (tch<eof&&*tch!='\n'&&*tch!='\r') tch++;
2667
+
break;
2668
+
}
2669
+
}
2599
2670
if (*tch==sep) { tch++; continue; }
2600
2671
if (fill&& (*tch=='\n'||*tch=='\r'||tch==eof) &&j<ncol) continue; // reuse processors to write appropriate NA to target; saves maintenance of a type switch down here
0 commit comments