Skip to content

Commit efd8797

Browse files
committed
use skip_line helper
1 parent 9d18827 commit efd8797

File tree

1 file changed

+59
-28
lines changed

1 file changed

+59
-28
lines changed

src/fread.c

Lines changed: 59 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,15 @@ static inline bool eol(const char **pch)
301301
return eol_one_r && **pch == '\r';
302302
}
303303

304+
305+
static inline const char *skip_line(const char *ch, const char *eof) {
306+
while (ch < eof && *ch != '\n' && *ch != '\r')
307+
ch++;
308+
if (ch < eof && eol(&ch)) ch++;
309+
return ch;
310+
}
311+
312+
304313
/**
305314
* Return True iff `ch` is a valid field terminator character: either a field
306315
* separator or a newline.
@@ -352,15 +361,12 @@ static inline int countfields(const char **pch)
352361
if (sep == ' ') while (*ch == ' ') ch++; // multiple sep==' ' at the start does not mean sep
353362
skip_white(&ch);
354363
if (commentChar && *ch == commentChar) {
355-
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
356-
if (ch < eof) {
357-
if (*ch == '\r' || *ch == '\n') {
358-
eol(&ch);
359-
if (ch < eof) ch++;
360-
}
364+
const char *next = skip_line(ch, eof);
365+
if (next < eof) {
366+
ch = next;
361367
continue; // rescan next line
362368
}
363-
*pch = ch;
369+
*pch = next;
364370
return 0;
365371
}
366372
break;
@@ -378,13 +384,7 @@ static inline int countfields(const char **pch)
378384
while (ch < eof) {
379385
Field(&ctx);
380386
if (commentChar && *ch == commentChar) {
381-
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
382-
if (ch < eof) {
383-
if (*ch == '\r' || *ch == '\n') {
384-
eol(&ch);
385-
if (ch < eof) ch++;
386-
}
387-
}
387+
ch = skip_line(ch, eof);
388388
*pch = ch;
389389
return ncol;
390390
}
@@ -416,19 +416,16 @@ static inline const char *nextGoodLine(const char *ch, int ncol)
416416
// If this doesn't return the true line start, no matter. The previous thread will run-on and
417417
// resolve it. A good guess is all we need here. Being wrong will just be a bit slower.
418418
// If there are no embedded newlines, all newlines are true, and this guess will never be wrong.
419-
while (*ch != '\n' && *ch != '\r' && (*ch != '\0' || ch < eof)) ch++;
419+
ch = skip_line(ch, eof);
420420
if (ch == eof) return eof;
421-
if (eol(&ch)) // move to last byte of the line ending sequence (e.g. \r\r\n would be +2).
422-
ch++; // and then move to first byte of next line
423421
const char *simpleNext = ch; // simply the first newline after the jump
424422
// if a better one can't be found, return this one (simpleNext). This will be the case when
425423
// fill=TRUE and the jump lands before 5 too-short lines, for example.
426424

427425
for (int attempts = 0; attempts < 5 && ch < eof; attempts++) {
428426
const char *ch2 = ch;
429427
if (countfields(&ch2) == ncol) return ch; // returns simpleNext here on first attempt, almost all the time
430-
while (*ch != '\n' && *ch != '\r' && (*ch != '\0' || ch < eof)) ch++;
431-
if (eol(&ch)) ch++;
428+
ch = skip_line(ch, eof);
432429
}
433430
return simpleNext;
434431
}
@@ -2021,6 +2018,21 @@ int freadMain(freadMainArgs _args)
20212018
for (int jump = 0; jump < nJumps; jump++) {
20222019
if (jump == 0) {
20232020
ch = pos;
2021+
// Skip leading comment lines before processing header
2022+
if (commentChar) {
2023+
while (ch < eof) {
2024+
const char *lineStart = ch;
2025+
ch = skip_to_comment_or_nonwhite(ch);
2026+
if (ch < eof && *ch == commentChar) {
2027+
ch = skip_line(ch, eof);
2028+
row1line++;
2029+
continue;
2030+
}
2031+
ch = lineStart;
2032+
break;
2033+
}
2034+
pos = ch;
2035+
}
20242036
if (args.header != false) {
20252037
countfields(&ch); // skip first row for type guessing as it's probably column names
20262038
row1line++;
@@ -2225,7 +2237,17 @@ int freadMain(freadMainArgs _args)
22252237
if (verbose) DTPRINT(_("[08] Assign column names\n"));
22262238

22272239
ch = pos; // back to start of first row (column names if header==true)
2228-
2240+
// Skip leading comment lines before parsing header
2241+
if (args.header != false && commentChar) {
2242+
while (ch < eof) {
2243+
ch = skip_to_comment_or_nonwhite(ch);
2244+
if (ch < eof && *ch == commentChar) {
2245+
ch = skip_line(ch, eof);
2246+
} else break;
2247+
}
2248+
pos = ch;
2249+
colNamesAnchor = pos;
2250+
}
22292251
if (args.header == false) {
22302252
colNames = NULL; // userOverride will assign V1, V2, etc
22312253
} else {
@@ -2249,8 +2271,7 @@ int freadMain(freadMainArgs _args)
22492271
// skip leading whitespace to detect inline comment marker in header row
22502272
const char *commentPos = skip_to_comment_or_nonwhite(ch);
22512273
if (commentPos < eof && *commentPos == commentChar) {
2252-
ch = commentPos;
2253-
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
2274+
ch = skip_line(commentPos, eof);
22542275
break; // stop header parsing after comment
22552276
}
22562277
}
@@ -2264,13 +2285,23 @@ int freadMain(freadMainArgs _args)
22642285
// fast-trim trailing comment text after the header names
22652286
const char *commentPos = skip_to_comment_or_nonwhite(ch);
22662287
if (commentPos < eof && *commentPos == commentChar) {
2267-
ch = commentPos;
2268-
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
2288+
ch = skip_line(commentPos, eof);
2289+
}
2290+
}
2291+
if (ch == eof || *ch == '\0') {
2292+
pos = ch;
2293+
} else if (*ch == '\n' || *ch == '\r') {
2294+
if (eol(&ch)) {
2295+
if (ch < eof) ch++;
2296+
pos = ch;
2297+
} else {
2298+
INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov
22692299
}
2300+
} else if (ch > sof && (ch[-1] == '\n' || ch[-1] == '\r')) {
2301+
pos = ch;
2302+
} else {
2303+
INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov
22702304
}
2271-
if (eol(&ch)) pos = ++ch;
2272-
else if (*ch == '\0') pos = ch;
2273-
else INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov
22742305
// now on first data row (row after column names)
22752306
// when fill=TRUE and column names shorter (test 1635.2), leave calloc initialized lenOff.len==0
22762307
}
@@ -2882,7 +2913,7 @@ int freadMain(freadMainArgs _args)
28822913
} else {
28832914
const char *skippedFooter = ENC2NATIVE(ch);
28842915
// detect if it's a single line footer. Commonly the row count from SQL queries.
2885-
while (ch < eof && *ch != '\n' && *ch != '\r') ch++;
2916+
ch = skip_line(ch, eof);
28862917
while (ch < eof && isspace(*ch)) ch++;
28872918
if (ch == eof) {
28882919
DTWARN(_("Discarded single-line footer: <<%s>>"), strlim(skippedFooter, (char[500]) {0}, 500));

0 commit comments

Comments
 (0)