@@ -301,6 +301,15 @@ static inline bool eol(const char **pch)
301301 return eol_one_r && * * pch == '\r' ;
302302}
303303
304+
305+ static inline const char * skip_line (const char * ch , const char * eof ) {
306+ while (ch < eof && * ch != '\n' && * ch != '\r' )
307+ ch ++ ;
308+ if (ch < eof && eol (& ch )) ch ++ ;
309+ return ch ;
310+ }
311+
312+
304313/**
305314 * Return True iff `ch` is a valid field terminator character: either a field
306315 * separator or a newline.
@@ -352,15 +361,12 @@ static inline int countfields(const char **pch)
352361 if (sep == ' ' ) while (* ch == ' ' ) ch ++ ; // multiple sep==' ' at the start does not mean sep
353362 skip_white (& ch );
354363 if (commentChar && * ch == commentChar ) {
355- while (ch < eof && * ch != '\n' && * ch != '\r' ) ch ++ ;
356- if (ch < eof ) {
357- if (* ch == '\r' || * ch == '\n' ) {
358- eol (& ch );
359- if (ch < eof ) ch ++ ;
360- }
364+ const char * next = skip_line (ch , eof );
365+ if (next < eof ) {
366+ ch = next ;
361367 continue ; // rescan next line
362368 }
363- * pch = ch ;
369+ * pch = next ;
364370 return 0 ;
365371 }
366372 break ;
@@ -378,13 +384,7 @@ static inline int countfields(const char **pch)
378384 while (ch < eof ) {
379385 Field (& ctx );
380386 if (commentChar && * ch == commentChar ) {
381- while (ch < eof && * ch != '\n' && * ch != '\r' ) ch ++ ;
382- if (ch < eof ) {
383- if (* ch == '\r' || * ch == '\n' ) {
384- eol (& ch );
385- if (ch < eof ) ch ++ ;
386- }
387- }
387+ ch = skip_line (ch , eof );
388388 * pch = ch ;
389389 return ncol ;
390390 }
@@ -416,19 +416,16 @@ static inline const char *nextGoodLine(const char *ch, int ncol)
416416 // If this doesn't return the true line start, no matter. The previous thread will run-on and
417417 // resolve it. A good guess is all we need here. Being wrong will just be a bit slower.
418418 // If there are no embedded newlines, all newlines are true, and this guess will never be wrong.
419- while ( * ch != '\n' && * ch != '\r' && ( * ch != '\0' || ch < eof )) ch ++ ;
419+ ch = skip_line ( ch , eof );
420420 if (ch == eof ) return eof ;
421- if (eol (& ch )) // move to last byte of the line ending sequence (e.g. \r\r\n would be +2).
422- ch ++ ; // and then move to first byte of next line
423421 const char * simpleNext = ch ; // simply the first newline after the jump
424422 // if a better one can't be found, return this one (simpleNext). This will be the case when
425423 // fill=TRUE and the jump lands before 5 too-short lines, for example.
426424
427425 for (int attempts = 0 ; attempts < 5 && ch < eof ; attempts ++ ) {
428426 const char * ch2 = ch ;
429427 if (countfields (& ch2 ) == ncol ) return ch ; // returns simpleNext here on first attempt, almost all the time
430- while (* ch != '\n' && * ch != '\r' && (* ch != '\0' || ch < eof )) ch ++ ;
431- if (eol (& ch )) ch ++ ;
428+ ch = skip_line (ch , eof );
432429 }
433430 return simpleNext ;
434431}
@@ -2021,6 +2018,21 @@ int freadMain(freadMainArgs _args)
20212018 for (int jump = 0 ; jump < nJumps ; jump ++ ) {
20222019 if (jump == 0 ) {
20232020 ch = pos ;
2021+ // Skip leading comment lines before processing header
2022+ if (commentChar ) {
2023+ while (ch < eof ) {
2024+ const char * lineStart = ch ;
2025+ ch = skip_to_comment_or_nonwhite (ch );
2026+ if (ch < eof && * ch == commentChar ) {
2027+ ch = skip_line (ch , eof );
2028+ row1line ++ ;
2029+ continue ;
2030+ }
2031+ ch = lineStart ;
2032+ break ;
2033+ }
2034+ pos = ch ;
2035+ }
20242036 if (args .header != false) {
20252037 countfields (& ch ); // skip first row for type guessing as it's probably column names
20262038 row1line ++ ;
@@ -2225,7 +2237,17 @@ int freadMain(freadMainArgs _args)
22252237 if (verbose ) DTPRINT (_ ("[08] Assign column names\n" ));
22262238
22272239 ch = pos ; // back to start of first row (column names if header==true)
2228-
2240+ // Skip leading comment lines before parsing header
2241+ if (args .header != false && commentChar ) {
2242+ while (ch < eof ) {
2243+ ch = skip_to_comment_or_nonwhite (ch );
2244+ if (ch < eof && * ch == commentChar ) {
2245+ ch = skip_line (ch , eof );
2246+ } else break ;
2247+ }
2248+ pos = ch ;
2249+ colNamesAnchor = pos ;
2250+ }
22292251 if (args .header == false) {
22302252 colNames = NULL ; // userOverride will assign V1, V2, etc
22312253 } else {
@@ -2249,8 +2271,7 @@ int freadMain(freadMainArgs _args)
22492271 // skip leading whitespace to detect inline comment marker in header row
22502272 const char * commentPos = skip_to_comment_or_nonwhite (ch );
22512273 if (commentPos < eof && * commentPos == commentChar ) {
2252- ch = commentPos ;
2253- while (ch < eof && * ch != '\n' && * ch != '\r' ) ch ++ ;
2274+ ch = skip_line (commentPos , eof );
22542275 break ; // stop header parsing after comment
22552276 }
22562277 }
@@ -2264,13 +2285,23 @@ int freadMain(freadMainArgs _args)
22642285 // fast-trim trailing comment text after the header names
22652286 const char * commentPos = skip_to_comment_or_nonwhite (ch );
22662287 if (commentPos < eof && * commentPos == commentChar ) {
2267- ch = commentPos ;
2268- while (ch < eof && * ch != '\n' && * ch != '\r' ) ch ++ ;
2288+ ch = skip_line (commentPos , eof );
2289+ }
2290+ }
2291+ if (ch == eof || * ch == '\0' ) {
2292+ pos = ch ;
2293+ } else if (* ch == '\n' || * ch == '\r' ) {
2294+ if (eol (& ch )) {
2295+ if (ch < eof ) ch ++ ;
2296+ pos = ch ;
2297+ } else {
2298+ INTERNAL_STOP ("reading colnames ending on '%c'" , * ch ); // # nocov
22692299 }
2300+ } else if (ch > sof && (ch [-1 ] == '\n' || ch [-1 ] == '\r' )) {
2301+ pos = ch ;
2302+ } else {
2303+ INTERNAL_STOP ("reading colnames ending on '%c'" , * ch ); // # nocov
22702304 }
2271- if (eol (& ch )) pos = ++ ch ;
2272- else if (* ch == '\0' ) pos = ch ;
2273- else INTERNAL_STOP ("reading colnames ending on '%c'" , * ch ); // # nocov
22742305 // now on first data row (row after column names)
22752306 // when fill=TRUE and column names shorter (test 1635.2), leave calloc initialized lenOff.len==0
22762307 }
@@ -2882,7 +2913,7 @@ int freadMain(freadMainArgs _args)
28822913 } else {
28832914 const char * skippedFooter = ENC2NATIVE (ch );
28842915 // detect if it's a single line footer. Commonly the row count from SQL queries.
2885- while ( ch < eof && * ch != '\n' && * ch != '\r' ) ch ++ ;
2916+ ch = skip_line ( ch , eof ) ;
28862917 while (ch < eof && isspace (* ch )) ch ++ ;
28872918 if (ch == eof ) {
28882919 DTWARN (_ ("Discarded single-line footer: <<%s>>" ), strlim (skippedFooter , (char [500 ]) {0 }, 500 ));
0 commit comments