Skip to content

Commit 00eb637

Browse files
turned quote rule into enumeration + minor formatting improvements (#7222)
* turned quote rule into enumeration + minor formatting improvements * updated comments * restored overridden changes * Update src/fread.c Co-authored-by: Michael Chirico <[email protected]> * implemented suggested changes * reverted removal of braces --------- Co-authored-by: Michael Chirico <[email protected]>
1 parent b596eb1 commit 00eb637

File tree

1 file changed

+62
-49
lines changed

1 file changed

+62
-49
lines changed

src/fread.c

Lines changed: 62 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -34,22 +34,33 @@ static char quote, dec;
3434
static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.'
3535
static bool eol_one_r; // only true very rarely for \r-only files
3636

37-
// Quote rule:
38-
// 0 = Fields may be quoted, any quote inside the field is doubled. This is
39-
// the CSV standard. For example: <<...,"hello ""world""",...>>
40-
// 1 = Fields may be quoted, any quotes inside are escaped with a backslash.
41-
// For example: <<...,"hello \"world\"",...>>
42-
// 2 = Fields may be quoted, but any quotes inside will appear verbatim and
43-
// not escaped in any way. It is not always possible to parse the file
44-
// unambiguously, but we give it a try anyways. A quote will be presumed
45-
// to mark the end of the field iff it is followed by the field separator.
46-
// Under this rule eol characters cannot appear inside the field.
47-
// For example: <<...,"hello "world"",...>>
48-
// 3 = Fields are not quoted at all. Any quote characters appearing anywhere
49-
// inside the field will be treated as any other regular characters.
50-
// Example: <<...,hello "world",...>>
51-
//
52-
static int quoteRule;
37+
enum quote_rule_t
38+
{
39+
// Fields may be quoted, any quote inside the field is doubled.This is
40+
// the CSV standard. For example: <<...,"hello ""world""",...>>
41+
QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED,
42+
43+
// Fields may be quoted, any quotes inside are escaped with a backslash.
44+
// For example: <<...,"hello \"world\"",...>>
45+
QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED,
46+
47+
// Fields may be quoted, but any quotes inside will appear verbatim and
48+
// not escaped in any way. It is not always possible to parse the file
49+
// unambiguously, but we give it a try anyways. A quote will be presumed
50+
// to mark the end of the field iff it is followed by the field separator.
51+
// Under this rule eol characters cannot appear inside the field.
52+
// For example: <<...,"hello "world"",...>>
53+
QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED,
54+
55+
// Fields are not quoted at all. Any quote characters appearing anywhere
56+
// inside the field will be treated as any other regular characters.
57+
// Example: <<...,hello "world",...>>
58+
QUOTE_RULE_IGNORE_QUOTES,
59+
60+
QUOTE_RULE_COUNT
61+
};
62+
63+
static enum quote_rule_t quoteRule;
5364
static const char* const* NAstrings;
5465
static bool any_number_like_NAstrings = false;
5566
static bool blank_is_a_NAstring = false;
@@ -75,7 +86,7 @@ static freadMainArgs args = { 0 }; // global for use by DTPRINT; static implies
7586

7687
// See header for more explanation.
7788
const char typeName[NUMTYPE][10] = { "drop", "bool8", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string" };
78-
int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 };
89+
int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8, 8 };
7990

8091
// In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043
8192
// So we assign them through below init_const_literals function.
@@ -200,7 +211,7 @@ static inline int iminInt( int a, int b) { return a < b ? a : b; }
200211
/** Return value of `x` clamped to the range [upper, lower] */
201212
static inline int64_t clamp_i64t(int64_t x, int64_t lower, int64_t upper)
202213
{
203-
return x < lower ? lower : x > upper? upper : x;
214+
return x < lower ? lower : x > upper ? upper : x;
204215
}
205216

206217

@@ -449,7 +460,7 @@ double copyFile(size_t fileSize) // only called in very very rare cases
449460
return -1.0; // # nocov
450461
memcpy(mmp_copy, mmp, fileSize);
451462
sof = mmp_copy;
452-
eof = (char *)OFFSET_POINTER(mmp_copy, fileSize);
463+
eof = (char*)OFFSET_POINTER(mmp_copy, fileSize);
453464
return wallclock() - tt;
454465
}
455466

@@ -504,7 +515,7 @@ static void Field(FieldParseContext *ctx)
504515
if ((*ch == ' ' && stripWhite) || (*ch == '\0' && ch < eof))
505516
while(*++ch == ' ' || (*ch == '\0' && ch < eof)); // if sep==' ' the space would have been skipped already and we wouldn't be on space now.
506517
const char *fieldStart = ch;
507-
if (*ch != quote || quoteRule == 3 || quote == '\0') {
518+
if (*ch != quote || quoteRule == QUOTE_RULE_IGNORE_QUOTES || quote == '\0') {
508519
// Most common case. Unambiguously not quoted. Simply search for sep|eol. If field contains sep|eol then it should have been quoted and we do not try to heal that.
509520
while(!end_of_field(ch)) ch++; // sep, \r, \n or eof will end
510521
*ctx->ch = ch;
@@ -519,28 +530,28 @@ static void Field(FieldParseContext *ctx)
519530
return;
520531
}
521532
// else *ch==quote (we don't mind that quoted fields are a little slower e.g. no desire to save switch)
522-
// the field is quoted and quotes are correctly escaped (quoteRule 0 and 1)
523-
// or the field is quoted but quotes are not escaped (quoteRule 2)
524-
// or the field is not quoted but the data contains a quote at the start (quoteRule 2 too)
533+
// the field is quoted and quotes are correctly escaped (QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED and QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED)
534+
// or the field is quoted but quotes are not escaped (QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED)
535+
// or the field is not quoted but the data contains a quote at the start (QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED too)
525536
// What if this string signifies an NA? Will find out after we're done parsing quotes
526537
const char *field_after_NA = end_NA_string(fieldStart);
527538
fieldStart++; // step over opening quote
528539
switch(quoteRule) {
529-
case 0: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol
540+
case QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol
530541
while (*++ch || ch < eof) {
531542
if (*ch == quote) {
532543
if (ch[1] == quote) { ch++; continue; }
533544
break; // found undoubled closing quote
534545
}
535546
}
536547
break;
537-
case 1: // quoted with embedded quotes escaped; the final unescaped " must be followed by sep|eol
548+
case QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED: // quoted with embedded quotes escaped; the final unescaped " must be followed by sep|eol
538549
while (*++ch || ch < eof) {
539550
if (*ch == '\\' && (ch[1] == quote || ch[1] == '\\')) { ch++; continue; }
540551
if (*ch == quote) break;
541552
}
542553
break;
543-
case 2:
554+
case QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED:
544555
// (i) quoted (perhaps because the source system knows sep is present) but any quotes were not escaped at all,
545556
// so look for ", to define the end. (There might not be any quotes present to worry about, anyway).
546557
// (ii) not-quoted but there is a quote at the beginning so it should have been; look for , at the end
@@ -550,7 +561,7 @@ static void Field(FieldParseContext *ctx)
550561
{
551562
const char *ch2 = ch;
552563
while ((*++ch || ch < eof) && *ch != '\n' && *ch != '\r') {
553-
if (*ch == quote && end_of_field(ch + 1)) {ch2 = ch; break;} // (*1) regular ", ending; leave *ch on closing quote
564+
if (*ch == quote && end_of_field(ch + 1)) { ch2 = ch; break; } // (*1) regular ", ending; leave *ch on closing quote
554565
if (*ch == sep) {
555566
// first sep in this field
556567
// if there is a ", afterwards but before the next \n, use that; the field was quoted and it's still case (i) above.
@@ -579,7 +590,7 @@ static void Field(FieldParseContext *ctx)
579590
*ctx->ch = ch;
580591
} else {
581592
*ctx->ch = ch;
582-
if (ch == eof && quoteRule != 2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
593+
if (ch == eof && quoteRule != QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED
583594
while(target->len > 0 && ((ch[-1] == ' ' && stripWhite) || ch[-1] == '\0')) { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST "
584595
}
585596
// Does end-of-field correspond to end-of-possible-NA?
@@ -1341,7 +1352,7 @@ int freadMain(freadMainArgs _args)
13411352
if (verbose) DTPRINT(_(" Using %d threads (omp_get_max_threads()=%d, nth=%d)\n"), nth, maxth, args.nth);
13421353
}
13431354

1344-
const uint64_t ui64 = NA_FLOAT64_I64;
1355+
static const uint64_t ui64 = NA_FLOAT64_I64;
13451356
memcpy(&NA_FLOAT64, &ui64, 8);
13461357

13471358
const int64_t nrowLimit = args.nrowLimit;
@@ -1378,7 +1389,9 @@ int freadMain(freadMainArgs _args)
13781389
}
13791390
disabled_parsers[CT_BOOL8_N] = !args.logical01;
13801391
disabled_parsers[CT_BOOL8_Y] = !args.logicalYN;
1381-
disabled_parsers[CT_ISO8601_DATE] = disabled_parsers[CT_ISO8601_TIME] = args.oldNoDateTime; // temporary new option in v1.13.0; see NEWS
1392+
disabled_parsers[CT_ISO8601_DATE] = args.oldNoDateTime; // temporary new option in v1.13.0; see NEWS
1393+
disabled_parsers[CT_ISO8601_TIME] = args.oldNoDateTime;
1394+
13821395
if (verbose) {
13831396
if (*NAstrings == NULL) {
13841397
DTPRINT(_(" No NAstrings provided.\n"));
@@ -1438,7 +1451,7 @@ int freadMain(freadMainArgs _args)
14381451
}
14391452
else if (args.filename) {
14401453
if (verbose) DTPRINT(_(" Opening file %s\n"), args.filename);
1441-
const char* fnam = args.filename;
1454+
const char *fnam = args.filename;
14421455
#ifndef WIN32
14431456
int fd = open(fnam, O_RDONLY);
14441457
if (fd == -1) STOP(_("Couldn't open file %s: %s"), fnam, strerror(errno));
@@ -1497,7 +1510,7 @@ int freadMain(freadMainArgs _args)
14971510
CloseHandle(hFile); // see https://msdn.microsoft.com/en-us/library/windows/desktop/aa366537(v=vs.85).aspx
14981511
if (mmp == NULL) {
14991512
#endif
1500-
int nbit = 8 * sizeof(char *); // #nocov
1513+
int nbit = 8 * sizeof(char*); // #nocov
15011514
STOP(_("Opened %s file ok but could not memory map it. This is a %dbit process. %s."), filesize_to_str(fileSize), nbit, // # nocov
15021515
nbit <= 32 ? _("Please upgrade to 64bit") : _("There is probably not enough contiguous virtual memory available")); // # nocov
15031516
}
@@ -1705,7 +1718,7 @@ int freadMain(freadMainArgs _args)
17051718
if (verbose) DTPRINT(_(" sep='\\n' passed in meaning read lines as single character column\n"));
17061719
sep = 127; // ASCII DEL: a character different from \r, \n and \0 that isn't in the data
17071720
whiteChar = 0;
1708-
quoteRule = 3; // Ignore quoting
1721+
quoteRule = QUOTE_RULE_IGNORE_QUOTES;
17091722
ncol = 1;
17101723
int thisLine = 0;
17111724
while (ch < eof && thisLine++ < jumpLines) {
@@ -1730,13 +1743,13 @@ int freadMain(freadMainArgs _args)
17301743
//topSep = args.sep;
17311744
if (verbose) DTPRINT(_(" Using supplied sep '%s'\n"), args.sep == '\t' ? "\\t" : seps);
17321745
}
1733-
int topNumLines = 0; // the most number of lines with the same number of fields, so far
1734-
int topNumFields = 1; // how many fields that was, to resolve ties
1735-
int topQuoteRule = -1; // which quote rule that was
1736-
int topSkip = 0; // how many rows to auto-skip
1746+
int topNumLines = 0; // the most number of lines with the same number of fields, so far
1747+
int topNumFields = 1; // how many fields that was, to resolve ties
1748+
enum quote_rule_t topQuoteRule = -1; // which quote rule that was
1749+
int topSkip = 0; // how many rows to auto-skip
17371750
const char *topStart = NULL;
17381751

1739-
for (quoteRule = quote ? 0 : 3; quoteRule < 4; quoteRule++) { // #loop_counter_not_local_scope_ok
1752+
for (quoteRule = quote ? QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED : QUOTE_RULE_IGNORE_QUOTES; quoteRule < QUOTE_RULE_COUNT; quoteRule++) { // #loop_counter_not_local_scope_ok
17401753
// quote rule in order of preference.
17411754
// when top is tied the first wins, so do all seps for the first quoteRule, then all seps for the second quoteRule, etc
17421755
for (int s = 0; s < nseps; s++) {
@@ -1746,7 +1759,7 @@ int freadMain(freadMainArgs _args)
17461759
// if (verbose) DTPRINT(_(" Trying sep='%c' with quoteRule %d ...\n"), sep, quoteRule);
17471760

17481761
if (fill) {
1749-
if (quoteRule > 1 && quote) continue; // turn off self-healing quote rule when filling
1762+
if (quoteRule > QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED && quote) continue; // turn off self-healing quote rule when filling
17501763
int firstRowNcol = countfields(&ch);
17511764
int thisncol = 0, maxncol = firstRowNcol, thisRow = 0;
17521765
while (ch < eof && ++thisRow < jumpLines) { // TODO: rename 'jumpLines' to 'jumpRows'
@@ -1801,7 +1814,7 @@ int freadMain(freadMainArgs _args)
18011814
if ((thisBlockLines > topNumLines && lastncol > 1) || // more lines wins even with fewer fields, so long as number of fields >= 2
18021815
(thisBlockLines == topNumLines &&
18031816
lastncol > topNumFields && // when number of lines is tied, choose the sep which separates it into more columns
1804-
(quoteRule < 2 || quoteRule <= topQuoteRule) && // for test 1834 where every line contains a correctly quoted field contain sep
1817+
(quoteRule < QUOTE_RULE_EMBEDDED_QUOTES_NOT_ESCAPED || quoteRule <= topQuoteRule) && // for test 1834 where every line contains a correctly quoted field contain sep
18051818
(topNumFields <= 1 || sep != ' '))) {
18061819
topNumLines = thisBlockLines;
18071820
topNumFields = lastncol;
@@ -1828,8 +1841,8 @@ int freadMain(freadMainArgs _args)
18281841
ASSERT(topSep == 127, "Single column input has topSep=%d", topSep);
18291842
sep = topSep;
18301843
// no self healing quote rules, as we don't have >1 field to disambiguate
1831-
// choose quote rule 0 or 1 based on for which 100 rows gets furthest into file
1832-
for (quoteRule = 0; quoteRule <= 1; quoteRule++) { // #loop_counter_not_local_scope_ok
1844+
// choose QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED or QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED based on for which 100 rows gets furthest into file
1845+
for (quoteRule = QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED; quoteRule <= QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED; quoteRule++) { // #loop_counter_not_local_scope_ok
18331846
int thisRow = 0, thisncol = 0;
18341847
ch = pos;
18351848
while (ch < eof && ++thisRow < jumpLines && (thisncol = countfields(&ch)) >= 0) {};
@@ -1843,7 +1856,7 @@ int freadMain(freadMainArgs _args)
18431856
}
18441857

18451858
quoteRule = topQuoteRule;
1846-
if (quoteRule > 1 && quote) {
1859+
if (quoteRule > QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED && quote) {
18471860
DTWARN(_("Found and resolved improper quoting in first %d rows. If the fields are not quoted (e.g. field separator does not appear within any field), try quote=\"\" to avoid this warning."), jumpLines);
18481861
// TODO: include line number and text in warning. Could loop again with the standard quote rule to find the line that fails.
18491862
}
@@ -1890,8 +1903,8 @@ int freadMain(freadMainArgs _args)
18901903
DTPRINT(_(" File copy in RAM took %.3f seconds.\n"), time_taken);
18911904
else if (tt > 0.5) // # nocov
18921905
DTPRINT(_("Avoidable file copy in RAM took %.3f seconds. %s.\n"), time_taken, msg); // # nocov. not warning as that could feasibly cause CRAN tests to fail, say, if test machine is heavily loaded
1893-
pos = sof + (pos - (const char *)mmp);
1894-
firstJumpEnd = sof + (firstJumpEnd - (const char *)mmp);
1906+
pos = sof + (pos - (const char*)mmp);
1907+
firstJumpEnd = sof + (firstJumpEnd - (const char*)mmp);
18951908
} else {
18961909
if (verbose) DTPRINT(_(" 1-column file ends with 2 or more end-of-line. Restoring last eol using extra byte in cow page.\n"));
18971910
eof++;
@@ -2241,7 +2254,7 @@ int freadMain(freadMainArgs _args)
22412254
if (type[j] < tmpType[j]) {
22422255
if (strcmp(typeName[tmpType[j]], typeName[type[j]]) != 0) {
22432256
DTWARN(_("Attempt to override column %d%s%.*s%s of inherent type '%s' down to '%s' ignored. Only overrides to a higher type are currently supported. If this was intended, please coerce to the lower type afterwards."),
2244-
j + 1, colNames ? " <<" : "", colNames?(colNames[j].len) : 0, colNames ? (colNamesAnchor + colNames[j].off) : "", colNames ? ">>" : "", // #4644
2257+
j + 1, colNames ? " <<" : "", colNames ? (colNames[j].len) : 0, colNames ? (colNamesAnchor + colNames[j].off) : "", colNames ? ">>" : "", // #4644
22452258
typeName[tmpType[j]], typeName[type[j]]);
22462259
}
22472260
type[j] = tmpType[j];
@@ -2449,7 +2462,7 @@ int freadMain(freadMainArgs _args)
24492462
fun[IGNORE_BUMP(thisType)](&fctx);
24502463
if (*tch != sep) break;
24512464
int8_t thisSize = size[j];
2452-
if (thisSize) ((char **) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0
2465+
if (thisSize) ((char**) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0
24532466
tch++;
24542467
j++;
24552468
}
@@ -2463,7 +2476,7 @@ int freadMain(freadMainArgs _args)
24632476
}
24642477
else if (eol(&tch) && j < ncol) { // j<ncol needed for #2523 (erroneous extra comma after last field)
24652478
int8_t thisSize = size[j];
2466-
if (thisSize) ((char **) targets)[thisSize] += thisSize;
2479+
if (thisSize) ((char**) targets)[thisSize] += thisSize;
24672480
j++;
24682481
if (j > max_col) max_col = j;
24692482
if (j == ncol) { tch++; myNrow++; continue; } // next line. Back up to while (tch<nextJumpStart). Usually happens, fastest path
@@ -2633,7 +2646,7 @@ int freadMain(freadMainArgs _args)
26332646
ctx.nRows = myNrow;
26342647
orderBuffer(&ctx);
26352648
if (myStopEarly) {
2636-
if (quoteRule < 3) {
2649+
if (quoteRule < QUOTE_RULE_IGNORE_QUOTES) {
26372650
quoteRule++;
26382651
if (quoteRuleBumpedCh == NULL) {
26392652
// for warning message if the quote rule bump does in fact manage to heal it, e.g. test 1881

0 commit comments

Comments
 (0)