Skip to content

Commit 13d9b50

Browse files
committed
turned quote rule into enumeration + minor formatting improvements
1 parent cfa9f49 commit 13d9b50

File tree

1 file changed

+52
-41
lines changed

1 file changed

+52
-41
lines changed

src/fread.c

Lines changed: 52 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,17 @@ static bool eol_one_r; // only true very rarely for \r-only files
4949
// inside the field will be treated as any other regular characters.
5050
// Example: <<...,hello "world",...>>
5151
//
52-
static int quoteRule;
52+
53+
enum quoteRule
54+
{
55+
QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED,
56+
QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED,
57+
QUOTE_RULE_HYBRID,
58+
QUOTE_RULE_IGNORE_QUOTES,
59+
QUOTE_RULE_COUNT
60+
};
61+
62+
static enum quoteRule quoteRule;
5363
static const char* const* NAstrings;
5464
static bool any_number_like_NAstrings = false;
5565
static bool blank_is_a_NAstring = false;
@@ -75,15 +85,15 @@ static freadMainArgs args = { 0 }; // global for use by DTPRINT; static implies
7585

7686
// See header for more explanation.
7787
const char typeName[NUMTYPE][10] = { "drop", "bool8", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string" };
78-
int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 };
88+
int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8, 8 };
7989

8090
// In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043
81-
// So we assign them through below init_const_literals function.
91+
// So we assign them through below init function.
8292
static double NAND;
8393
static double INFD;
8494

8595
// NAN and INFINITY constants are float, so cast to double once up front.
86-
static void init_const_literals(void)
96+
static void init(void)
8797
{
8898
NAND = (double)NAN;
8999
INFD = (double)INFINITY;
@@ -200,7 +210,7 @@ static inline int iminInt( int a, int b) { return a < b ? a : b; }
200210
/** Return value of `x` clamped to the range [upper, lower] */
201211
static inline int64_t clamp_i64t(int64_t x, int64_t lower, int64_t upper)
202212
{
203-
return x < lower ? lower : x > upper? upper : x;
213+
return x < lower ? lower : x > upper ? upper : x;
204214
}
205215

206216

@@ -449,7 +459,7 @@ double copyFile(size_t fileSize) // only called in very very rare cases
449459
return -1.0; // # nocov
450460
memcpy(mmp_copy, mmp, fileSize);
451461
sof = mmp_copy;
452-
eof = (char *)OFFSET_POINTER(mmp_copy, fileSize);
462+
eof = (char*)OFFSET_POINTER(mmp_copy, fileSize);
453463
return wallclock() - tt;
454464
}
455465

@@ -504,7 +514,7 @@ static void Field(FieldParseContext *ctx)
504514
if ((*ch == ' ' && stripWhite) || (*ch == '\0' && ch < eof))
505515
while(*++ch == ' ' || (*ch == '\0' && ch < eof)); // if sep==' ' the space would have been skipped already and we wouldn't be on space now.
506516
const char *fieldStart = ch;
507-
if (*ch != quote || quoteRule == 3 || quote == '\0') {
517+
if (*ch != quote || quoteRule == QUOTE_RULE_IGNORE_QUOTES || quote == '\0') {
508518
// Most common case. Unambiguously not quoted. Simply search for sep|eol. If field contains sep|eol then it should have been quoted and we do not try to heal that.
509519
while(!end_of_field(ch)) ch++; // sep, \r, \n or eof will end
510520
*ctx->ch = ch;
@@ -526,21 +536,21 @@ static void Field(FieldParseContext *ctx)
526536
const char *field_after_NA = end_NA_string(fieldStart);
527537
fieldStart++; // step over opening quote
528538
switch(quoteRule) {
529-
case 0: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol
539+
case QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol
530540
while (*++ch || ch < eof) {
531541
if (*ch == quote) {
532542
if (ch[1] == quote) { ch++; continue; }
533543
break; // found undoubled closing quote
534544
}
535545
}
536546
break;
537-
case 1: // quoted with embedded quotes escaped; the final unescaped " must be followed by sep|eol
547+
case QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED: // quoted with embedded quotes escaped; the final unescaped " must be followed by sep|eol
538548
while (*++ch || ch < eof) {
539549
if (*ch == '\\' && (ch[1] == quote || ch[1] == '\\')) { ch++; continue; }
540550
if (*ch == quote) break;
541551
}
542552
break;
543-
case 2:
553+
case QUOTE_RULE_HYBRID:
544554
// (i) quoted (perhaps because the source system knows sep is present) but any quotes were not escaped at all,
545555
// so look for ", to define the end. (There might not be any quotes present to worry about, anyway).
546556
// (ii) not-quoted but there is a quote at the beginning so it should have been; look for , at the end
@@ -550,7 +560,7 @@ static void Field(FieldParseContext *ctx)
550560
{
551561
const char *ch2 = ch;
552562
while ((*++ch || ch < eof) && *ch != '\n' && *ch != '\r') {
553-
if (*ch == quote && end_of_field(ch + 1)) {ch2 = ch; break;} // (*1) regular ", ending; leave *ch on closing quote
563+
if (*ch == quote && end_of_field(ch + 1)) { ch2 = ch; break; } // (*1) regular ", ending; leave *ch on closing quote
554564
if (*ch == sep) {
555565
// first sep in this field
556566
// if there is a ", afterwards but before the next \n, use that; the field was quoted and it's still case (i) above.
@@ -579,7 +589,7 @@ static void Field(FieldParseContext *ctx)
579589
*ctx->ch = ch;
580590
} else {
581591
*ctx->ch = ch;
582-
if (ch == eof && quoteRule != 2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
592+
if (ch == eof && quoteRule != QUOTE_RULE_HYBRID) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
583593
while(target->len > 0 && ((ch[-1] == ' ' && stripWhite) || ch[-1] == '\0')) { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST "
584594
}
585595
// Does end-of-field correspond to end-of-possible-NA?
@@ -829,7 +839,7 @@ static void parse_double_regular(FieldParseContext *ctx)
829839
*/
830840
static void parse_double_extended(FieldParseContext *ctx)
831841
{
832-
init_const_literals();
842+
init();
833843

834844
double* target = ctx->targets[sizeof(double)];
835845
const char *ch = *ctx->ch;
@@ -916,7 +926,7 @@ static void parse_double_extended(FieldParseContext *ctx)
916926
*/
917927
static void parse_double_hexadecimal(FieldParseContext *ctx)
918928
{
919-
init_const_literals();
929+
init();
920930

921931
const char *ch = *ctx->ch;
922932
double *target = ctx->targets[sizeof(double)];
@@ -1280,7 +1290,7 @@ static int detect_types(const char **pch, int ncol, bool *bumped)
12801290
dec = '\0';
12811291
}
12821292
}
1283-
while (++tmpType[field] < CT_STRING && disabled_parsers[tmpType[field]]) {};
1293+
while (++tmpType[field] < CT_STRING && disabled_parsers[tmpType[field]]);
12841294
*bumped = true;
12851295
}
12861296
if (autoDec && dec != '\0') { // double was attempted
@@ -1341,7 +1351,7 @@ int freadMain(freadMainArgs _args)
13411351
if (verbose) DTPRINT(_(" Using %d threads (omp_get_max_threads()=%d, nth=%d)\n"), nth, maxth, args.nth);
13421352
}
13431353

1344-
const uint64_t ui64 = NA_FLOAT64_I64;
1354+
static const uint64_t ui64 = NA_FLOAT64_I64;
13451355
memcpy(&NA_FLOAT64, &ui64, 8);
13461356

13471357
const int64_t nrowLimit = args.nrowLimit;
@@ -1378,7 +1388,9 @@ int freadMain(freadMainArgs _args)
13781388
}
13791389
disabled_parsers[CT_BOOL8_N] = !args.logical01;
13801390
disabled_parsers[CT_BOOL8_Y] = !args.logicalYN;
1381-
disabled_parsers[CT_ISO8601_DATE] = disabled_parsers[CT_ISO8601_TIME] = args.oldNoDateTime; // temporary new option in v1.13.0; see NEWS
1391+
disabled_parsers[CT_ISO8601_DATE] = args.oldNoDateTime; // temporary new option in v1.13.0; see NEWS
1392+
disabled_parsers[CT_ISO8601_TIME] = args.oldNoDateTime;
1393+
13821394
if (verbose) {
13831395
if (*NAstrings == NULL) {
13841396
DTPRINT(_(" No NAstrings provided.\n"));
@@ -1438,7 +1450,7 @@ int freadMain(freadMainArgs _args)
14381450
}
14391451
else if (args.filename) {
14401452
if (verbose) DTPRINT(_(" Opening file %s\n"), args.filename);
1441-
const char* fnam = args.filename;
1453+
const char *fnam = args.filename;
14421454
#ifndef WIN32
14431455
int fd = open(fnam, O_RDONLY);
14441456
if (fd == -1) STOP(_("Couldn't open file %s: %s"), fnam, strerror(errno));
@@ -1497,7 +1509,7 @@ int freadMain(freadMainArgs _args)
14971509
CloseHandle(hFile); // see https://msdn.microsoft.com/en-us/library/windows/desktop/aa366537(v=vs.85).aspx
14981510
if (mmp == NULL) {
14991511
#endif
1500-
int nbit = 8 * sizeof(char *); // #nocov
1512+
int nbit = 8 * sizeof(char*); // #nocov
15011513
STOP(_("Opened %s file ok but could not memory map it. This is a %dbit process. %s."), filesize_to_str(fileSize), nbit, // # nocov
15021514
nbit <= 32 ? _("Please upgrade to 64bit") : _("There is probably not enough contiguous virtual memory available")); // # nocov
15031515
}
@@ -1569,7 +1581,7 @@ int freadMain(freadMainArgs _args)
15691581
}
15701582
if (ch >= sof) {
15711583
const char *lastNewLine = ch; // the start of the final newline sequence.
1572-
while (++ch < eof && isspace(*ch)) {};
1584+
while (++ch < eof && isspace(*ch));
15731585
if (ch == eof) {
15741586
// yes, just whitespace after last newline. Use last newline to put final \0
15751587
eof = lastNewLine;
@@ -1705,7 +1717,7 @@ int freadMain(freadMainArgs _args)
17051717
if (verbose) DTPRINT(_(" sep='\\n' passed in meaning read lines as single character column\n"));
17061718
sep = 127; // ASCII DEL: a character different from \r, \n and \0 that isn't in the data
17071719
whiteChar = 0;
1708-
quoteRule = 3; // Ignore quoting
1720+
quoteRule = QUOTE_RULE_IGNORE_QUOTES;
17091721
ncol = 1;
17101722
int thisLine = 0;
17111723
while (ch < eof && thisLine++ < jumpLines) {
@@ -1730,13 +1742,13 @@ int freadMain(freadMainArgs _args)
17301742
//topSep = args.sep;
17311743
if (verbose) DTPRINT(_(" Using supplied sep '%s'\n"), args.sep == '\t' ? "\\t" : seps);
17321744
}
1733-
int topNumLines = 0; // the most number of lines with the same number of fields, so far
1734-
int topNumFields = 1; // how many fields that was, to resolve ties
1735-
int topQuoteRule = -1; // which quote rule that was
1736-
int topSkip = 0; // how many rows to auto-skip
1745+
int topNumLines = 0; // the most number of lines with the same number of fields, so far
1746+
int topNumFields = 1; // how many fields that was, to resolve ties
1747+
enum quoteRule topQuoteRule = -1; // which quote rule that was
1748+
int topSkip = 0; // how many rows to auto-skip
17371749
const char *topStart = NULL;
17381750

1739-
for (quoteRule = quote ? 0 : 3; quoteRule < 4; quoteRule++) { // #loop_counter_not_local_scope_ok
1751+
for (quoteRule = quote ? QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED : QUOTE_RULE_IGNORE_QUOTES; quoteRule < QUOTE_RULE_COUNT; quoteRule++) { // #loop_counter_not_local_scope_ok
17401752
// quote rule in order of preference.
17411753
// when top is tied the first wins, so do all seps for the first quoteRule, then all seps for the second quoteRule, etc
17421754
for (int s = 0; s < nseps; s++) {
@@ -1746,7 +1758,7 @@ int freadMain(freadMainArgs _args)
17461758
// if (verbose) DTPRINT(_(" Trying sep='%c' with quoteRule %d ...\n"), sep, quoteRule);
17471759

17481760
if (fill) {
1749-
if (quoteRule > 1 && quote) continue; // turn off self-healing quote rule when filling
1761+
if (quoteRule > QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED && quote) continue; // turn off self-healing quote rule when filling
17501762
int firstRowNcol = countfields(&ch);
17511763
int thisncol = 0, maxncol = firstRowNcol, thisRow = 0;
17521764
while (ch < eof && ++thisRow < jumpLines) { // TODO: rename 'jumpLines' to 'jumpRows'
@@ -1801,7 +1813,7 @@ int freadMain(freadMainArgs _args)
18011813
if ((thisBlockLines > topNumLines && lastncol > 1) || // more lines wins even with fewer fields, so long as number of fields >= 2
18021814
(thisBlockLines == topNumLines &&
18031815
lastncol > topNumFields && // when number of lines is tied, choose the sep which separates it into more columns
1804-
(quoteRule < 2 || quoteRule <= topQuoteRule) && // for test 1834 where every line contains a correctly quoted field contain sep
1816+
(quoteRule < QUOTE_RULE_HYBRID || quoteRule <= topQuoteRule) && // for test 1834 where every line contains a correctly quoted field contain sep
18051817
(topNumFields <= 1 || sep != ' '))) {
18061818
topNumLines = thisBlockLines;
18071819
topNumFields = lastncol;
@@ -1829,10 +1841,10 @@ int freadMain(freadMainArgs _args)
18291841
sep = topSep;
18301842
// no self healing quote rules, as we don't have >1 field to disambiguate
18311843
// choose quote rule 0 or 1 based on for which 100 rows gets furthest into file
1832-
for (quoteRule = 0; quoteRule <= 1; quoteRule++) { // #loop_counter_not_local_scope_ok
1844+
for (quoteRule = QUOTE_RULE_EMBEDDED_QUOTES_DOUBLED; quoteRule <= QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED; quoteRule++) { // #loop_counter_not_local_scope_ok
18331845
int thisRow = 0, thisncol = 0;
18341846
ch = pos;
1835-
while (ch < eof && ++thisRow < jumpLines && (thisncol = countfields(&ch)) >= 0) {};
1847+
while (ch < eof && ++thisRow < jumpLines && (thisncol = countfields(&ch)) >= 0);
18361848
if (thisncol < 0) continue; // invalid file; e.g. unescaped quote inside quoted field
18371849
if (!firstJumpEnd || ch > firstJumpEnd) {
18381850
firstJumpEnd = ch;
@@ -1843,7 +1855,7 @@ int freadMain(freadMainArgs _args)
18431855
}
18441856

18451857
quoteRule = topQuoteRule;
1846-
if (quoteRule > 1 && quote) {
1858+
if (quoteRule > QUOTE_RULE_EMBEDDED_QUOTES_ESCAPED && quote) {
18471859
DTWARN(_("Found and resolved improper quoting in first %d rows. If the fields are not quoted (e.g. field separator does not appear within any field), try quote=\"\" to avoid this warning."), jumpLines);
18481860
// TODO: include line number and text in warning. Could loop again with the standard quote rule to find the line that fails.
18491861
}
@@ -1890,8 +1902,8 @@ int freadMain(freadMainArgs _args)
18901902
DTPRINT(_(" File copy in RAM took %.3f seconds.\n"), time_taken);
18911903
else if (tt > 0.5) // # nocov
18921904
DTPRINT(_("Avoidable file copy in RAM took %.3f seconds. %s.\n"), time_taken, msg); // # nocov. not warning as that could feasibly cause CRAN tests to fail, say, if test machine is heavily loaded
1893-
pos = sof + (pos - (const char *)mmp);
1894-
firstJumpEnd = sof + (firstJumpEnd - (const char *)mmp);
1905+
pos = sof + (pos - (const char*)mmp);
1906+
firstJumpEnd = sof + (firstJumpEnd - (const char*)mmp);
18951907
} else {
18961908
if (verbose) DTPRINT(_(" 1-column file ends with 2 or more end-of-line. Restoring last eol using extra byte in cow page.\n"));
18971909
eof++;
@@ -2153,8 +2165,9 @@ int freadMain(freadMainArgs _args)
21532165
DTPRINT(_(" Initial alloc = %"PRId64" rows (%"PRId64" + %d%%) using bytes/max(mean-2*sd,min) clamped between [1.1*estn, 2.0*estn]\n"),
21542166
allocnrow, estnrow, (int)(100.0 * allocnrow / estnrow - 100.0));
21552167
DTPRINT(" =====\n"); // # notranslate
2168+
} else {
2169+
if (sampleLines > allocnrow) INTERNAL_STOP("sampleLines(%"PRId64") > allocnrow(%"PRId64")", sampleLines, allocnrow); // # nocov
21562170
}
2157-
if (sampleLines > allocnrow) INTERNAL_STOP("sampleLines(%"PRId64") > allocnrow(%"PRId64")", sampleLines, allocnrow); // # nocov
21582171
}
21592172
if (nrowLimit < allocnrow) {
21602173
if (verbose) DTPRINT(_(" Alloc limited to lower nrows=%"PRId64" passed in.\n"), nrowLimit);
@@ -2241,7 +2254,7 @@ int freadMain(freadMainArgs _args)
22412254
if (type[j] < tmpType[j]) {
22422255
if (strcmp(typeName[tmpType[j]], typeName[type[j]]) != 0) {
22432256
DTWARN(_("Attempt to override column %d%s%.*s%s of inherent type '%s' down to '%s' ignored. Only overrides to a higher type are currently supported. If this was intended, please coerce to the lower type afterwards."),
2244-
j + 1, colNames ? " <<" : "", colNames?(colNames[j].len) : 0, colNames ? (colNamesAnchor + colNames[j].off) : "", colNames ? ">>" : "", // #4644
2257+
j + 1, colNames ? " <<" : "", colNames ? (colNames[j].len) : 0, colNames ? (colNamesAnchor + colNames[j].off) : "", colNames ? ">>" : "", // #4644
22452258
typeName[tmpType[j]], typeName[type[j]]);
22462259
}
22472260
type[j] = tmpType[j];
@@ -2362,10 +2375,8 @@ int freadMain(freadMainArgs _args)
23622375
.threadn = me,
23632376
.quoteRule = quoteRule,
23642377
.stopTeam = &stopTeam,
2365-
#ifndef DTPY
23662378
.nStringCols = nStringCols,
23672379
.nNonStringCols = nNonStringCols
2368-
#endif
23692380
};
23702381
if ((rowSize8 && !ctx.buff8) || (rowSize4 && !ctx.buff4) || (rowSize1 && !ctx.buff1)) {
23712382
stopTeam = true;
@@ -2449,7 +2460,7 @@ int freadMain(freadMainArgs _args)
24492460
fun[IGNORE_BUMP(thisType)](&fctx);
24502461
if (*tch != sep) break;
24512462
int8_t thisSize = size[j];
2452-
if (thisSize) ((char **) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0
2463+
if (thisSize) ((char**) targets)[thisSize] += thisSize; // 'if' for when rereading to avoid undefined NULL+0
24532464
tch++;
24542465
j++;
24552466
}
@@ -2463,7 +2474,7 @@ int freadMain(freadMainArgs _args)
24632474
}
24642475
else if (eol(&tch) && j < ncol) { // j<ncol needed for #2523 (erroneous extra comma after last field)
24652476
int8_t thisSize = size[j];
2466-
if (thisSize) ((char **) targets)[thisSize] += thisSize;
2477+
if (thisSize) ((char**) targets)[thisSize] += thisSize;
24672478
j++;
24682479
if (j > max_col) max_col = j;
24692480
if (j == ncol) { tch++; myNrow++; continue; } // next line. Back up to while (tch<nextJumpStart). Usually happens, fastest path
@@ -2533,7 +2544,7 @@ int freadMain(freadMainArgs _args)
25332544
// guess is insufficient out-of-sample, type is changed to negative sign and then bumped. Continue to
25342545
// check that the new type is sufficient for the rest of the column (and any other columns also in out-of-sample bump status) to be
25352546
// sure a single re-read will definitely work.
2536-
while (++absType < CT_STRING && disabled_parsers[absType]) {};
2547+
while (++absType < CT_STRING && disabled_parsers[absType]);
25372548

25382549
if(args.readInt64As != CT_INT64 && absType == CT_INT64)
25392550
thisType = TOGGLE_BUMP(args.readInt64As);
@@ -2633,7 +2644,7 @@ int freadMain(freadMainArgs _args)
26332644
ctx.nRows = myNrow;
26342645
orderBuffer(&ctx);
26352646
if (myStopEarly) {
2636-
if (quoteRule < 3) {
2647+
if (quoteRule < QUOTE_RULE_IGNORE_QUOTES) {
26372648
quoteRule++;
26382649
if (quoteRuleBumpedCh == NULL) {
26392650
// for warning message if the quote rule bump does in fact manage to heal it, e.g. test 1881

0 commit comments

Comments
 (0)